* [PATCH 01/11] block: add support for carrying stream information in a bio
2017-06-13 17:15 [PATCH 0/11] Add support for write life time hints Jens Axboe
@ 2017-06-13 17:15 ` Jens Axboe
2017-06-13 19:01 ` Andreas Dilger
0 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-13 17:15 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: Jens Axboe
No functional changes in this patch, but it allows each bio to
carry a stream ID and disallows merging of bio's with different
stream IDs.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/bio.c | 2 ++
block/blk-merge.c | 14 ++++++++++++++
include/linux/blk_types.h | 17 +++++++++++++++++
3 files changed, 33 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index 888e7801c638..77f4be1f7428 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -595,6 +595,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
bio->bi_opf = bio_src->bi_opf;
bio->bi_iter = bio_src->bi_iter;
bio->bi_io_vec = bio_src->bi_io_vec;
+ bio->bi_stream = bio_src->bi_stream;
bio_clone_blkcg_association(bio, bio_src);
}
@@ -678,6 +679,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
bio->bi_opf = bio_src->bi_opf;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
+ bio->bi_stream = bio_src->bi_stream;
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3990ae406341..28998acdd675 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -693,6 +693,13 @@ static struct request *attempt_merge(struct request_queue *q,
return NULL;
/*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if (req->bio->bi_stream != next->bio->bi_stream)
+ return NULL;
+
+ /*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
@@ -811,6 +818,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
+ /*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if (rq->bio->bi_stream != bio->bi_stream)
+ return false;
+
return true;
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 61339bc44400..1940876a7fa7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -36,6 +36,8 @@ struct bio {
unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
+ unsigned int bi_stream; /* write life time hint */
+
struct bvec_iter bi_iter;
/* Number of segments in this BIO after
@@ -312,4 +314,19 @@ struct blk_rq_stat {
u64 batch;
};
+static inline void bio_set_streamid(struct bio *bio, unsigned int stream)
+{
+ bio->bi_stream = stream;
+}
+
+static inline bool bio_stream_valid(struct bio *bio)
+{
+ return bio->bi_stream != 0;
+}
+
+static inline unsigned int bio_stream(struct bio *bio)
+{
+ return bio->bi_stream;
+}
+
#endif /* __LINUX_BLK_TYPES_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* Re: [PATCH 01/11] block: add support for carrying stream information in a bio
2017-06-13 17:15 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
@ 2017-06-13 19:01 ` Andreas Dilger
0 siblings, 0 replies; 40+ messages in thread
From: Andreas Dilger @ 2017-06-13 19:01 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-fsdevel, linux-block
[-- Attachment #1: Type: text/plain, Size: 3201 bytes --]
On Jun 13, 2017, at 11:15 AM, Jens Axboe <axboe@kernel.dk> wrote:
>
> No functional changes in this patch, but it allows each bio to
> carry a stream ID and disallows merging of bio's with different
> stream IDs.
>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
> ---
> block/bio.c | 2 ++
> block/blk-merge.c | 14 ++++++++++++++
> include/linux/blk_types.h | 17 +++++++++++++++++
> 3 files changed, 33 insertions(+)
>
> diff --git a/block/bio.c b/block/bio.c
> index 888e7801c638..77f4be1f7428 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -595,6 +595,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
> bio->bi_opf = bio_src->bi_opf;
> bio->bi_iter = bio_src->bi_iter;
> bio->bi_io_vec = bio_src->bi_io_vec;
> + bio->bi_stream = bio_src->bi_stream;
>
> bio_clone_blkcg_association(bio, bio_src);
> }
> @@ -678,6 +679,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
> bio->bi_opf = bio_src->bi_opf;
> bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
> bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
> + bio->bi_stream = bio_src->bi_stream;
>
> switch (bio_op(bio)) {
> case REQ_OP_DISCARD:
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 3990ae406341..28998acdd675 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -693,6 +693,13 @@ static struct request *attempt_merge(struct request_queue *q,
> return NULL;
>
> /*
> + * Don't allow merge of different streams, or for a stream with
> + * non-stream IO.
> + */
> + if (req->bio->bi_stream != next->bio->bi_stream)
> + return NULL;
> +
> + /*
> * If we are allowed to merge, then append bio list
> * from next to rq and release next. merge_requests_fn
> * will have updated segment counts, update sector
> @@ -811,6 +818,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
> !blk_write_same_mergeable(rq->bio, bio))
> return false;
>
> + /*
> + * Don't allow merge of different streams, or for a stream with
> + * non-stream IO.
> + */
> + if (rq->bio->bi_stream != bio->bi_stream)
> + return false;
> +
> return true;
> }
>
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 61339bc44400..1940876a7fa7 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -36,6 +36,8 @@ struct bio {
> unsigned short bi_flags; /* status, etc and bvec pool number */
> unsigned short bi_ioprio;
>
> + unsigned int bi_stream; /* write life time hint */
> +
> struct bvec_iter bi_iter;
>
> /* Number of segments in this BIO after
> @@ -312,4 +314,19 @@ struct blk_rq_stat {
> u64 batch;
> };
>
> +static inline void bio_set_streamid(struct bio *bio, unsigned int stream)
> +{
> + bio->bi_stream = stream;
> +}
> +
> +static inline bool bio_stream_valid(struct bio *bio)
> +{
> + return bio->bi_stream != 0;
> +}
> +
> +static inline unsigned int bio_stream(struct bio *bio)
> +{
> + return bio->bi_stream;
> +}
> +
> #endif /* __LINUX_BLK_TYPES_H */
> --
> 2.7.4
>
Cheers, Andreas
[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 01/11] block: add support for carrying stream information in a bio
2017-06-14 19:05 [PATCHSET v3] " Jens Axboe
@ 2017-06-14 19:05 ` Jens Axboe
2017-06-14 20:37 ` Christoph Hellwig
0 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-14 19:05 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
No functional changes in this patch, we just add four flags
that will be used to denote a stream type, and ensure that we
don't merge across different stream types.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-merge.c | 16 ++++++++++++++++
include/linux/blk_types.h | 11 +++++++++++
2 files changed, 27 insertions(+)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3990ae406341..7d299df3b12b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -693,6 +693,14 @@ static struct request *attempt_merge(struct request_queue *q,
return NULL;
/*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if ((req->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+ (next->cmd_flags & REQ_WRITE_LIFE_MASK))
+ return NULL;
+
+ /*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
@@ -811,6 +819,14 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
+ /*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if ((rq->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+ (bio->bi_opf & REQ_WRITE_LIFE_MASK))
+ return false;
+
return true;
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 61339bc44400..57d1eb530799 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -201,6 +201,10 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
+ __REQ_WRITE_SHORT, /* short life time write */
+ __REQ_WRITE_MEDIUM, /* medium life time write */
+ __REQ_WRITE_LONG, /* long life time write */
+ __REQ_WRITE_EXTREME, /* extremely long life time write */
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -221,6 +225,13 @@ enum req_flag_bits {
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
+#define REQ_WRITE_SHORT (1ULL << __REQ_WRITE_SHORT)
+#define REQ_WRITE_MEDIUM (1ULL << __REQ_WRITE_MEDIUM)
+#define REQ_WRITE_LONG (1ULL << __REQ_WRITE_LONG)
+#define REQ_WRITE_EXTREME (1ULL << __REQ_WRITE_EXTREME)
+
+#define REQ_WRITE_LIFE_MASK (REQ_WRITE_SHORT | REQ_WRITE_MEDIUM | \
+ REQ_WRITE_LONG | REQ_WRITE_EXTREME)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* Re: [PATCH 01/11] block: add support for carrying stream information in a bio
2017-06-14 19:05 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
@ 2017-06-14 20:37 ` Christoph Hellwig
2017-06-14 20:44 ` Jens Axboe
0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-14 20:37 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen
Btw, I think these could also easily map to DSM field in the NVMe
write command, except that these unfortunately mix in read information
as well.
> + __REQ_WRITE_SHORT, /* short life time write */
-> Frequent writes and infrequent reads to the LBA range indicated.
or
-> Frequent writes and frequent reads to the LBA range indicated.
> + __REQ_WRITE_MEDIUM, /* medium life time write */
-> Typical number of reads and writes expected for this LBA range.
> + __REQ_WRITE_LONG, /* long life time write */
-> Infrequent writes and infrequent reads to the LBA range indicated.
or
-> Infrequent writes and frequent reads to the LBA range indicated.
> + __REQ_WRITE_EXTREME, /* extremely long life time write */
-> One time write. E.g. command is due to virus scan, backup, file
copy, or archive.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 01/11] block: add support for carrying stream information in a bio
2017-06-14 20:37 ` Christoph Hellwig
@ 2017-06-14 20:44 ` Jens Axboe
0 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-14 20:44 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, linux-block, adilger, martin.petersen
On 06/14/2017 02:37 PM, Christoph Hellwig wrote:
> Btw, I think these could also easily map to DSM field in the NVMe
> write command, except that these unfortunately mix in read information
> as well.
But that's the problem, they are read/write mixed flags. I'd much
rather keep them separate. If some application finds it useful
to specify read access patterns, we should have separate flags for
those imho.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCHSET v4] Add support for write life time hints
@ 2017-06-15 3:45 Jens Axboe
2017-06-15 3:45 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
` (11 more replies)
0 siblings, 12 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen
A new iteration of this patchset, previously known as write streams.
As before, this patchset aims at enabling applications split up
writes into separate streams, based on the perceived life time
of the data written. This is useful for a variety of reasons:
- With NVMe 1.3 compliant devices, the device can expose multiple
streams. Separating data written into streams based on life time
can drastically reduce the write amplification. This helps device
endurance, and increases performance. Testing just performed
internally at Facebook with these patches showed up to a 25%
reduction in NAND writes in a RocksDB setup.
- Software caching solutions can make more intelligent decisions
on how and where to place data.
Contrary to previous patches, we're not exposing numeric stream values anymore.
I've previously advocated for just doing a set of hints that makes sense
instead. See the coverage from the LSFMM summit this year:
https://lwn.net/Articles/717755/
This patchset attempts to do that. We define 4 flags for the pwritev2
system call:
RWF_WRITE_LIFE_SHORT Data written with this flag is expected to have
a high overwrite rate, or life time.
RWF_WRITE_LIFE_MEDIUM Longer life time than SHORT
RWF_WRITE_LIFE_LONG Longer life time than MEDIUM
RWF_WRITE_LIFE_EXTREME Longer life time than LONG
The idea is that these are relative values, so an application can
use them as they see fit. The underlying device can then place
data appropriately, or be free to ignore the hint. It's just a hint.
A branch based on current master can be pulled
from here:
git://git.kernel.dk/linux-block write-stream.4
Changes since v3:
- Change any naming of stream ID to write hint.
- Various little API changes, suggested by Christoph
- Cleanup the NVMe bits, dump the debug info.
- Change NVMe to lazily allocate the streams.
- Various NVMe error handling improvements and command checking.
Changes since v2:
- Get rid of bio->bi_stream and replace with four request/bio flags.
These map directly to the RWF_WRITE_* flags that the user passes in.
- Cleanup the NVMe stream setting.
- Drivers now responsible for updating the queue stream write counter,
as they determine what stream to map a given flag to.
Changes since v1:
- Guard queue stream stats to ensure we don't mess up memory, if
bio_stream() ever were to return a larger value than we support.
- NVMe: ensure we set the stream modulo the name space defined count.
- Cleanup the RWF_ and IOCB_ flags. Set aside 4 bits, and just store
the stream value in there. This makes the passing of stream ID from
RWF_ space to IOCB_ (and IOCB_ to bio) more efficient, and cleans it
up in general.
- Kill the block internal definitions of the stream type, we don't need
them anymore. See above.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 01/11] block: add support for carrying stream information in a bio
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 02/11] blk-mq: expose stream write stats through debugfs Jens Axboe
` (10 subsequent siblings)
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
No functional changes in this patch, we just add four flags
that will be used to denote a stream type, and ensure that we
don't merge across different stream types.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-merge.c | 16 ++++++++++++++++
include/linux/blk_types.h | 11 +++++++++++
2 files changed, 27 insertions(+)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3990ae406341..7d299df3b12b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -693,6 +693,14 @@ static struct request *attempt_merge(struct request_queue *q,
return NULL;
/*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if ((req->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+ (next->cmd_flags & REQ_WRITE_LIFE_MASK))
+ return NULL;
+
+ /*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
@@ -811,6 +819,14 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
+ /*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if ((rq->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+ (bio->bi_opf & REQ_WRITE_LIFE_MASK))
+ return false;
+
return true;
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 61339bc44400..57d1eb530799 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -201,6 +201,10 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
+ __REQ_WRITE_SHORT, /* short life time write */
+ __REQ_WRITE_MEDIUM, /* medium life time write */
+ __REQ_WRITE_LONG, /* long life time write */
+ __REQ_WRITE_EXTREME, /* extremely long life time write */
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -221,6 +225,13 @@ enum req_flag_bits {
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
+#define REQ_WRITE_SHORT (1ULL << __REQ_WRITE_SHORT)
+#define REQ_WRITE_MEDIUM (1ULL << __REQ_WRITE_MEDIUM)
+#define REQ_WRITE_LONG (1ULL << __REQ_WRITE_LONG)
+#define REQ_WRITE_EXTREME (1ULL << __REQ_WRITE_EXTREME)
+
+#define REQ_WRITE_LIFE_MASK (REQ_WRITE_SHORT | REQ_WRITE_MEDIUM | \
+ REQ_WRITE_LONG | REQ_WRITE_EXTREME)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 02/11] blk-mq: expose stream write stats through debugfs
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
2017-06-15 3:45 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 8:16 ` Christoph Hellwig
2017-06-15 3:45 ` [PATCH 03/11] fs: add support for an inode to carry stream related data Jens Axboe
` (9 subsequent siblings)
11 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Useful to verify that things are working the way they should.
Reading the file will return number of kb written to each
stream. Writing the file will reset the statistics. No care
is taken to ensure that we don't race on updates.
Drivers will write to q->stream_writes[] if they handle a stream.
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-mq-debugfs.c | 24 ++++++++++++++++++++++++
include/linux/blkdev.h | 3 +++
2 files changed, 27 insertions(+)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 803aed4d7221..0a37c848961d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -133,6 +133,29 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
}
}
+static int queue_streams_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ int i;
+
+ for (i = 0; i < BLK_MAX_STREAM; i++)
+ seq_printf(m, "stream%d: %llu\n", i, q->stream_writes[i]);
+
+ return 0;
+}
+
+static ssize_t queue_streams_store(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct request_queue *q = data;
+ int i;
+
+ for (i = 0; i < BLK_MAX_STREAM; i++)
+ q->stream_writes[i] = 0;
+
+ return count;
+}
+
static int queue_poll_stat_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
@@ -656,6 +679,7 @@ const struct file_operations blk_mq_debugfs_fops = {
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{"poll_stat", 0400, queue_poll_stat_show},
{"state", 0600, queue_state_show, queue_state_write},
+ {"streams", 0600, queue_streams_show, queue_streams_store},
{},
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ab92c4ea138b..88719c6f3edf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -586,6 +586,9 @@ struct request_queue {
size_t cmd_size;
void *rq_alloc_data;
+
+#define BLK_MAX_STREAM 5
+ u64 stream_writes[BLK_MAX_STREAM];
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 03/11] fs: add support for an inode to carry stream related data
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
2017-06-15 3:45 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
2017-06-15 3:45 ` [PATCH 02/11] blk-mq: expose stream write stats through debugfs Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 8:17 ` Christoph Hellwig
2017-06-15 3:45 ` [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints Jens Axboe
` (8 subsequent siblings)
11 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
No functional changes in this patch, just in preparation for
allowing applications to pass in hints about data life times
for writes.
Pack the i_write_hint field into a 2-byte hole, so we don't grow
the size of the inode.
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/inode.c | 1 +
include/linux/fs.h | 9 +++++++++
2 files changed, 10 insertions(+)
diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..bd8bf44f3f31 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -149,6 +149,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_blocks = 0;
inode->i_bytes = 0;
inode->i_generation = 0;
+ inode->i_write_hint = 0;
inode->i_pipe = NULL;
inode->i_bdev = NULL;
inode->i_cdev = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9b2654..f4f9df8ed059 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -591,6 +591,7 @@ struct inode {
struct timespec i_ctime;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
+ unsigned short i_write_hint;
unsigned int i_blkbits;
blkcnt_t i_blocks;
@@ -655,6 +656,14 @@ struct inode {
void *i_private; /* fs or device private pointer */
};
+static inline unsigned int inode_write_hint(struct inode *inode)
+{
+ if (inode)
+ return inode->i_write_hint;
+
+ return 0;
+}
+
static inline unsigned int i_blocksize(const struct inode *node)
{
return (1 << node->i_blkbits);
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (2 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 03/11] fs: add support for an inode to carry stream related data Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 4:15 ` Darrick J. Wong
2017-06-15 3:45 ` [PATCH 05/11] block: add helpers for setting/checking write hint validity Jens Axboe
` (7 subsequent siblings)
11 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Add four flags for the pwritev2(2) system call, allowing an application
to give the kernel a hint about what on-media life times can be
expected from a given write.
The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.
Define IOCB flags to carry this over this information from the pwritev2
RWF_WRITE_LIFE_* flags.
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/read_write.c | 9 ++++++++-
include/linux/fs.h | 12 ++++++++++++
include/uapi/linux/fs.h | 10 ++++++++++
3 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d4484df9..9cb2314efca3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -678,7 +678,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
struct kiocb kiocb;
ssize_t ret;
- if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
+ if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_WRITE_LIFE_MASK))
return -EOPNOTSUPP;
init_sync_kiocb(&kiocb, filp);
@@ -688,6 +688,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
kiocb.ki_flags |= IOCB_DSYNC;
if (flags & RWF_SYNC)
kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+ if (flags & RWF_WRITE_LIFE_MASK) {
+ struct inode *inode = file_inode(filp);
+
+ inode->i_write_hint = (flags & RWF_WRITE_LIFE_MASK) >>
+ RWF_WRITE_LIFE_SHIFT;
+ kiocb.ki_flags |= inode->i_write_hint << IOCB_WRITE_LIFE_SHIFT;
+ }
kiocb.ki_pos = *ppos;
if (type == READ)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f4f9df8ed059..63798b67fcfe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -269,6 +269,12 @@ struct writeback_control;
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
+/*
+ * Steal 4-bits for stream information, this allows 16 valid streams
+ */
+#define IOCB_WRITE_LIFE_SHIFT 7
+#define IOCB_WRITE_LIFE_MASK (BIT(7) | BIT(8) | BIT(9) | BIT(10))
+
struct kiocb {
struct file *ki_filp;
loff_t ki_pos;
@@ -292,6 +298,12 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
};
}
+static inline int iocb_write_hint(const struct kiocb *iocb)
+{
+ return (iocb->ki_flags & IOCB_WRITE_LIFE_MASK) >>
+ IOCB_WRITE_LIFE_SHIFT;
+}
+
/*
* "descriptor" for what we're up to with a read.
* This allows us to use the same read code yet
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 24e61a54feaa..58b7ee06b380 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -361,4 +361,14 @@ struct fscrypt_key {
#define RWF_DSYNC 0x00000002 /* per-IO O_DSYNC */
#define RWF_SYNC 0x00000004 /* per-IO O_SYNC */
+/*
+ * Data life time write flags, steal 4 bits for that
+ */
+#define RWF_WRITE_LIFE_SHIFT 4
+#define RWF_WRITE_LIFE_MASK 0x000000f0 /* 4 bits of stream ID */
+#define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_MEDIUM (2 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_LONG (3 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_EXTREME (4 << RWF_WRITE_LIFE_SHIFT)
+
#endif /* _UAPI_LINUX_FS_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 05/11] block: add helpers for setting/checking write hint validity
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (3 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 06/11] fs: add O_DIRECT support for sending down bio stream information Jens Axboe
` (6 subsequent siblings)
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
We map the RWF_WRITE_* life time flags to the internal flags.
Drivers can then, in turn, map those flags to a suitable stream
type.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/bio.c | 16 ++++++++++++++++
include/linux/bio.h | 1 +
include/linux/blk_types.h | 5 +++++
3 files changed, 22 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index 888e7801c638..8804439945b2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2082,6 +2082,22 @@ void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
#endif /* CONFIG_BLK_CGROUP */
+static const unsigned int rwf_write_to_opf_flag[] = {
+ 0, REQ_WRITE_SHORT, REQ_WRITE_MEDIUM, REQ_WRITE_LONG, REQ_WRITE_EXTREME
+};
+
+/*
+ * 'stream_flags' is one of RWF_WRITE_LIFE_* values
+ */
+unsigned int bio_op_write_hint(unsigned int rwf_flags)
+{
+ if (WARN_ON_ONCE(rwf_flags >= ARRAY_SIZE(rwf_write_to_opf_flag)))
+ return 0;
+
+ return rwf_write_to_opf_flag[rwf_flags];
+}
+EXPORT_SYMBOL_GPL(bio_op_write_hint);
+
static void __init biovec_init_slabs(void)
{
int i;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d1b04b0e99cf..debd60d641dd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,6 +443,7 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
gfp_t, int);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
+extern unsigned int bio_op_write_hint(unsigned int rwf_flags);
void generic_start_io_acct(int rw, unsigned long sectors,
struct hd_struct *part);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 57d1eb530799..23646eb433e7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -323,4 +323,9 @@ struct blk_rq_stat {
u64 batch;
};
+static inline bool op_write_hint_valid(unsigned int opf)
+{
+ return (opf & REQ_WRITE_LIFE_MASK) != 0;
+}
+
#endif /* __LINUX_BLK_TYPES_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 06/11] fs: add O_DIRECT support for sending down bio stream information
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (4 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 05/11] block: add helpers for setting/checking write hint validity Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 07/11] fs: add support for buffered writeback to pass down write hints Jens Axboe
` (5 subsequent siblings)
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/block_dev.c | 2 ++
fs/direct-io.c | 2 ++
fs/iomap.c | 1 +
3 files changed, 5 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 519599dddd36..de4301168710 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -239,6 +239,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
should_dirty = true;
} else {
bio.bi_opf = dio_bio_write_op(iocb);
+ bio.bi_opf |= bio_op_write_hint(iocb_write_hint(iocb));
task_io_account_write(ret);
}
@@ -374,6 +375,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio_set_pages_dirty(bio);
} else {
bio->bi_opf = dio_bio_write_op(iocb);
+ bio->bi_opf |= bio_op_write_hint(iocb_write_hint(iocb));
task_io_account_write(bio->bi_iter.bi_size);
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a04ebea77de8..98874478ec8a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,6 +386,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
else
bio->bi_end_io = dio_bio_end_io;
+ bio->bi_opf |= bio_op_write_hint(iocb_write_hint(dio->iocb));
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/iomap.c b/fs/iomap.c
index 4b10892967a5..7e18e760e421 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -804,6 +804,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
if (dio->flags & IOMAP_DIO_WRITE) {
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ bio->bi_opf |= bio_op_write_hint(inode_write_hint(inode));
task_io_account_write(bio->bi_iter.bi_size);
} else {
bio_set_op_attrs(bio, REQ_OP_READ, 0);
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 07/11] fs: add support for buffered writeback to pass down write hints
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (5 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 06/11] fs: add O_DIRECT support for sending down bio stream information Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 08/11] ext4: add support for passing in write hints for buffered writes Jens Axboe
` (4 subsequent siblings)
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/buffer.c | 14 +++++++++-----
fs/mpage.c | 1 +
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 161be58c5cb0..3faf73a71d4b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc);
+ unsigned int stream, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -1829,7 +1829,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode_write_hint(inode), wbc);
nr_underway++;
}
bh = next;
@@ -1883,7 +1884,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode_write_hint(inode), wbc);
nr_underway++;
}
bh = next;
@@ -3091,7 +3093,7 @@ void guard_bio_eod(int op, struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc)
+ unsigned int write_hint, struct writeback_control *wbc)
{
struct bio *bio;
@@ -3134,6 +3136,8 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
+
+ op_flags |= bio_op_write_hint(write_hint);
bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
@@ -3142,7 +3146,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, NULL);
+ return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
diff --git a/fs/mpage.c b/fs/mpage.c
index baff8f820c29..df0635c8a512 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -614,6 +614,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
goto confused;
wbc_init_bio(wbc, bio);
+ bio->bi_opf |= bio_op_write_hint(inode_write_hint(inode));
}
/*
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 08/11] ext4: add support for passing in write hints for buffered writes
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (6 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 07/11] fs: add support for buffered writeback to pass down write hints Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 09/11] xfs: " Jens Axboe
` (3 subsequent siblings)
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/ext4/page-io.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1a82138ba739..764bf0ddecd4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -349,6 +349,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
REQ_SYNC : 0;
+ io_op_flags |= bio_op_write_hint(inode_write_hint(io->io_end->inode));
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
@@ -396,6 +397,7 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
+ io->io_bio->bi_opf |= bio_op_write_hint(inode_write_hint(inode));
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 09/11] xfs: add support for passing in write hints for buffered writes
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (7 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 08/11] ext4: add support for passing in write hints for buffered writes Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 10/11] btrfs: " Jens Axboe
` (2 subsequent siblings)
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/xfs/xfs_aops.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 09af0f7cd55e..fe11fe47d235 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -505,6 +505,7 @@ xfs_submit_ioend(
return status;
}
+ ioend->io_bio->bi_opf |= bio_op_write_hint(inode_write_hint(ioend->io_inode));
submit_bio(ioend->io_bio);
return 0;
}
@@ -564,6 +565,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ ioend->io_bio->bi_opf |= bio_op_write_hint(inode_write_hint(ioend->io_inode));
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 10/11] btrfs: add support for passing in write hints for buffered writes
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (8 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 09/11] xfs: " Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 11/11] nvme: add support for streams and directives Jens Axboe
2017-06-15 8:12 ` [PATCHSET v4] Add support for write life time hints Christoph Hellwig
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/btrfs/extent_io.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d3619e010005..2bc2dfca87c2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2826,6 +2826,7 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ op_flags |= bio_op_write_hint(inode_write_hint(page->mapping->host));
bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* [PATCH 11/11] nvme: add support for streams and directives
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (9 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 10/11] btrfs: " Jens Axboe
@ 2017-06-15 3:45 ` Jens Axboe
2017-06-15 8:12 ` [PATCHSET v4] Add support for write life time hints Christoph Hellwig
11 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 3:45 UTC (permalink / raw)
To: linux-fsdevel, linux-block; +Cc: adilger, hch, martin.petersen, Jens Axboe
This adds support for Directives in NVMe, particular for the Streams
directive. Support for Directives is a new feature in NVMe 1.3. It
allows a user to pass in information about where to store the data,
so that it the device can do so most effiently. If an application is
managing and writing data with different life times, mixing differently
retentioned data onto the same locations on flash can cause write
amplification to grow. This, in turn, will reduce performance and
life time of the device.
We default to allocating 4 streams per name space, but it is
configurable with the 'streams_per_ns' module option. If a write stream
is set in a write, flag is as such before sending it to the device. The
streams are allocated lazily - if we get a write request with a life
time hint, then background allocate streams and use them once that
is done.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/core.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++
drivers/nvme/host/nvme.h | 5 ++
include/linux/nvme.h | 48 +++++++++++++
3 files changed, 228 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 903d5813023a..30a6473b68cc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -65,6 +65,10 @@ static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+static char streams_per_ns = 4;
+module_param(streams_per_ns, byte, 0644);
+MODULE_PARM_DESC(streams_per_ns, "if available, allocate this many streams per NS");
+
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
@@ -331,6 +335,151 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_MQ_RQ_QUEUE_OK;
}
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_send;
+ c.directive.nsid = cpu_to_le32(0xffffffff);
+ c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+ c.directive.dtype = NVME_DIR_IDENTIFY;
+ c.directive.tdtype = NVME_DIR_STREAMS;
+ c.directive.endir = NVME_DIR_ENDIR;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_probe_directives(struct nvme_ctrl *ctrl)
+{
+ struct streams_directive_params s;
+ struct nvme_command c;
+ int ret;
+
+ if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+ return 0;
+
+ ret = nvme_enable_streams(ctrl);
+ if (ret)
+ return ret;
+
+ memset(&c, 0, sizeof(c));
+ memset(&s, 0, sizeof(s));
+
+ c.directive.opcode = nvme_admin_directive_recv;
+ c.directive.nsid = cpu_to_le32(0xffffffff);
+ c.directive.numd = sizeof(s);
+ c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+ c.directive.dtype = NVME_DIR_STREAMS;
+
+ ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, &s, sizeof(s));
+ if (ret)
+ return ret;
+
+ ctrl->nssa = le16_to_cpu(s.nssa);
+ return 0;
+}
+
+/*
+ * Returns number of streams allocated for use by this ns, or -1 on error.
+ */
+static int nvme_streams_allocate(struct nvme_ns *ns, unsigned int streams)
+{
+ struct nvme_command c;
+ union nvme_result res;
+ int ret;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_recv;
+ c.directive.nsid = cpu_to_le32(ns->ns_id);
+ c.directive.doper = NVME_DIR_RCV_ST_OP_RESOURCE;
+ c.directive.dtype = NVME_DIR_STREAMS;
+ c.directive.endir = streams;
+
+ ret = __nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, &res, NULL, 0, 0,
+ NVME_QID_ANY, 0, 0);
+ if (ret)
+ return -1;
+
+ return le32_to_cpu(res.u32) & 0xffff;
+}
+
+static int nvme_streams_deallocate(struct nvme_ns *ns)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_send;
+ c.directive.nsid = cpu_to_le32(ns->ns_id);
+ c.directive.doper = NVME_DIR_SND_ST_OP_REL_RSC;
+ c.directive.dtype = NVME_DIR_STREAMS;
+
+ return nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0);
+}
+
+static void nvme_write_hint_work(struct work_struct *work)
+{
+ struct nvme_ns *ns = container_of(work, struct nvme_ns, write_hint_work);
+ int ret, nr_streams;
+
+ if (ns->nr_streams)
+ return;
+
+ nr_streams = streams_per_ns;
+ if (nr_streams > ns->ctrl->nssa)
+ nr_streams = ns->ctrl->nssa;
+
+ ret = nvme_streams_allocate(ns, nr_streams);
+ if (ret <= 0)
+ goto err;
+
+ ns->nr_streams = ret;
+ dev_info(ns->ctrl->device, "successfully enabled %d streams\n", ret);
+ return;
+err:
+ dev_info(ns->ctrl->device, "failed enabling streams\n");
+ ns->ctrl->failed_streams = true;
+}
+
+static void nvme_configure_streams(struct nvme_ns *ns)
+{
+ /*
+ * If we already called this function, we've either marked it
+ * as a failure or set the number of streams.
+ */
+ if (ns->ctrl->failed_streams)
+ return;
+ if (ns->nr_streams)
+ return;
+ schedule_work(&ns->write_hint_work);
+}
+
+static unsigned int nvme_get_write_stream(struct nvme_ns *ns,
+ struct request *req)
+{
+ unsigned int streamid = 0;
+
+ if (req->cmd_flags & REQ_WRITE_SHORT)
+ streamid = 1;
+ else if (req->cmd_flags & REQ_WRITE_MEDIUM)
+ streamid = 2;
+ else if (req->cmd_flags & REQ_WRITE_LONG)
+ streamid = 3;
+ else if (req->cmd_flags & REQ_WRITE_EXTREME)
+ streamid = 4;
+
+ req->q->stream_writes[streamid] += blk_rq_bytes(req) >> 9;
+
+ if (streamid <= ns->nr_streams)
+ return streamid;
+
+ /* for now just round-robin, do something more clever later */
+ return (streamid % (ns->nr_streams + 1));
+}
+
static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd)
{
@@ -351,6 +500,25 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ /*
+ * If we support streams and it isn't enabled, so so now. Until it's
+ * enabled, we won't flag the write with a stream. If we don't support
+ * streams, just ignore the life time hint.
+ */
+ if (req_op(req) == REQ_OP_WRITE && op_write_hint_valid(req->cmd_flags)) {
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ if (ns->nr_streams) {
+ unsigned int stream = nvme_get_write_stream(ns, req);
+
+ if (stream) {
+ control |= NVME_RW_DTYPE_STREAMS;
+ dsmgmt |= (stream << 16);
+ }
+ } else if (ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)
+ nvme_configure_streams(ns);
+ }
+
if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
@@ -1650,6 +1818,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
nvme_configure_apst(ctrl);
+ nvme_probe_directives(ctrl);
ctrl->identified = true;
@@ -2049,6 +2218,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
+ INIT_WORK(&ns->write_hint_work, nvme_write_hint_work);
+
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
if (nvme_revalidate_ns(ns, &id))
@@ -2105,6 +2276,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
+ flush_work(&ns->write_hint_work);
+
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
@@ -2112,6 +2285,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
&nvme_ns_attr_group);
if (ns->ndev)
nvme_nvm_unregister_sysfs(ns);
+ if (ns->nr_streams)
+ nvme_streams_deallocate(ns);
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9d6a070d4391..918b6126d38b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -118,6 +118,7 @@ enum nvme_ctrl_state {
struct nvme_ctrl {
enum nvme_ctrl_state state;
bool identified;
+ bool failed_streams;
spinlock_t lock;
const struct nvme_ctrl_ops *ops;
struct request_queue *admin_q;
@@ -147,6 +148,7 @@ struct nvme_ctrl {
u16 oncs;
u16 vid;
u16 oacs;
+ u16 nssa;
atomic_t abort_limit;
u8 event_limit;
u8 vwc;
@@ -192,6 +194,7 @@ struct nvme_ns {
u8 uuid[16];
unsigned ns_id;
+ unsigned nr_streams;
int lba_shift;
u16 ms;
bool ext;
@@ -203,6 +206,8 @@ struct nvme_ns {
u64 mode_select_num_blocks;
u32 mode_select_block_len;
+
+ struct work_struct write_hint_work;
};
struct nvme_ctrl_ops {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b625bacf37ef..8b2f5b140134 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -245,6 +245,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
@@ -295,6 +296,19 @@ enum {
};
enum {
+ NVME_DIR_IDENTIFY = 0x00,
+ NVME_DIR_STREAMS = 0x01,
+ NVME_DIR_SND_ID_OP_ENABLE = 0x01,
+ NVME_DIR_SND_ST_OP_REL_ID = 0x01,
+ NVME_DIR_SND_ST_OP_REL_RSC = 0x02,
+ NVME_DIR_RCV_ID_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_STATUS = 0x02,
+ NVME_DIR_RCV_ST_OP_RESOURCE = 0x03,
+ NVME_DIR_ENDIR = 0x01,
+};
+
+enum {
NVME_NS_FEAT_THIN = 1 << 0,
NVME_NS_FLBAS_LBA_MASK = 0xf,
NVME_NS_FLBAS_META_EXT = 0x10,
@@ -535,6 +549,7 @@ enum {
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
+ NVME_RW_DTYPE_STREAMS = 1 << 4,
};
struct nvme_dsm_cmd {
@@ -604,6 +619,8 @@ enum nvme_admin_opcode {
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
+ nvme_admin_directive_send = 0x19,
+ nvme_admin_directive_recv = 0x1a,
nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
@@ -756,6 +773,24 @@ struct nvme_get_log_page_command {
__u32 rsvd14[2];
};
+struct nvme_directive_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __le32 numd;
+ __u8 doper;
+ __u8 dtype;
+ __le16 dspec;
+ __u8 endir;
+ __u8 tdtype;
+ __u16 rsvd15;
+
+ __u32 rsvd16[3];
+};
+
/*
* Fabrics subcommands.
*/
@@ -886,6 +921,18 @@ struct nvme_dbbuf {
__u32 rsvd12[6];
};
+struct streams_directive_params {
+ __u16 msl;
+ __u16 nssa;
+ __u16 nsso;
+ __u8 rsvd[10];
+ __u32 sws;
+ __u16 sgs;
+ __u16 nsa;
+ __u16 nso;
+ __u8 rsvd2[6];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -906,6 +953,7 @@ struct nvme_command {
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
struct nvme_dbbuf dbbuf;
+ struct nvme_directive_cmd directive;
};
};
--
2.7.4
^ permalink raw reply related [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 3:45 ` [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints Jens Axboe
@ 2017-06-15 4:15 ` Darrick J. Wong
2017-06-15 4:33 ` Jens Axboe
2017-06-15 11:24 ` Al Viro
0 siblings, 2 replies; 40+ messages in thread
From: Darrick J. Wong @ 2017-06-15 4:15 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen
On Wed, Jun 14, 2017 at 09:45:05PM -0600, Jens Axboe wrote:
> Add four flags for the pwritev2(2) system call, allowing an application
> to give the kernel a hint about what on-media life times can be
> expected from a given write.
>
> The intent is for these values to be relative to each other, no
> absolute meaning should be attached to these flag names.
>
> Define IOCB flags to carry this over this information from the pwritev2
> RWF_WRITE_LIFE_* flags.
>
> Reviewed-by: Andreas Dilger <adilger@dilger.ca>
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---
> fs/read_write.c | 9 ++++++++-
> include/linux/fs.h | 12 ++++++++++++
> include/uapi/linux/fs.h | 10 ++++++++++
> 3 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d4484df9..9cb2314efca3 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -678,7 +678,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
> struct kiocb kiocb;
> ssize_t ret;
>
> - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
> + if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_WRITE_LIFE_MASK))
> return -EOPNOTSUPP;
>
> init_sync_kiocb(&kiocb, filp);
> @@ -688,6 +688,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
> kiocb.ki_flags |= IOCB_DSYNC;
> if (flags & RWF_SYNC)
> kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
> + if (flags & RWF_WRITE_LIFE_MASK) {
> + struct inode *inode = file_inode(filp);
> +
> + inode->i_write_hint = (flags & RWF_WRITE_LIFE_MASK) >>
> + RWF_WRITE_LIFE_SHIFT;
Hmm, so once set, hints stick around until someone else sets a different
one. I suppose it's unlikely that you'd have two programs writing to
the same inode with different write hints, right?
Just wondering if anyone will be surprised that they thought they were
writing to an _EXTREME hint file but someone else switched it to _SHORT
on them.
Also, how does userspace query the write hint value once set?
> + kiocb.ki_flags |= inode->i_write_hint << IOCB_WRITE_LIFE_SHIFT;
> + }
> kiocb.ki_pos = *ppos;
>
> if (type == READ)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f4f9df8ed059..63798b67fcfe 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -269,6 +269,12 @@ struct writeback_control;
> #define IOCB_SYNC (1 << 5)
> #define IOCB_WRITE (1 << 6)
>
> +/*
> + * Steal 4-bits for stream information, this allows 16 valid streams
> + */
> +#define IOCB_WRITE_LIFE_SHIFT 7
> +#define IOCB_WRITE_LIFE_MASK (BIT(7) | BIT(8) | BIT(9) | BIT(10))
> +
> struct kiocb {
> struct file *ki_filp;
> loff_t ki_pos;
> @@ -292,6 +298,12 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
> };
> }
>
> +static inline int iocb_write_hint(const struct kiocb *iocb)
> +{
> + return (iocb->ki_flags & IOCB_WRITE_LIFE_MASK) >>
> + IOCB_WRITE_LIFE_SHIFT;
> +}
> +
> /*
> * "descriptor" for what we're up to with a read.
> * This allows us to use the same read code yet
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 24e61a54feaa..58b7ee06b380 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -361,4 +361,14 @@ struct fscrypt_key {
> #define RWF_DSYNC 0x00000002 /* per-IO O_DSYNC */
> #define RWF_SYNC 0x00000004 /* per-IO O_SYNC */
>
> +/*
> + * Data life time write flags, steal 4 bits for that
> + */
> +#define RWF_WRITE_LIFE_SHIFT 4
> +#define RWF_WRITE_LIFE_MASK 0x000000f0 /* 4 bits of stream ID */
> +#define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT)
> +#define RWF_WRITE_LIFE_MEDIUM (2 << RWF_WRITE_LIFE_SHIFT)
> +#define RWF_WRITE_LIFE_LONG (3 << RWF_WRITE_LIFE_SHIFT)
> +#define RWF_WRITE_LIFE_EXTREME (4 << RWF_WRITE_LIFE_SHIFT)
Should O_TMPFILE files ought to be created with i_write_hint =
RWF_WRITE_LIFE_SHORT by default?
--D
> +
> #endif /* _UAPI_LINUX_FS_H */
> --
> 2.7.4
>
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 4:15 ` Darrick J. Wong
@ 2017-06-15 4:33 ` Jens Axboe
2017-06-15 8:19 ` Christoph Hellwig
2017-06-15 11:24 ` Al Viro
1 sibling, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 4:33 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen
On 06/14/2017 10:15 PM, Darrick J. Wong wrote:
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 47c1d4484df9..9cb2314efca3 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -678,7 +678,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>> struct kiocb kiocb;
>> ssize_t ret;
>>
>> - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
>> + if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_WRITE_LIFE_MASK))
>> return -EOPNOTSUPP;
>>
>> init_sync_kiocb(&kiocb, filp);
>> @@ -688,6 +688,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
>> kiocb.ki_flags |= IOCB_DSYNC;
>> if (flags & RWF_SYNC)
>> kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
>> + if (flags & RWF_WRITE_LIFE_MASK) {
>> + struct inode *inode = file_inode(filp);
>> +
>> + inode->i_write_hint = (flags & RWF_WRITE_LIFE_MASK) >>
>> + RWF_WRITE_LIFE_SHIFT;
>
> Hmm, so once set, hints stick around until someone else sets a different
> one. I suppose it's unlikely that you'd have two programs writing to
> the same inode with different write hints, right?
You'd hope so... There's really no good way to support that with
buffered writes. For the NVMe use case, you'd be no worse off than you
were without hints, however.
But I do think one change should be made above - we only reset the hint
if someone passes a new hint in. But we probably also want to do so for
the case where no hint is passed in, but one is currently set.
> Also, how does userspace query the write hint value once set?
It doesn't. Ideally this hint would be "for this write only", but that's
not really possible with deferred write back.
>> +/*
>> + * Data life time write flags, steal 4 bits for that
>> + */
>> +#define RWF_WRITE_LIFE_SHIFT 4
>> +#define RWF_WRITE_LIFE_MASK 0x000000f0 /* 4 bits of stream ID */
>> +#define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT)
>> +#define RWF_WRITE_LIFE_MEDIUM (2 << RWF_WRITE_LIFE_SHIFT)
>> +#define RWF_WRITE_LIFE_LONG (3 << RWF_WRITE_LIFE_SHIFT)
>> +#define RWF_WRITE_LIFE_EXTREME (4 << RWF_WRITE_LIFE_SHIFT)
>
> Should O_TMPFILE files ought to be created with i_write_hint =
> RWF_WRITE_LIFE_SHORT by default?
The answer here is "it depends". If we're already using hints on that
device, then yes, O_TMPFILE is a clear candidate for
RWF_WRITE_LIFE_SHORT. However, if we are not, then we should not set it
as it may have implications on how the device manages writes. For that
case we'll potentially only be driving a single stream, short writes,
and that may not be enough to saturate device bandwidth.
I would rather leave that for future experimentation. There are similar
things we can do with short lived writes, like apply them to the log
writes in the file system. But all of that should be built on top of
what we end up agreeing on, not included from the get-go. I'd rather get
the basic usage and model going first before we further complicate
matters.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCHSET v4] Add support for write life time hints
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
` (10 preceding siblings ...)
2017-06-15 3:45 ` [PATCH 11/11] nvme: add support for streams and directives Jens Axboe
@ 2017-06-15 8:12 ` Christoph Hellwig
2017-06-15 14:23 ` Jens Axboe
11 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-15 8:12 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen
On Wed, Jun 14, 2017 at 09:45:01PM -0600, Jens Axboe wrote:
> A new iteration of this patchset, previously known as write streams.
> As before, this patchset aims at enabling applications split up
> writes into separate streams, based on the perceived life time
> of the data written. This is useful for a variety of reasons:
>
> - With NVMe 1.3 compliant devices, the device can expose multiple
> streams.
Nitpick: while the feature is introduce in NVMe 1.3, the NVMe versioning
rules also allow devices that claim conformance with any older spec
version to implement the feature.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 02/11] blk-mq: expose stream write stats through debugfs
2017-06-15 3:45 ` [PATCH 02/11] blk-mq: expose stream write stats through debugfs Jens Axboe
@ 2017-06-15 8:16 ` Christoph Hellwig
2017-06-15 14:24 ` Jens Axboe
0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-15 8:16 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen
On Wed, Jun 14, 2017 at 09:45:03PM -0600, Jens Axboe wrote:
> Useful to verify that things are working the way they should.
> Reading the file will return number of kb written to each
> stream. Writing the file will reset the statistics. No care
> is taken to ensure that we don't race on updates.
>
> Drivers will write to q->stream_writes[] if they handle a stream.
What's blk-mq specific about this? Seems like this either should
be generic block or nvme specific, but probably not blk-mq. And
of course the name should change with the rest of the names now.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 03/11] fs: add support for an inode to carry stream related data
2017-06-15 3:45 ` [PATCH 03/11] fs: add support for an inode to carry stream related data Jens Axboe
@ 2017-06-15 8:17 ` Christoph Hellwig
2017-06-15 14:22 ` Jens Axboe
0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-15 8:17 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen
On Wed, Jun 14, 2017 at 09:45:04PM -0600, Jens Axboe wrote:
> No functional changes in this patch, just in preparation for
> allowing applications to pass in hints about data life times
> for writes.
>
> Pack the i_write_hint field into a 2-byte hole, so we don't grow
> the size of the inode.
A u8 should be plenty. But talking about the representation -
your write lifetime hints are a 5 option enum basically. I wonder
if we really should encode it as flags, or if we should have an
enum (which could be packed into a 3-bit bitfield) and then pass
it down the stack in that form instead of changing the representation
N times.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 4:33 ` Jens Axboe
@ 2017-06-15 8:19 ` Christoph Hellwig
2017-06-15 14:21 ` Jens Axboe
0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-15 8:19 UTC (permalink / raw)
To: Jens Axboe
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger, hch,
martin.petersen
I think Darrick has a very valid concern here - using RWF_* flags
to affect inode or fd-wide state is extremely counter productive.
Combined with the fact that the streams need a special setup in NVMe
I'm tempted to say that the interface really should be fadvise or
similar, which would keep the setup out of the I/O path and make clear
it's a sticky interface. For direct I/O RWF_* would make some sense,
but we'd still have to deal with the setup issue.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 4:15 ` Darrick J. Wong
2017-06-15 4:33 ` Jens Axboe
@ 2017-06-15 11:24 ` Al Viro
1 sibling, 0 replies; 40+ messages in thread
From: Al Viro @ 2017-06-15 11:24 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Jens Axboe, linux-fsdevel, linux-block, adilger, hch,
martin.petersen
On Wed, Jun 14, 2017 at 09:15:03PM -0700, Darrick J. Wong wrote:
> > + */
> > +#define RWF_WRITE_LIFE_SHIFT 4
> > +#define RWF_WRITE_LIFE_MASK 0x000000f0 /* 4 bits of stream ID */
> > +#define RWF_WRITE_LIFE_SHORT (1 << RWF_WRITE_LIFE_SHIFT)
> > +#define RWF_WRITE_LIFE_MEDIUM (2 << RWF_WRITE_LIFE_SHIFT)
> > +#define RWF_WRITE_LIFE_LONG (3 << RWF_WRITE_LIFE_SHIFT)
> > +#define RWF_WRITE_LIFE_EXTREME (4 << RWF_WRITE_LIFE_SHIFT)
>
> Should O_TMPFILE files ought to be created with i_write_hint =
> RWF_WRITE_LIFE_SHORT by default?
Depends... One of the uses for that is to have them linked into permanent
place once completely written...
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 8:19 ` Christoph Hellwig
@ 2017-06-15 14:21 ` Jens Axboe
2017-06-15 15:23 ` Jens Axboe
2017-06-16 7:33 ` Christoph Hellwig
0 siblings, 2 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 14:21 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/15/2017 02:19 AM, Christoph Hellwig wrote:
> I think Darrick has a very valid concern here - using RWF_* flags
> to affect inode or fd-wide state is extremely counter productive.
>
> Combined with the fact that the streams need a special setup in NVMe
> I'm tempted to say that the interface really should be fadvise or
> similar, which would keep the setup out of the I/O path and make clear
> it's a sticky interface. For direct I/O RWF_* would make some sense,
> but we'd still have to deal with the setup issue.
OK, which is exactly how I had implemented the interface 2 years ago.
I can resurrect that part and dump the RWF_* flags. I agree the RWF_*
flags are confusing for buffered IO, since they are persistent. For
O_DIRECT, they make more sense. So the question is if we want to
retain the RWF_WRITE_LIFE_* hints at all, or simply go back to the
fadvise with something ala:
POSIX_FADV_WRITE_HINT_SET Set write life time hint
POSIX_FADV_WRITE_HINT_GET Get write life time hint
I'd still very much like to stick to the same four hints, and not
require any special setup. I think the lazy setup for nvme is fine. Some
devices can do it at probe time, others will want to do it lazily to not
waste resources. Do we really want to have the HINT_SET bubble down to
the device and allocate streams?
I want to keep this simple to use. The RWF_WRITE_LIFE_* are easy to use
and adopt. But I do agree that being able to query the setting is
useful, and then we may as well introduce a get/set fadvise interface
for doing so.
Do people like the S_WRITE_LIFE_* part, or should we keep the
i_write_hint and just shrink it to 8 bytes?
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 03/11] fs: add support for an inode to carry stream related data
2017-06-15 8:17 ` Christoph Hellwig
@ 2017-06-15 14:22 ` Jens Axboe
0 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 14:22 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, linux-block, adilger, martin.petersen
On 06/15/2017 02:17 AM, Christoph Hellwig wrote:
> On Wed, Jun 14, 2017 at 09:45:04PM -0600, Jens Axboe wrote:
>> No functional changes in this patch, just in preparation for
>> allowing applications to pass in hints about data life times
>> for writes.
>>
>> Pack the i_write_hint field into a 2-byte hole, so we don't grow
>> the size of the inode.
>
> A u8 should be plenty. But talking about the representation -
> your write lifetime hints are a 5 option enum basically. I wonder
> if we really should encode it as flags, or if we should have an
> enum (which could be packed into a 3-bit bitfield) and then pass
> it down the stack in that form instead of changing the representation
> N times.
If we keep the RWF_WRITE_LIFE_* flags, then yes, I think we should
unify the RWF_WRITE_LIFE_*, IOCB_WRITE_LIFE_*, and REQ_WRITE_LIFE_*
flags into a specific type.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCHSET v4] Add support for write life time hints
2017-06-15 8:12 ` [PATCHSET v4] Add support for write life time hints Christoph Hellwig
@ 2017-06-15 14:23 ` Jens Axboe
0 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 14:23 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, linux-block, adilger, martin.petersen
On 06/15/2017 02:12 AM, Christoph Hellwig wrote:
> On Wed, Jun 14, 2017 at 09:45:01PM -0600, Jens Axboe wrote:
>> A new iteration of this patchset, previously known as write streams.
>> As before, this patchset aims at enabling applications split up
>> writes into separate streams, based on the perceived life time
>> of the data written. This is useful for a variety of reasons:
>>
>> - With NVMe 1.3 compliant devices, the device can expose multiple
>> streams.
>
> Nitpick: while the feature is introduce in NVMe 1.3, the NVMe versioning
> rules also allow devices that claim conformance with any older spec
> version to implement the feature.
Yes, that's true. I'll make that change, to not offend any of the NVMe
members.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 02/11] blk-mq: expose stream write stats through debugfs
2017-06-15 8:16 ` Christoph Hellwig
@ 2017-06-15 14:24 ` Jens Axboe
0 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 14:24 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: linux-fsdevel, linux-block, adilger, martin.petersen
On 06/15/2017 02:16 AM, Christoph Hellwig wrote:
> On Wed, Jun 14, 2017 at 09:45:03PM -0600, Jens Axboe wrote:
>> Useful to verify that things are working the way they should.
>> Reading the file will return number of kb written to each
>> stream. Writing the file will reset the statistics. No care
>> is taken to ensure that we don't race on updates.
>>
>> Drivers will write to q->stream_writes[] if they handle a stream.
>
> What's blk-mq specific about this? Seems like this either should
> be generic block or nvme specific, but probably not blk-mq. And
> of course the name should change with the rest of the names now.
There is nothing blk-mq specific about it, but it's a handy place
to stick some debug data, as we don't have debugfs support for non
blk-mq. I'm fine with even dumping this, but it's useful for debugging
that things are being passed all the way through correctly for now.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 14:21 ` Jens Axboe
@ 2017-06-15 15:23 ` Jens Axboe
2017-06-16 7:30 ` Christoph Hellwig
2017-06-16 7:33 ` Christoph Hellwig
1 sibling, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-15 15:23 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/15/2017 08:21 AM, Jens Axboe wrote:
> On 06/15/2017 02:19 AM, Christoph Hellwig wrote:
>> I think Darrick has a very valid concern here - using RWF_* flags
>> to affect inode or fd-wide state is extremely counter productive.
>>
>> Combined with the fact that the streams need a special setup in NVMe
>> I'm tempted to say that the interface really should be fadvise or
>> similar, which would keep the setup out of the I/O path and make clear
>> it's a sticky interface. For direct I/O RWF_* would make some sense,
>> but we'd still have to deal with the setup issue.
>
> OK, which is exactly how I had implemented the interface 2 years ago.
> I can resurrect that part and dump the RWF_* flags. I agree the RWF_*
> flags are confusing for buffered IO, since they are persistent. For
> O_DIRECT, they make more sense. So the question is if we want to
> retain the RWF_WRITE_LIFE_* hints at all, or simply go back to the
> fadvise with something ala:
>
> POSIX_FADV_WRITE_HINT_SET Set write life time hint
> POSIX_FADV_WRITE_HINT_GET Get write life time hint
And then I remembered why fadvise _sucks_. It returns the error value
directly. So 0 is success > 0 is some error. That does not work well
for adding a set/get interface. Additionally, with fadvise, we have
to overload either 'offset' or 'length' for the write hint for the
set operation. Not super pretty.
Any objections to making the auxiliary interface fcntl(2) based?
That would be a cleaner fit, imho.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 15:23 ` Jens Axboe
@ 2017-06-16 7:30 ` Christoph Hellwig
2017-06-16 14:35 ` Jens Axboe
0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-16 7:30 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Darrick J. Wong, linux-fsdevel, linux-block,
adilger, martin.petersen
On Thu, Jun 15, 2017 at 09:23:02AM -0600, Jens Axboe wrote:
> And then I remembered why fadvise _sucks_. It returns the error value
> directly. So 0 is success > 0 is some error. That does not work well
> for adding a set/get interface. Additionally, with fadvise, we have
> to overload either 'offset' or 'length' for the write hint for the
> set operation. Not super pretty.
>
> Any objections to making the auxiliary interface fcntl(2) based?
> That would be a cleaner fit, imho.
fcntl sounds fine to me.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-15 14:21 ` Jens Axboe
2017-06-15 15:23 ` Jens Axboe
@ 2017-06-16 7:33 ` Christoph Hellwig
2017-06-16 14:35 ` Jens Axboe
1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-16 7:33 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Darrick J. Wong, linux-fsdevel, linux-block,
adilger, martin.petersen
On Thu, Jun 15, 2017 at 08:21:18AM -0600, Jens Axboe wrote:
> I'd still very much like to stick to the same four hints, and not
Agreed. In fact I'd go a little further: we should have a
u16 hints;
that goes all the way down from fcntl to the driver, right now
we'll allocate the first 3 bits for the write lifetime hints (2.5,
so we have one value spare, as they don't need to flags but can be
enum values), leaving more space for other kinds of hints.
> require any special setup. I think the lazy setup for nvme is fine. Some
> devices can do it at probe time, others will want to do it lazily to not
> waste resources. Do we really want to have the HINT_SET bubble down to
> the device and allocate streams?
If I look at the mess for allocating the streams I think we need it
to bubble down. That way the device can allocate the stream at time
of the fcntl and we can keep the low level driver very simple.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 7:33 ` Christoph Hellwig
@ 2017-06-16 14:35 ` Jens Axboe
2017-06-16 14:53 ` Jens Axboe
2017-06-16 15:52 ` Christoph Hellwig
0 siblings, 2 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-16 14:35 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/16/2017 01:33 AM, Christoph Hellwig wrote:
> On Thu, Jun 15, 2017 at 08:21:18AM -0600, Jens Axboe wrote:
>> I'd still very much like to stick to the same four hints, and not
>
> Agreed. In fact I'd go a little further: we should have a
>
> u16 hints;
>
> that goes all the way down from fcntl to the driver, right now
> we'll allocate the first 3 bits for the write lifetime hints (2.5,
> so we have one value spare, as they don't need to flags but can be
> enum values), leaving more space for other kinds of hints.
Did you see v5? It adds enum write_hint and passes it all the way down,
until we transform them into rq/bio flags.
>> require any special setup. I think the lazy setup for nvme is fine. Some
>> devices can do it at probe time, others will want to do it lazily to not
>> waste resources. Do we really want to have the HINT_SET bubble down to
>> the device and allocate streams?
>
> If I look at the mess for allocating the streams I think we need it
> to bubble down. That way the device can allocate the stream at time
> of the fcntl and we can keep the low level driver very simple.
Mess? The NVMe code seems pretty straight forward to me. Is it purely
the lazy alloc part you're referring to? I'm fine with bubbling down the
hint and setting everything up inline from the fcntl() call, but that
means we need to make things like btrfs and md/dm aware of it and
resolve all mappings. That sort-of sucks, especially since they don't
otherwise need to know about it.
If another driver needs similar setup like NVMe, I'd much rather just
fix it up there. From the API point of view, there would be no changes.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 7:30 ` Christoph Hellwig
@ 2017-06-16 14:35 ` Jens Axboe
0 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-16 14:35 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/16/2017 01:30 AM, Christoph Hellwig wrote:
> On Thu, Jun 15, 2017 at 09:23:02AM -0600, Jens Axboe wrote:
>> And then I remembered why fadvise _sucks_. It returns the error value
>> directly. So 0 is success > 0 is some error. That does not work well
>> for adding a set/get interface. Additionally, with fadvise, we have
>> to overload either 'offset' or 'length' for the write hint for the
>> set operation. Not super pretty.
>>
>> Any objections to making the auxiliary interface fcntl(2) based?
>> That would be a cleaner fit, imho.
>
> fcntl sounds fine to me.
v5 implemented that, much better fit than fadvise.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 14:35 ` Jens Axboe
@ 2017-06-16 14:53 ` Jens Axboe
2017-06-16 15:52 ` Christoph Hellwig
1 sibling, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-16 14:53 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/16/2017 08:35 AM, Jens Axboe wrote:
>> If I look at the mess for allocating the streams I think we need it
>> to bubble down. That way the device can allocate the stream at time
>> of the fcntl and we can keep the low level driver very simple.
>
> Mess? The NVMe code seems pretty straight forward to me. Is it purely
> the lazy alloc part you're referring to? I'm fine with bubbling down the
> hint and setting everything up inline from the fcntl() call, but that
> means we need to make things like btrfs and md/dm aware of it and
> resolve all mappings. That sort-of sucks, especially since they don't
> otherwise need to know about it.
This is what the old code looked like, back when it was implemented as
managed streams, for btrfs:
http://git.kernel.dk/cgit/linux-block/commit/?id=93b4d9370250ea9273e1e71775df85964ff52922
We don't need the close part anymore, but the device iteration for
constituent devices would be the same. On top of that, we'd have to
touch all of drivers/md/ as well, or it won't work for anyone that's
using raid or dm.
Why not just let it resolve lazily? That way we don't have to touch
drivers that need not know about this, since the information is already
being passed all the way down without having to modify drivers.
If you object to the nvme driver open coding this, I can put that in
block instead and catch it earlier. Then nvme would not have to worry
about it. But honestly, I'd much rather make it generic once we have
another user that needs this, than do it up front.
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 14:35 ` Jens Axboe
2017-06-16 14:53 ` Jens Axboe
@ 2017-06-16 15:52 ` Christoph Hellwig
2017-06-16 15:59 ` Jens Axboe
1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-16 15:52 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Darrick J. Wong, linux-fsdevel, linux-block,
adilger, martin.petersen
On Fri, Jun 16, 2017 at 08:35:07AM -0600, Jens Axboe wrote:
> > Agreed. In fact I'd go a little further: we should have a
> >
> > u16 hints;
> >
> > that goes all the way down from fcntl to the driver, right now
> > we'll allocate the first 3 bits for the write lifetime hints (2.5,
> > so we have one value spare, as they don't need to flags but can be
> > enum values), leaving more space for other kinds of hints.
>
> Did you see v5? It adds enum write_hint and passes it all the way down,
> until we transform them into rq/bio flags.
Yes. But with all the way down I mean all the way down to the driver :)
> Mess? The NVMe code seems pretty straight forward to me. Is it purely
> the lazy alloc part you're referring to?
Yes.
> I'm fine with bubbling down the
> hint and setting everything up inline from the fcntl() call, but that
> means we need to make things like btrfs and md/dm aware of it and
> resolve all mappings. That sort-of sucks, especially since they don't
> otherwise need to know about it.
True, that sucks. But taking the hint and doing things behind the
scenes sounds nasty.
> If another driver needs similar setup like NVMe, I'd much rather just
> fix it up there. From the API point of view, there would be no changes.
Ok..
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 15:52 ` Christoph Hellwig
@ 2017-06-16 15:59 ` Jens Axboe
2017-06-16 16:14 ` Jens Axboe
2017-06-16 18:02 ` Christoph Hellwig
0 siblings, 2 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-16 15:59 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/16/2017 09:52 AM, Christoph Hellwig wrote:
> On Fri, Jun 16, 2017 at 08:35:07AM -0600, Jens Axboe wrote:
>>> Agreed. In fact I'd go a little further: we should have a
>>>
>>> u16 hints;
>>>
>>> that goes all the way down from fcntl to the driver, right now
>>> we'll allocate the first 3 bits for the write lifetime hints (2.5,
>>> so we have one value spare, as they don't need to flags but can be
>>> enum values), leaving more space for other kinds of hints.
>>
>> Did you see v5? It adds enum write_hint and passes it all the way down,
>> until we transform them into rq/bio flags.
>
> Yes. But with all the way down I mean all the way down to the driver :)
Only missing part is the request flags. And why make that any different
than the flags we already have now? It'd be trivial to pack the value
into the request flags as well, but I'm struggling to see the point of
that, honestly.
Please expand on why you think changing the request flags to also
carry that value would be useful, as opposed to just mapping it when
we setup the request. If you have a valid concern I don't mind making
the change, but I just don't see one right now.
>> Mess? The NVMe code seems pretty straight forward to me. Is it purely
>> the lazy alloc part you're referring to?
>
> Yes.
>
>> I'm fine with bubbling down the
>> hint and setting everything up inline from the fcntl() call, but that
>> means we need to make things like btrfs and md/dm aware of it and
>> resolve all mappings. That sort-of sucks, especially since they don't
>> otherwise need to know about it.
>
> True, that sucks. But taking the hint and doing things behind the
> scenes sounds nasty.
How so? It's pretty straight forward. The only downside I see is that
we'll have a delay between seeing the first valid write hint and to
the time when nvme has it enabled. But that's so short, and in the
grander scheme of things, doesn't matter one bit.
The alternative is a bunch of infrastructure to do this inline. I
think that choice is pretty clear here.
>> If another driver needs similar setup like NVMe, I'd much rather just
>> fix it up there. From the API point of view, there would be no changes.
>
> Ok..
>
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 15:59 ` Jens Axboe
@ 2017-06-16 16:14 ` Jens Axboe
2017-06-16 18:00 ` Christoph Hellwig
2017-06-16 18:02 ` Christoph Hellwig
1 sibling, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2017-06-16 16:14 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/16/2017 09:59 AM, Jens Axboe wrote:
> On 06/16/2017 09:52 AM, Christoph Hellwig wrote:
>> On Fri, Jun 16, 2017 at 08:35:07AM -0600, Jens Axboe wrote:
>>>> Agreed. In fact I'd go a little further: we should have a
>>>>
>>>> u16 hints;
>>>>
>>>> that goes all the way down from fcntl to the driver, right now
>>>> we'll allocate the first 3 bits for the write lifetime hints (2.5,
>>>> so we have one value spare, as they don't need to flags but can be
>>>> enum values), leaving more space for other kinds of hints.
>>>
>>> Did you see v5? It adds enum write_hint and passes it all the way down,
>>> until we transform them into rq/bio flags.
>>
>> Yes. But with all the way down I mean all the way down to the driver :)
>
> Only missing part is the request flags. And why make that any different
> than the flags we already have now? It'd be trivial to pack the value
> into the request flags as well, but I'm struggling to see the point of
> that, honestly.
>
> Please expand on why you think changing the request flags to also
> carry that value would be useful, as opposed to just mapping it when
> we setup the request. If you have a valid concern I don't mind making
> the change, but I just don't see one right now.
So that would look like the below change on top of the current v5. I
skipped the callers, since those are all easy
s/bio_op_write_hint/write_hint_to_opf changes.
diff --git a/block/bio.c b/block/bio.c
index 758d83d91bb0..888e7801c638 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2082,22 +2082,6 @@ void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
#endif /* CONFIG_BLK_CGROUP */
-static const unsigned int rwf_write_to_opf_flag[] = {
- 0, REQ_WRITE_SHORT, REQ_WRITE_MEDIUM, REQ_WRITE_LONG, REQ_WRITE_EXTREME
-};
-
-/*
- * Convert WRITE_LIFE_* hints into req/bio flags
- */
-unsigned int bio_op_write_hint(enum write_hint hint)
-{
- if (WARN_ON_ONCE(hint >= ARRAY_SIZE(rwf_write_to_opf_flag)))
- return 0;
-
- return rwf_write_to_opf_flag[hint];
-}
-EXPORT_SYMBOL_GPL(bio_op_write_hint);
-
static void __init biovec_init_slabs(void)
{
int i;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e9360dc5ea07..d1b04b0e99cf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -443,7 +443,6 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
gfp_t, int);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
-extern unsigned int bio_op_write_hint(enum write_hint hint);
void generic_start_io_acct(int rw, unsigned long sectors,
struct hd_struct *part);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 23646eb433e7..f4d348cd3a6b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <linux/bvec.h>
+#include <linux/fs.h>
struct bio_set;
struct bio;
@@ -201,10 +202,9 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
- __REQ_WRITE_SHORT, /* short life time write */
- __REQ_WRITE_MEDIUM, /* medium life time write */
- __REQ_WRITE_LONG, /* long life time write */
- __REQ_WRITE_EXTREME, /* extremely long life time write */
+ __REQ_WRITE_HINT_SHIFT, /* 3 bits for life time hint */
+ __REQ_WRITE_HINT_PAD1,
+ __REQ_WRITE_HINT_PAD2,
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -225,13 +225,12 @@ enum req_flag_bits {
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
-#define REQ_WRITE_SHORT (1ULL << __REQ_WRITE_SHORT)
-#define REQ_WRITE_MEDIUM (1ULL << __REQ_WRITE_MEDIUM)
-#define REQ_WRITE_LONG (1ULL << __REQ_WRITE_LONG)
-#define REQ_WRITE_EXTREME (1ULL << __REQ_WRITE_EXTREME)
+#define REQ_WRITE_SHORT (WRITE_HINT_SHORT << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_MEDIUM (WRITE_HINT_MEDIUM << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_LONG (WRITE_HINT_LONG << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_EXTREME (WRITE_HINT_EXTREME << __REQ_WRITE_HINT_SHIFT)
-#define REQ_WRITE_LIFE_MASK (REQ_WRITE_SHORT | REQ_WRITE_MEDIUM | \
- REQ_WRITE_LONG | REQ_WRITE_EXTREME)
+#define REQ_WRITE_LIFE_MASK (0x7 << __REQ_WRITE_HINT_SHIFT)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
@@ -328,4 +327,9 @@ static inline bool op_write_hint_valid(unsigned int opf)
return (opf & REQ_WRITE_LIFE_MASK) != 0;
}
+static inline unsigned int write_hint_to_opf(enum write_hint hint)
+{
+ return hint << __REQ_WRITE_HINT_SHIFT;
+}
+
#endif /* __LINUX_BLK_TYPES_H */
--
Jens Axboe
^ permalink raw reply related [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 16:14 ` Jens Axboe
@ 2017-06-16 18:00 ` Christoph Hellwig
0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-16 18:00 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Darrick J. Wong, linux-fsdevel, linux-block,
adilger, martin.petersen
On Fri, Jun 16, 2017 at 10:14:01AM -0600, Jens Axboe wrote:
> So that would look like the below change on top of the current v5. I
> skipped the callers, since those are all easy
> s/bio_op_write_hint/write_hint_to_opf changes.
That's better. But my idead was to actually go back to req_hints and
bi_hints fields. Especially as we now have various little holes due
to the size reduction from bi_error to bi_status (and the request
equivalent). But this mostly makes sense if we're planning for more
hints.
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 15:59 ` Jens Axboe
2017-06-16 16:14 ` Jens Axboe
@ 2017-06-16 18:02 ` Christoph Hellwig
2017-06-16 19:35 ` Jens Axboe
1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2017-06-16 18:02 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, Darrick J. Wong, linux-fsdevel, linux-block,
adilger, martin.petersen
On Fri, Jun 16, 2017 at 09:59:38AM -0600, Jens Axboe wrote:
> >> Did you see v5? It adds enum write_hint and passes it all the way down,
> >> until we transform them into rq/bio flags.
> >
> > Yes. But with all the way down I mean all the way down to the driver :)
>
> Only missing part is the request flags. And why make that any different
> than the flags we already have now? It'd be trivial to pack the value
> into the request flags as well, but I'm struggling to see the point of
> that, honestly.
>
> Please expand on why you think changing the request flags to also
> carry that value would be useful, as opposed to just mapping it when
> we setup the request. If you have a valid concern I don't mind making
> the change, but I just don't see one right now.
Mostly so that we can pass the value down in one form through the whole
stack.
> How so? It's pretty straight forward. The only downside I see is that
> we'll have a delay between seeing the first valid write hint and to
> the time when nvme has it enabled. But that's so short, and in the
> grander scheme of things, doesn't matter one bit.
I'll take your word for that. To be honest I hope future standards
won't make the mistake to come up with something as ugly as streams
and just take something like our hints on a per-I/O basis..
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints
2017-06-16 18:02 ` Christoph Hellwig
@ 2017-06-16 19:35 ` Jens Axboe
0 siblings, 0 replies; 40+ messages in thread
From: Jens Axboe @ 2017-06-16 19:35 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, linux-fsdevel, linux-block, adilger,
martin.petersen
On 06/16/2017 12:02 PM, Christoph Hellwig wrote:
> On Fri, Jun 16, 2017 at 09:59:38AM -0600, Jens Axboe wrote:
>>>> Did you see v5? It adds enum write_hint and passes it all the way down,
>>>> until we transform them into rq/bio flags.
>>>
>>> Yes. But with all the way down I mean all the way down to the driver :)
>>
>> Only missing part is the request flags. And why make that any different
>> than the flags we already have now? It'd be trivial to pack the value
>> into the request flags as well, but I'm struggling to see the point of
>> that, honestly.
>>
>> Please expand on why you think changing the request flags to also
>> carry that value would be useful, as opposed to just mapping it when
>> we setup the request. If you have a valid concern I don't mind making
>> the change, but I just don't see one right now.
>
> Mostly so that we can pass the value down in one form through the whole
> stack.
OK good, so we have that now. I guess the one change we could further make
is have the write hints be a subset of the possible hints. I already made
changes to reflect that in the latest posting, where we have a rw_hint
enum. I haven't made any division of it into read vs write hints, but that
could always be done and it won't impact the API since we can just keep
the write hints in the lower 8-16 bits, or whatever we choose.
>> How so? It's pretty straight forward. The only downside I see is that
>> we'll have a delay between seeing the first valid write hint and to
>> the time when nvme has it enabled. But that's so short, and in the
>> grander scheme of things, doesn't matter one bit.
>
> I'll take your word for that. To be honest I hope future standards
> won't make the mistake to come up with something as ugly as streams
> and just take something like our hints on a per-I/O basis..
Well, at least streams is better than how it started out...
--
Jens Axboe
^ permalink raw reply [flat|nested] 40+ messages in thread
end of thread, other threads:[~2017-06-16 19:35 UTC | newest]
Thread overview: 40+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-15 3:45 [PATCHSET v4] Add support for write life time hints Jens Axboe
2017-06-15 3:45 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
2017-06-15 3:45 ` [PATCH 02/11] blk-mq: expose stream write stats through debugfs Jens Axboe
2017-06-15 8:16 ` Christoph Hellwig
2017-06-15 14:24 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 03/11] fs: add support for an inode to carry stream related data Jens Axboe
2017-06-15 8:17 ` Christoph Hellwig
2017-06-15 14:22 ` Jens Axboe
2017-06-15 3:45 ` [PATCH 04/11] fs: add support for allowing applications to pass in write life time hints Jens Axboe
2017-06-15 4:15 ` Darrick J. Wong
2017-06-15 4:33 ` Jens Axboe
2017-06-15 8:19 ` Christoph Hellwig
2017-06-15 14:21 ` Jens Axboe
2017-06-15 15:23 ` Jens Axboe
2017-06-16 7:30 ` Christoph Hellwig
2017-06-16 14:35 ` Jens Axboe
2017-06-16 7:33 ` Christoph Hellwig
2017-06-16 14:35 ` Jens Axboe
2017-06-16 14:53 ` Jens Axboe
2017-06-16 15:52 ` Christoph Hellwig
2017-06-16 15:59 ` Jens Axboe
2017-06-16 16:14 ` Jens Axboe
2017-06-16 18:00 ` Christoph Hellwig
2017-06-16 18:02 ` Christoph Hellwig
2017-06-16 19:35 ` Jens Axboe
2017-06-15 11:24 ` Al Viro
2017-06-15 3:45 ` [PATCH 05/11] block: add helpers for setting/checking write hint validity Jens Axboe
2017-06-15 3:45 ` [PATCH 06/11] fs: add O_DIRECT support for sending down bio stream information Jens Axboe
2017-06-15 3:45 ` [PATCH 07/11] fs: add support for buffered writeback to pass down write hints Jens Axboe
2017-06-15 3:45 ` [PATCH 08/11] ext4: add support for passing in write hints for buffered writes Jens Axboe
2017-06-15 3:45 ` [PATCH 09/11] xfs: " Jens Axboe
2017-06-15 3:45 ` [PATCH 10/11] btrfs: " Jens Axboe
2017-06-15 3:45 ` [PATCH 11/11] nvme: add support for streams and directives Jens Axboe
2017-06-15 8:12 ` [PATCHSET v4] Add support for write life time hints Christoph Hellwig
2017-06-15 14:23 ` Jens Axboe
-- strict thread matches above, loose matches on Subject: below --
2017-06-14 19:05 [PATCHSET v3] " Jens Axboe
2017-06-14 19:05 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
2017-06-14 20:37 ` Christoph Hellwig
2017-06-14 20:44 ` Jens Axboe
2017-06-13 17:15 [PATCH 0/11] Add support for write life time hints Jens Axboe
2017-06-13 17:15 ` [PATCH 01/11] block: add support for carrying stream information in a bio Jens Axboe
2017-06-13 19:01 ` Andreas Dilger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).