* [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
@ 2017-06-19 17:04 ` Jens Axboe
2017-06-20 23:09 ` Bart Van Assche
2017-06-19 17:04 ` [PATCH 2/9] block: add support for write hints in a bio Jens Axboe
` (7 subsequent siblings)
8 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:04 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Define a set of write life time hints:
and add an fcntl interface for querying these flags, and also for
setting them as well:
F_GET_RW_HINT Returns the read/write hint set.
F_SET_RW_HINT Pass one of the above write hints.
The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.
Sample program testing/implementing basic setting/getting of write
hints is below.
Add support for storing the write life time hint in the inode flags,
and pass them to the kiocb flags as well. This is in preparation
for utilizing these hints in the block layer, to guide on-media
data placement.
/*
* writehint.c: check or set a file/inode write hint
*/
static char *str[] = { "WRITE_LIFE_NONE", "WRITE_LIFE_SHORT",
"WRITE_LIFE_MEDIUM", "WRITE_LIFE_LONG",
"WRITE_LIFE_EXTREME" };
int main(int argc, char *argv[])
{
uint64_t hint = -1ULL;
int fd, ret;
if (argc < 2) {
fprintf(stderr, "%s: dev <hint>\n", argv[0]);
return 1;
}
fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 2;
}
if (argc > 2)
hint = atoi(argv[2]);
if (hint == -1ULL) {
ret = fcntl(fd, F_RW_GET_HINT, &hint);
if (ret < 0) {
perror("fcntl: F_RW_GET_HINT");
return 3;
}
} else {
ret = fcntl(fd, F_RW_SET_HINT, &hint);
if (ret < 0) {
perror("fcntl: F_RW_SET_HINT");
return 4;
}
}
printf("%s: %shint %s\n", argv[1], hint != -1ULL ? "set " : "", str[hint]);
close(fd);
return 0;
}
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/fcntl.c | 43 ++++++++++++++++++++++++++++++++
fs/inode.c | 11 +++++++++
include/linux/fs.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/fcntl.h | 15 ++++++++++++
4 files changed, 130 insertions(+)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f4e7267d117f..113b78c11631 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -243,6 +243,45 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
+static long fcntl_rw_hint(struct file *file, unsigned int cmd,
+ u64 __user *ptr)
+{
+ struct inode *inode = file_inode(file);
+ long ret = 0;
+ u64 hint;
+
+ switch (cmd) {
+ case F_GET_RW_HINT:
+ hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
+ if (put_user(hint, ptr))
+ ret = -EFAULT;
+ break;
+ case F_SET_RW_HINT:
+ if (get_user(hint, ptr)) {
+ ret = -EFAULT;
+ break;
+ }
+ switch (hint) {
+ case WRITE_LIFE_NONE:
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_MEDIUM:
+ case WRITE_LIFE_LONG:
+ case WRITE_LIFE_EXTREME:
+ inode_set_write_hint(inode, hint);
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -337,6 +376,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_GET_SEALS:
err = shmem_fcntl(filp, cmd, arg);
break;
+ case F_GET_RW_HINT:
+ case F_SET_RW_HINT:
+ err = fcntl_rw_hint(filp, cmd, (u64 __user *) arg);
+ break;
default:
break;
}
diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..defb015a2c6d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2120,3 +2120,14 @@ struct timespec current_time(struct inode *inode)
return timespec_trunc(now, inode->i_sb->s_time_gran);
}
EXPORT_SYMBOL(current_time);
+
+void inode_set_write_hint(struct inode *inode, enum rw_hint hint)
+{
+ unsigned int flags = write_hint_to_mask(hint, S_WRITE_LIFE_SHIFT);
+
+ if (flags != mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT)) {
+ inode_lock(inode);
+ inode_set_flags(inode, flags, S_WRITE_LIFE_MASK);
+ inode_unlock(inode);
+ }
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 023f0324762b..8720251cc153 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -270,6 +270,12 @@ struct writeback_control;
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
+/*
+ * Steal 3 bits for stream information, this allows 8 valid streams
+ */
+#define IOCB_WRITE_LIFE_SHIFT 7
+#define IOCB_WRITE_LIFE_MASK (BIT(7) | BIT(8) | BIT(9))
+
struct kiocb {
struct file *ki_filp;
loff_t ki_pos;
@@ -293,6 +299,12 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
};
}
+static inline int iocb_write_hint(const struct kiocb *iocb)
+{
+ return (iocb->ki_flags & IOCB_WRITE_LIFE_MASK) >>
+ IOCB_WRITE_LIFE_SHIFT;
+}
+
/*
* "descriptor" for what we're up to with a read.
* This allows us to use the same read code yet
@@ -1829,6 +1841,14 @@ struct super_operations {
#endif
/*
+ * Expected life time hint of a write for this inode. This uses the
+ * WRITE_LIFE_* encoding, we just need to define the shift. We need
+ * 3 bits for this. Next S_* value is 131072, bit 17.
+ */
+#define S_WRITE_LIFE_MASK 0x1c000 /* bits 14..16 */
+#define S_WRITE_LIFE_SHIFT 14 /* 16384, next bit */
+
+/*
* Note that nosuid etc flags are inode-specific: setting some file-system
* flags just means all the inodes inherit those flags by default. It might be
* possible to override it selectively if you really wanted to with some
@@ -1875,6 +1895,37 @@ static inline bool HAS_UNMAPPED_ID(struct inode *inode)
}
/*
+ * Write life time hint values.
+ */
+enum rw_hint {
+ WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
+ WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
+ WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME
+};
+
+static inline unsigned int write_hint_to_mask(enum rw_hint hint,
+ unsigned int shift)
+{
+ return hint << shift;
+}
+
+static inline enum rw_hint mask_to_write_hint(unsigned int mask,
+ unsigned int shift)
+{
+ return (mask >> shift) & 0x7;
+}
+
+static inline unsigned int inode_write_hint(struct inode *inode)
+{
+ if (inode)
+ return mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
+
+ return 0;
+}
+
+/*
* Inode state bits. Protected by inode->i_lock
*
* Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
@@ -2758,6 +2809,7 @@ extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int should_remove_suid(struct dentry *);
extern int file_remove_privs(struct file *);
+extern void inode_set_write_hint(struct inode *inode, enum rw_hint hint);
extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
@@ -3045,7 +3097,9 @@ static inline bool io_is_direct(struct file *filp)
static inline int iocb_flags(struct file *file)
{
+ struct inode *inode = file_inode(file);
int res = 0;
+
if (file->f_flags & O_APPEND)
res |= IOCB_APPEND;
if (io_is_direct(file))
@@ -3054,6 +3108,13 @@ static inline int iocb_flags(struct file *file)
res |= IOCB_DSYNC;
if (file->f_flags & __O_SYNC)
res |= IOCB_SYNC;
+ if (mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT)) {
+ enum rw_hint hint;
+
+ hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
+ res |= write_hint_to_mask(hint, IOCB_WRITE_LIFE_SHIFT);
+ }
+
return res;
}
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 813afd6eee71..def8f70e8bae 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,21 @@
/* (1U << 31) is reserved for signed error codes */
/*
+ * Set/Get write life time hints.
+ */
+#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
+
+/*
+ * Valid hint values for F_{GET,SET}_RW_HINT
+ */
+#define RWH_WRITE_LIFE_NONE 0
+#define RWH_WRITE_LIFE_SHORT 1
+#define RWH_WRITE_LIFE_MEDIUM 2
+#define RWH_WRITE_LIFE_LONG 3
+#define RWH_WRITE_LIFE_EXTREME 4
+
+/*
* Types of directory notifications that may be requested.
*/
#define DN_ACCESS 0x00000001 /* File accessed */
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints
2017-06-19 17:04 ` [PATCH 1/9] fs: add fcntl() interface for setting/getting " Jens Axboe
@ 2017-06-20 23:09 ` Bart Van Assche
2017-06-20 23:49 ` Jens Axboe
0 siblings, 1 reply; 25+ messages in thread
From: Bart Van Assche @ 2017-06-20 23:09 UTC (permalink / raw)
To: linux-block@vger.kernel.org, axboe@kernel.dk,
linux-fsdevel@vger.kernel.org
Cc: hch@infradead.org, adilger@dilger.ca,
linux-nvme@lists.infradead.org, martin.petersen@oracle.com
On Mon, 2017-06-19 at 11:04 -0600, Jens Axboe wrote:
> +static long fcntl_rw_hint(struct file *file, unsigned int cmd,
> + u64 __user *ptr)
> +{
> + struct inode *inode = file_inode(file);
> + long ret = 0;
> + u64 hint;
> +
> + switch (cmd) {
> + case F_GET_RW_HINT:
> + hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
> + if (put_user(hint, ptr))
> + ret = -EFAULT;
> + break;
> + case F_SET_RW_HINT:
> + if (get_user(hint, ptr)) {
> + ret = -EFAULT;
> + break;
> + }
> + switch (hint) {
> + case WRITE_LIFE_NONE:
> + case WRITE_LIFE_SHORT:
> + case WRITE_LIFE_MEDIUM:
> + case WRITE_LIFE_LONG:
> + case WRITE_LIFE_EXTREME:
> + inode_set_write_hint(inode, hint);
> + ret = 0;
> + break;
> + default:
> + ret = -EINVAL;
> + }
> + break;
> + default:
> + ret = -EINVAL;
> + break;
> + }
> +
> + return ret;
> +}
Hello Jens,
Do we need an (inline) helper function for checking the validity of a
numerical WRITE_LIFE value next to the definition of the WRITE_LIFE_*
constants, e.g. WRITE_LIFE_NONE <= hint && hint <= WRITE_LIFE_EXTREME?
> +/*
> + * Steal 3 bits for stream information, this allows 8 valid streams
> + */
> +#define IOCB_WRITE_LIFE_SHIFT 7
> +#define IOCB_WRITE_LIFE_MASK (BIT(7) | BIT(8) | BIT(9))
A minor comment: how about making this easier to read by defining
IOCB_WRITE_LIFE_MASK as (7 << IOCB_WRITE_LIFE_SHIFT)?
> /*
> + * Expected life time hint of a write for this inode. This uses the
> + * WRITE_LIFE_* encoding, we just need to define the shift. We need
> + * 3 bits for this. Next S_* value is 131072, bit 17.
> + */
> +#define S_WRITE_LIFE_MASK 0x1c000 /* bits 14..16 */
> +#define S_WRITE_LIFE_SHIFT 14 /* 16384, next bit */
Another minor comment: how about making this easier to read by defining
S_WRITE_LIFE_MASK as (7 << S_WRITE_LIFE_SHIFT)?
> /*
> + * Write life time hint values.
> + */
> +enum rw_hint {
> + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
> + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
> + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
> + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
> + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME
> +};
> [ ... ]
> +/*
> + * Valid hint values for F_{GET,SET}_RW_HINT
> + */
> +#define RWH_WRITE_LIFE_NONE 0
> +#define RWH_WRITE_LIFE_SHORT 1
> +#define RWH_WRITE_LIFE_MEDIUM 2
> +#define RWH_WRITE_LIFE_LONG 3
> +#define RWH_WRITE_LIFE_EXTREME 4
Maybe I missed something, but it's not clear to me why we have both an enum and
defines with the same numerical values? BTW, I prefer an enum above #defines.
Thanks,
Bart.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/9] fs: add fcntl() interface for setting/getting write life time hints
2017-06-20 23:09 ` Bart Van Assche
@ 2017-06-20 23:49 ` Jens Axboe
0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-20 23:49 UTC (permalink / raw)
To: Bart Van Assche, linux-block@vger.kernel.org,
linux-fsdevel@vger.kernel.org
Cc: hch@infradead.org, adilger@dilger.ca,
linux-nvme@lists.infradead.org, martin.petersen@oracle.com
On 06/20/2017 05:09 PM, Bart Van Assche wrote:
> On Mon, 2017-06-19 at 11:04 -0600, Jens Axboe wrote:
>> +static long fcntl_rw_hint(struct file *file, unsigned int cmd,
>> + u64 __user *ptr)
>> +{
>> + struct inode *inode = file_inode(file);
>> + long ret = 0;
>> + u64 hint;
>> +
>> + switch (cmd) {
>> + case F_GET_RW_HINT:
>> + hint = mask_to_write_hint(inode->i_flags, S_WRITE_LIFE_SHIFT);
>> + if (put_user(hint, ptr))
>> + ret = -EFAULT;
>> + break;
>> + case F_SET_RW_HINT:
>> + if (get_user(hint, ptr)) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> + switch (hint) {
>> + case WRITE_LIFE_NONE:
>> + case WRITE_LIFE_SHORT:
>> + case WRITE_LIFE_MEDIUM:
>> + case WRITE_LIFE_LONG:
>> + case WRITE_LIFE_EXTREME:
>> + inode_set_write_hint(inode, hint);
>> + ret = 0;
>> + break;
>> + default:
>> + ret = -EINVAL;
>> + }
>> + break;
>> + default:
>> + ret = -EINVAL;
>> + break;
>> + }
>> +
>> + return ret;
>> +}
>
> Hello Jens,
>
> Do we need an (inline) helper function for checking the validity of a
> numerical WRITE_LIFE value next to the definition of the WRITE_LIFE_*
> constants, e.g. WRITE_LIFE_NONE <= hint && hint <= WRITE_LIFE_EXTREME?
Might not hurt in general, I can fold something like that in.
>> +/*
>> + * Steal 3 bits for stream information, this allows 8 valid streams
>> + */
>> +#define IOCB_WRITE_LIFE_SHIFT 7
>> +#define IOCB_WRITE_LIFE_MASK (BIT(7) | BIT(8) | BIT(9))
>
> A minor comment: how about making this easier to read by defining
> IOCB_WRITE_LIFE_MASK as (7 << IOCB_WRITE_LIFE_SHIFT)?
Agree, that would be prettier.
>> /*
>> + * Expected life time hint of a write for this inode. This uses the
>> + * WRITE_LIFE_* encoding, we just need to define the shift. We need
>> + * 3 bits for this. Next S_* value is 131072, bit 17.
>> + */
>> +#define S_WRITE_LIFE_MASK 0x1c000 /* bits 14..16 */
>> +#define S_WRITE_LIFE_SHIFT 14 /* 16384, next bit */
>
> Another minor comment: how about making this easier to read by defining
> S_WRITE_LIFE_MASK as (7 << S_WRITE_LIFE_SHIFT)?
Agree, I'll make that change too.
>> /*
>> + * Write life time hint values.
>> + */
>> +enum rw_hint {
>> + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
>> + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
>> + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
>> + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
>> + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME
>> +};
>> [ ... ]
>> +/*
>> + * Valid hint values for F_{GET,SET}_RW_HINT
>> + */
>> +#define RWH_WRITE_LIFE_NONE 0
>> +#define RWH_WRITE_LIFE_SHORT 1
>> +#define RWH_WRITE_LIFE_MEDIUM 2
>> +#define RWH_WRITE_LIFE_LONG 3
>> +#define RWH_WRITE_LIFE_EXTREME 4
>
> Maybe I missed something, but it's not clear to me why we have both an
> enum and defines with the same numerical values? BTW, I prefer an enum
> above #defines.
We use the enum internally, that's the hint that the fs and block layer
sees. The reason for the defines is for the user interface, where we
don't want that to be an enum. So the mapping between the two is the
definition of the enum rw_hint values.
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 2/9] block: add support for write hints in a bio
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
2017-06-19 17:04 ` [PATCH 1/9] fs: add fcntl() interface for setting/getting " Jens Axboe
@ 2017-06-19 17:04 ` Jens Axboe
2017-06-19 17:04 ` [PATCH 3/9] blk-mq: expose stream write hints through debugfs Jens Axboe
` (6 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:04 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
No functional changes in this patch, we just set aside 3 bits
in the bio/request flags, which can be used to hold a WRITE_LIFE_*
life time hint.
Ensure that we don't merge requests that have different life time
hints assigned to them.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-merge.c | 16 ++++++++++++++++
include/linux/blk_types.h | 20 ++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index cea544ec5d96..17cd0c3f872d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -670,6 +670,14 @@ static struct request *attempt_merge(struct request_queue *q,
return NULL;
/*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if ((req->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+ (next->cmd_flags & REQ_WRITE_LIFE_MASK))
+ return NULL;
+
+ /*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
@@ -788,6 +796,14 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
!blk_write_same_mergeable(rq->bio, bio))
return false;
+ /*
+ * Don't allow merge of different streams, or for a stream with
+ * non-stream IO.
+ */
+ if ((rq->cmd_flags & REQ_WRITE_LIFE_MASK) !=
+ (bio->bi_opf & REQ_WRITE_LIFE_MASK))
+ return false;
+
return true;
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dcd45b15a3a5..fde216924aa3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <linux/bvec.h>
+#include <linux/fs.h>
struct bio_set;
struct bio;
@@ -220,6 +221,9 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
+ __REQ_WRITE_HINT_SHIFT, /* 3 bits for life time hint */
+ __REQ_WRITE_HINT_PAD1,
+ __REQ_WRITE_HINT_PAD2,
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -240,6 +244,12 @@ enum req_flag_bits {
#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH)
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
+#define REQ_WRITE_SHORT (WRITE_LIFE_SHORT << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_MEDIUM (WRITE_LIFE_MEDIUM << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_LONG (WRITE_LIFE_LONG << __REQ_WRITE_HINT_SHIFT)
+#define REQ_WRITE_EXTREME (WRITE_LIFE_EXTREME << __REQ_WRITE_HINT_SHIFT)
+
+#define REQ_WRITE_LIFE_MASK (0x7 << __REQ_WRITE_HINT_SHIFT)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
@@ -331,4 +341,14 @@ struct blk_rq_stat {
u64 batch;
};
+static inline unsigned int write_hint_to_opf(enum rw_hint hint)
+{
+ return hint << __REQ_WRITE_HINT_SHIFT;
+}
+
+static inline enum rw_hint opf_to_write_hint(unsigned int opf)
+{
+ return (opf & REQ_WRITE_LIFE_MASK) >> __REQ_WRITE_HINT_SHIFT;
+}
+
#endif /* __LINUX_BLK_TYPES_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 3/9] blk-mq: expose stream write hints through debugfs
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
2017-06-19 17:04 ` [PATCH 1/9] fs: add fcntl() interface for setting/getting " Jens Axboe
2017-06-19 17:04 ` [PATCH 2/9] block: add support for write hints in a bio Jens Axboe
@ 2017-06-19 17:04 ` Jens Axboe
2017-06-19 17:04 ` [PATCH 4/9] fs: add O_DIRECT support for sending down write life time hints Jens Axboe
` (5 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:04 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Useful to verify that things are working the way they should.
Reading the file will return number of kb written with each
write hint. Writing the file will reset the statistics. No care
is taken to ensure that we don't race on updates.
Drivers will write to q->write_hints[] if they handle a given
write hint.
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
block/blk-mq-debugfs.c | 24 ++++++++++++++++++++++++
include/linux/blkdev.h | 3 +++
2 files changed, 27 insertions(+)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 9edebbdce0bd..9ebc2945f991 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -135,6 +135,29 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
}
}
+static int queue_write_hint_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ int i;
+
+ for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
+ seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]);
+
+ return 0;
+}
+
+static ssize_t queue_write_hint_store(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct request_queue *q = data;
+ int i;
+
+ for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
+ q->write_hints[i] = 0;
+
+ return count;
+}
+
static int queue_poll_stat_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
@@ -730,6 +753,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{"poll_stat", 0400, queue_poll_stat_show},
{"requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops},
{"state", 0600, queue_state_show, queue_state_write},
+ {"write_hints", 0600, queue_write_hint_show, queue_write_hint_store},
{},
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 22cfba64ce81..687394b70924 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -586,6 +586,9 @@ struct request_queue {
size_t cmd_size;
void *rq_alloc_data;
+
+#define BLK_MAX_WRITE_HINTS 5
+ u64 write_hints[BLK_MAX_WRITE_HINTS];
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 4/9] fs: add O_DIRECT support for sending down write life time hints
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
` (2 preceding siblings ...)
2017-06-19 17:04 ` [PATCH 3/9] blk-mq: expose stream write hints through debugfs Jens Axboe
@ 2017-06-19 17:04 ` Jens Axboe
2017-06-19 17:04 ` [PATCH 5/9] fs: add support for buffered writeback to pass down write hints Jens Axboe
` (4 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:04 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/block_dev.c | 2 ++
fs/direct-io.c | 2 ++
fs/iomap.c | 1 +
3 files changed, 5 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index dd91c99e9ba0..30e1fb65c2fa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -183,6 +183,8 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
/* avoid the need for a I/O completion work item */
if (iocb->ki_flags & IOCB_DSYNC)
op |= REQ_FUA;
+
+ op |= write_hint_to_opf(iocb_write_hint(iocb));
return op;
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e8baaabebf13..9e9adca0c592 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -385,6 +385,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
else
bio->bi_end_io = dio_bio_end_io;
+ bio->bi_opf |= write_hint_to_opf(iocb_write_hint(dio->iocb));
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/iomap.c b/fs/iomap.c
index 18f2f2b8ba2c..63b9f87d9461 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -804,6 +804,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
if (dio->flags & IOMAP_DIO_WRITE) {
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ bio->bi_opf |= write_hint_to_opf(inode_write_hint(inode));
task_io_account_write(bio->bi_iter.bi_size);
} else {
bio_set_op_attrs(bio, REQ_OP_READ, 0);
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 5/9] fs: add support for buffered writeback to pass down write hints
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
` (3 preceding siblings ...)
2017-06-19 17:04 ` [PATCH 4/9] fs: add O_DIRECT support for sending down write life time hints Jens Axboe
@ 2017-06-19 17:04 ` Jens Axboe
2017-06-19 17:04 ` [PATCH 6/9] ext4: add support for passing in write hints for buffered writes Jens Axboe
` (3 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:04 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/buffer.c | 14 +++++++++-----
fs/mpage.c | 1 +
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 306b720f7383..1259524715c8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc);
+ unsigned int stream, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -1829,7 +1829,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode_write_hint(inode), wbc);
nr_underway++;
}
bh = next;
@@ -1883,7 +1884,8 @@ int __block_write_full_page(struct inode *inode, struct page *page,
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
+ inode_write_hint(inode), wbc);
nr_underway++;
}
bh = next;
@@ -3091,7 +3093,7 @@ void guard_bio_eod(int op, struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- struct writeback_control *wbc)
+ unsigned int write_hint, struct writeback_control *wbc)
{
struct bio *bio;
@@ -3134,6 +3136,8 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
+
+ op_flags |= write_hint_to_opf(write_hint);
bio_set_op_attrs(bio, op, op_flags);
submit_bio(bio);
@@ -3142,7 +3146,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, NULL);
+ return submit_bh_wbc(op, op_flags, bh, 0, NULL);
}
EXPORT_SYMBOL(submit_bh);
diff --git a/fs/mpage.c b/fs/mpage.c
index 9524fdde00c2..d8a750873bf4 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -615,6 +615,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
goto confused;
wbc_init_bio(wbc, bio);
+ bio->bi_opf |= write_hint_to_opf(inode_write_hint(inode));
}
/*
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 6/9] ext4: add support for passing in write hints for buffered writes
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
` (4 preceding siblings ...)
2017-06-19 17:04 ` [PATCH 5/9] fs: add support for buffered writeback to pass down write hints Jens Axboe
@ 2017-06-19 17:04 ` Jens Axboe
2017-06-19 17:05 ` [PATCH 7/9] xfs: " Jens Axboe
` (2 subsequent siblings)
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:04 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/ext4/page-io.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 930ca0fc9a0f..92834b702728 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -350,6 +350,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
REQ_SYNC : 0;
+ io_op_flags |= write_hint_to_opf(inode_write_hint(io->io_end->inode));
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
@@ -397,6 +398,7 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
+ io->io_bio->bi_opf |= write_hint_to_opf(inode_write_hint(inode));
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 7/9] xfs: add support for passing in write hints for buffered writes
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
` (5 preceding siblings ...)
2017-06-19 17:04 ` [PATCH 6/9] ext4: add support for passing in write hints for buffered writes Jens Axboe
@ 2017-06-19 17:05 ` Jens Axboe
2017-06-19 17:05 ` [PATCH 8/9] btrfs: " Jens Axboe
2017-06-19 17:05 ` [PATCH 9/9] nvme: add support for streams and directives Jens Axboe
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:05 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/xfs/xfs_aops.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 76b6f988e2fa..e4d9d470402c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -506,6 +506,7 @@ xfs_submit_ioend(
return status;
}
+ ioend->io_bio->bi_opf |= write_hint_to_opf(inode_write_hint(ioend->io_inode));
submit_bio(ioend->io_bio);
return 0;
}
@@ -565,6 +566,7 @@ xfs_chain_bio(
bio_chain(ioend->io_bio, new);
bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ ioend->io_bio->bi_opf |= write_hint_to_opf(inode_write_hint(ioend->io_inode));
submit_bio(ioend->io_bio);
ioend->io_bio = new;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 8/9] btrfs: add support for passing in write hints for buffered writes
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
` (6 preceding siblings ...)
2017-06-19 17:05 ` [PATCH 7/9] xfs: " Jens Axboe
@ 2017-06-19 17:05 ` Jens Axboe
2017-06-19 17:05 ` [PATCH 9/9] nvme: add support for streams and directives Jens Axboe
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:05 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
fs/btrfs/extent_io.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 19eedf2e630b..3e57cfaa6dd6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2830,6 +2830,7 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ op_flags |= write_hint_to_opf(inode_write_hint(page->mapping->host));
bio_set_op_attrs(bio, op, op_flags);
if (wbc) {
wbc_init_bio(wbc, bio);
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 9/9] nvme: add support for streams and directives
2017-06-19 17:04 [PATCHSET v8] Add support for write life time hints Jens Axboe
` (7 preceding siblings ...)
2017-06-19 17:05 ` [PATCH 8/9] btrfs: " Jens Axboe
@ 2017-06-19 17:05 ` Jens Axboe
8 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-19 17:05 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
This adds support for Directives in NVMe, particular for the Streams
directive. Support for Directives is a new feature in NVMe 1.3. It
allows a user to pass in information about where to store the data, so
that it the device can do so most effiently. If an application is
managing and writing data with different life times, mixing differently
retentioned data onto the same locations on flash can cause write
amplification to grow. This, in turn, will reduce performance and life
time of the device.
We default to allocating 4 streams, controller wide, so we can use them
on all name spaces. This is configurable with the 'streams' module
parameter. If a write stream is set in a write, flag is as such before
sending it to the device.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/core.c | 189 +++++++++++++++++++++++++++++++++++++++++++++--
drivers/nvme/host/nvme.h | 4 +
include/linux/nvme.h | 48 ++++++++++++
3 files changed, 236 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index aee37b73231d..fa1acfa27fa4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,6 +68,10 @@ MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if qu
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
+static bool streams = true;
+module_param(streams, bool, 0644);
+MODULE_PARM_DESC(streams, "use streams, if available");
+
static LIST_HEAD(nvme_ctrl_list);
static DEFINE_SPINLOCK(dev_list_lock);
@@ -297,6 +301,141 @@ struct request *nvme_alloc_request(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(nvme_alloc_request);
+/*
+
+ * Returns number of streams allocated for use by, or -1 on error.
+ */
+static int nvme_streams_allocate(struct nvme_ctrl *ctrl, unsigned int nstreams)
+{
+ struct nvme_command c;
+ union nvme_result res;
+ int ret;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_recv;
+ c.directive.nsid = cpu_to_le32(0xffffffff);
+ c.directive.doper = NVME_DIR_RCV_ST_OP_RESOURCE;
+ c.directive.dtype = NVME_DIR_STREAMS;
+ c.directive.endir = nstreams;
+
+ ret = __nvme_submit_sync_cmd(ctrl->admin_q, &c, &res, NULL, 0, 0,
+ NVME_QID_ANY, 0, 0);
+ if (ret)
+ return -1;
+
+ return le32_to_cpu(res.u32) & 0xffff;
+}
+
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_send;
+ c.directive.nsid = cpu_to_le32(0xffffffff);
+ c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+ c.directive.dtype = NVME_DIR_IDENTIFY;
+ c.directive.tdtype = NVME_DIR_STREAMS;
+ c.directive.endir = NVME_DIR_ENDIR;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
+ struct streams_directive_params *s, u32 nsid)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ memset(s, 0, sizeof(*s));
+
+ c.directive.opcode = nvme_admin_directive_recv;
+ c.directive.nsid = cpu_to_le32(nsid);
+ c.directive.numd = sizeof(*s);
+ c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+ c.directive.dtype = NVME_DIR_STREAMS;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
+}
+
+static int nvme_setup_directives(struct nvme_ctrl *ctrl)
+{
+ struct streams_directive_params s;
+ unsigned int nstreams;
+ int ret;
+
+ if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+ return 0;
+ if (!streams)
+ return 0;
+
+ ret = nvme_enable_streams(ctrl);
+ if (ret)
+ return ret;
+
+ ret = nvme_get_stream_params(ctrl, &s, 0xffffffff);
+ if (ret)
+ return ret;
+
+ ctrl->nssa = le16_to_cpu(s.nssa);
+
+ nstreams = min_t(unsigned int, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+ ret = nvme_streams_allocate(ctrl, nstreams);
+ if (ret < 0)
+ return ret;
+
+ /* require at least 2 streams to use them effectively */
+ if (ret > 1) {
+ ret = min(ret, BLK_MAX_WRITE_HINTS - 1);
+ ctrl->nr_streams = ret;
+ dev_info(ctrl->device, "successfully enabled %d streams\n", ret);
+ }
+
+ return 0;
+}
+
+/*
+ * Write hint number to stream mappings
+ */
+static const unsigned int stream_mappings[BLK_MAX_WRITE_HINTS][BLK_MAX_WRITE_HINTS] = {
+ /* 0 or 1 stream, we don't use streams */
+ { 0, },
+ { 0, },
+ /* collapse short+medium to short, and long+extreme to medium */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM, WRITE_LIFE_MEDIUM },
+ /* collapse long+extreme to long */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG, WRITE_LIFE_LONG },
+ /* 4 streams, no collapsing needed */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG, WRITE_LIFE_EXTREME },
+};
+
+/*
+ * Check if 'req' has a write hint associated with it. If it does, assign
+ * a valid namespace stream to the write. If we haven't setup streams yet,
+ * kick off configuration and ignore the hints until that has completed.
+ */
+static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
+ struct request *req, u16 *control,
+ u32 *dsmgmt)
+{
+ enum rw_hint streamid;
+
+ streamid = opf_to_write_hint(req->cmd_flags);
+ if (streamid != WRITE_LIFE_NONE) {
+ streamid = stream_mappings[ctrl->nr_streams][streamid];
+ *control |= NVME_RW_DTYPE_STREAMS;
+ *dsmgmt |= streamid << 16;
+ }
+
+ if (streamid < ARRAY_SIZE(req->q->write_hints))
+ req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+}
+
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
@@ -348,6 +487,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
+ struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
u32 dsmgmt = 0;
@@ -375,6 +515,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
+ nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
+
if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
@@ -1088,14 +1231,21 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
static void nvme_config_discard(struct nvme_ns *ns)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
u32 logical_block_size = queue_logical_block_size(ns->queue);
+ struct nvme_ctrl *ctrl = ns->ctrl;
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- ns->queue->limits.discard_alignment = logical_block_size;
- ns->queue->limits.discard_granularity = logical_block_size;
+ if (ctrl->nr_streams && ns->sws && ns->sgs) {
+ unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+
+ ns->queue->limits.discard_alignment = sz;
+ ns->queue->limits.discard_granularity = sz;
+ } else {
+ ns->queue->limits.discard_alignment = logical_block_size;
+ ns->queue->limits.discard_granularity = logical_block_size;
+ }
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
@@ -1135,6 +1285,7 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
u16 bs;
/*
@@ -1149,7 +1300,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
blk_mq_freeze_queue(disk->queue);
- if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+ if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
nvme_prep_integrity(disk, id, bs);
blk_queue_logical_block_size(ns->queue, bs);
if (ns->noiob)
@@ -1161,7 +1312,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
- if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
blk_mq_unfreeze_queue(disk->queue);
}
@@ -1766,6 +1917,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
nvme_configure_apst(ctrl);
+ nvme_setup_directives(ctrl);
ctrl->identified = true;
@@ -2158,6 +2310,32 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
return ret;
}
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
+{
+ struct streams_directive_params s;
+ int ret;
+
+ if (!ctrl->nr_streams)
+ return 0;
+
+ ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
+ if (ret)
+ return ret;
+
+ ns->sws = le32_to_cpu(s.sws);
+ ns->sgs = le16_to_cpu(s.sgs);
+
+ if (ns->sws) {
+ unsigned int bs = 1 << ns->lba_shift;
+
+ blk_queue_io_min(ns->queue, bs * ns->sws);
+ if (ns->sgs)
+ blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
+ }
+
+ return 0;
+}
+
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
@@ -2187,6 +2365,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
+ nvme_setup_streams_ns(ctrl, ns);
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ec8c7363934d..f616835afc4c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -147,6 +147,8 @@ struct nvme_ctrl {
u16 oncs;
u16 vid;
u16 oacs;
+ u16 nssa;
+ u16 nr_streams;
atomic_t abort_limit;
u8 event_limit;
u8 vwc;
@@ -199,6 +201,8 @@ struct nvme_ns {
unsigned ns_id;
int lba_shift;
u16 ms;
+ u16 sgs;
+ u32 sws;
bool ext;
u8 pi_type;
unsigned long flags;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 291587a0743f..f516a975bb21 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -253,6 +253,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
@@ -304,6 +305,19 @@ enum {
};
enum {
+ NVME_DIR_IDENTIFY = 0x00,
+ NVME_DIR_STREAMS = 0x01,
+ NVME_DIR_SND_ID_OP_ENABLE = 0x01,
+ NVME_DIR_SND_ST_OP_REL_ID = 0x01,
+ NVME_DIR_SND_ST_OP_REL_RSC = 0x02,
+ NVME_DIR_RCV_ID_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_STATUS = 0x02,
+ NVME_DIR_RCV_ST_OP_RESOURCE = 0x03,
+ NVME_DIR_ENDIR = 0x01,
+};
+
+enum {
NVME_NS_FEAT_THIN = 1 << 0,
NVME_NS_FLBAS_LBA_MASK = 0xf,
NVME_NS_FLBAS_META_EXT = 0x10,
@@ -560,6 +574,7 @@ enum {
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
+ NVME_RW_DTYPE_STREAMS = 1 << 4,
};
struct nvme_dsm_cmd {
@@ -634,6 +649,8 @@ enum nvme_admin_opcode {
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
+ nvme_admin_directive_send = 0x19,
+ nvme_admin_directive_recv = 0x1a,
nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
@@ -797,6 +814,24 @@ struct nvme_get_log_page_command {
__u32 rsvd14[2];
};
+struct nvme_directive_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __le32 numd;
+ __u8 doper;
+ __u8 dtype;
+ __le16 dspec;
+ __u8 endir;
+ __u8 tdtype;
+ __u16 rsvd15;
+
+ __u32 rsvd16[3];
+};
+
/*
* Fabrics subcommands.
*/
@@ -927,6 +962,18 @@ struct nvme_dbbuf {
__u32 rsvd12[6];
};
+struct streams_directive_params {
+ __u16 msl;
+ __u16 nssa;
+ __u16 nsso;
+ __u8 rsvd[10];
+ __u32 sws;
+ __u16 sgs;
+ __u16 nsa;
+ __u16 nso;
+ __u8 rsvd2[6];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -947,6 +994,7 @@ struct nvme_command {
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
struct nvme_dbbuf dbbuf;
+ struct nvme_directive_cmd directive;
};
};
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 9/9] nvme: add support for streams and directives
2017-06-21 0:21 [PATCHSET v9] Add support for write life time hints Jens Axboe
@ 2017-06-21 0:22 ` Jens Axboe
2017-06-26 9:59 ` Christoph Hellwig
0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2017-06-21 0:22 UTC (permalink / raw)
To: linux-fsdevel, linux-block
Cc: adilger, hch, martin.petersen, linux-nvme, Jens Axboe
This adds support for Directives in NVMe, particular for the Streams
directive. Support for Directives is a new feature in NVMe 1.3. It
allows a user to pass in information about where to store the data, so
that it the device can do so most effiently. If an application is
managing and writing data with different life times, mixing differently
retentioned data onto the same locations on flash can cause write
amplification to grow. This, in turn, will reduce performance and life
time of the device.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/core.c | 142 +++++++++++++++++++++++++++++++++++++++++++++--
drivers/nvme/host/nvme.h | 4 ++
include/linux/nvme.h | 48 ++++++++++++++++
3 files changed, 190 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index aee37b73231d..fcccc1534f7b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -297,6 +297,100 @@ struct request *nvme_alloc_request(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(nvme_alloc_request);
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_send;
+ c.directive.nsid = cpu_to_le32(0xffffffff);
+ c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+ c.directive.dtype = NVME_DIR_IDENTIFY;
+ c.directive.tdtype = NVME_DIR_STREAMS;
+ c.directive.endir = NVME_DIR_ENDIR;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
+ struct streams_directive_params *s, u32 nsid)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ memset(s, 0, sizeof(*s));
+
+ c.directive.opcode = nvme_admin_directive_recv;
+ c.directive.nsid = cpu_to_le32(nsid);
+ c.directive.numd = sizeof(*s);
+ c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+ c.directive.dtype = NVME_DIR_STREAMS;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
+}
+
+static int nvme_configure_directives(struct nvme_ctrl *ctrl)
+{
+ struct streams_directive_params s;
+ int ret;
+
+ if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+ return 0;
+
+ ret = nvme_enable_streams(ctrl);
+ if (ret)
+ return ret;
+
+ ret = nvme_get_stream_params(ctrl, &s, 0xffffffff);
+ if (ret)
+ return ret;
+
+ ctrl->nssa = le16_to_cpu(s.nssa);
+ ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+ return 0;
+}
+
+/*
+ * Write hint number to stream mappings
+ */
+static const unsigned int stream_mappings[BLK_MAX_WRITE_HINTS][BLK_MAX_WRITE_HINTS] = {
+ /* 0 or 1 stream, we don't use streams */
+ { 0, },
+ { 0, },
+ /* collapse short+medium to short, and long+extreme to medium */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM, WRITE_LIFE_MEDIUM },
+ /* collapse long+extreme to long */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG, WRITE_LIFE_LONG },
+ /* 4 streams, no collapsing needed */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG, WRITE_LIFE_EXTREME },
+};
+
+/*
+ * Check if 'req' has a write hint associated with it. If it does, assign
+ * a valid namespace stream to the write. If we haven't setup streams yet,
+ * kick off configuration and ignore the hints until that has completed.
+ */
+static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
+ struct request *req, u16 *control,
+ u32 *dsmgmt)
+{
+ enum rw_hint streamid;
+
+ streamid = opf_to_write_hint(req->cmd_flags);
+ if (streamid != WRITE_LIFE_NONE) {
+ streamid = stream_mappings[ctrl->nr_streams][streamid - 1];
+ *control |= NVME_RW_DTYPE_STREAMS;
+ *dsmgmt |= streamid << 16;
+ }
+
+ if (streamid < ARRAY_SIZE(req->q->write_hints))
+ req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+}
+
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
@@ -348,6 +442,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
+ struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
u32 dsmgmt = 0;
@@ -375,6 +470,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
+ nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
+
if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
@@ -1094,8 +1192,15 @@ static void nvme_config_discard(struct nvme_ns *ns)
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- ns->queue->limits.discard_alignment = logical_block_size;
- ns->queue->limits.discard_granularity = logical_block_size;
+ if (ctrl->nr_streams && ns->sws && ns->sgs) {
+ unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+
+ ns->queue->limits.discard_alignment = sz;
+ ns->queue->limits.discard_granularity = sz;
+ } else {
+ ns->queue->limits.discard_alignment = logical_block_size;
+ ns->queue->limits.discard_granularity = logical_block_size;
+ }
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
@@ -1135,6 +1240,7 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
u16 bs;
/*
@@ -1149,7 +1255,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
blk_mq_freeze_queue(disk->queue);
- if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+ if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
nvme_prep_integrity(disk, id, bs);
blk_queue_logical_block_size(ns->queue, bs);
if (ns->noiob)
@@ -1161,7 +1267,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
- if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
blk_mq_unfreeze_queue(disk->queue);
}
@@ -1766,6 +1872,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
nvme_configure_apst(ctrl);
+ nvme_configure_directives(ctrl);
ctrl->identified = true;
@@ -2158,6 +2265,32 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
return ret;
}
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
+{
+ struct streams_directive_params s;
+ int ret;
+
+ if (!ctrl->nr_streams)
+ return 0;
+
+ ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
+ if (ret)
+ return ret;
+
+ ns->sws = le32_to_cpu(s.sws);
+ ns->sgs = le16_to_cpu(s.sgs);
+
+ if (ns->sws) {
+ unsigned int bs = 1 << ns->lba_shift;
+
+ blk_queue_io_min(ns->queue, bs * ns->sws);
+ if (ns->sgs)
+ blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
+ }
+
+ return 0;
+}
+
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
@@ -2187,6 +2320,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
+ nvme_setup_streams_ns(ctrl, ns);
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ec8c7363934d..f616835afc4c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -147,6 +147,8 @@ struct nvme_ctrl {
u16 oncs;
u16 vid;
u16 oacs;
+ u16 nssa;
+ u16 nr_streams;
atomic_t abort_limit;
u8 event_limit;
u8 vwc;
@@ -199,6 +201,8 @@ struct nvme_ns {
unsigned ns_id;
int lba_shift;
u16 ms;
+ u16 sgs;
+ u32 sws;
bool ext;
u8 pi_type;
unsigned long flags;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 291587a0743f..f516a975bb21 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -253,6 +253,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
@@ -304,6 +305,19 @@ enum {
};
enum {
+ NVME_DIR_IDENTIFY = 0x00,
+ NVME_DIR_STREAMS = 0x01,
+ NVME_DIR_SND_ID_OP_ENABLE = 0x01,
+ NVME_DIR_SND_ST_OP_REL_ID = 0x01,
+ NVME_DIR_SND_ST_OP_REL_RSC = 0x02,
+ NVME_DIR_RCV_ID_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_STATUS = 0x02,
+ NVME_DIR_RCV_ST_OP_RESOURCE = 0x03,
+ NVME_DIR_ENDIR = 0x01,
+};
+
+enum {
NVME_NS_FEAT_THIN = 1 << 0,
NVME_NS_FLBAS_LBA_MASK = 0xf,
NVME_NS_FLBAS_META_EXT = 0x10,
@@ -560,6 +574,7 @@ enum {
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
+ NVME_RW_DTYPE_STREAMS = 1 << 4,
};
struct nvme_dsm_cmd {
@@ -634,6 +649,8 @@ enum nvme_admin_opcode {
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
+ nvme_admin_directive_send = 0x19,
+ nvme_admin_directive_recv = 0x1a,
nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
@@ -797,6 +814,24 @@ struct nvme_get_log_page_command {
__u32 rsvd14[2];
};
+struct nvme_directive_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __le32 numd;
+ __u8 doper;
+ __u8 dtype;
+ __le16 dspec;
+ __u8 endir;
+ __u8 tdtype;
+ __u16 rsvd15;
+
+ __u32 rsvd16[3];
+};
+
/*
* Fabrics subcommands.
*/
@@ -927,6 +962,18 @@ struct nvme_dbbuf {
__u32 rsvd12[6];
};
+struct streams_directive_params {
+ __u16 msl;
+ __u16 nssa;
+ __u16 nsso;
+ __u8 rsvd[10];
+ __u32 sws;
+ __u16 sgs;
+ __u16 nsa;
+ __u16 nso;
+ __u8 rsvd2[6];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -947,6 +994,7 @@ struct nvme_command {
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
struct nvme_dbbuf dbbuf;
+ struct nvme_directive_cmd directive;
};
};
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-21 0:22 ` [PATCH 9/9] nvme: add support for streams and directives Jens Axboe
@ 2017-06-26 9:59 ` Christoph Hellwig
2017-06-26 13:56 ` Jens Axboe
2017-06-26 17:52 ` Martin K. Petersen
0 siblings, 2 replies; 25+ messages in thread
From: Christoph Hellwig @ 2017-06-26 9:59 UTC (permalink / raw)
To: Jens Axboe
Cc: linux-fsdevel, linux-block, adilger, hch, martin.petersen,
linux-nvme
Looks mostly good,
but two nit-picks:
- can we keep a module option to disable streams, or in fact for
now maybe to explicitly enable it? I expect this to be interesting
at least for the first devices that implement it. Also given that
it needs to be explicitly enabled I would expect some overhead of
just enabling it when never used
- do we even need the < 4 streams fallback now that they are global
instead of per-ns instead of just disabling the feature for now?
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 9:59 ` Christoph Hellwig
@ 2017-06-26 13:56 ` Jens Axboe
2017-06-26 19:36 ` Andreas Dilger
2017-06-27 14:11 ` Christoph Hellwig
2017-06-26 17:52 ` Martin K. Petersen
1 sibling, 2 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-26 13:56 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, linux-block, adilger, martin.petersen, linux-nvme
On 06/26/2017 03:59 AM, Christoph Hellwig wrote:
> Looks mostly good,
>
> but two nit-picks:
>
> - can we keep a module option to disable streams, or in fact for
> now maybe to explicitly enable it? I expect this to be interesting
> at least for the first devices that implement it. Also given that
> it needs to be explicitly enabled I would expect some overhead of
> just enabling it when never used
Fine with me, I can add the 'streams' parameter back, but just default
it to false.
> - do we even need the < 4 streams fallback now that they are global
> instead of per-ns instead of just disabling the feature for now?
Maybe the device only supports 2? or 3?
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 13:56 ` Jens Axboe
@ 2017-06-26 19:36 ` Andreas Dilger
2017-06-26 19:39 ` Jens Axboe
2017-06-27 14:11 ` Christoph Hellwig
1 sibling, 1 reply; 25+ messages in thread
From: Andreas Dilger @ 2017-06-26 19:36 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, linux-fsdevel, linux-block, Martin Petersen,
linux-nvme
[-- Attachment #1: Type: text/plain, Size: 895 bytes --]
On Jun 26, 2017, at 7:56 AM, Jens Axboe <axboe@kernel.dk> wrote:
>
> On 06/26/2017 03:59 AM, Christoph Hellwig wrote:
>> Looks mostly good,
>>
>> but two nit-picks:
>>
>> - can we keep a module option to disable streams, or in fact for
>> now maybe to explicitly enable it? I expect this to be interesting
>> at least for the first devices that implement it. Also given that
>> it needs to be explicitly enabled I would expect some overhead of
>> just enabling it when never used
>
> Fine with me, I can add the 'streams' parameter back, but just default
> it to false.
Better would be a parameter to set the default streams count, 0 by default.
>> - do we even need the < 4 streams fallback now that they are global
>> instead of per-ns instead of just disabling the feature for now?
>
> Maybe the device only supports 2? or 3?
>
> --
> Jens Axboe
>
Cheers, Andreas
[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 195 bytes --]
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 19:36 ` Andreas Dilger
@ 2017-06-26 19:39 ` Jens Axboe
0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-26 19:39 UTC (permalink / raw)
To: Andreas Dilger
Cc: Christoph Hellwig, linux-fsdevel, linux-block, Martin Petersen,
linux-nvme
On 06/26/2017 01:36 PM, Andreas Dilger wrote:
> On Jun 26, 2017, at 7:56 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>
>> On 06/26/2017 03:59 AM, Christoph Hellwig wrote:
>>> Looks mostly good,
>>>
>>> but two nit-picks:
>>>
>>> - can we keep a module option to disable streams, or in fact for
>>> now maybe to explicitly enable it? I expect this to be interesting
>>> at least for the first devices that implement it. Also given that
>>> it needs to be explicitly enabled I would expect some overhead of
>>> just enabling it when never used
>>
>> Fine with me, I can add the 'streams' parameter back, but just default
>> it to false.
>
> Better would be a parameter to set the default streams count, 0 by default.
The user should not need to know. If streams is enabled (bool), then it'll
ask for as many as we need on the block side right now, and scale down if
we have to. So I'd rather keep it as a "use streams or not" bool on the
nvme side.
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 13:56 ` Jens Axboe
2017-06-26 19:36 ` Andreas Dilger
@ 2017-06-27 14:11 ` Christoph Hellwig
2017-06-27 14:16 ` Jens Axboe
1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2017-06-27 14:11 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, linux-fsdevel, linux-block, adilger,
martin.petersen, linux-nvme
On Mon, Jun 26, 2017 at 07:56:22AM -0600, Jens Axboe wrote:
> > - do we even need the < 4 streams fallback now that they are global
> > instead of per-ns instead of just disabling the feature for now?
>
> Maybe the device only supports 2? or 3?
My crystal ball indicates that those are unlikely too see the
light. IFF we need to handle them we can still add code for it.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-27 14:11 ` Christoph Hellwig
@ 2017-06-27 14:16 ` Jens Axboe
2017-06-27 14:44 ` Christoph Hellwig
0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2017-06-27 14:16 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, linux-block, adilger, martin.petersen, linux-nvme
On 06/27/2017 08:11 AM, Christoph Hellwig wrote:
> On Mon, Jun 26, 2017 at 07:56:22AM -0600, Jens Axboe wrote:
>>> - do we even need the < 4 streams fallback now that they are global
>>> instead of per-ns instead of just disabling the feature for now?
>>
>> Maybe the device only supports 2? or 3?
>
> My crystal ball indicates that those are unlikely too see the
> light. IFF we need to handle them we can still add code for it.
But we have to handle it, not doing so would be fragile. So our
options are:
1) Keep the stream_mappings[] array. It's simple, and it'll work
for any number of streams.
2) Kill stream_mappings[] and just do the MOD again.
I'd strongly lean towards #1. I don't have a lot of faith in
crystal balls.
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-27 14:16 ` Jens Axboe
@ 2017-06-27 14:44 ` Christoph Hellwig
2017-06-27 14:46 ` Jens Axboe
0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2017-06-27 14:44 UTC (permalink / raw)
To: Jens Axboe
Cc: Christoph Hellwig, linux-fsdevel, linux-block, adilger,
martin.petersen, linux-nvme
On Tue, Jun 27, 2017 at 08:16:49AM -0600, Jens Axboe wrote:
> But we have to handle it, not doing so would be fragile. So our
> options are:
>
> 1) Keep the stream_mappings[] array. It's simple, and it'll work
> for any number of streams.
>
> 2) Kill stream_mappings[] and just do the MOD again.
3) print a message and tell streams aren't supported on this device.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-27 14:44 ` Christoph Hellwig
@ 2017-06-27 14:46 ` Jens Axboe
2017-06-27 14:56 ` Jens Axboe
0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2017-06-27 14:46 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, linux-block, adilger, martin.petersen, linux-nvme
On 06/27/2017 08:44 AM, Christoph Hellwig wrote:
> On Tue, Jun 27, 2017 at 08:16:49AM -0600, Jens Axboe wrote:
>> But we have to handle it, not doing so would be fragile. So our
>> options are:
>>
>> 1) Keep the stream_mappings[] array. It's simple, and it'll work
>> for any number of streams.
>>
>> 2) Kill stream_mappings[] and just do the MOD again.
>
> 3) print a message and tell streams aren't supported on this device.
Is that your nvme preference - if less than 4 streams, just ignore it?
Would seem a shame to lose out with 2 streams, I can think of several
hot/cold scenarios that would probably work fine with that. But at the
same time, not a big deal to me, if you prefer just turning it off
for < 4 streams, that's fine with me.
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-27 14:46 ` Jens Axboe
@ 2017-06-27 14:56 ` Jens Axboe
0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-27 14:56 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, linux-block, adilger, martin.petersen, linux-nvme
On 06/27/2017 08:46 AM, Jens Axboe wrote:
> On 06/27/2017 08:44 AM, Christoph Hellwig wrote:
>> On Tue, Jun 27, 2017 at 08:16:49AM -0600, Jens Axboe wrote:
>>> But we have to handle it, not doing so would be fragile. So our
>>> options are:
>>>
>>> 1) Keep the stream_mappings[] array. It's simple, and it'll work
>>> for any number of streams.
>>>
>>> 2) Kill stream_mappings[] and just do the MOD again.
>>
>> 3) print a message and tell streams aren't supported on this device.
>
> Is that your nvme preference - if less than 4 streams, just ignore it?
> Would seem a shame to lose out with 2 streams, I can think of several
> hot/cold scenarios that would probably work fine with that. But at the
> same time, not a big deal to me, if you prefer just turning it off
> for < 4 streams, that's fine with me.
http://git.kernel.dk/cgit/linux-block/commit/?h=write-stream&id=98335ae1347a9a08adc831e77e8fefcf06ab8282
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 9:59 ` Christoph Hellwig
2017-06-26 13:56 ` Jens Axboe
@ 2017-06-26 17:52 ` Martin K. Petersen
2017-06-26 18:00 ` Jens Axboe
1 sibling, 1 reply; 25+ messages in thread
From: Martin K. Petersen @ 2017-06-26 17:52 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jens Axboe, linux-fsdevel, linux-block, adilger, martin.petersen,
linux-nvme
Christoph,
> - can we keep a module option to disable streams, or in fact for
> now maybe to explicitly enable it? I expect this to be interesting
> at least for the first devices that implement it. Also given that
> it needs to be explicitly enabled I would expect some overhead of
> just enabling it when never used
Yeah, based on my experiments we'll need to drive this as an opt-in
feature for now. Short term the module option is OK. Once more devices
start materializing we probably need a white/blacklist/quirk scheme.
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 17:52 ` Martin K. Petersen
@ 2017-06-26 18:00 ` Jens Axboe
0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-26 18:00 UTC (permalink / raw)
To: Martin K. Petersen, Christoph Hellwig
Cc: linux-fsdevel, linux-block, adilger, linux-nvme
On 06/26/2017 11:52 AM, Martin K. Petersen wrote:
>
> Christoph,
>
>> - can we keep a module option to disable streams, or in fact for
>> now maybe to explicitly enable it? I expect this to be interesting
>> at least for the first devices that implement it. Also given that
>> it needs to be explicitly enabled I would expect some overhead of
>> just enabling it when never used
>
> Yeah, based on my experiments we'll need to drive this as an opt-in
> feature for now. Short term the module option is OK. Once more devices
> start materializing we probably need a white/blacklist/quirk scheme.
Completely agree. Might even need quirks for stream allocations too,
for instance. But let's hope we can keep it clean.
--
Jens Axboe
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 9/9] nvme: add support for streams and directives
2017-06-26 15:37 [PATCHSET v10] Add support for write life time hints Jens Axboe
@ 2017-06-26 15:38 ` Jens Axboe
0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2017-06-26 15:38 UTC (permalink / raw)
To: linux-block; +Cc: linux-fsdevel, hch, martin.petersen, Jens Axboe
This adds support for Directives in NVMe, particular for the Streams
directive. Support for Directives is a new feature in NVMe 1.3. It
allows a user to pass in information about where to store the data, so
that it the device can do so most effiently. If an application is
managing and writing data with different life times, mixing differently
retentioned data onto the same locations on flash can cause write
amplification to grow. This, in turn, will reduce performance and life
time of the device.
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/core.c | 148 +++++++++++++++++++++++++++++++++++++++++++++--
drivers/nvme/host/nvme.h | 4 ++
include/linux/nvme.h | 48 +++++++++++++++
3 files changed, 196 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index aee37b73231d..2d9835617953 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -65,6 +65,10 @@ static bool force_apst;
module_param(force_apst, bool, 0644);
MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+static bool streams;
+module_param(streams, bool, 0644);
+MODULE_PARM_DESC(stream, "turn on support for Streams write directives");
+
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
@@ -297,6 +301,102 @@ struct request *nvme_alloc_request(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(nvme_alloc_request);
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+
+ c.directive.opcode = nvme_admin_directive_send;
+ c.directive.nsid = cpu_to_le32(0xffffffff);
+ c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+ c.directive.dtype = NVME_DIR_IDENTIFY;
+ c.directive.tdtype = NVME_DIR_STREAMS;
+ c.directive.endir = NVME_DIR_ENDIR;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
+ struct streams_directive_params *s, u32 nsid)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ memset(s, 0, sizeof(*s));
+
+ c.directive.opcode = nvme_admin_directive_recv;
+ c.directive.nsid = cpu_to_le32(nsid);
+ c.directive.numd = sizeof(*s);
+ c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+ c.directive.dtype = NVME_DIR_STREAMS;
+
+ return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
+}
+
+static int nvme_configure_directives(struct nvme_ctrl *ctrl)
+{
+ struct streams_directive_params s;
+ int ret;
+
+ if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+ return 0;
+ if (!streams)
+ return 0;
+
+ ret = nvme_enable_streams(ctrl);
+ if (ret)
+ return ret;
+
+ ret = nvme_get_stream_params(ctrl, &s, 0xffffffff);
+ if (ret)
+ return ret;
+
+ ctrl->nssa = le16_to_cpu(s.nssa);
+ ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+ return 0;
+}
+
+/*
+ * Write hint number to stream mappings
+ */
+static const unsigned int stream_mappings[BLK_MAX_WRITE_HINTS][BLK_MAX_WRITE_HINTS] = {
+ /* 0 or 1 stream, we don't use streams */
+ { 0, },
+ { 0, },
+ /* collapse short+medium to short, and long+extreme to medium */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM, WRITE_LIFE_MEDIUM },
+ /* collapse long+extreme to long */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG, WRITE_LIFE_LONG },
+ /* 4 streams, no collapsing needed */
+ { WRITE_LIFE_NONE, WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG, WRITE_LIFE_EXTREME },
+};
+
+/*
+ * Check if 'req' has a write hint associated with it. If it does, assign
+ * a valid namespace stream to the write. If we haven't setup streams yet,
+ * kick off configuration and ignore the hints until that has completed.
+ */
+static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
+ struct request *req, u16 *control,
+ u32 *dsmgmt)
+{
+ enum rw_hint streamid;
+
+ streamid = opf_to_write_hint(req->cmd_flags);
+ if (streamid != WRITE_LIFE_NONE) {
+ streamid = stream_mappings[ctrl->nr_streams][streamid - 1];
+ *control |= NVME_RW_DTYPE_STREAMS;
+ *dsmgmt |= streamid << 16;
+ }
+
+ if (streamid < ARRAY_SIZE(req->q->write_hints))
+ req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+}
+
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
@@ -348,6 +448,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
+ struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
u32 dsmgmt = 0;
@@ -375,6 +476,9 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
+ nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
+
if (ns->ms) {
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
@@ -1094,8 +1198,15 @@ static void nvme_config_discard(struct nvme_ns *ns)
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- ns->queue->limits.discard_alignment = logical_block_size;
- ns->queue->limits.discard_granularity = logical_block_size;
+ if (ctrl->nr_streams && ns->sws && ns->sgs) {
+ unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+
+ ns->queue->limits.discard_alignment = sz;
+ ns->queue->limits.discard_granularity = sz;
+ } else {
+ ns->queue->limits.discard_alignment = logical_block_size;
+ ns->queue->limits.discard_granularity = logical_block_size;
+ }
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
@@ -1135,6 +1246,7 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
u16 bs;
/*
@@ -1149,7 +1261,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
blk_mq_freeze_queue(disk->queue);
- if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
+ if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
nvme_prep_integrity(disk, id, bs);
blk_queue_logical_block_size(ns->queue, bs);
if (ns->noiob)
@@ -1161,7 +1273,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
else
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
- if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
blk_mq_unfreeze_queue(disk->queue);
}
@@ -1766,6 +1878,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
nvme_configure_apst(ctrl);
+ nvme_configure_directives(ctrl);
ctrl->identified = true;
@@ -2158,6 +2271,32 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
return ret;
}
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
+{
+ struct streams_directive_params s;
+ int ret;
+
+ if (!ctrl->nr_streams)
+ return 0;
+
+ ret = nvme_get_stream_params(ctrl, &s, ns->ns_id);
+ if (ret)
+ return ret;
+
+ ns->sws = le32_to_cpu(s.sws);
+ ns->sgs = le16_to_cpu(s.sgs);
+
+ if (ns->sws) {
+ unsigned int bs = 1 << ns->lba_shift;
+
+ blk_queue_io_min(ns->queue, bs * ns->sws);
+ if (ns->sgs)
+ blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
+ }
+
+ return 0;
+}
+
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
@@ -2187,6 +2326,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
+ nvme_setup_streams_ns(ctrl, ns);
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ec8c7363934d..f616835afc4c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -147,6 +147,8 @@ struct nvme_ctrl {
u16 oncs;
u16 vid;
u16 oacs;
+ u16 nssa;
+ u16 nr_streams;
atomic_t abort_limit;
u8 event_limit;
u8 vwc;
@@ -199,6 +201,8 @@ struct nvme_ns {
unsigned ns_id;
int lba_shift;
u16 ms;
+ u16 sgs;
+ u32 sws;
bool ext;
u8 pi_type;
unsigned long flags;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 291587a0743f..f516a975bb21 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -253,6 +253,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
@@ -304,6 +305,19 @@ enum {
};
enum {
+ NVME_DIR_IDENTIFY = 0x00,
+ NVME_DIR_STREAMS = 0x01,
+ NVME_DIR_SND_ID_OP_ENABLE = 0x01,
+ NVME_DIR_SND_ST_OP_REL_ID = 0x01,
+ NVME_DIR_SND_ST_OP_REL_RSC = 0x02,
+ NVME_DIR_RCV_ID_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_PARAM = 0x01,
+ NVME_DIR_RCV_ST_OP_STATUS = 0x02,
+ NVME_DIR_RCV_ST_OP_RESOURCE = 0x03,
+ NVME_DIR_ENDIR = 0x01,
+};
+
+enum {
NVME_NS_FEAT_THIN = 1 << 0,
NVME_NS_FLBAS_LBA_MASK = 0xf,
NVME_NS_FLBAS_META_EXT = 0x10,
@@ -560,6 +574,7 @@ enum {
NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
+ NVME_RW_DTYPE_STREAMS = 1 << 4,
};
struct nvme_dsm_cmd {
@@ -634,6 +649,8 @@ enum nvme_admin_opcode {
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
+ nvme_admin_directive_send = 0x19,
+ nvme_admin_directive_recv = 0x1a,
nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
@@ -797,6 +814,24 @@ struct nvme_get_log_page_command {
__u32 rsvd14[2];
};
+struct nvme_directive_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __le32 numd;
+ __u8 doper;
+ __u8 dtype;
+ __le16 dspec;
+ __u8 endir;
+ __u8 tdtype;
+ __u16 rsvd15;
+
+ __u32 rsvd16[3];
+};
+
/*
* Fabrics subcommands.
*/
@@ -927,6 +962,18 @@ struct nvme_dbbuf {
__u32 rsvd12[6];
};
+struct streams_directive_params {
+ __u16 msl;
+ __u16 nssa;
+ __u16 nsso;
+ __u8 rsvd[10];
+ __u32 sws;
+ __u16 sgs;
+ __u16 nsa;
+ __u16 nso;
+ __u8 rsvd2[6];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -947,6 +994,7 @@ struct nvme_command {
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
struct nvme_dbbuf dbbuf;
+ struct nvme_directive_cmd directive;
};
};
--
2.7.4
^ permalink raw reply related [flat|nested] 25+ messages in thread