[PATCH] NVMe: Add rw

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] NVMe: Add rw_page support
@ 2014-11-14  0:05 Keith Busch
  2014-11-14  1:29 ` Jens Axboe
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Keith Busch @ 2014-11-14  0:05 UTC (permalink / raw)


This adds the rw_page entry point to the nvme driver so a page can be
written/read without going through the block layer and not requiring
additional allocations.

Just because we implement this doesn't mean we want to use it. I only see
a performance win on some types of work, like swap where I see about 15%
reduction in system time (compared to 20% prior to blk-mq when we didn't
allocate a request to get a command id). Even then, system time accounts
for very little of the real time anyway, and it's only an over-all win
if the device has very low-latency. But the driver doesn't know this
nor if the expected workload will even benefit from using page io,
so I added a queue flag that a user can toggle on/off.

The other benefit besides reduced system time is that we can swap
pages in/out without having to allocate anything since everything is
preallocated in this path.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
 block/blk-sysfs.c         |    8 +++++
 drivers/block/nvme-core.c |   78 +++++++++++++++++++++++++++++++++++++++++++++
 fs/block_dev.c            |    4 +--
 include/linux/blkdev.h    |    2 ++
 4 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1fac434..b29de5f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -221,6 +221,7 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \
 QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
 QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+QUEUE_SYSFS_BIT_FNS(rw_page, RW_PG, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
@@ -404,6 +405,12 @@ static struct queue_sysfs_entry queue_random_entry = {
 	.store = queue_store_random,
 };
 
+static struct queue_sysfs_entry queue_rw_page_entry = {
+	.attr = {.name = "rw_page", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_show_rw_page,
+	.store = queue_store_rw_page,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -427,6 +434,7 @@ static struct attribute *default_attrs[] = {
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
 	&queue_random_entry.attr,
+	&queue_rw_page_entry.attr,
 	NULL,
 };
 
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 8393f91..96a1d61 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -143,6 +143,7 @@ struct nvme_cmd_info {
 	nvme_completion_fn fn;
 	void *ctx;
 	int aborted;
+	dma_addr_t dma;
 	struct nvme_queue *nvmeq;
 };
 
@@ -1807,8 +1808,85 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	return 0;
 }
 
+static void pgrd_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct request *req = ctx;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct page *page = req->special;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_FROM_DEVICE);
+	page_endio(page, READ, status != NVME_SC_SUCCESS);
+	blk_put_request(req);
+}
+
+static void pgwr_completion(struct nvme_queue *nvmeq, void *ctx,
+						struct nvme_completion *cqe)
+{
+	struct request *req = ctx;
+	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+	struct page *page = req->special;
+	u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+	dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_TO_DEVICE);
+	page_endio(page, WRITE, status != NVME_SC_SUCCESS);
+	blk_put_request(req);
+}
+
+static const enum dma_data_direction nvme_to_direction[] = {
+	DMA_NONE, DMA_TO_DEVICE, DMA_FROM_DEVICE, DMA_BIDIRECTIONAL
+};
+
+static int nvme_rw_page(struct block_device *bdev, sector_t sector,
+					struct page *page, int rw)
+{
+	dma_addr_t dma;
+	struct request *req;
+	struct nvme_command *cmd;
+	struct nvme_queue *nvmeq;
+	struct nvme_cmd_info *cmd_rq;
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	nvme_completion_fn fn = (rw & WRITE) ? pgwr_completion : pgrd_completion;
+	u8 op = (rw & WRITE) ? nvme_cmd_write : nvme_cmd_read;
+	enum dma_data_direction dma_dir = nvme_to_direction[op & 3];
+
+	req = blk_mq_alloc_request(ns->queue, rw, GFP_ATOMIC, false);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->special = page;
+	cmd_rq = blk_mq_rq_to_pdu(req);
+	nvmeq = cmd_rq->nvmeq;
+
+	nvme_set_info(cmd_rq, req, fn);
+
+	dma = dma_map_page(nvmeq->q_dmadev, page, 0, PAGE_CACHE_SIZE, dma_dir);
+	cmd_rq->dma = dma;
+
+	spin_lock_irq(&nvmeq->q_lock);
+	cmd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->rw.opcode = op;
+	cmd->rw.command_id = req->tag;
+	cmd->rw.nsid = cpu_to_le32(ns->ns_id);
+	cmd->rw.slba = cpu_to_le64(nvme_block_nr(ns, sector));
+	cmd->rw.length = cpu_to_le16((PAGE_CACHE_SIZE >> ns->lba_shift) - 1);
+	cmd->rw.prp1 = cpu_to_le64(dma);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	nvme_process_cq(nvmeq);
+	spin_unlock_irq(&nvmeq->q_lock);
+
+	return 0;
+}
+
 static const struct block_device_operations nvme_fops = {
 	.owner		= THIS_MODULE,
+	.rw_page	= nvme_rw_page,
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
 	.open		= nvme_open,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d411..f17f95d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -380,7 +380,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 			struct page *page)
 {
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	if (!ops->rw_page)
+	if (!ops->rw_page || !blk_queue_rw_page(bdev->bd_queue))
 		return -EOPNOTSUPP;
 	return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
 }
@@ -411,7 +411,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 	int result;
 	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
-	if (!ops->rw_page)
+	if (!ops->rw_page || !blk_queue_rw_page(bdev->bd_queue))
 		return -EOPNOTSUPP;
 	set_page_writeback(page);
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 77db6dc..17a6058 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -509,6 +509,7 @@ struct request_queue {
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
 #define QUEUE_FLAG_SG_GAPS     22	/* queue doesn't support SG gaps */
+#define QUEUE_FLAG_RW_PG       23	/* use .rw_page if implemented */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -596,6 +597,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_rw_page(q)	test_bit(QUEUE_FLAG_RW_PG, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14  0:05 [PATCH] NVMe: Add rw_page support Keith Busch
@ 2014-11-14  1:29 ` Jens Axboe
  2014-11-14 14:58   ` Matthew Wilcox
  2014-11-14 14:55 ` Matthew Wilcox
       [not found] ` <CANvN+ekQTdNgPe33iaM_9=2Hjrfds2B2R3d3XK06K9n=SY+ZKA@mail.gmail.com>
  2 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2014-11-14  1:29 UTC (permalink / raw)


On 2014-11-13 17:05, Keith Busch wrote:
> This adds the rw_page entry point to the nvme driver so a page can be
> written/read without going through the block layer and not requiring
> additional allocations.
>
> Just because we implement this doesn't mean we want to use it. I only see
> a performance win on some types of work, like swap where I see about 15%
> reduction in system time (compared to 20% prior to blk-mq when we didn't
> allocate a request to get a command id). Even then, system time accounts
> for very little of the real time anyway, and it's only an over-all win
> if the device has very low-latency. But the driver doesn't know this
> nor if the expected workload will even benefit from using page io,
> so I added a queue flag that a user can toggle on/off.

Of the reduced system time, where was that spent?

> The other benefit besides reduced system time is that we can swap
> pages in/out without having to allocate anything since everything is
> preallocated in this path.

I have an iod prealloc patch that gets rid of the last kmalloc/kfree in 
the IO path for nvme, for smaller IO (defaults to 8KB, or 2 segments), 
making the generic IO path contain no allocations/frees for smaller IO. 
Probably that would get us pretty close to rw_page. For direct/sync IO, 
we'll go direct to ->queue_rq mostly, too.

The downside I see is that this is an OOB IO path. Once we start adding 
IO scheduling for those that need that, then this will completely bypass 
that.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14  0:05 [PATCH] NVMe: Add rw_page support Keith Busch
  2014-11-14  1:29 ` Jens Axboe
@ 2014-11-14 14:55 ` Matthew Wilcox
       [not found] ` <CANvN+ekQTdNgPe33iaM_9=2Hjrfds2B2R3d3XK06K9n=SY+ZKA@mail.gmail.com>
  2 siblings, 0 replies; 14+ messages in thread
From: Matthew Wilcox @ 2014-11-14 14:55 UTC (permalink / raw)


On Thu, Nov 13, 2014@05:05:38PM -0700, Keith Busch wrote:
> +static void pgrd_completion(struct nvme_queue *nvmeq, void *ctx,
> +						struct nvme_completion *cqe)
> +{
> +	struct request *req = ctx;
> +	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
> +	struct page *page = req->special;
> +	u16 status = le16_to_cpup(&cqe->status) >> 1;
> +
> +	dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_FROM_DEVICE);
> +	page_endio(page, READ, status != NVME_SC_SUCCESS);

The third parameter to page_endio() is supposed to be an errno.  This will
happen to work fine since anything other than 0 and -ENOSPC gets translated
to -EIO, but it'd be better to make this line:

	page_endio(page, READ, nvme_error_status(status));

> +static void pgwr_completion(struct nvme_queue *nvmeq, void *ctx,
> +						struct nvme_completion *cqe)
> +{
> +	struct request *req = ctx;
> +	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
> +	struct page *page = req->special;
> +	u16 status = le16_to_cpup(&cqe->status) >> 1;
> +
> +	dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_TO_DEVICE);
> +	page_endio(page, WRITE, status != NVME_SC_SUCCESS);

Here too.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14  1:29 ` Jens Axboe
@ 2014-11-14 14:58   ` Matthew Wilcox
  2014-11-14 15:07     ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2014-11-14 14:58 UTC (permalink / raw)

On Thu, Nov 13, 2014@06:29:37PM -0700, Jens Axboe wrote:
> The downside I see is that this is an OOB IO path. Once we start adding IO
> scheduling for those that need that, then this will completely bypass that.

The idea is that you would only enable it for devices that are based on
NVM that is of "near-DRAM" speeds, and can complete small I/Os as fast
as they are issued.  For those kinds of devices, there is absolutely no
value to any kind of IO scheduling.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 14:58   ` Matthew Wilcox
@ 2014-11-14 15:07     ` Jens Axboe
  2014-11-14 15:52       ` Matthew Wilcox
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2014-11-14 15:07 UTC (permalink / raw)


On 11/14/2014 07:58 AM, Matthew Wilcox wrote:
> On Thu, Nov 13, 2014@06:29:37PM -0700, Jens Axboe wrote:
>> The downside I see is that this is an OOB IO path. Once we start adding IO
>> scheduling for those that need that, then this will completely bypass that.
> 
> The idea is that you would only enable it for devices that are based on
> NVM that is of "near-DRAM" speeds, and can complete small I/Os as fast
> as they are issued.  For those kinds of devices, there is absolutely no
> value to any kind of IO scheduling.

I agree, that's not the kind of device that people would generally do
scheduling on, and we can't at those rates. But if that's the case, why
isn't this a sync interface? "Near DRAM speeds" and interrupt driven
seems like a poor choice.


-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 15:07     ` Jens Axboe
@ 2014-11-14 15:52       ` Matthew Wilcox
  2014-11-14 16:32         ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Matthew Wilcox @ 2014-11-14 15:52 UTC (permalink / raw)

On Fri, Nov 14, 2014@08:07:49AM -0700, Jens Axboe wrote:
> On 11/14/2014 07:58 AM, Matthew Wilcox wrote:
> > On Thu, Nov 13, 2014@06:29:37PM -0700, Jens Axboe wrote:
> >> The downside I see is that this is an OOB IO path. Once we start adding IO
> >> scheduling for those that need that, then this will completely bypass that.
> > 
> > The idea is that you would only enable it for devices that are based on
> > NVM that is of "near-DRAM" speeds, and can complete small I/Os as fast
> > as they are issued.  For those kinds of devices, there is absolutely no
> > value to any kind of IO scheduling.
> 
> I agree, that's not the kind of device that people would generally do
> scheduling on, and we can't at those rates. But if that's the case, why
> isn't this a sync interface? "Near DRAM speeds" and interrupt driven
> seems like a poor choice.

It could be done as a sync interface; zram and brd do implement it
synchronously.  But if you look at the callers, mostly they try to send
several pages before waiting on each of them to complete, and so we can
overlap the work of sending each page with the drive handling the I/O
of the previous page.  You'll notice that we check the completion queue
before returning from nvme_rw_page(), so not waiting for an interrupt
to fire for anything that already completed.

The missing piece that I think we need is something like the
patch I sent last year to spin instead of sleeping in io_schedule()
(https://lwn.net/Articles/555886/).  That will ensure that we pick up
the last I/O or two without waiting for an interrupt.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 15:52       ` Matthew Wilcox
@ 2014-11-14 16:32         ` Jens Axboe
  2014-11-14 17:05           ` Keith Busch
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2014-11-14 16:32 UTC (permalink / raw)


On 11/14/2014 08:52 AM, Matthew Wilcox wrote:
> On Fri, Nov 14, 2014@08:07:49AM -0700, Jens Axboe wrote:
>> On 11/14/2014 07:58 AM, Matthew Wilcox wrote:
>>> On Thu, Nov 13, 2014@06:29:37PM -0700, Jens Axboe wrote:
>>>> The downside I see is that this is an OOB IO path. Once we start adding IO
>>>> scheduling for those that need that, then this will completely bypass that.
>>>
>>> The idea is that you would only enable it for devices that are based on
>>> NVM that is of "near-DRAM" speeds, and can complete small I/Os as fast
>>> as they are issued.  For those kinds of devices, there is absolutely no
>>> value to any kind of IO scheduling.
>>
>> I agree, that's not the kind of device that people would generally do
>> scheduling on, and we can't at those rates. But if that's the case, why
>> isn't this a sync interface? "Near DRAM speeds" and interrupt driven
>> seems like a poor choice.
> 
> It could be done as a sync interface; zram and brd do implement it
> synchronously.  But if you look at the callers, mostly they try to send
> several pages before waiting on each of them to complete, and so we can
> overlap the work of sending each page with the drive handling the I/O
> of the previous page.  You'll notice that we check the completion queue
> before returning from nvme_rw_page(), so not waiting for an interrupt
> to fire for anything that already completed.

For the cases where you do indeed end up submitting multiple, it's even
more of a shame to bypass the normal IO path. There are various tricks
we can do in there to speed things up, like batched doorbell rings. And
if we kill that last alloc/free per IO, then I'd really be curious to
know why rw_page is faster. Seems it should be possible to fix that up
instead.

> The missing piece that I think we need is something like the
> patch I sent last year to spin instead of sleeping in io_schedule()
> (https://lwn.net/Articles/555886/).  That will ensure that we pick up
> the last I/O or two without waiting for an interrupt.

Yes, we need to look more into that at some point. If we can eliminate a
sleep/wakeup cycle, we're way ahead in the game.


-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 16:32         ` Jens Axboe
@ 2014-11-14 17:05           ` Keith Busch
  2014-11-14 20:53             ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-11-14 17:05 UTC (permalink / raw)


On Fri, 14 Nov 2014, Jens Axboe wrote:
> For the cases where you do indeed end up submitting multiple, it's even
> more of a shame to bypass the normal IO path. There are various tricks
> we can do in there to speed things up, like batched doorbell rings. And
> if we kill that last alloc/free per IO, then I'd really be curious to
> know why rw_page is faster. Seems it should be possible to fix that up
> instead.

Here's some perf data of just the kernel from two runs with a simple
swap testing program. I'm a novice at interpreting this for comparison,
so I'm not sure if this shows what we're looking for. The test ran for
the same amount of time in both cases, but perf couted ~16% fewer events
when using rw_page.

With rw_page disabled:

      7.33%  swap     [kernel.kallsyms]  [k] page_fault
      5.13%  swap     [kernel.kallsyms]  [k] clear_page_c
      4.46%  swap     [kernel.kallsyms]  [k] __radix_tree_lookup
      4.36%  swap     [kernel.kallsyms]  [k] do_raw_spin_lock
      2.63%  swap     [kernel.kallsyms]  [k] handle_mm_fault
      2.17%  swap     [kernel.kallsyms]  [k] get_page_from_freelist
      1.77%  swap     [kernel.kallsyms]  [k] __swap_duplicate
      1.53%  swap     [nvme]             [k] nvme_queue_rq
      1.38%  swap     [kernel.kallsyms]  [k] intel_pmu_disable_all
      1.37%  swap     [kernel.kallsyms]  [k] put_page_testzero
      1.19%  swap     [kernel.kallsyms]  [k] __do_page_fault
      1.05%  swap     [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
      0.99%  swap     [kernel.kallsyms]  [k] __free_one_page
      0.97%  swap     [kernel.kallsyms]  [k] swap_info_get
      0.90%  swap     [kernel.kallsyms]  [k] __alloc_pages_nodemask
      0.80%  swap     [kernel.kallsyms]  [k] radix_tree_insert
      0.78%  swap     [kernel.kallsyms]  [k] test_and_set_bit.constprop.90
      0.74%  swap     [kernel.kallsyms]  [k] __bt_get
      0.71%  swap     [kernel.kallsyms]  [k] sg_init_table
      0.71%  swap     [kernel.kallsyms]  [k] list_del
      0.70%  swap     [kernel.kallsyms]  [k] ____cache_alloc
      0.67%  swap     [kernel.kallsyms]  [k] __schedule
      0.66%  swap     [kernel.kallsyms]  [k] round_jiffies_common
      0.63%  swap     [kernel.kallsyms]  [k] __wait_on_bit
      0.61%  swap     [kernel.kallsyms]  [k] __rmqueue
      0.60%  swap     [kernel.kallsyms]  [k] vmacache_find
      0.54%  swap     [kernel.kallsyms]  [k] __blk_bios_map_sg
      0.54%  swap     [kernel.kallsyms]  [k] blk_mq_start_request
      0.53%  swap     [kernel.kallsyms]  [k] unmap_single_vma
      0.52%  swap     [kernel.kallsyms]  [k] __update_tg_runnable_avg.isra.23
      0.52%  swap     [kernel.kallsyms]  [k] __blk_mq_alloc_request
      0.51%  swap     [kernel.kallsyms]  [k] swiotlb_map_sg_attrs
      0.49%  swap     [nvme]             [k] nvme_alloc_iod
      0.49%  swap     [kernel.kallsyms]  [k] update_cfs_shares
      0.47%  swap     [kernel.kallsyms]  [k] __add_to_swap_cache
      0.46%  swap     [kernel.kallsyms]  [k] update_curr
      0.46%  swap     [kernel.kallsyms]  [k] swap_entry_free
      0.45%  swap     [kernel.kallsyms]  [k] swapin_readahead
      0.45%  swap     [kernel.kallsyms]  [k] __call_rcu.constprop.62
      0.44%  swap     [kernel.kallsyms]  [k] page_waitqueue
      0.44%  swap     [kernel.kallsyms]  [k] tag_get
      0.43%  swap     [kernel.kallsyms]  [k] next_zones_zonelist
      0.43%  swap     [kernel.kallsyms]  [k] kmem_cache_alloc
      0.42%  swap     [nvme]             [k] nvme_process_cq

With rw_page enabled:

      8.33%  swap     [kernel.kallsyms]  [k] page_fault
      6.36%  swap     [kernel.kallsyms]  [k] clear_page_c
      5.15%  swap     [kernel.kallsyms]  [k] do_raw_spin_lock
      5.10%  swap     [kernel.kallsyms]  [k] __radix_tree_lookup
      3.01%  swap     [kernel.kallsyms]  [k] handle_mm_fault
      2.57%  swap     [kernel.kallsyms]  [k] get_page_from_freelist
      2.06%  swap     [kernel.kallsyms]  [k] __swap_duplicate
      1.57%  swap     [kernel.kallsyms]  [k] put_page_testzero
      1.44%  swap     [kernel.kallsyms]  [k] intel_pmu_disable_all
      1.37%  swap     [kernel.kallsyms]  [k] test_and_set_bit.constprop.90
      1.20%  swap     [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
      1.19%  swap     [kernel.kallsyms]  [k] __free_one_page
      1.15%  swap     [kernel.kallsyms]  [k] radix_tree_insert
      1.15%  swap     [kernel.kallsyms]  [k] __do_page_fault
      1.07%  swap     [kernel.kallsyms]  [k] swap_info_get
      0.89%  swap     [kernel.kallsyms]  [k] __alloc_pages_nodemask
      0.85%  swap     [kernel.kallsyms]  [k] list_del
      0.81%  swap     [kernel.kallsyms]  [k] __bt_get
      0.78%  swap     [nvme]             [k] nvme_rw_page
      0.74%  swap     [kernel.kallsyms]  [k] __rmqueue
      0.74%  swap     [kernel.kallsyms]  [k] __wait_on_bit
      0.69%  swap     [kernel.kallsyms]  [k] __schedule
      0.63%  swap     [kernel.kallsyms]  [k] unmap_single_vma
      0.62%  swap     [kernel.kallsyms]  [k] vmacache_find
      0.60%  swap     [kernel.kallsyms]  [k] update_cfs_shares
      0.59%  swap     [kernel.kallsyms]  [k] tag_get
      0.55%  swap     [kernel.kallsyms]  [k] update_curr
      0.53%  swap     [kernel.kallsyms]  [k] __update_tg_runnable_avg.isra.23
      0.51%  swap     [kernel.kallsyms]  [k] next_zones_zonelist
      0.51%  swap     [kernel.kallsyms]  [k] __radix_tree_create
      0.50%  swap     [kernel.kallsyms]  [k] __blk_mq_alloc_request
      0.50%  swap     [kernel.kallsyms]  [k] __call_rcu.constprop.62
      0.49%  swap     [kernel.kallsyms]  [k] page_waitqueue
      0.48%  swap     [kernel.kallsyms]  [k] swap_entry_free
      0.47%  swap     [kernel.kallsyms]  [k] __add_to_swap_cache
      0.46%  swap     [kernel.kallsyms]  [k] down_read_trylock
      0.44%  swap     [kernel.kallsyms]  [k] up_read
      0.43%  swap     [kernel.kallsyms]  [k] __wake_up_bit
      0.43%  swap     [kernel.kallsyms]  [k] io_schedule
      0.42%  swap     [kernel.kallsyms]  [k] __mod_zone_page_state
      0.42%  swap     [kernel.kallsyms]  [k] do_wp_page
      0.39%  swap     [kernel.kallsyms]  [k] __inc_zone_state
      0.39%  swap     [kernel.kallsyms]  [k] dequeue_task_fair
      0.39%  swap     [kernel.kallsyms]  [k] prepare_to_wait

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 17:05           ` Keith Busch
@ 2014-11-14 20:53             ` Jens Axboe
  2014-11-14 22:59               ` Keith Busch
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2014-11-14 20:53 UTC (permalink / raw)


On 11/14/2014 10:05 AM, Keith Busch wrote:
> On Fri, 14 Nov 2014, Jens Axboe wrote:
>> For the cases where you do indeed end up submitting multiple, it's even
>> more of a shame to bypass the normal IO path. There are various tricks
>> we can do in there to speed things up, like batched doorbell rings. And
>> if we kill that last alloc/free per IO, then I'd really be curious to
>> know why rw_page is faster. Seems it should be possible to fix that up
>> instead.
> 
> Here's some perf data of just the kernel from two runs with a simple
> swap testing program. I'm a novice at interpreting this for comparison,
> so I'm not sure if this shows what we're looking for. The test ran for
> the same amount of time in both cases, but perf couted ~16% fewer events
> when using rw_page.
> 
> With rw_page disabled:
> 
>      7.33%  swap     [kernel.kallsyms]  [k] page_fault
>      5.13%  swap     [kernel.kallsyms]  [k] clear_page_c
>      4.46%  swap     [kernel.kallsyms]  [k] __radix_tree_lookup
>      4.36%  swap     [kernel.kallsyms]  [k] do_raw_spin_lock
>      2.63%  swap     [kernel.kallsyms]  [k] handle_mm_fault
>      2.17%  swap     [kernel.kallsyms]  [k] get_page_from_freelist
>      1.77%  swap     [kernel.kallsyms]  [k] __swap_duplicate
>      1.53%  swap     [nvme]             [k] nvme_queue_rq
>      1.38%  swap     [kernel.kallsyms]  [k] intel_pmu_disable_all
>      1.37%  swap     [kernel.kallsyms]  [k] put_page_testzero
>      1.19%  swap     [kernel.kallsyms]  [k] __do_page_fault
>      1.05%  swap     [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
>      0.99%  swap     [kernel.kallsyms]  [k] __free_one_page
>      0.97%  swap     [kernel.kallsyms]  [k] swap_info_get
>      0.90%  swap     [kernel.kallsyms]  [k] __alloc_pages_nodemask
>      0.80%  swap     [kernel.kallsyms]  [k] radix_tree_insert
>      0.78%  swap     [kernel.kallsyms]  [k] test_and_set_bit.constprop.90
>      0.74%  swap     [kernel.kallsyms]  [k] __bt_get
>      0.71%  swap     [kernel.kallsyms]  [k] sg_init_table
>      0.71%  swap     [kernel.kallsyms]  [k] list_del
>      0.70%  swap     [kernel.kallsyms]  [k] ____cache_alloc
>      0.67%  swap     [kernel.kallsyms]  [k] __schedule
>      0.66%  swap     [kernel.kallsyms]  [k] round_jiffies_common
>      0.63%  swap     [kernel.kallsyms]  [k] __wait_on_bit
>      0.61%  swap     [kernel.kallsyms]  [k] __rmqueue
>      0.60%  swap     [kernel.kallsyms]  [k] vmacache_find
>      0.54%  swap     [kernel.kallsyms]  [k] __blk_bios_map_sg
>      0.54%  swap     [kernel.kallsyms]  [k] blk_mq_start_request
>      0.53%  swap     [kernel.kallsyms]  [k] unmap_single_vma
>      0.52%  swap     [kernel.kallsyms]  [k]
> __update_tg_runnable_avg.isra.23
>      0.52%  swap     [kernel.kallsyms]  [k] __blk_mq_alloc_request
>      0.51%  swap     [kernel.kallsyms]  [k] swiotlb_map_sg_attrs
>      0.49%  swap     [nvme]             [k] nvme_alloc_iod
>      0.49%  swap     [kernel.kallsyms]  [k] update_cfs_shares
>      0.47%  swap     [kernel.kallsyms]  [k] __add_to_swap_cache
>      0.46%  swap     [kernel.kallsyms]  [k] update_curr
>      0.46%  swap     [kernel.kallsyms]  [k] swap_entry_free
>      0.45%  swap     [kernel.kallsyms]  [k] swapin_readahead
>      0.45%  swap     [kernel.kallsyms]  [k] __call_rcu.constprop.62
>      0.44%  swap     [kernel.kallsyms]  [k] page_waitqueue
>      0.44%  swap     [kernel.kallsyms]  [k] tag_get
>      0.43%  swap     [kernel.kallsyms]  [k] next_zones_zonelist
>      0.43%  swap     [kernel.kallsyms]  [k] kmem_cache_alloc
>      0.42%  swap     [nvme]             [k] nvme_process_cq
> 
> With rw_page enabled:
> 
>      8.33%  swap     [kernel.kallsyms]  [k] page_fault
>      6.36%  swap     [kernel.kallsyms]  [k] clear_page_c
>      5.15%  swap     [kernel.kallsyms]  [k] do_raw_spin_lock
>      5.10%  swap     [kernel.kallsyms]  [k] __radix_tree_lookup
>      3.01%  swap     [kernel.kallsyms]  [k] handle_mm_fault
>      2.57%  swap     [kernel.kallsyms]  [k] get_page_from_freelist
>      2.06%  swap     [kernel.kallsyms]  [k] __swap_duplicate
>      1.57%  swap     [kernel.kallsyms]  [k] put_page_testzero
>      1.44%  swap     [kernel.kallsyms]  [k] intel_pmu_disable_all
>      1.37%  swap     [kernel.kallsyms]  [k] test_and_set_bit.constprop.90
>      1.20%  swap     [kernel.kallsyms]  [k] _raw_spin_lock_irqsave
>      1.19%  swap     [kernel.kallsyms]  [k] __free_one_page
>      1.15%  swap     [kernel.kallsyms]  [k] radix_tree_insert
>      1.15%  swap     [kernel.kallsyms]  [k] __do_page_fault
>      1.07%  swap     [kernel.kallsyms]  [k] swap_info_get
>      0.89%  swap     [kernel.kallsyms]  [k] __alloc_pages_nodemask
>      0.85%  swap     [kernel.kallsyms]  [k] list_del
>      0.81%  swap     [kernel.kallsyms]  [k] __bt_get
>      0.78%  swap     [nvme]             [k] nvme_rw_page
>      0.74%  swap     [kernel.kallsyms]  [k] __rmqueue
>      0.74%  swap     [kernel.kallsyms]  [k] __wait_on_bit
>      0.69%  swap     [kernel.kallsyms]  [k] __schedule
>      0.63%  swap     [kernel.kallsyms]  [k] unmap_single_vma
>      0.62%  swap     [kernel.kallsyms]  [k] vmacache_find
>      0.60%  swap     [kernel.kallsyms]  [k] update_cfs_shares
>      0.59%  swap     [kernel.kallsyms]  [k] tag_get
>      0.55%  swap     [kernel.kallsyms]  [k] update_curr
>      0.53%  swap     [kernel.kallsyms]  [k]
> __update_tg_runnable_avg.isra.23
>      0.51%  swap     [kernel.kallsyms]  [k] next_zones_zonelist
>      0.51%  swap     [kernel.kallsyms]  [k] __radix_tree_create
>      0.50%  swap     [kernel.kallsyms]  [k] __blk_mq_alloc_request
>      0.50%  swap     [kernel.kallsyms]  [k] __call_rcu.constprop.62
>      0.49%  swap     [kernel.kallsyms]  [k] page_waitqueue
>      0.48%  swap     [kernel.kallsyms]  [k] swap_entry_free
>      0.47%  swap     [kernel.kallsyms]  [k] __add_to_swap_cache
>      0.46%  swap     [kernel.kallsyms]  [k] down_read_trylock
>      0.44%  swap     [kernel.kallsyms]  [k] up_read
>      0.43%  swap     [kernel.kallsyms]  [k] __wake_up_bit
>      0.43%  swap     [kernel.kallsyms]  [k] io_schedule
>      0.42%  swap     [kernel.kallsyms]  [k] __mod_zone_page_state
>      0.42%  swap     [kernel.kallsyms]  [k] do_wp_page
>      0.39%  swap     [kernel.kallsyms]  [k] __inc_zone_state
>      0.39%  swap     [kernel.kallsyms]  [k] dequeue_task_fair
>      0.39%  swap     [kernel.kallsyms]  [k] prepare_to_wait

It's hard (impossible) to tell from just this, we'd need performance
data to go with it, too. The number of events is a very vague hint, I
would not put any value into that.

If you can describe your workload, I'd love to just run it and see what
happens here!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
       [not found] ` <CANvN+ekQTdNgPe33iaM_9=2Hjrfds2B2R3d3XK06K9n=SY+ZKA@mail.gmail.com>
@ 2014-11-14 22:50   ` Keith Busch
  2014-11-14 22:56     ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-11-14 22:50 UTC (permalink / raw)


On Fri, 14 Nov 2014, Andrey Kuzmin wrote:
> On Nov 14, 2014 3:13 AM, "Keith Busch" <keith.busch@intel.com> wrote:
> > +static void pgwr_completion(struct nvme_queue *nvmeq, void *ctx,
> > +? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct nvme_completion *cqe)
> > +{
> > +? ? ? ?struct request *req = ctx;
> > +? ? ? ?struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
> > +? ? ? ?struct page *page = req->special;
> > +? ? ? ?u16 status = le16_to_cpup(&cqe->status) >> 1;
> > +
> > +? ? ? ?dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma, PAGE_CACHE_SIZE, DMA_TO_DEVICE);
> > +? ? ? ?page_endio(page, WRITE, status != NVME_SC_SUCCESS);
> > +? ? ? ?blk_put_request(req);
> > +}
> > +
> 
> As you have access to the nvme_cmd_info - and thus command opcode - in the
> completion handler, having separate completion handlers for read and write
> operations looks like unnecessary code duplication. Just my .02 :).

But nvme_cmd_info does not contain the opcode. I do have the struct
request here, and I can certainly pull the data direction from its
cmd_flags, so thanks for the suggestion!

Now I wonder if there's something else unused in a request that I
can repurpose to save the dma_addr_t in so I don't add it in the
nvme_cmd_info...

> Best regards,
> Andrey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 22:50   ` Keith Busch
@ 2014-11-14 22:56     ` Jens Axboe
  2014-11-14 23:04       ` Keith Busch
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2014-11-14 22:56 UTC (permalink / raw)


On 11/14/2014 03:50 PM, Keith Busch wrote:
> On Fri, 14 Nov 2014, Andrey Kuzmin wrote:
>> On Nov 14, 2014 3:13 AM, "Keith Busch" <keith.busch@intel.com> wrote:
>> > +static void pgwr_completion(struct nvme_queue *nvmeq, void *ctx,
>> > +                                               struct
>> nvme_completion *cqe)
>> > +{
>> > +       struct request *req = ctx;
>> > +       struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
>> > +       struct page *page = req->special;
>> > +       u16 status = le16_to_cpup(&cqe->status) >> 1;
>> > +
>> > +       dma_unmap_page(nvmeq->q_dmadev, cmd_rq->dma,
>> PAGE_CACHE_SIZE, DMA_TO_DEVICE);
>> > +       page_endio(page, WRITE, status != NVME_SC_SUCCESS);
>> > +       blk_put_request(req);
>> > +}
>> > +
>>
>> As you have access to the nvme_cmd_info - and thus command opcode - in
>> the
>> completion handler, having separate completion handlers for read and
>> write
>> operations looks like unnecessary code duplication. Just my .02 :).
> 
> But nvme_cmd_info does not contain the opcode. I do have the struct
> request here, and I can certainly pull the data direction from its
> cmd_flags, so thanks for the suggestion!
> 
> Now I wonder if there's something else unused in a request that I
> can repurpose to save the dma_addr_t in so I don't add it in the
> nvme_cmd_info...

For the rw_page path, you don't use ->special to store the iod. So you
could use that... That might break for 32-bit platforms and 64-bit DMA,
though. If you really want to get nasty, ->__cmd is 16 bytes of unused
goodness as well, though I do want to make that one dynamically
allocated for the queues that need it.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 20:53             ` Jens Axboe
@ 2014-11-14 22:59               ` Keith Busch
  0 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2014-11-14 22:59 UTC (permalink / raw)

On Fri, 14 Nov 2014, Jens Axboe wrote:
> It's hard (impossible) to tell from just this, we'd need performance
> data to go with it, too. The number of events is a very vague hint, I
> would not put any value into that.

Oh good. After generating the report I couldn't figure out why I thought
it would have been useful in the first place!

> If you can describe your workload, I'd love to just run it and see what
> happens here!

Sure, I made this program up real quick, so there's probably something
better out there. It just allocates millions of pages in an array then
writes and reads a byte from each page to force page faults and swapping,
then does it all again a few more times.

This is the basic verion. I've another one that does this with lots of
threads accessing pages in random order.

It allocates 40GB worth of pages, so that might not be appropriate for
all machines.

swap.c:
---
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

int main(int argc, char ** argv)
{
 	unsigned long i, j, num_pages;
 	unsigned char **mem;
 	volatile unsigned char k;

 	num_pages = (1024 * 1024 * 1024 * 40ULL) / getpagesize();
 	mem = (unsigned char **)calloc(num_pages, sizeof(*mem));
 	if (!mem)
 		return 1;

 	for (i = 0; i < num_pages; i++) {
 		mem[i] = malloc(getpagesize());
 		if (!mem[i])
 			return 1;
 	}

 	for (j = 0; j < 8; j++) {
 		for (i = 0; i < num_pages; i++)
 			mem[i][0] = i;
 		for (i = 0; i < num_pages; i++)
 			k = mem[i][0];
 	}
 	return 0;
}
--

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 22:56     ` Jens Axboe
@ 2014-11-14 23:04       ` Keith Busch
  2014-11-14 23:30         ` Jens Axboe
  0 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2014-11-14 23:04 UTC (permalink / raw)


On Fri, 14 Nov 2014, Jens Axboe wrote:
> On 11/14/2014 03:50 PM, Keith Busch wrote:
>> But nvme_cmd_info does not contain the opcode. I do have the struct
>> request here, and I can certainly pull the data direction from its
>> cmd_flags, so thanks for the suggestion!
>>
>> Now I wonder if there's something else unused in a request that I
>> can repurpose to save the dma_addr_t in so I don't add it in the
>> nvme_cmd_info...
>
> For the rw_page path, you don't use ->special to store the iod. So you
> could use that... That might break for 32-bit platforms and 64-bit DMA,
> though. If you really want to get nasty, ->__cmd is 16 bytes of unused
> goodness as well, though I do want to make that one dynamically
> allocated for the queues that need it.

Ah, but I already got req->special pointing to the page. Maybe we can
union another field with a new "special_2" field.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] NVMe: Add rw_page support
  2014-11-14 23:04       ` Keith Busch
@ 2014-11-14 23:30         ` Jens Axboe
  0 siblings, 0 replies; 14+ messages in thread
From: Jens Axboe @ 2014-11-14 23:30 UTC (permalink / raw)


On 2014-11-14 16:04, Keith Busch wrote:
> On Fri, 14 Nov 2014, Jens Axboe wrote:
>> On 11/14/2014 03:50 PM, Keith Busch wrote:
>>> But nvme_cmd_info does not contain the opcode. I do have the struct
>>> request here, and I can certainly pull the data direction from its
>>> cmd_flags, so thanks for the suggestion!
>>>
>>> Now I wonder if there's something else unused in a request that I
>>> can repurpose to save the dma_addr_t in so I don't add it in the
>>> nvme_cmd_info...
>>
>> For the rw_page path, you don't use ->special to store the iod. So you
>> could use that... That might break for 32-bit platforms and 64-bit DMA,
>> though. If you really want to get nasty, ->__cmd is 16 bytes of unused
>> goodness as well, though I do want to make that one dynamically
>> allocated for the queues that need it.
>
> Ah, but I already got req->special pointing to the page. Maybe we can
> union another field with a new "special_2" field.

Since you are the request allocator, you can use ->end_io_data as well.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-11-14 23:30 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-11-14  0:05 [PATCH] NVMe: Add rw_page support Keith Busch
2014-11-14  1:29 ` Jens Axboe
2014-11-14 14:58   ` Matthew Wilcox
2014-11-14 15:07     ` Jens Axboe
2014-11-14 15:52       ` Matthew Wilcox
2014-11-14 16:32         ` Jens Axboe
2014-11-14 17:05           ` Keith Busch
2014-11-14 20:53             ` Jens Axboe
2014-11-14 22:59               ` Keith Busch
2014-11-14 14:55 ` Matthew Wilcox
     [not found] ` <CANvN+ekQTdNgPe33iaM_9=2Hjrfds2B2R3d3XK06K9n=SY+ZKA@mail.gmail.com>
2014-11-14 22:50   ` Keith Busch
2014-11-14 22:56     ` Jens Axboe
2014-11-14 23:04       ` Keith Busch
2014-11-14 23:30         ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.