* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
[not found] <mailman.3257.1377730793.1059.linux-nvme@lists.infradead.org>
@ 2013-09-02 6:49 ` Huhaiyan (haiyan)
2013-09-03 15:53 ` Matthew Wilcox
0 siblings, 1 reply; 7+ messages in thread
From: Huhaiyan (haiyan) @ 2013-09-02 6:49 UTC (permalink / raw)
We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq().
This change can reduce latency for the admin/io command.
Signed-off-by: Haiyan Hu <huhaiyan at huawei.com >
---
drivers/block/nvme-core.c | 19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7de80bb..80cf07c 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -74,7 +74,8 @@ struct nvme_queue {
wait_queue_head_t sq_full;
wait_queue_t sq_cong_wait;
struct bio_list sq_cong;
- u32 __iomem *q_db;
+ u32 __iomem *sq_tail_db;
+ u32 __iomem *cq_head_db;
u16 q_depth;
u16 cq_vector;
u16 sq_head;
@@ -252,7 +253,7 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
if (++tail == nvmeq->q_depth)
tail = 0;
- writel(tail, nvmeq->q_db);
+ writel(tail, nvmeq->sq_tail_db);
nvmeq->sq_tail = tail;
spin_unlock_irqrestore(&nvmeq->q_lock, flags);
@@ -618,7 +619,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
return 0;
}
@@ -635,7 +636,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
return 0;
}
@@ -728,7 +729,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
nvme_start_io_acct(bio);
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
return 0;
@@ -772,7 +773,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
return 0;
- writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+ writel(head, nvmeq->cq_head_db);
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
@@ -1084,7 +1085,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
init_waitqueue_head(&nvmeq->sq_full);
init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
bio_list_init(&nvmeq->sq_cong);
- nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+ nvmeq->sq_tail_db = &dev->dbs[qid << (dev->db_stride + 1)];
+ nvmeq->cq_head_db = nvmeq->sq_tail_db + (1 << dev->db_stride);
nvmeq->q_depth = depth;
nvmeq->cq_vector = vector;
@@ -1690,7 +1692,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
iounmap(dev->bar);
dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size);
dev->dbs = ((void __iomem *)dev->bar) + 4096;
- dev->queues[0]->q_db = dev->dbs;
+ dev->queues[0]->sq_tail_db = dev->dbs;
+ dev->queues[0]->cq_head_db = dev->dbs + (1 << dev->db_stride);
}
vecs = nr_io_queues;
--
1.7.6
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
[not found] <A103C806EC8D6E46A0D950DAA4D118D1400E0067@szxeml558-mbx.china.huawei.com>
@ 2013-09-02 7:00 ` Qiang Huang
0 siblings, 0 replies; 7+ messages in thread
From: Qiang Huang @ 2013-09-02 7:00 UTC (permalink / raw)
Hi haiyan,
You overlooked zefan's inline comments.
And the pattern of the patch looks odd, you'd better use
# git format-patch -1 -s
to create patches, and get a new email client like thunderbird.
You can get more information from Documentation/SubmittingPatches.
On 2013/9/2 14:36, Huhaiyan (haiyan) wrote:
> We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq().
>
> This change can reduce latency for the admin/io command.
>
>
>
> Signed-off-by: Haiyan Hu <huhaiyan at huawei.com >
>
>
>
> ---
>
> drivers/block/nvme-core.c | 19 +++++++++++--------
>
> 1 files changed, 11 insertions(+), 8 deletions(-)
>
>
>
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7de80bb..80cf07c 100644
>
> --- a/drivers/block/nvme-core.c
>
> +++ b/drivers/block/nvme-core.c
>
> @@ -74,7 +74,8 @@ struct nvme_queue {
>
> wait_queue_head_t sq_full;
>
> wait_queue_t sq_cong_wait;
>
> struct bio_list sq_cong;
>
> - u32 __iomem *q_db;
>
> + u32 __iomem *sq_tail_db;
>
> + u32 __iomem *cq_head_db;
>
> u16 q_depth;
>
> u16 cq_vector;
>
> u16 sq_head;
>
> @@ -252,7 +253,7 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
>
> memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>
> if (++tail == nvmeq->q_depth)
>
> tail = 0;
>
> - writel(tail, nvmeq->q_db);
>
> + writel(tail, nvmeq->sq_tail_db);
>
> nvmeq->sq_tail = tail;
>
> spin_unlock_irqrestore(&nvmeq->q_lock, flags);
>
> @@ -618,7 +619,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>
> if (++nvmeq->sq_tail == nvmeq->q_depth)
>
> nvmeq->sq_tail = 0;
>
> - writel(nvmeq->sq_tail, nvmeq->q_db);
>
> + writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
>
> return 0;
>
> }
>
> @@ -635,7 +636,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>
> if (++nvmeq->sq_tail == nvmeq->q_depth)
>
> nvmeq->sq_tail = 0;
>
> - writel(nvmeq->sq_tail, nvmeq->q_db);
>
> + writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
>
> return 0;
>
> }
>
> @@ -728,7 +729,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
>
> nvme_start_io_acct(bio);
>
> if (++nvmeq->sq_tail == nvmeq->q_depth)
>
> nvmeq->sq_tail = 0;
>
> - writel(nvmeq->sq_tail, nvmeq->q_db);
>
> + writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
>
> return 0;
>
> @@ -772,7 +773,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
>
> if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
>
> return 0;
>
> - writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
>
> + writel(head, nvmeq->cq_head_db);
>
> nvmeq->cq_head = head;
>
> nvmeq->cq_phase = phase;
>
> @@ -1084,7 +1085,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
>
> init_waitqueue_head(&nvmeq->sq_full);
>
> init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
>
> bio_list_init(&nvmeq->sq_cong);
>
> - nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
>
> + nvmeq->sq_tail_db = &dev->dbs[qid << (dev->db_stride + 1)];
>
> + nvmeq->cq_head_db = nvmeq->sq_tail_db + (1 << dev->db_stride);
>
> nvmeq->q_depth = depth;
>
> nvmeq->cq_vector = vector;
>
> @@ -1690,7 +1692,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>
> iounmap(dev->bar);
>
> dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size);
>
> dev->dbs = ((void __iomem *)dev->bar) + 4096;
>
> - dev->queues[0]->q_db = dev->dbs;
>
> + dev->queues[0]->sq_tail_db = dev->dbs;
>
> + dev->queues[0]->cq_head_db = dev->dbs + (1 << dev->db_stride);
>
> }
>
> vecs = nr_io_queues;
>
> --
>
> 1.7.6
>
> _______________________________________________
> Kernel.openeuler mailing list
> Kernel.openeuler at huawei.com
> http://rnd-openeuler.huawei.com/mailman/listinfo/kernel.openeuler
>
> .
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
2013-09-02 6:49 ` [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq() Huhaiyan (haiyan)
@ 2013-09-03 15:53 ` Matthew Wilcox
2013-09-04 9:15 ` Haiyan Hu
0 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2013-09-03 15:53 UTC (permalink / raw)
On Mon, Sep 02, 2013@06:49:34AM +0000, Huhaiyan (haiyan) wrote:
> We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq().
> This change can reduce latency for the admin/io command.
Have you been able to measure a difference? If so, can you share
relative numbers?
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
2013-09-03 15:53 ` Matthew Wilcox
@ 2013-09-04 9:15 ` Haiyan Hu
2013-09-04 16:51 ` Matthew Wilcox
0 siblings, 1 reply; 7+ messages in thread
From: Haiyan Hu @ 2013-09-04 9:15 UTC (permalink / raw)
On 2013/9/3 23:53, Matthew Wilcox wrote:
> On Mon, Sep 02, 2013@06:49:34AM +0000, Huhaiyan (haiyan) wrote:
>> We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq().
>> This change can reduce latency for the admin/io command.
>
> Have you been able to measure a difference? If so, can you share
> relative numbers?
>
Sorry for my inaccurate description.
Our test data shows that this patch does not change cmd process latency.
But I think it can improve code efficiency, maybe a little.
In addition, the format of the patch I sent is corrupted.
I will resend it later.
Thank you.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
2013-09-04 9:15 ` Haiyan Hu
@ 2013-09-04 16:51 ` Matthew Wilcox
2013-09-05 8:55 ` Haiyan Hu
0 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2013-09-04 16:51 UTC (permalink / raw)
On Wed, Sep 04, 2013@05:15:51PM +0800, Haiyan Hu wrote:
> On 2013/9/3 23:53, Matthew Wilcox wrote:
> > Have you been able to measure a difference? If so, can you share
> > relative numbers?
>
> Sorry for my inaccurate description.
> Our test data shows that this patch does not change cmd process latency.
> But I think it can improve code efficiency, maybe a little.
Right, so it's a trade-off. You want to add 8 bytes to the queue data
structure in order to save a few instructions from being executed at
queue processing time. We need to quantify the savings (since we know
the costs). I approximated the code by replacing:
- writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+ writel(head, nvmeq->q_db);
Here's the code before:
2ca: 48 8b 43 08 mov 0x8(%rbx),%rax
2ce: 48 8b 93 88 00 00 00 mov 0x88(%rbx),%rdx
2d5: 8b 48 40 mov 0x40(%rax),%ecx
2d8: b8 01 00 00 00 mov $0x1,%eax
2dd: d3 e0 shl %cl,%eax
2df: 48 98 cltq
2e1: 48 8d 14 82 lea (%rdx,%rax,4),%rdx
2e5: 41 0f b7 c4 movzwl %r12w,%eax
2e9: 89 02 mov %eax,(%rdx)
Here's the code after:
2ca: 48 8b 93 88 00 00 00 mov 0x88(%rbx),%rdx
2d1: 41 0f b7 c4 movzwl %r12w,%eax
2d5: 89 02 mov %eax,(%rdx)
That's 6 instructions sved, and to be fair they have some pretty tight
dependencies. Still, on a 3GHz processor, that's maybe 2 nanoseconds.
If we're doing a million IOPS, we could save 2ms/s, or 0.2%. Pretty tough
to measure, let alone justify.
Here's an alternative that doesn't require adding 8 bytes to the
nvme_queue. Instead of storing the shift in db_stride, we can store the
actual stride. Then there is less calculation to be done at
completion time, and the code looks like this:
- writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+ writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
2ca: 48 8b 43 08 mov 0x8(%rbx),%rax
2ce: 48 63 50 40 movslq 0x40(%rax),%rdx
2d2: 48 8b 83 88 00 00 00 mov 0x88(%rbx),%rax
2d9: 48 8d 14 90 lea (%rax,%rdx,4),%rdx
2dd: 41 0f b7 c4 movzwl %r12w,%eax
2e1: 89 02 mov %eax,(%rdx)
That saves half the instructions, for no increased data structure usage.
One other microoptimisation we can do is change the type of db_stride from
int to unsigned. Then the compiler produces:
2ca: 48 8b 43 08 mov 0x8(%rbx),%rax
2ce: 8b 50 40 mov 0x40(%rax),%edx
2d1: 48 8b 83 88 00 00 00 mov 0x88(%rbx),%rax
2d8: 48 8d 14 90 lea (%rax,%rdx,4),%rdx
2dc: 41 0f b7 c4 movzwl %r12w,%eax
2e0: 89 02 mov %eax,(%rdx)
which uses one fewer byte (at address 2ce, it uses mov instead of movslq).
Would you like to send a patch which changes the type of db_stride to
unsigned and changes the value stored there to be 1 << the current value?
Or you can try to persuade me again that the tradeoff is worth adding
the extra 8 bytes to the data structure :-)
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
2013-09-04 16:51 ` Matthew Wilcox
@ 2013-09-05 8:55 ` Haiyan Hu
2013-09-05 19:28 ` Matthew Wilcox
0 siblings, 1 reply; 7+ messages in thread
From: Haiyan Hu @ 2013-09-05 8:55 UTC (permalink / raw)
On 2013/9/5 0:51, Matthew Wilcox wrote:
> On Wed, Sep 04, 2013@05:15:51PM +0800, Haiyan Hu wrote:
>> On 2013/9/3 23:53, Matthew Wilcox wrote:
>>> Have you been able to measure a difference? If so, can you share
>>> relative numbers?
>>
>> Sorry for my inaccurate description.
>> Our test data shows that this patch does not change cmd process latency.
>> But I think it can improve code efficiency, maybe a little.
>
> Right, so it's a trade-off. You want to add 8 bytes to the queue data
> structure in order to save a few instructions from being executed at
> queue processing time. We need to quantify the savings (since we know
> the costs). I approximated the code by replacing:
>
> - writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
> + writel(head, nvmeq->q_db);
>
> Here's the code before:
>
> 2ca: 48 8b 43 08 mov 0x8(%rbx),%rax
> 2ce: 48 8b 93 88 00 00 00 mov 0x88(%rbx),%rdx
> 2d5: 8b 48 40 mov 0x40(%rax),%ecx
> 2d8: b8 01 00 00 00 mov $0x1,%eax
> 2dd: d3 e0 shl %cl,%eax
> 2df: 48 98 cltq
> 2e1: 48 8d 14 82 lea (%rdx,%rax,4),%rdx
> 2e5: 41 0f b7 c4 movzwl %r12w,%eax
> 2e9: 89 02 mov %eax,(%rdx)
>
> Here's the code after:
>
> 2ca: 48 8b 93 88 00 00 00 mov 0x88(%rbx),%rdx
> 2d1: 41 0f b7 c4 movzwl %r12w,%eax
> 2d5: 89 02 mov %eax,(%rdx)
>
> That's 6 instructions sved, and to be fair they have some pretty tight
> dependencies. Still, on a 3GHz processor, that's maybe 2 nanoseconds.
> If we're doing a million IOPS, we could save 2ms/s, or 0.2%. Pretty tough
> to measure, let alone justify.
>
> Here's an alternative that doesn't require adding 8 bytes to the
> nvme_queue. Instead of storing the shift in db_stride, we can store the
> actual stride. Then there is less calculation to be done at
> completion time, and the code looks like this:
>
> - writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
> + writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
>
> 2ca: 48 8b 43 08 mov 0x8(%rbx),%rax
> 2ce: 48 63 50 40 movslq 0x40(%rax),%rdx
> 2d2: 48 8b 83 88 00 00 00 mov 0x88(%rbx),%rax
> 2d9: 48 8d 14 90 lea (%rax,%rdx,4),%rdx
> 2dd: 41 0f b7 c4 movzwl %r12w,%eax
> 2e1: 89 02 mov %eax,(%rdx)
>
> That saves half the instructions, for no increased data structure usage.
>
> One other microoptimisation we can do is change the type of db_stride from
> int to unsigned. Then the compiler produces:
>
> 2ca: 48 8b 43 08 mov 0x8(%rbx),%rax
> 2ce: 8b 50 40 mov 0x40(%rax),%edx
> 2d1: 48 8b 83 88 00 00 00 mov 0x88(%rbx),%rax
> 2d8: 48 8d 14 90 lea (%rax,%rdx,4),%rdx
> 2dc: 41 0f b7 c4 movzwl %r12w,%eax
> 2e0: 89 02 mov %eax,(%rdx)
>
> which uses one fewer byte (at address 2ce, it uses mov instead of movslq).
>
>
> Would you like to send a patch which changes the type of db_stride to
> unsigned and changes the value stored there to be 1 << the current value?
>
> Or you can try to persuade me again that the tradeoff is worth adding
> the extra 8 bytes to the data structure :-)
> .
>
Your analysis is more comprehensive. Of course, I'd like to send a new patch that
you advised. Do you think the updated variable named "db_stride_shift"?
Thank you.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
2013-09-05 8:55 ` Haiyan Hu
@ 2013-09-05 19:28 ` Matthew Wilcox
0 siblings, 0 replies; 7+ messages in thread
From: Matthew Wilcox @ 2013-09-05 19:28 UTC (permalink / raw)
On Thu, Sep 05, 2013@04:55:43PM +0800, Haiyan Hu wrote:
> Your analysis is more comprehensive. Of course, I'd like to send a new patch that
> you advised. Do you think the updated variable named "db_stride_shift"?
I think that might have been a better name for the old meaning :-) I think
we should probabaly keep the name as-is, but change the usage. Unless
someone can come up with a better name ... ?
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2013-09-05 19:28 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <mailman.3257.1377730793.1059.linux-nvme@lists.infradead.org>
2013-09-02 6:49 ` [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq() Huhaiyan (haiyan)
2013-09-03 15:53 ` Matthew Wilcox
2013-09-04 9:15 ` Haiyan Hu
2013-09-04 16:51 ` Matthew Wilcox
2013-09-05 8:55 ` Haiyan Hu
2013-09-05 19:28 ` Matthew Wilcox
[not found] <A103C806EC8D6E46A0D950DAA4D118D1400E0067@szxeml558-mbx.china.huawei.com>
2013-09-02 7:00 ` Qiang Huang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).