[PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process

linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
       [not found] <mailman.3257.1377730793.1059.linux-nvme@lists.infradead.org>
@ 2013-09-02  6:49 ` Huhaiyan (haiyan)
  2013-09-03 15:53   ` Matthew Wilcox
  0 siblings, 1 reply; 7+ messages in thread
From: Huhaiyan (haiyan) @ 2013-09-02  6:49 UTC (permalink / raw)


We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq(). 
This change can reduce latency for the admin/io command.

Signed-off-by: Haiyan Hu <huhaiyan at huawei.com >

---
drivers/block/nvme-core.c |   19 +++++++++++--------
1 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7de80bb..80cf07c 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -74,7 +74,8 @@ struct nvme_queue {
       wait_queue_head_t sq_full;
       wait_queue_t sq_cong_wait;
       struct bio_list sq_cong;
-        u32 __iomem *q_db;
+       u32 __iomem *sq_tail_db;
+       u32 __iomem *cq_head_db;
       u16 q_depth;
       u16 cq_vector;
       u16 sq_head;
@@ -252,7 +253,7 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
       memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
       if (++tail == nvmeq->q_depth)
                tail = 0;
-        writel(tail, nvmeq->q_db);
+       writel(tail, nvmeq->sq_tail_db);
       nvmeq->sq_tail = tail;
       spin_unlock_irqrestore(&nvmeq->q_lock, flags);

@@ -618,7 +619,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,

        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
-        writel(nvmeq->sq_tail, nvmeq->q_db);
+       writel(nvmeq->sq_tail, nvmeq->sq_tail_db);

        return 0;
}
@@ -635,7 +636,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,

        if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
-        writel(nvmeq->sq_tail, nvmeq->q_db);
+       writel(nvmeq->sq_tail, nvmeq->sq_tail_db);

        return 0;
}
@@ -728,7 +729,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
       nvme_start_io_acct(bio);
       if (++nvmeq->sq_tail == nvmeq->q_depth)
                nvmeq->sq_tail = 0;
-        writel(nvmeq->sq_tail, nvmeq->q_db);
+       writel(nvmeq->sq_tail, nvmeq->sq_tail_db);

        return 0;

@@ -772,7 +773,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
       if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
                return 0;

-        writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+       writel(head, nvmeq->cq_head_db);
       nvmeq->cq_head = head;
       nvmeq->cq_phase = phase;

@@ -1084,7 +1085,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
       init_waitqueue_head(&nvmeq->sq_full);
       init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
       bio_list_init(&nvmeq->sq_cong);
-        nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+       nvmeq->sq_tail_db = &dev->dbs[qid << (dev->db_stride + 1)];
+       nvmeq->cq_head_db = nvmeq->sq_tail_db + (1 << dev->db_stride);
       nvmeq->q_depth = depth;
       nvmeq->cq_vector = vector;

@@ -1690,7 +1692,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
                iounmap(dev->bar);
                dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size);
                dev->dbs = ((void __iomem *)dev->bar) + 4096;
-                 dev->queues[0]->q_db = dev->dbs;
+                dev->queues[0]->sq_tail_db = dev->dbs;
+               dev->queues[0]->cq_head_db = dev->dbs + (1 << dev->db_stride);
       }

        vecs = nr_io_queues;
--
1.7.6

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
       [not found] <A103C806EC8D6E46A0D950DAA4D118D1400E0067@szxeml558-mbx.china.huawei.com>
@ 2013-09-02  7:00 ` Qiang Huang
  0 siblings, 0 replies; 7+ messages in thread
From: Qiang Huang @ 2013-09-02  7:00 UTC (permalink / raw)


Hi haiyan,

You overlooked zefan's inline comments.

And the pattern of the patch looks odd, you'd better use
# git format-patch -1 -s
to create patches, and get a new email client like thunderbird.

You can get more information from Documentation/SubmittingPatches.

On 2013/9/2 14:36, Huhaiyan (haiyan) wrote:
> We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq().
> 
> This change can reduce latency for the admin/io command.
> 
> 
> 
> Signed-off-by: Haiyan Hu <huhaiyan at huawei.com >
> 
> 
> 
> ---
> 
> drivers/block/nvme-core.c |   19 +++++++++++--------
> 
> 1 files changed, 11 insertions(+), 8 deletions(-)
> 
> 
> 
> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7de80bb..80cf07c 100644
> 
> --- a/drivers/block/nvme-core.c
> 
> +++ b/drivers/block/nvme-core.c
> 
> @@ -74,7 +74,8 @@ struct nvme_queue {
> 
>        wait_queue_head_t sq_full;
> 
>        wait_queue_t sq_cong_wait;
> 
>        struct bio_list sq_cong;
> 
> -        u32 __iomem *q_db;
> 
> +       u32 __iomem *sq_tail_db;
> 
> +       u32 __iomem *cq_head_db;
> 
>        u16 q_depth;
> 
>        u16 cq_vector;
> 
>        u16 sq_head;
> 
> @@ -252,7 +253,7 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
> 
>        memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
> 
>        if (++tail == nvmeq->q_depth)
> 
>                 tail = 0;
> 
> -        writel(tail, nvmeq->q_db);
> 
> +       writel(tail, nvmeq->sq_tail_db);
> 
>        nvmeq->sq_tail = tail;
> 
>        spin_unlock_irqrestore(&nvmeq->q_lock, flags);
> 
> @@ -618,7 +619,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
> 
>         if (++nvmeq->sq_tail == nvmeq->q_depth)
> 
>                 nvmeq->sq_tail = 0;
> 
> -        writel(nvmeq->sq_tail, nvmeq->q_db);
> 
> +       writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
> 
>         return 0;
> 
> }
> 
> @@ -635,7 +636,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
> 
>         if (++nvmeq->sq_tail == nvmeq->q_depth)
> 
>                 nvmeq->sq_tail = 0;
> 
> -        writel(nvmeq->sq_tail, nvmeq->q_db);
> 
> +       writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
> 
>         return 0;
> 
> }
> 
> @@ -728,7 +729,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
> 
>        nvme_start_io_acct(bio);
> 
>        if (++nvmeq->sq_tail == nvmeq->q_depth)
> 
>                 nvmeq->sq_tail = 0;
> 
> -        writel(nvmeq->sq_tail, nvmeq->q_db);
> 
> +       writel(nvmeq->sq_tail, nvmeq->sq_tail_db);
> 
>         return 0;
> 
> @@ -772,7 +773,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
> 
>        if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
> 
>                 return 0;
> 
> -        writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
> 
> +       writel(head, nvmeq->cq_head_db);
> 
>        nvmeq->cq_head = head;
> 
>        nvmeq->cq_phase = phase;
> 
> @@ -1084,7 +1085,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
> 
>        init_waitqueue_head(&nvmeq->sq_full);
> 
>        init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
> 
>        bio_list_init(&nvmeq->sq_cong);
> 
> -        nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
> 
> +       nvmeq->sq_tail_db = &dev->dbs[qid << (dev->db_stride + 1)];
> 
> +       nvmeq->cq_head_db = nvmeq->sq_tail_db + (1 << dev->db_stride);
> 
>        nvmeq->q_depth = depth;
> 
>        nvmeq->cq_vector = vector;
> 
> @@ -1690,7 +1692,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
> 
>                 iounmap(dev->bar);
> 
>                 dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size);
> 
>                 dev->dbs = ((void __iomem *)dev->bar) + 4096;
> 
> -                 dev->queues[0]->q_db = dev->dbs;
> 
> +                dev->queues[0]->sq_tail_db = dev->dbs;
> 
> +               dev->queues[0]->cq_head_db = dev->dbs + (1 << dev->db_stride);
> 
>        }
> 
>         vecs = nr_io_queues;
> 
> --
> 
> 1.7.6
> 
> _______________________________________________
> Kernel.openeuler mailing list
> Kernel.openeuler at huawei.com
> http://rnd-openeuler.huawei.com/mailman/listinfo/kernel.openeuler
> 
> .
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
  2013-09-02  6:49 ` [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq() Huhaiyan (haiyan)
@ 2013-09-03 15:53   ` Matthew Wilcox
  2013-09-04  9:15     ` Haiyan Hu
  0 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2013-09-03 15:53 UTC (permalink / raw)


On Mon, Sep 02, 2013@06:49:34AM +0000, Huhaiyan (haiyan) wrote:
> We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq(). 
> This change can reduce latency for the admin/io command.

Have you been able to measure a difference?  If so, can you share
relative numbers?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
  2013-09-03 15:53   ` Matthew Wilcox
@ 2013-09-04  9:15     ` Haiyan Hu
  2013-09-04 16:51       ` Matthew Wilcox
  0 siblings, 1 reply; 7+ messages in thread
From: Haiyan Hu @ 2013-09-04  9:15 UTC (permalink / raw)


On 2013/9/3 23:53, Matthew Wilcox wrote:
> On Mon, Sep 02, 2013@06:49:34AM +0000, Huhaiyan (haiyan) wrote:
>> We use 2 pointers, one for sq tail doorbell, the other for cq head doorbell in struct nvme_queue, to avoid calculate cq head doorbell in nvme_process_cq(). 
>> This change can reduce latency for the admin/io command.
> 
> Have you been able to measure a difference?  If so, can you share
> relative numbers?
> 
Sorry for my inaccurate description.
Our test data shows that this patch does not change cmd process latency.
But I think it can improve code efficiency, maybe a little.

In addition, the format of the patch I sent is corrupted.
I will resend it later.

Thank you.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
  2013-09-04  9:15     ` Haiyan Hu
@ 2013-09-04 16:51       ` Matthew Wilcox
  2013-09-05  8:55         ` Haiyan Hu
  0 siblings, 1 reply; 7+ messages in thread
From: Matthew Wilcox @ 2013-09-04 16:51 UTC (permalink / raw)

On Wed, Sep 04, 2013@05:15:51PM +0800, Haiyan Hu wrote:
> On 2013/9/3 23:53, Matthew Wilcox wrote:
> > Have you been able to measure a difference?  If so, can you share
> > relative numbers?
> 
> Sorry for my inaccurate description.
> Our test data shows that this patch does not change cmd process latency.
> But I think it can improve code efficiency, maybe a little.

Right, so it's a trade-off.  You want to add 8 bytes to the queue data
structure in order to save a few instructions from being executed at
queue processing time.  We need to quantify the savings (since we know
the costs).  I approximated the code by replacing:

-       writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+       writel(head, nvmeq->q_db);

Here's the code before:

     2ca:       48 8b 43 08             mov    0x8(%rbx),%rax
     2ce:       48 8b 93 88 00 00 00    mov    0x88(%rbx),%rdx
     2d5:       8b 48 40                mov    0x40(%rax),%ecx
     2d8:       b8 01 00 00 00          mov    $0x1,%eax
     2dd:       d3 e0                   shl    %cl,%eax
     2df:       48 98                   cltq   
     2e1:       48 8d 14 82             lea    (%rdx,%rax,4),%rdx
     2e5:       41 0f b7 c4             movzwl %r12w,%eax
     2e9:       89 02                   mov    %eax,(%rdx)

Here's the code after:

     2ca:       48 8b 93 88 00 00 00    mov    0x88(%rbx),%rdx
     2d1:       41 0f b7 c4             movzwl %r12w,%eax
     2d5:       89 02                   mov    %eax,(%rdx)

That's 6 instructions sved, and to be fair they have some pretty tight
dependencies.  Still, on a 3GHz processor, that's maybe 2 nanoseconds.
If we're doing a million IOPS, we could save 2ms/s, or 0.2%.  Pretty tough
to measure, let alone justify.

Here's an alternative that doesn't require adding 8 bytes to the
nvme_queue.  Instead of storing the shift in db_stride, we can store the
actual stride.  Then there is less calculation to be done at
completion time, and the code looks like this:

-       writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+       writel(head, nvmeq->q_db + nvmeq->dev->db_stride);

     2ca:       48 8b 43 08             mov    0x8(%rbx),%rax
     2ce:       48 63 50 40             movslq 0x40(%rax),%rdx
     2d2:       48 8b 83 88 00 00 00    mov    0x88(%rbx),%rax
     2d9:       48 8d 14 90             lea    (%rax,%rdx,4),%rdx
     2dd:       41 0f b7 c4             movzwl %r12w,%eax
     2e1:       89 02                   mov    %eax,(%rdx)

That saves half the instructions, for no increased data structure usage.

One other microoptimisation we can do is change the type of db_stride from
int to unsigned.  Then the compiler produces:

     2ca:       48 8b 43 08             mov    0x8(%rbx),%rax
     2ce:       8b 50 40                mov    0x40(%rax),%edx
     2d1:       48 8b 83 88 00 00 00    mov    0x88(%rbx),%rax
     2d8:       48 8d 14 90             lea    (%rax,%rdx,4),%rdx
     2dc:       41 0f b7 c4             movzwl %r12w,%eax
     2e0:       89 02                   mov    %eax,(%rdx)

which uses one fewer byte (at address 2ce, it uses mov instead of movslq).

Would you like to send a patch which changes the type of db_stride to
unsigned and changes the value stored there to be 1 << the current value?

Or you can try to persuade me again that the tradeoff is worth adding
the extra 8 bytes to the data structure :-)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
  2013-09-04 16:51       ` Matthew Wilcox
@ 2013-09-05  8:55         ` Haiyan Hu
  2013-09-05 19:28           ` Matthew Wilcox
  0 siblings, 1 reply; 7+ messages in thread
From: Haiyan Hu @ 2013-09-05  8:55 UTC (permalink / raw)


On 2013/9/5 0:51, Matthew Wilcox wrote:
> On Wed, Sep 04, 2013@05:15:51PM +0800, Haiyan Hu wrote:
>> On 2013/9/3 23:53, Matthew Wilcox wrote:
>>> Have you been able to measure a difference?  If so, can you share
>>> relative numbers?
>>
>> Sorry for my inaccurate description.
>> Our test data shows that this patch does not change cmd process latency.
>> But I think it can improve code efficiency, maybe a little.
> 
> Right, so it's a trade-off.  You want to add 8 bytes to the queue data
> structure in order to save a few instructions from being executed at
> queue processing time.  We need to quantify the savings (since we know
> the costs).  I approximated the code by replacing:
> 
> -       writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
> +       writel(head, nvmeq->q_db);
> 
> Here's the code before:
> 
>      2ca:       48 8b 43 08             mov    0x8(%rbx),%rax
>      2ce:       48 8b 93 88 00 00 00    mov    0x88(%rbx),%rdx
>      2d5:       8b 48 40                mov    0x40(%rax),%ecx
>      2d8:       b8 01 00 00 00          mov    $0x1,%eax
>      2dd:       d3 e0                   shl    %cl,%eax
>      2df:       48 98                   cltq   
>      2e1:       48 8d 14 82             lea    (%rdx,%rax,4),%rdx
>      2e5:       41 0f b7 c4             movzwl %r12w,%eax
>      2e9:       89 02                   mov    %eax,(%rdx)
> 
> Here's the code after:
> 
>      2ca:       48 8b 93 88 00 00 00    mov    0x88(%rbx),%rdx
>      2d1:       41 0f b7 c4             movzwl %r12w,%eax
>      2d5:       89 02                   mov    %eax,(%rdx)
> 
> That's 6 instructions sved, and to be fair they have some pretty tight
> dependencies.  Still, on a 3GHz processor, that's maybe 2 nanoseconds.
> If we're doing a million IOPS, we could save 2ms/s, or 0.2%.  Pretty tough
> to measure, let alone justify.
> 
> Here's an alternative that doesn't require adding 8 bytes to the
> nvme_queue.  Instead of storing the shift in db_stride, we can store the
> actual stride.  Then there is less calculation to be done at
> completion time, and the code looks like this:
> 
> -       writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
> +       writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
> 
>      2ca:       48 8b 43 08             mov    0x8(%rbx),%rax
>      2ce:       48 63 50 40             movslq 0x40(%rax),%rdx
>      2d2:       48 8b 83 88 00 00 00    mov    0x88(%rbx),%rax
>      2d9:       48 8d 14 90             lea    (%rax,%rdx,4),%rdx
>      2dd:       41 0f b7 c4             movzwl %r12w,%eax
>      2e1:       89 02                   mov    %eax,(%rdx)
> 
> That saves half the instructions, for no increased data structure usage.
> 
> One other microoptimisation we can do is change the type of db_stride from
> int to unsigned.  Then the compiler produces:
> 
>      2ca:       48 8b 43 08             mov    0x8(%rbx),%rax
>      2ce:       8b 50 40                mov    0x40(%rax),%edx
>      2d1:       48 8b 83 88 00 00 00    mov    0x88(%rbx),%rax
>      2d8:       48 8d 14 90             lea    (%rax,%rdx,4),%rdx
>      2dc:       41 0f b7 c4             movzwl %r12w,%eax
>      2e0:       89 02                   mov    %eax,(%rdx)
> 
> which uses one fewer byte (at address 2ce, it uses mov instead of movslq).
> 
> 
> Would you like to send a patch which changes the type of db_stride to
> unsigned and changes the value stored there to be 1 << the current value?
> 
> Or you can try to persuade me again that the tradeoff is worth adding
> the extra 8 bytes to the data structure :-)
> .
> 
Your analysis is more comprehensive. Of course, I'd like to send a new patch that
you advised. Do you think the updated variable named "db_stride_shift"?

Thank you.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq()
  2013-09-05  8:55         ` Haiyan Hu
@ 2013-09-05 19:28           ` Matthew Wilcox
  0 siblings, 0 replies; 7+ messages in thread
From: Matthew Wilcox @ 2013-09-05 19:28 UTC (permalink / raw)


On Thu, Sep 05, 2013@04:55:43PM +0800, Haiyan Hu wrote:
> Your analysis is more comprehensive. Of course, I'd like to send a new patch that
> you advised. Do you think the updated variable named "db_stride_shift"?

I think that might have been a better name for the old meaning :-)  I think
we should probabaly keep the name as-is, but change the usage.  Unless
someone can come up with a better name ... ?

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2013-09-05 19:28 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <mailman.3257.1377730793.1059.linux-nvme@lists.infradead.org>
2013-09-02  6:49 ` [PATCH] NVMe: Avoid caculate cq head doorbel in nvme_process_cq() Huhaiyan (haiyan)
2013-09-03 15:53   ` Matthew Wilcox
2013-09-04  9:15     ` Haiyan Hu
2013-09-04 16:51       ` Matthew Wilcox
2013-09-05  8:55         ` Haiyan Hu
2013-09-05 19:28           ` Matthew Wilcox
     [not found] <A103C806EC8D6E46A0D950DAA4D118D1400E0067@szxeml558-mbx.china.huawei.com>
2013-09-02  7:00 ` Qiang Huang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).