* [PATCH 1/4] nvme: use a work item to submit async event requests
2016-02-29 14:59 roolup of my pending NVMe patches for 4.6 Christoph Hellwig
@ 2016-02-29 14:59 ` Christoph Hellwig
2016-02-29 14:59 ` [PATCH 2/4] nvme: don't poll the CQ from the kthread Christoph Hellwig
` (3 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2016-02-29 14:59 UTC (permalink / raw)
Use a dedicated work item to submit async event requests instead of the
global kthread. This simplifies the code and reduces the latencies to
resubmit a request once an even notification happened.
Signed-off-by: Christoph Hellwig <hch at lst.de>
Reviewed-by: Keith Busch <keith.busch at intel.com>
Reviewed-by: Sagi Grimberg <sagig at mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn at suse.de>
---
drivers/nvme/host/pci.c | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fec7479..21b0be4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -100,6 +100,7 @@ struct nvme_dev {
struct work_struct reset_work;
struct work_struct scan_work;
struct work_struct remove_work;
+ struct work_struct async_work;
struct mutex shutdown_lock;
bool subsystem;
void __iomem *cmb;
@@ -281,8 +282,11 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
u16 status = le16_to_cpu(cqe->status) >> 1;
u32 result = le32_to_cpu(cqe->result);
- if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
+ if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) {
++dev->ctrl.event_limit;
+ queue_work(nvme_workq, &dev->async_work);
+ }
+
if (status != NVME_SC_SUCCESS)
return;
@@ -816,15 +820,22 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
return 0;
}
-static void nvme_submit_async_event(struct nvme_dev *dev)
+static void nvme_async_event_work(struct work_struct *work)
{
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work);
+ struct nvme_queue *nvmeq = dev->queues[0];
struct nvme_command c;
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
- c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit;
- __nvme_submit_cmd(dev->queues[0], &c);
+ spin_lock_irq(&nvmeq->q_lock);
+ while (dev->ctrl.event_limit > 0) {
+ c.common.command_id = NVME_AQ_BLKMQ_DEPTH +
+ --dev->ctrl.event_limit;
+ __nvme_submit_cmd(nvmeq, &c);
+ }
+ spin_unlock_irq(&nvmeq->q_lock);
}
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1358,9 +1369,6 @@ static int nvme_kthread(void *data)
continue;
spin_lock_irq(&nvmeq->q_lock);
nvme_process_cq(nvmeq);
-
- while (i == 0 && dev->ctrl.event_limit > 0)
- nvme_submit_async_event(dev);
spin_unlock_irq(&nvmeq->q_lock);
}
}
@@ -1929,6 +1937,7 @@ static void nvme_reset_work(struct work_struct *work)
goto free_tags;
dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
+ queue_work(nvme_workq, &dev->async_work);
result = nvme_dev_list_add(dev);
if (result)
@@ -2062,6 +2071,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+ INIT_WORK(&dev->async_work, nvme_async_event_work);
mutex_init(&dev->shutdown_lock);
init_completion(&dev->ioq_wait);
@@ -2115,6 +2125,7 @@ static void nvme_remove(struct pci_dev *pdev)
spin_unlock(&dev_list_lock);
pci_set_drvdata(pdev, NULL);
+ flush_work(&dev->async_work);
flush_work(&dev->reset_work);
flush_work(&dev->scan_work);
nvme_remove_namespaces(&dev->ctrl);
--
2.1.4
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 3/4] nvme: replace the kthread with a per-device watchdog timer
2016-02-29 14:59 roolup of my pending NVMe patches for 4.6 Christoph Hellwig
2016-02-29 14:59 ` [PATCH 1/4] nvme: use a work item to submit async event requests Christoph Hellwig
2016-02-29 14:59 ` [PATCH 2/4] nvme: don't poll the CQ from the kthread Christoph Hellwig
@ 2016-02-29 14:59 ` Christoph Hellwig
2016-02-29 14:59 ` [PATCH 4/4] nvme: return the whole CQE through the request passthrough interface Christoph Hellwig
2016-02-29 15:47 ` roolup of my pending NVMe patches for 4.6 Jens Axboe
4 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2016-02-29 14:59 UTC (permalink / raw)
The only work left in the kthread is the periodic health check for each
controller. There is no need to run this from process context or keep
a thread context around for it, so replace it with a simpler timer.
Signed-off-by: Christoph Hellwig <hch at lst.de>
Reviewed-by: Keith Busch <keith.busch at intel.com>
Reviewed-by: Sagi Grimberg <sagig at mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn at suse.de>
---
drivers/nvme/host/pci.c | 112 ++++++++++--------------------------------------
1 file changed, 23 insertions(+), 89 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10839f7..a623360 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -27,7 +27,6 @@
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kdev_t.h>
-#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -39,6 +38,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/t10-pi.h>
+#include <linux/timer.h>
#include <linux/types.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <asm/unaligned.h>
@@ -64,11 +64,7 @@ static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0644);
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
-static LIST_HEAD(dev_list);
-static DEFINE_SPINLOCK(dev_list_lock);
-static struct task_struct *nvme_thread;
static struct workqueue_struct *nvme_workq;
-static wait_queue_head_t nvme_kthread_wait;
struct nvme_dev;
struct nvme_queue;
@@ -82,7 +78,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct nvme_dev {
- struct list_head node;
struct nvme_queue **queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
@@ -101,6 +96,7 @@ struct nvme_dev {
struct work_struct scan_work;
struct work_struct remove_work;
struct work_struct async_work;
+ struct timer_list watchdog_timer;
struct mutex shutdown_lock;
bool subsystem;
void __iomem *cmb;
@@ -1334,36 +1330,26 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
return result;
}
-static int nvme_kthread(void *data)
+static void nvme_watchdog_timer(unsigned long data)
{
- struct nvme_dev *dev, *next;
+ struct nvme_dev *dev = (struct nvme_dev *)data;
+ u32 csts = readl(dev->bar + NVME_REG_CSTS);
- while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
- spin_lock(&dev_list_lock);
- list_for_each_entry_safe(dev, next, &dev_list, node) {
- u32 csts = readl(dev->bar + NVME_REG_CSTS);
-
- /*
- * Skip controllers currently under reset.
- */
- if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
- continue;
-
- if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
- csts & NVME_CSTS_CFS) {
- if (queue_work(nvme_workq, &dev->reset_work)) {
- dev_warn(dev->ctrl.device,
- "Failed status: %x, reset controller\n",
- readl(dev->bar + NVME_REG_CSTS));
- }
- continue;
- }
+ /*
+ * Skip controllers currently under reset.
+ */
+ if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) &&
+ ((csts & NVME_CSTS_CFS) ||
+ (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) {
+ if (queue_work(nvme_workq, &dev->reset_work)) {
+ dev_warn(dev->dev,
+ "Failed status: 0x%x, reset controller.\n",
+ csts);
}
- spin_unlock(&dev_list_lock);
- schedule_timeout(round_jiffies_relative(HZ));
+ return;
}
- return 0;
+
+ mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
}
static int nvme_create_io_queues(struct nvme_dev *dev)
@@ -1777,56 +1763,12 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
}
}
-static int nvme_dev_list_add(struct nvme_dev *dev)
-{
- bool start_thread = false;
-
- spin_lock(&dev_list_lock);
- if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
- start_thread = true;
- nvme_thread = NULL;
- }
- list_add(&dev->node, &dev_list);
- spin_unlock(&dev_list_lock);
-
- if (start_thread) {
- nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
- wake_up_all(&nvme_kthread_wait);
- } else
- wait_event_killable(nvme_kthread_wait, nvme_thread);
-
- if (IS_ERR_OR_NULL(nvme_thread))
- return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
-
- return 0;
-}
-
-/*
-* Remove the node from the device list and check
-* for whether or not we need to stop the nvme_thread.
-*/
-static void nvme_dev_list_remove(struct nvme_dev *dev)
-{
- struct task_struct *tmp = NULL;
-
- spin_lock(&dev_list_lock);
- list_del_init(&dev->node);
- if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
- tmp = nvme_thread;
- nvme_thread = NULL;
- }
- spin_unlock(&dev_list_lock);
-
- if (tmp)
- kthread_stop(tmp);
-}
-
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
int i;
u32 csts = -1;
- nvme_dev_list_remove(dev);
+ del_timer_sync(&dev->watchdog_timer);
mutex_lock(&dev->shutdown_lock);
if (dev->bar) {
@@ -1927,9 +1869,7 @@ static void nvme_reset_work(struct work_struct *work)
dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
queue_work(nvme_workq, &dev->async_work);
- result = nvme_dev_list_add(dev);
- if (result)
- goto remove;
+ mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ));
/*
* Keep the controller around but remove all namespaces if we don't have
@@ -1946,8 +1886,6 @@ static void nvme_reset_work(struct work_struct *work)
clear_bit(NVME_CTRL_RESETTING, &dev->flags);
return;
- remove:
- nvme_dev_list_remove(dev);
free_tags:
nvme_dev_remove_admin(dev);
blk_put_queue(dev->ctrl.admin_q);
@@ -2055,11 +1993,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
- INIT_LIST_HEAD(&dev->node);
INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
INIT_WORK(&dev->async_work, nvme_async_event_work);
+ setup_timer(&dev->watchdog_timer, nvme_watchdog_timer,
+ (unsigned long)dev);
mutex_init(&dev->shutdown_lock);
init_completion(&dev->ioq_wait);
@@ -2108,9 +2047,7 @@ static void nvme_remove(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- spin_lock(&dev_list_lock);
- list_del_init(&dev->node);
- spin_unlock(&dev_list_lock);
+ del_timer_sync(&dev->watchdog_timer);
pci_set_drvdata(pdev, NULL);
flush_work(&dev->async_work);
@@ -2223,8 +2160,6 @@ static int __init nvme_init(void)
{
int result;
- init_waitqueue_head(&nvme_kthread_wait);
-
nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
if (!nvme_workq)
return -ENOMEM;
@@ -2239,7 +2174,6 @@ static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
destroy_workqueue(nvme_workq);
- BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
_nvme_check_size();
}
--
2.1.4
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH 4/4] nvme: return the whole CQE through the request passthrough interface
2016-02-29 14:59 roolup of my pending NVMe patches for 4.6 Christoph Hellwig
` (2 preceding siblings ...)
2016-02-29 14:59 ` [PATCH 3/4] nvme: replace the kthread with a per-device watchdog timer Christoph Hellwig
@ 2016-02-29 14:59 ` Christoph Hellwig
2016-02-29 15:42 ` Keith Busch
2016-02-29 15:47 ` roolup of my pending NVMe patches for 4.6 Jens Axboe
4 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2016-02-29 14:59 UTC (permalink / raw)
Both LighNVM and NVMe over Fabrics need to look at more than just the
status and result field.
Signed-off-by: Christoph Hellwig <hch at lst.de>
Reviewed-by: Matias Bj?rling <m at bjorling.me>
Reviewed-by: Jay Freyensee <james.p.freyensee at intel.com>
Reviewed-by: Sagi Grimberg <sagig at mellanox.com>
Signed-off-by: Sagi Grimberg <sagig at mellanox.com>
---
drivers/nvme/host/core.c | 27 +++++++++++++++++++--------
drivers/nvme/host/nvme.h | 3 ++-
drivers/nvme/host/pci.c | 11 +++--------
3 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 07b7ec69..66fd3d9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -132,7 +132,6 @@ struct request *nvme_alloc_request(struct request_queue *q,
req->cmd = (unsigned char *)cmd;
req->cmd_len = sizeof(struct nvme_command);
- req->special = (void *)0;
return req;
}
@@ -143,7 +142,8 @@ EXPORT_SYMBOL_GPL(nvme_alloc_request);
* if the result is positive, it's an NVM Express status code
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
- void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
+ struct nvme_completion *cqe, void *buffer, unsigned bufflen,
+ unsigned timeout)
{
struct request *req;
int ret;
@@ -153,6 +153,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
return PTR_ERR(req);
req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+ req->special = cqe;
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
@@ -161,8 +162,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
}
blk_execute_rq(req->q, NULL, req, 0);
- if (result)
- *result = (u32)(uintptr_t)req->special;
ret = req->errors;
out:
blk_mq_free_request(req);
@@ -172,7 +171,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
- return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
+ return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -182,6 +181,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
u32 *result, unsigned timeout)
{
bool write = cmd->common.opcode & 1;
+ struct nvme_completion cqe;
struct nvme_ns *ns = q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
struct request *req;
@@ -194,6 +194,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
return PTR_ERR(req);
req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+ req->special = &cqe;
if (ubuffer && bufflen) {
ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
@@ -248,7 +249,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
blk_execute_rq(req->q, disk, req, 0);
ret = req->errors;
if (result)
- *result = (u32)(uintptr_t)req->special;
+ *result = le32_to_cpu(cqe.result);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
@@ -329,6 +330,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
dma_addr_t dma_addr, u32 *result)
{
struct nvme_command c;
+ struct nvme_completion cqe;
+ int ret;
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_get_features;
@@ -336,13 +339,18 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
c.features.prp1 = cpu_to_le64(dma_addr);
c.features.fid = cpu_to_le32(fid);
- return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+ ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0);
+ if (ret >= 0)
+ *result = le32_to_cpu(cqe.result);
+ return ret;
}
int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
dma_addr_t dma_addr, u32 *result)
{
struct nvme_command c;
+ struct nvme_completion cqe;
+ int ret;
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_set_features;
@@ -350,7 +358,10 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
- return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+ ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0);
+ if (ret >= 0)
+ *result = le32_to_cpu(cqe.result);
+ return ret;
}
int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 63ba8a5..2ac7539 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -248,7 +248,8 @@ void nvme_requeue_req(struct request *req);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buf, unsigned bufflen);
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
- void *buffer, unsigned bufflen, u32 *result, unsigned timeout);
+ struct nvme_completion *cqe, void *buffer, unsigned bufflen,
+ unsigned timeout);
int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void __user *ubuffer, unsigned bufflen, u32 *result,
unsigned timeout);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a623360..d47b087 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -748,10 +748,8 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
}
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
- if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
- u32 result = le32_to_cpu(cqe.result);
- req->special = (void *)(uintptr_t)result;
- }
+ if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special)
+ memcpy(req->special, &cqe, sizeof(cqe));
blk_mq_complete_request(req, status >> 1);
}
@@ -901,13 +899,10 @@ static void abort_endio(struct request *req, int error)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = iod->nvmeq;
- u32 result = (u32)(uintptr_t)req->special;
u16 status = req->errors;
- dev_warn(nvmeq->dev->ctrl.device,
- "Abort status:%x result:%x", status, result);
+ dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status);
atomic_inc(&nvmeq->dev->ctrl.abort_limit);
-
blk_mq_free_request(req);
}
--
2.1.4
^ permalink raw reply related [flat|nested] 9+ messages in thread