[PATCH 5/7 v2] nvme-fabrics: Add host FC transport support

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support
@ 2016-10-07 23:09 James Smart
  2016-10-12  9:24 ` Christoph Hellwig
  0 siblings, 1 reply; 5+ messages in thread
From: James Smart @ 2016-10-07 23:09 UTC (permalink / raw)



Add nvme-fabrics host FC transport support

Implements the FC-NVME T11 definition of how nvme fabric capsules are
performed on an FC fabric. Utilizes a lower-layer API to FC host adapters
to send/receive FC-4 LS operations and FCP operations that comprise NVME
over FC operation.

The T11 definitions for FC-4 Link Services are implemented which create
NVMeOF connections.  Implements the hooks with blk-mq to then submit admin
and io requests to the different connections.

The changes made since the prior patch:
- Revise Kconfig for changes in select/depends
- Remove todo on Association LS: pull values from opts struct
- Slight restructure of nvme_fc_create_io_queues() and
  nvme_fc_init_io_queues() for their return values
- Checked and corrected BLK_MQ_xx return values in nvme_fc_start_fcp_op()
- Remove the todo's around dma_map_page(). No need to use dma_map_page()
- Reworked teardown paths with ref counting
- Teardown now invokes callbackes on completion of unregister functions
- set sqid in fcp request structs
- Add blk poll support.
- Remove map_queue from templates
- Removal of fabric name for nport qualifier. Not necessary in T11.
- LS's moved to controller linked list rather than remoteport
- Modifications from prior reviews:
-  Create inline xx_t_yy() wrappers for container_of's
-  fix nvme_fc_timeout() to use reinit_completion
-  fix nvme_fc_map_data() to declare dma dir to avoid line splits
-  fix nvme_fc_shutdown_ctlr() to remove previously done
     nvme_stop_keep_alive()
-  minor parens changes

Signed-off-by: James Smart <james.smart at broadcom.com>
---
 drivers/nvme/host/Kconfig  |   16 +
 drivers/nvme/host/Makefile |    3 +
 drivers/nvme/host/fc.c     | 2304 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 2323 insertions(+)
 create mode 100644 drivers/nvme/host/fc.c

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index f7d37a6..aacac94 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -43,3 +43,19 @@ config NVME_RDMA
 	  from https://github.com/linux-nvme/nvme-cli.
 
 	  If unsure, say N.
+
+config NVME_FC
+	tristate "NVM Express over Fabrics FC host driver"
+	depends on BLOCK
+	select NVME_CORE
+	select NVME_FABRICS
+	select SG_POOL
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the FC transport.  This allows you to use remote block devices
+	  exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 47abcec..f1a7d94 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_NVME_CORE)			+= nvme-core.o
 obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
 obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
 obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
+obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
 
 nvme-core-y				:= core.o
 nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI)	+= scsi.o
@@ -12,3 +13,5 @@ nvme-y					+= pci.o
 nvme-fabrics-y				+= fabrics.o
 
 nvme-rdma-y				+= rdma.o
+
+nvme-fc-y				+= fc.o
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
new file mode 100644
index 0000000..35ea164
--- /dev/null
+++ b/drivers/nvme/host/fc.c
@@ -0,0 +1,2304 @@
+/*
+ * Copyright (c) 2016 Avago Technologies.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful.
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
+ * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
+ * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
+ * See the GNU General Public License for more details, a copy of which
+ * can be found in the file COPYING included with this package
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/parser.h>
+#include <uapi/scsi/fc/fc_fs.h>
+#include <uapi/scsi/fc/fc_els.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+#include <linux/nvme-fc-driver.h>
+#include <linux/nvme-fc.h>
+
+
+/* *************************** Data Structures/Defines ****************** */
+
+
+/*
+ * We handle AEN commands ourselves and don't even let the
+ * block layer know about them.
+ */
+#define NVME_FC_NR_AEN_COMMANDS	1
+#define NVME_FC_AQ_BLKMQ_DEPTH	\
+	(NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
+#define AEN_CMDID_BASE		(NVME_FC_AQ_BLKMQ_DEPTH + 1)
+#define IS_AEN_COMMAND(command_id) \
+	((command_id) >= AEN_CMDID_BASE)
+
+enum nvme_fc_queue_flags {
+	NVME_FC_Q_CONNECTED = (1 << 0),
+};
+
+#define NVMEFC_QUEUE_DELAY	3		/* ms units */
+
+struct nvme_fc_queue {
+	struct nvme_fc_ctrl	*ctrl;
+	struct device		*dev;
+	struct blk_mq_hw_ctx	*hctx;
+	void			*lldd_handle;
+	int			queue_size;
+	size_t			cmnd_capsule_len;
+	u32			qnum;
+	u32			rqcnt;
+	u32			seqno;
+	u16			sqhd;
+
+	u64			connection_id;
+	atomic_t		csn;
+
+	unsigned long		flags;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+struct nvmefc_ls_req_op {
+	struct nvmefc_ls_req	ls_req;
+
+	struct nvme_fc_ctrl	*ctrl;
+	struct nvme_fc_queue	*queue;
+	struct request		*rq;
+
+	int			ls_error;
+	struct completion	ls_done;
+	struct list_head	lsreq_list;	/* ctrl->ls_req_list */
+};
+
+enum nvme_fcpop_state {
+	FCPOP_STATE_UNINIT	= 0,
+	FCPOP_STATE_IDLE	= 1,
+	FCPOP_STATE_ACTIVE	= 2,
+	FCPOP_STATE_ABORTED	= 3,
+};
+
+struct nvme_fc_fcp_op {
+	struct nvmefc_fcp_req	fcp_req;
+
+	struct nvme_fc_ctrl	*ctrl;
+	struct nvme_fc_queue	*queue;
+	struct request		*rq;
+
+	atomic_t		state;
+	u32			rqno;
+	u32			nents;
+	struct completion	abort_done;
+
+	struct nvme_fc_cmd_iu	cmd_iu;
+	struct nvme_fc_ersp_iu	rsp_iu;
+};
+
+struct nvme_fc_lport {
+	struct nvme_fc_local_port	localport;
+
+	u32				endp_cnt;
+	struct list_head		port_list;	/* nvme_fc_port_list */
+	struct list_head		endp_list;
+	struct device			*dev;	/* physical device for dma */
+	struct nvme_fc_port_template	*ops;
+	struct kref			ref;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+struct nvme_fc_rport {
+	struct nvme_fc_remote_port	remoteport;
+
+	struct list_head		endp_list; /* for lport->endp_list */
+	struct list_head		ctrl_list;
+	spinlock_t			lock;
+	struct kref			ref;
+} __aligned(sizeof(u64));	/* alignment for other things alloc'd with */
+
+enum nvme_fcctrl_state {
+	FCCTRL_INIT		= 0,
+	FCCTRL_ACTIVE		= 1,
+	FCCTRL_DELETING		= 2,
+	FCCTRL_DELETED		= 3,
+};
+
+struct nvme_fc_ctrl {
+	spinlock_t		lock;
+	struct nvme_fc_queue	*queues;
+	u32			queue_count;
+
+	struct device		*dev;
+	struct nvme_fc_lport	*lport;
+	struct nvme_fc_rport	*rport;
+	u32			cnum;
+
+	u64			association_id;
+
+	u64			cap;
+
+	struct list_head	ctrl_list;	/* rport->ctrl_list */
+	struct list_head	ls_req_list;
+
+	struct blk_mq_tag_set	admin_tag_set;
+	struct blk_mq_tag_set	tag_set;
+
+	struct work_struct	delete_work;
+	struct kref		ref;
+	atomic_t		myref;
+	int			state;
+
+	struct nvme_fc_fcp_op	aen_ops[NVME_FC_NR_AEN_COMMANDS];
+
+	struct nvme_ctrl	ctrl;
+};
+
+static inline struct nvme_fc_ctrl *
+to_fc_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_fc_ctrl, ctrl);
+}
+
+static inline struct nvme_fc_lport *
+localport_to_lport(struct nvme_fc_local_port *portptr)
+{
+	return container_of(portptr, struct nvme_fc_lport, localport);
+}
+
+static inline struct nvme_fc_rport *
+remoteport_to_rport(struct nvme_fc_remote_port *portptr)
+{
+	return container_of(portptr, struct nvme_fc_rport, remoteport);
+}
+
+static inline struct nvmefc_ls_req_op *
+ls_req_to_lsop(struct nvmefc_ls_req *lsreq)
+{
+	return container_of(lsreq, struct nvmefc_ls_req_op, ls_req);
+}
+
+static inline struct nvme_fc_fcp_op *
+fcp_req_to_fcp_op(struct nvmefc_fcp_req *fcpreq)
+{
+	return container_of(fcpreq, struct nvme_fc_fcp_op, fcp_req);
+}
+
+
+
+/* *************************** Globals **************************** */
+
+
+static DEFINE_SPINLOCK(nvme_fc_lock);
+
+static LIST_HEAD(nvme_fc_lport_list);
+static u32 nvme_fc_local_port_cnt;
+static u32 nvme_fc_ctrl_cnt;
+
+static struct workqueue_struct *nvme_fc_wq;
+
+
+
+/* *********************** FC-NVME Port Management ************************ */
+
+static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *);
+static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
+			struct nvme_fc_queue *, unsigned int);
+
+
+/**
+ * nvme_fc_register_localport - transport entry point called by an
+ *                              LLDD to register the existence of a NVME
+ *                              host FC port.
+ * @pinfo:     pointer to information about the port to be registered
+ * @template:  LLDD entrypoints and operational parameters for the port
+ * @dev:       physical hardware device node port corresponds to. Will be
+ *             used for DMA mappings
+ * @lport_p:   pointer to a local port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_local_port structure and place its
+ *             address in the local port pointer. Upon failure, local port
+ *             pointer will be set to 0.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_register_localport(struct nvme_fc_port_info *pinfo,
+			struct nvme_fc_port_template *template,
+			struct device *dev,
+			struct nvme_fc_local_port **portptr)
+{
+	struct nvme_fc_lport *newrec;
+	unsigned long flags;
+	int ret;
+
+	if (!template->localport_delete || !template->remoteport_delete ||
+	    !template->ls_req || !template->fcp_io ||
+	    !template->ls_abort || !template->fcp_abort ||
+	    !template->max_hw_queues || !template->max_sgl_segments ||
+	    !template->max_dif_sgl_segments || !template->dma_boundary) {
+		ret = -EINVAL;
+		goto out_reghost_failed;
+	}
+
+	newrec = kmalloc((sizeof(*newrec) + template->local_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_reghost_failed;
+	}
+
+	INIT_LIST_HEAD(&newrec->port_list);
+	INIT_LIST_HEAD(&newrec->endp_list);
+	kref_init(&newrec->ref);
+	newrec->ops = template;
+	newrec->dev = dev;
+	newrec->endp_cnt = 0;
+	newrec->localport.private = &newrec[1];
+	newrec->localport.node_name = pinfo->node_name;
+	newrec->localport.port_name = pinfo->port_name;
+	newrec->localport.port_role = pinfo->port_role;
+	newrec->localport.port_id = pinfo->port_id;
+	newrec->localport.port_state = FC_OBJSTATE_ONLINE;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	newrec->localport.port_num = nvme_fc_local_port_cnt++;
+	list_add_tail(&newrec->port_list, &nvme_fc_lport_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	if (dev)
+		dma_set_seg_boundary(dev, template->dma_boundary);
+
+	*portptr = &newrec->localport;
+	return 0;
+
+out_reghost_failed:
+	*portptr = NULL;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_register_localport);
+
+static void
+nvme_fc_free_lport(struct kref *ref)
+{
+	struct nvme_fc_lport *lport =
+		container_of(ref, struct nvme_fc_lport, ref);
+	unsigned long flags;
+
+	WARN_ON(lport->localport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&lport->endp_list));
+
+	/* remove from transport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&lport->port_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	/* let the LLDD know we've finished tearing it down */
+	lport->ops->localport_delete(&lport->localport);
+
+	kfree(lport);
+}
+
+static void
+nvme_fc_lport_put(struct nvme_fc_lport *lport)
+{
+	kref_put(&lport->ref, nvme_fc_free_lport);
+}
+
+static int
+nvme_fc_lport_get(struct nvme_fc_lport *lport)
+{
+	return kref_get_unless_zero(&lport->ref);
+}
+
+/**
+ * nvme_fc_unregister_localport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a NVME host FC port.
+ * @localport: pointer to the (registered) local port that is to be
+ *             deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr)
+{
+	struct nvme_fc_lport *lport = localport_to_lport(portptr);
+	unsigned long flags;
+	u32 lnum;
+
+	if (!portptr)
+		return -EINVAL;
+
+	lnum = portptr->port_num;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&nvme_fc_lock, flags);
+		return -EINVAL;
+	}
+	portptr->port_state = FC_OBJSTATE_DELETED;
+
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	nvme_fc_lport_put(lport);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport);
+
+/**
+ * nvme_fc_register_remoteport - transport entry point called by an
+ *                              LLDD to register the existence of a NVME
+ *                              subsystem FC port on its fabric.
+ * @localport: pointer to the (registered) local port that the remote
+ *             subsystem port is connected to.
+ * @pinfo:     pointer to information about the port to be registered
+ * @rport_p:   pointer to a remote port pointer. Upon success, the routine
+ *             will allocate a nvme_fc_remote_port structure and place its
+ *             address in the remote port pointer. Upon failure, remote port
+ *             pointer will be set to 0.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
+				struct nvme_fc_port_info *pinfo,
+				struct nvme_fc_remote_port **portptr)
+{
+	struct nvme_fc_lport *lport = localport_to_lport(localport);
+	struct nvme_fc_rport *newrec;
+	unsigned long flags;
+	int ret;
+
+	newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz),
+			 GFP_KERNEL);
+	if (!newrec) {
+		ret = -ENOMEM;
+		goto out_reghost_failed;
+	}
+
+	INIT_LIST_HEAD(&newrec->endp_list);
+	INIT_LIST_HEAD(&newrec->ctrl_list);
+	kref_init(&newrec->ref);
+	spin_lock_init(&newrec->lock);
+	newrec->remoteport.localport = &lport->localport;
+	newrec->remoteport.private = &newrec[1];
+	newrec->remoteport.port_role = pinfo->port_role;
+	newrec->remoteport.node_name = pinfo->node_name;
+	newrec->remoteport.port_name = pinfo->port_name;
+	newrec->remoteport.port_id = pinfo->port_id;
+	newrec->remoteport.port_state = FC_OBJSTATE_ONLINE;
+
+	nvme_fc_lport_get(lport);
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	newrec->remoteport.port_num = lport->endp_cnt++;
+	list_add_tail(&newrec->endp_list, &lport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	*portptr = &newrec->remoteport;
+	return 0;
+
+out_reghost_failed:
+	*portptr = NULL;
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
+
+static void
+nvme_fc_free_rport(struct kref *ref)
+{
+	struct nvme_fc_rport *rport =
+		container_of(ref, struct nvme_fc_rport, ref);
+	struct nvme_fc_lport *lport =
+			localport_to_lport(rport->remoteport.localport);
+	unsigned long flags;
+
+	WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED);
+	WARN_ON(!list_empty(&rport->ctrl_list));
+
+	/* remove from lport list */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_del(&rport->endp_list);
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	/* let the LLDD know we've finished tearing it down */
+	lport->ops->remoteport_delete(&rport->remoteport);
+
+	kfree(rport);
+
+	nvme_fc_lport_put(lport);
+}
+
+static void
+nvme_fc_rport_put(struct nvme_fc_rport *rport)
+{
+	kref_put(&rport->ref, nvme_fc_free_rport);
+}
+
+static int
+nvme_fc_rport_get(struct nvme_fc_rport *rport)
+{
+	return kref_get_unless_zero(&rport->ref);
+}
+
+/**
+ * nvme_fc_unregister_remoteport - transport entry point called by an
+ *                              LLDD to deregister/remove a previously
+ *                              registered a NVME subsystem FC port.
+ * @remoteport: pointer to the (registered) remote port that is to be
+ *              deregistered.
+ *
+ * Returns:
+ * a completion status. Must be 0 upon success; a negative errno
+ * (ex: -ENXIO) upon failure.
+ */
+int
+nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
+{
+	struct nvme_fc_rport *rport = remoteport_to_rport(portptr);
+	struct nvme_fc_lport *lport = localport_to_lport(portptr->localport);
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+	u32 lnum, pnum;
+
+	if (!portptr)
+		return -EINVAL;
+
+	spin_lock_irqsave(&rport->lock, flags);
+
+	if (portptr->port_state != FC_OBJSTATE_ONLINE) {
+		spin_unlock_irqrestore(&rport->lock, flags);
+		return -EINVAL;
+	}
+	portptr->port_state = FC_OBJSTATE_DELETED;
+
+	lnum = lport->localport.port_num;
+	pnum = portptr->port_num;
+
+	/* tear down all associations to the remote port */
+	list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+		__nvme_fc_del_ctrl(ctrl);
+	}
+
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	nvme_fc_rport_put(rport);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport);
+
+
+
+/* *********************** FC-NVME LS Handling **************************** */
+
+static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
+static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
+
+
+static void
+nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status)
+{
+	struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
+
+	lsop->ls_error = status;
+	complete(&lsop->ls_done);
+}
+
+
+static int
+nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
+{
+	struct nvmefc_ls_req *lsreq = &lsop->ls_req;
+	struct fcnvme_ls_rjt *rjt = lsreq->rspaddr;
+	unsigned long flags;
+	int ret;
+
+	nvme_fc_ctrl_get(ctrl);
+
+	lsreq->done = nvme_fc_send_ls_req_done;
+	lsop->ctrl = ctrl;
+	INIT_LIST_HEAD(&lsop->lsreq_list);
+	init_completion(&lsop->ls_done);
+
+	lsreq->rqstdma = dma_map_single(ctrl->dev, lsreq->rqstaddr,
+				  lsreq->rqstlen + lsreq->rsplen,
+				  DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(ctrl->dev, lsreq->rqstdma)) {
+		nvme_fc_ctrl_put(ctrl);
+		dev_err(ctrl->dev,
+			"els request command failed EFAULT.\n");
+		return -EFAULT;
+	}
+	lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	list_add_tail(&lsop->lsreq_list, &ctrl->ls_req_list);
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	ret = ctrl->lport->ops->ls_req(&ctrl->lport->localport,
+					&ctrl->rport->remoteport, lsreq);
+	if (ret) {
+		lsop->ls_error = ret;
+		goto iodone;
+	}
+
+	/*
+	 * No timeout/not interruptible as we need the struct to exist until
+	 * the lldd calls us back. Thus mandate wait until driver calls back.
+	 * lldd responsible for the timeout action
+	 */
+	wait_for_completion(&lsop->ls_done);
+
+iodone:
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	list_del(&lsop->lsreq_list);
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	dma_unmap_single(ctrl->dev, lsreq->rqstdma,
+				  (lsreq->rqstlen + lsreq->rsplen),
+				  DMA_BIDIRECTIONAL);
+
+	nvme_fc_ctrl_put(ctrl);
+
+	if (lsop->ls_error) {
+		dev_err(ctrl->dev,
+			"ls request command failed (%d).\n", lsop->ls_error);
+		return lsop->ls_error;
+	}
+
+	/* ACC or RJT payload ? */
+	if (rjt->w0.ls_cmd == FCNVME_LS_RJT)
+		return -ENXIO;
+
+	return 0;
+}
+
+/* Validation Error indexes into the string table below */
+enum {
+	VERR_NO_ERROR		= 0,
+	VERR_LSACC		= 1,
+	VERR_LSDESC_RQST	= 2,
+	VERR_LSDESC_RQST_LEN	= 3,
+	VERR_ASSOC_ID		= 4,
+	VERR_ASSOC_ID_LEN	= 5,
+	VERR_CONN_ID		= 6,
+	VERR_CONN_ID_LEN	= 7,
+	VERR_CR_ASSOC		= 8,
+	VERR_CR_ASSOC_ACC_LEN	= 9,
+	VERR_CR_CONN		= 10,
+	VERR_CR_CONN_ACC_LEN	= 11,
+	VERR_DISCONN		= 12,
+	VERR_DISCONN_ACC_LEN	= 13,
+};
+
+static char *validation_errors[] = {
+	"OK",
+	"Not LS_ACC",
+	"Not LSDESC_RQST",
+	"Bad LSDESC_RQST Length",
+	"Not Association ID",
+	"Bad Association ID Length",
+	"Not Connection ID",
+	"Bad Connection ID Length",
+	"Not CR_ASSOC Rqst",
+	"Bad CR_ASSOC ACC Length",
+	"Not CR_CONN Rqst",
+	"Bad CR_CONN ACC Length",
+	"Not Disconnect Rqst",
+	"Bad Disconnect ACC Length",
+};
+
+static int
+nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, u16 qsize, u16 ersp_ratio)
+{
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	struct fcnvme_ls_cr_assoc_rqst *assoc_rqst;
+	struct fcnvme_ls_cr_assoc_acc *assoc_acc;
+	int ret, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*assoc_rqst) + sizeof(*assoc_acc)), GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	assoc_rqst = (struct fcnvme_ls_cr_assoc_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	assoc_acc = (struct fcnvme_ls_cr_assoc_acc *)&assoc_rqst[1];
+
+	assoc_rqst->w0.ls_cmd = FCNVME_LS_CREATE_ASSOCIATION;
+	assoc_rqst->desc_list_len =
+			cpu_to_be32(sizeof(struct fcnvme_lsdesc_cr_assoc_cmd));
+
+	assoc_rqst->assoc_cmd.desc_tag =
+			cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD);
+	assoc_rqst->assoc_cmd.desc_len =
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_cr_assoc_cmd);
+
+	assoc_rqst->assoc_cmd.ersp_ratio = cpu_to_be16(ersp_ratio);
+	assoc_rqst->assoc_cmd.sqsize = cpu_to_be16(qsize);
+	/* Linux supports only Dynamic controllers */
+	assoc_rqst->assoc_cmd.cntlid = cpu_to_be16(0xffff);
+	memcpy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id,
+		min_t(size_t, FCNVME_ASSOC_HOSTID_LEN, sizeof(uuid_be)));
+	strncpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn,
+		min(FCNVME_ASSOC_HOSTNQN_LEN, NVMF_NQN_SIZE));
+	strncpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn,
+		min(FCNVME_ASSOC_SUBNQN_LEN, NVMF_NQN_SIZE));
+
+	lsop->queue = queue;
+	lsreq->rqstaddr = assoc_rqst;
+	lsreq->rqstlen = sizeof(*assoc_rqst);
+	lsreq->rspaddr = assoc_acc;
+	lsreq->rsplen = sizeof(*assoc_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* process connect LS completion */
+
+	/* validate the ACC response */
+	if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	if (assoc_acc->hdr.desc_list_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_ls_cr_assoc_acc))
+		fcret = VERR_CR_ASSOC_ACC_LEN;
+	if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (assoc_acc->hdr.rqst.desc_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_rqst))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (assoc_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_ASSOCIATION)
+		fcret = VERR_CR_ASSOC;
+	else if (assoc_acc->associd.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_ASSOC_ID))
+		fcret = VERR_ASSOC_ID;
+	else if (assoc_acc->associd.desc_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_assoc_id))
+		fcret = VERR_ASSOC_ID_LEN;
+	else if (assoc_acc->connectid.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CONN_ID))
+		fcret = VERR_CONN_ID;
+	else if (assoc_acc->connectid.desc_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_conn_id))
+		fcret = VERR_CONN_ID_LEN;
+
+	if (fcret) {
+		ret = -EBADF;
+		dev_err(ctrl->dev,
+			"q %d connect failed: %s\n",
+			queue->qnum, validation_errors[fcret]);
+	} else {
+		ctrl->association_id =
+			be64_to_cpu(assoc_acc->associd.association_id);
+		queue->connection_id =
+			be64_to_cpu(assoc_acc->connectid.connection_id);
+		set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+	}
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect command failed (%d).\n",
+			queue->qnum, ret);
+	return ret;
+}
+
+static int
+nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
+			u16 qsize, u16 ersp_ratio)
+{
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	struct fcnvme_ls_cr_conn_rqst *conn_rqst;
+	struct fcnvme_ls_cr_conn_acc *conn_acc;
+	int ret, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*conn_rqst) + sizeof(*conn_acc)), GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	conn_rqst = (struct fcnvme_ls_cr_conn_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	conn_acc = (struct fcnvme_ls_cr_conn_acc *)&conn_rqst[1];
+
+	conn_rqst->w0.ls_cmd = FCNVME_LS_CREATE_CONNECTION;
+	conn_rqst->desc_list_len = cpu_to_be32(
+				sizeof(struct fcnvme_lsdesc_assoc_id) +
+				sizeof(struct fcnvme_lsdesc_cr_conn_cmd));
+
+	conn_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	conn_rqst->associd.desc_len =
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_assoc_id);
+	conn_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+	conn_rqst->connect_cmd.desc_tag =
+			cpu_to_be32(FCNVME_LSDESC_CREATE_CONN_CMD);
+	conn_rqst->connect_cmd.desc_len =
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_cr_conn_cmd);
+	conn_rqst->connect_cmd.ersp_ratio = cpu_to_be16(ersp_ratio);
+	conn_rqst->connect_cmd.qid  = cpu_to_be16(queue->qnum);
+	conn_rqst->connect_cmd.sqsize = cpu_to_be16(qsize);
+
+	lsop->queue = queue;
+	lsreq->rqstaddr = conn_rqst;
+	lsreq->rqstlen = sizeof(*conn_rqst);
+	lsreq->rspaddr = conn_acc;
+	lsreq->rsplen = sizeof(*conn_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* process connect LS completion */
+
+	/* validate the ACC response */
+	if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	if (conn_acc->hdr.desc_list_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_ls_cr_conn_acc))
+		fcret = VERR_CR_CONN_ACC_LEN;
+	if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (conn_acc->hdr.rqst.desc_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_rqst))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (conn_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_CREATE_CONNECTION)
+		fcret = VERR_CR_CONN;
+	else if (conn_acc->connectid.desc_tag !=
+			cpu_to_be32(FCNVME_LSDESC_CONN_ID))
+		fcret = VERR_CONN_ID;
+	else if (conn_acc->connectid.desc_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_conn_id))
+		fcret = VERR_CONN_ID_LEN;
+
+	if (fcret) {
+		ret = -EBADF;
+		dev_err(ctrl->dev,
+			"q %d connect failed: %s\n",
+			queue->qnum, validation_errors[fcret]);
+	} else {
+		queue->connection_id =
+			be64_to_cpu(conn_acc->connectid.connection_id);
+		set_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+	}
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect command failed (%d).\n",
+			queue->qnum, ret);
+	return ret;
+}
+
+static int
+nvme_fc_disconnect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue)
+{
+	struct fcnvme_ls_disconnect_rqst *discon_rqst;
+	struct fcnvme_ls_disconnect_acc *discon_acc;
+	struct nvmefc_ls_req_op *lsop;
+	struct nvmefc_ls_req *lsreq;
+	int ret, qid = queue->qnum, fcret = 0;
+
+	lsop = kzalloc((sizeof(*lsop) +
+			 ctrl->lport->ops->lsrqst_priv_sz +
+			 sizeof(*discon_rqst) + sizeof(*discon_acc)),
+			GFP_KERNEL);
+	if (!lsop) {
+		ret = -ENOMEM;
+		goto out_no_memory;
+	}
+	lsreq = &lsop->ls_req;
+
+	lsreq->private = (void *)&lsop[1];
+	discon_rqst = (struct fcnvme_ls_disconnect_rqst *)
+			(lsreq->private + ctrl->lport->ops->lsrqst_priv_sz);
+	discon_acc = (struct fcnvme_ls_disconnect_acc *)&discon_rqst[1];
+
+	discon_rqst->w0.ls_cmd = FCNVME_LS_DISCONNECT;
+	discon_rqst->desc_list_len = cpu_to_be32(
+				sizeof(struct fcnvme_lsdesc_assoc_id) +
+				sizeof(struct fcnvme_lsdesc_disconn_cmd));
+
+	discon_rqst->associd.desc_tag = cpu_to_be32(FCNVME_LSDESC_ASSOC_ID);
+	discon_rqst->associd.desc_len =
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_assoc_id);
+
+	discon_rqst->associd.association_id = cpu_to_be64(ctrl->association_id);
+
+	discon_rqst->discon_cmd.desc_tag = cpu_to_be32(
+						FCNVME_LSDESC_DISCONN_CMD);
+	discon_rqst->discon_cmd.desc_len =
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_disconn_cmd);
+	discon_rqst->discon_cmd.scope = FCNVME_DISCONN_CONNECTION;
+	discon_rqst->discon_cmd.id = cpu_to_be64(queue->connection_id);
+
+	lsreq->rqstaddr = discon_rqst;
+	lsreq->rqstlen = sizeof(*discon_rqst);
+	lsreq->rspaddr = discon_acc;
+	lsreq->rsplen = sizeof(*discon_acc);
+	lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
+
+	ret = nvme_fc_send_ls_req(ctrl, lsop);
+	if (ret)
+		goto out_free_buffer;
+
+	/* validate the ACC response */
+	if (discon_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
+		fcret = VERR_LSACC;
+	if (discon_acc->hdr.desc_list_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_ls_disconnect_acc))
+		fcret = VERR_DISCONN_ACC_LEN;
+	if (discon_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+		fcret = VERR_LSDESC_RQST;
+	else if (discon_acc->hdr.rqst.desc_len !=
+			FCNVME_LSDESC_LEN(struct fcnvme_lsdesc_rqst))
+		fcret = VERR_LSDESC_RQST_LEN;
+	else if (discon_acc->hdr.rqst.w0.ls_cmd != FCNVME_LS_DISCONNECT)
+		fcret = VERR_DISCONN;
+
+	if (fcret) {
+		/* Log the error but don't report failure */
+		dev_err(ctrl->dev,
+			"queue %d disconnect failed: %s\n",
+			qid, validation_errors[fcret]);
+	}
+
+	queue->connection_id = 0;
+	if (!qid)
+		ctrl->association_id = 0;
+	clear_bit(NVME_FC_Q_CONNECTED, &queue->flags);
+
+out_free_buffer:
+	kfree(lsop);
+out_no_memory:
+	if (ret)
+		dev_err(ctrl->dev,
+			"queue %d connect command failed (%d).\n", qid, ret);
+	return ret;
+}
+
+
+
+/* *********************** NVME Ctrl Routines **************************** */
+
+
+static int
+nvme_fc_reinit_request(void *data, struct request *rq)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+
+	memset(cmdiu, 0, sizeof(*cmdiu));
+	cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+	cmdiu->fc_id = NVME_CMD_FC_ID;
+	cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+	memset(&op->rsp_iu, 0, sizeof(op->rsp_iu));
+
+	return 0;
+}
+
+static void
+__nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_fcp_op *op)
+{
+	dma_unmap_single(ctrl->lport->dev, op->fcp_req.rspdma,
+				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
+	dma_unmap_single(ctrl->lport->dev, op->fcp_req.cmddma,
+				sizeof(op->cmd_iu), DMA_TO_DEVICE);
+
+	atomic_set(&op->state, FCPOP_STATE_UNINIT);
+}
+
+static void
+nvme_fc_exit_request(void *data, struct request *rq,
+				unsigned int hctx_idx, unsigned int rq_idx)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+
+	return __nvme_fc_exit_request(data, op);
+}
+
+static void
+nvme_fc_exit_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
+	int i;
+
+	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+		if (atomic_read(&aen_op->state) == FCPOP_STATE_UNINIT)
+			continue;
+		__nvme_fc_exit_request(ctrl, aen_op);
+	}
+}
+
+void
+nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
+{
+	struct nvme_fc_fcp_op *op = fcp_req_to_fcp_op(req);
+	struct request *rq = op->rq;
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	struct nvme_fc_queue *queue = op->queue;
+	struct nvme_completion *cqe = &op->rsp_iu.cqe;
+	int status, llddstatus = freq->status;
+
+	dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
+				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
+
+	/*
+	 * if we aborted the op, the routines that invoked the abort
+	 * set the desired status on the request (namely DNR bits as
+	 * well as NVME_SC_ABORT_REQ). Thus, use those values for
+	 * the llddstatus which gets put into cQEs.
+	 */
+	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
+		llddstatus = rq->errors;
+
+	/*
+	 * If successful and ERSP, use the returned CQE
+	 *
+	 * Otherwise, there isn't a CQE or it may not have valid content.
+	 * FC-NVME will need to fudge one up. We also need to fudge up
+	 * CQE's for LLDD/transport errors.
+	 */
+
+	/*
+	 * if length of the received response is 0 or 12 and llddstatus is 0,
+	 * then a successful response is assumed.  But, need to create a CQE.
+	 */
+	if (!llddstatus && (!freq->rcv_rsplen ||
+			  (freq->rcv_rsplen == NVME_FC_SIZEOF_ZEROS_RSP))) {
+
+		memset(cqe, 0, sizeof(*cqe));
+		cqe->sq_head = cpu_to_le16(queue->sqhd);
+		cqe->command_id = cpu_to_le16(op->rqno);
+		goto validation_done;
+	}
+
+	/* successful ersp */
+	if ((!llddstatus) &&
+			(freq->rcv_rsplen == sizeof(struct nvme_fc_ersp_iu))) {
+		/* validate it */
+		if (unlikely(be16_to_cpu(op->rsp_iu.iu_len) !=
+				(freq->rcv_rsplen/4)))
+			llddstatus = NVME_SC_FC_FORMAT;
+		else if (unlikely(op->rqno != le16_to_cpu(cqe->command_id)))
+			llddstatus = NVME_SC_FC_CMDID_MISMATCH;
+		else {
+			/* passed validation, use the cqe */
+			/* TODO: fix sqhd - deal with out of order */
+			queue->sqhd = le16_to_cpu(cqe->sq_head);
+			queue->seqno = be32_to_cpu(op->rsp_iu.rsn);
+			goto validation_done;
+		}
+		/* if error - will fall thru below */
+
+	/* if a bad length */
+	} else if (!llddstatus)
+		llddstatus = NVME_SC_FC_FORMAT;
+
+	/* we have validation errors or a lldd/transport error */
+	memset(cqe, 0, sizeof(*cqe));
+	cqe->sq_head = cpu_to_le16(queue->sqhd);
+	cqe->command_id = cpu_to_le16(op->rqno);
+	cqe->status = cpu_to_le16(llddstatus << 1);
+
+validation_done:
+
+	if (!queue->qnum && IS_AEN_COMMAND(cqe->command_id)) {
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
+		return;
+	}
+
+	status = le16_to_cpu(cqe->status);
+	status >>= 1;
+
+	if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
+		memcpy(rq->special, cqe, sizeof(*cqe));
+
+	blk_mq_complete_request(rq, status);
+}
+
+static int
+__nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
+		struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op,
+		struct request *rq, u32 rqno)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	int ret = 0;
+
+	memset(op, 0, sizeof(*op));
+	op->fcp_req.cmdaddr = &op->cmd_iu;
+	op->fcp_req.cmdlen = sizeof(op->cmd_iu);
+	op->fcp_req.rspaddr = &op->rsp_iu;
+	op->fcp_req.rsplen = sizeof(op->rsp_iu);
+	op->fcp_req.done = nvme_fc_fcpio_done;
+	op->fcp_req.first_sgl = (struct scatterlist *)&op[1];
+	op->fcp_req.private = &op->fcp_req.first_sgl[SG_CHUNK_SIZE];
+	op->ctrl = ctrl;
+	op->queue = queue;
+	op->rq = rq;
+	op->rqno = rqno;
+	init_completion(&op->abort_done);
+
+	cmdiu->scsi_id = NVME_CMD_SCSI_ID;
+	cmdiu->fc_id = NVME_CMD_FC_ID;
+	cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32));
+
+	op->fcp_req.cmddma = dma_map_single(ctrl->lport->dev,
+				&op->cmd_iu, sizeof(op->cmd_iu), DMA_TO_DEVICE);
+	if (dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) {
+		dev_err(ctrl->dev,
+			"FCP Op failed - cmdiu dma mapping failed.\n");
+		ret = EFAULT;
+		goto out_on_error;
+	}
+	op->fcp_req.rspdma = dma_map_single(ctrl->lport->dev,
+				&op->rsp_iu, sizeof(op->rsp_iu),
+				DMA_FROM_DEVICE);
+	if (dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) {
+		dev_err(ctrl->dev,
+			"FCP Op failed - rspiu dma mapping failed.\n");
+		ret = EFAULT;
+	}
+
+	atomic_set(&op->state, FCPOP_STATE_IDLE);
+out_on_error:
+	return ret;
+}
+
+static int
+nvme_fc_init_request(void *data, struct request *rq,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1];
+
+	return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
+}
+
+static int
+nvme_fc_init_admin_request(void *data, struct request *rq,
+				unsigned int hctx_idx, unsigned int rq_idx,
+				unsigned int numa_node)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_queue *queue = &ctrl->queues[0];
+
+	return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
+}
+
+static int
+nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_fcp_op *aen_op;
+	struct nvme_fc_cmd_iu *cmdiu;
+	struct nvme_command *sqe;
+	int i, ret;
+
+	aen_op = ctrl->aen_ops;
+	for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) {
+		cmdiu = &aen_op->cmd_iu;
+		sqe = &cmdiu->sqe;
+		ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0],
+				aen_op, (struct request *)NULL,
+				(AEN_CMDID_BASE + i));
+		if (ret)
+			return ret;
+
+		memset(sqe, 0, sizeof(*sqe));
+		sqe->common.opcode = nvme_admin_async_event;
+		sqe->common.command_id = AEN_CMDID_BASE + i;
+	}
+	return 0;
+}
+
+
+static inline void
+__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl,
+		unsigned int qidx)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[qidx];
+
+	hctx->driver_data = queue;
+	queue->hctx = hctx;
+}
+
+static int
+nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+
+	__nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1);
+
+	return 0;
+}
+
+static int
+nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_fc_ctrl *ctrl = data;
+
+	__nvme_fc_init_hctx(hctx, ctrl, hctx_idx);
+
+	return 0;
+}
+
+static void
+nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size)
+{
+	struct nvme_fc_queue *queue;
+
+	queue = &ctrl->queues[idx];
+	memset(queue, 0, sizeof(*queue));
+	queue->ctrl = ctrl;
+	queue->qnum = idx;
+	atomic_set(&queue->csn, 1);
+	queue->dev = ctrl->dev;
+
+	if (idx > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command);
+
+	queue->queue_size = queue_size;
+
+	/*
+	 * Considered whether we should allocate buffers for all SQEs
+	 * and CQEs and dma map them - mapping their respective entries
+	 * into the request structures (kernel vm addr and dma address)
+	 * thus the driver could use the buffers/mappings directly.
+	 * It only makes sense if the LLDD would use them for its
+	 * messaging api. It's very unlikely most adapter api's would use
+	 * a native NVME sqe/cqe. More reasonable if FC-NVME IU payload
+	 * structures were used instead.
+	 */
+}
+
+static void
+nvme_fc_free_queue(struct nvme_fc_queue *queue)
+{
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+
+	if (!test_and_clear_bit(NVME_FC_Q_CONNECTED, &queue->flags))
+		return;
+
+	nvme_fc_disconnect_queue(ctrl, queue);
+}
+
+static void
+__nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, unsigned int qidx)
+{
+	if (ctrl->lport->ops->delete_queue)
+		ctrl->lport->ops->delete_queue(&ctrl->lport->localport, qidx,
+				queue->lldd_handle);
+	queue->lldd_handle = NULL;
+}
+
+static void
+nvme_fc_destroy_admin_queue(struct nvme_fc_ctrl *ctrl)
+{
+	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+	nvme_fc_free_queue(&ctrl->queues[0]);
+}
+
+static void
+nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_fc_free_queue(&ctrl->queues[i]);
+}
+
+static int
+__nvme_fc_create_hw_queue(struct nvme_fc_ctrl *ctrl,
+	struct nvme_fc_queue *queue, unsigned int qidx, u16 qsize)
+{
+	int ret = 0;
+
+	queue->lldd_handle = NULL;
+	if (ctrl->lport->ops->create_queue)
+		ret = ctrl->lport->ops->create_queue(&ctrl->lport->localport,
+				qidx, qsize, &queue->lldd_handle);
+
+	return ret;
+}
+
+static void
+nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[ctrl->queue_count - 1];
+	int i;
+
+	for (i = ctrl->queue_count - 1; i >= 1; i--, queue--)
+		__nvme_fc_delete_hw_queue(ctrl, queue, i);
+}
+
+static int
+nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
+{
+	struct nvme_fc_queue *queue = &ctrl->queues[1];
+	int i, j, ret;
+
+	for (i = 1; i < ctrl->queue_count; i++, queue++) {
+		ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize);
+		if (ret) {
+			for (j = i-1; j >= 0; j--)
+				__nvme_fc_delete_hw_queue(ctrl,
+						&ctrl->queues[j], j);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int
+nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize,
+					(qsize / 5));
+		if (ret)
+			break;
+		ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static void
+nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize);
+}
+
+static void
+nvme_fc_ctrl_free(struct kref *ref)
+{
+	struct nvme_fc_ctrl *ctrl =
+		container_of(ref, struct nvme_fc_ctrl, ref);
+	unsigned long flags;
+
+	WARN_ON(ctrl->state != FCCTRL_DELETED && ctrl->state != FCCTRL_INIT);
+	WARN_ON(!list_empty(&ctrl->ls_req_list));
+
+	if (ctrl->state != FCCTRL_INIT) {
+		if (ctrl->ctrl.tagset) {
+			blk_cleanup_queue(ctrl->ctrl.connect_q);
+			blk_mq_free_tag_set(&ctrl->tag_set);
+			nvme_fc_delete_hw_io_queues(ctrl);
+			nvme_fc_free_io_queues(ctrl);
+		}
+
+		nvme_fc_exit_aen_ops(ctrl);
+
+		nvme_fc_destroy_admin_queue(ctrl);
+
+		/* remove from rport list */
+		spin_lock_irqsave(&ctrl->rport->lock, flags);
+		list_del(&ctrl->ctrl_list);
+		spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+	}
+
+	put_device(ctrl->dev);
+	nvme_fc_rport_put(ctrl->rport);
+	kfree(ctrl->queues);
+	kfree(ctrl);
+}
+
+static void
+nvme_fc_ctrl_put(struct nvme_fc_ctrl *ctrl)
+{
+	kref_put(&ctrl->ref, nvme_fc_ctrl_free);
+}
+
+static int
+nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
+{
+	return kref_get_unless_zero(&ctrl->ref);
+}
+
+/*
+ * All accesses from nvme core layer done - can now free the
+ * controller. Called after last nvme_put_ctrl() call
+ */
+static void
+nvme_fc_free_nvme_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+	unsigned long flags;
+
+	WARN_ON(nctrl != &ctrl->ctrl);
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	/*
+	 * not sure why we have to check for double entries, but other
+	 * transport do
+	 */
+	if (ctrl->state != FCCTRL_DELETING && ctrl->state != FCCTRL_INIT) {
+		spin_unlock_irqrestore(&ctrl->lock, flags);
+		return;
+	}
+
+	if (ctrl->state != FCCTRL_INIT)
+		ctrl->state = FCCTRL_DELETED;
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	nvme_fc_ctrl_put(ctrl);
+}
+
+
+static int
+__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
+{
+	int state;
+
+	state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
+	if (state != FCPOP_STATE_ACTIVE) {
+		atomic_set(&op->state, state);
+		return 1; /* fail */
+	}
+
+	ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
+					&ctrl->rport->remoteport,
+					op->queue->lldd_handle,
+					&op->fcp_req);
+
+	return 0;
+}
+
+enum blk_eh_timer_return
+nvme_fc_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	int ret;
+
+	if (reserved)
+		return BLK_EH_RESET_TIMER;
+
+	ret = __nvme_fc_abort_op(ctrl, op);
+	if (ret)
+		/* io wasn't active to abort consider it done */
+		return BLK_EH_HANDLED;
+
+	/* fail with DNR on cmd timeout */
+	rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+
+	wait_for_completion(&op->abort_done);
+
+	/* TODO: force a controller reset */
+
+	reinit_completion(&op->abort_done);
+
+	return BLK_EH_HANDLED;
+}
+
+static int
+nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
+		struct nvme_fc_fcp_op *op)
+{
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+	u32 map_len = nvme_map_len(rq);
+	enum dma_data_direction dir;
+	int ret;
+
+	freq->sg_cnt = 0;
+
+	if (!map_len)
+		return 0;
+
+	freq->sg_table.sgl = freq->first_sgl;
+	ret = sg_alloc_table_chained(&freq->sg_table, rq->nr_phys_segments,
+			freq->sg_table.sgl);
+	if (ret)
+		return -ENOMEM;
+
+	op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
+	WARN_ON(op->nents > rq->nr_phys_segments);
+	dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+	freq->sg_cnt = dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
+				op->nents, dir);
+	if (unlikely(freq->sg_cnt <= 0)) {
+		sg_free_table_chained(&freq->sg_table, true);
+		freq->sg_cnt = 0;
+		return -EFAULT;
+	}
+
+	/*
+	 * TODO: blk_integrity_rq(rq)  for DIF
+	 */
+	return 0;
+}
+
+static void
+nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
+		struct nvme_fc_fcp_op *op)
+{
+	struct nvmefc_fcp_req *freq = &op->fcp_req;
+
+	if (!freq->sg_cnt)
+		return;
+
+	dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
+				((rq_data_dir(rq) == WRITE) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE));
+
+	nvme_cleanup_cmd(rq);
+
+	sg_free_table_chained(&freq->sg_table, true);
+
+	freq->sg_cnt = 0;
+}
+
+static int
+nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
+	struct nvme_fc_fcp_op *op, u32 data_len,
+	enum nvmefc_fcp_datadir	io_dir)
+{
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	struct nvme_command *sqe = &cmdiu->sqe;
+	u32 csn;
+	int ret;
+
+	nvme_fc_ctrl_get(ctrl);
+
+	/* format the FC-NVME CMD IU and fcp_req */
+	cmdiu->connection_id = cpu_to_be64(queue->connection_id);
+	csn = atomic_inc_return(&queue->csn);
+	cmdiu->csn = cpu_to_be32(csn);
+	cmdiu->data_len = cpu_to_be32(data_len);
+	switch (io_dir) {
+	case NVMEFC_FCP_WRITE:
+		cmdiu->flags = FCNVME_CMD_FLAGS_WRITE;
+		break;
+	case NVMEFC_FCP_READ:
+		cmdiu->flags = FCNVME_CMD_FLAGS_READ;
+		break;
+	case NVMEFC_FCP_NODATA:
+		cmdiu->flags = 0;
+		break;
+	}
+	op->fcp_req.payload_length = data_len;
+	op->fcp_req.io_dir = io_dir;
+	op->fcp_req.transferred_length = 0;
+	op->fcp_req.rcv_rsplen = 0;
+	op->fcp_req.status = 0;
+	op->fcp_req.sqid = cpu_to_le16(queue->qnum);
+
+	/*
+	 * validate per fabric rules, set fields mandated by fabric spec
+	 * as well as those by FC-NVME spec.
+	 */
+	WARN_ON_ONCE(sqe->common.metadata);
+	WARN_ON_ONCE(sqe->common.dptr.prp1);
+	WARN_ON_ONCE(sqe->common.dptr.prp2);
+	sqe->common.flags |= NVME_CMD_SGL_METABUF;
+
+	/*
+	 * format SQE DPTR field per FC-NVME rules
+	 *    type=data block descr; subtype=offset;
+	 *    offset is currently 0.
+	 */
+	sqe->rw.dptr.sgl.type = NVME_SGL_FMT_OFFSET;
+	sqe->rw.dptr.sgl.length = cpu_to_le32(data_len);
+	sqe->rw.dptr.sgl.addr = 0;
+
+	/* odd that we set the command_id - should come from nvme-fabrics */
+	WARN_ON_ONCE(sqe->common.command_id != cpu_to_le16(op->rqno));
+
+	if (op->rq) {				/* skipped on aens */
+		ret = nvme_fc_map_data(ctrl, op->rq, op);
+		if (ret < 0) {
+			dev_err(queue->ctrl->ctrl.device,
+			     "Failed to map data (%d)\n", ret);
+			nvme_cleanup_cmd(op->rq);
+			nvme_fc_ctrl_put(ctrl);
+			return (ret == -ENOMEM || ret == -EAGAIN) ?
+				BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
+		}
+	}
+
+	dma_sync_single_for_device(ctrl->lport->dev, op->fcp_req.cmddma,
+				  sizeof(op->cmd_iu), DMA_TO_DEVICE);
+
+	atomic_set(&op->state, FCPOP_STATE_ACTIVE);
+
+	if (op->rq)
+		blk_mq_start_request(op->rq);
+
+	ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
+					&ctrl->rport->remoteport,
+					queue->lldd_handle, &op->fcp_req);
+
+	if (ret) {
+		dev_err(ctrl->dev,
+			"Send nvme command failed - lldd returned %d.\n", ret);
+		if (op->rq) {			/* normal request */
+			nvme_fc_unmap_data(ctrl, op->rq, op);
+			nvme_cleanup_cmd(op->rq);
+		}
+		/* else - aen. no cleanup needed */
+
+		nvme_fc_ctrl_put(ctrl);
+
+		if (ret != -EBUSY)
+			return BLK_MQ_RQ_QUEUE_ERROR;
+
+		blk_mq_stop_hw_queues(op->rq->q);
+		blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
+		return BLK_MQ_RQ_QUEUE_BUSY;
+	}
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static int
+nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
+			const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_fc_queue *queue = hctx->driver_data;
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+	struct request *rq = bd->rq;
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
+	struct nvme_command *sqe = &cmdiu->sqe;
+	enum nvmefc_fcp_datadir	io_dir;
+	u32 data_len;
+	int ret;
+
+	ret = nvme_setup_cmd(ns, rq, sqe);
+	if (ret)
+		return ret;
+
+	data_len = nvme_map_len(rq);
+	if (data_len)
+		io_dir = ((rq_data_dir(rq) == WRITE) ?
+					NVMEFC_FCP_WRITE : NVMEFC_FCP_READ);
+	else
+		io_dir = NVMEFC_FCP_NODATA;
+
+	return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
+}
+
+static struct blk_mq_tags *
+nvme_fc_tagset(struct nvme_fc_queue *queue)
+{
+	if (queue->qnum == 0)
+		return queue->ctrl->admin_tag_set.tags[queue->qnum];
+
+	return queue->ctrl->tag_set.tags[queue->qnum - 1];
+}
+
+static int
+nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+
+{
+	struct nvme_fc_queue *queue = hctx->driver_data;
+	struct nvme_fc_ctrl *ctrl = queue->ctrl;
+	struct request *req;
+	struct nvme_fc_fcp_op *op;
+
+	req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
+	if (!req) {
+		dev_err(queue->ctrl->ctrl.device,
+			 "tag 0x%x on QNum %#x not found\n",
+			tag, queue->qnum);
+		return 0;
+	}
+
+	op = blk_mq_rq_to_pdu(req);
+
+	if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
+		 (ctrl->lport->ops->poll_queue))
+		ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
+						 queue->lldd_handle);
+
+	return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
+}
+
+static void
+nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg);
+	struct nvme_fc_fcp_op *aen_op;
+	int ret;
+
+	if (aer_idx > NVME_FC_NR_AEN_COMMANDS)
+		return;
+
+	aen_op = &ctrl->aen_ops[aer_idx];
+
+	ret =  nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0,
+					NVMEFC_FCP_NODATA);
+	if (ret)
+		dev_err(ctrl->ctrl.device,
+			"failed async event work [%d]\n", aer_idx);
+}
+
+static void
+nvme_fc_complete_rq(struct request *rq)
+{
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+	struct nvme_fc_ctrl *ctrl = op->ctrl;
+	int error = 0, state;
+
+	state = atomic_xchg(&op->state, FCPOP_STATE_IDLE);
+
+	nvme_cleanup_cmd(rq);
+
+	nvme_fc_unmap_data(ctrl, rq, op);
+
+	if (state == FCPOP_STATE_ABORTED)
+		complete(&op->abort_done);
+
+	if (unlikely(rq->errors)) {
+		if (nvme_req_needs_retry(rq, rq->errors)) {
+			nvme_requeue_req(rq);
+			return;
+		}
+
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
+			error = rq->errors;
+		else
+			error = nvme_error_status(rq->errors);
+	}
+
+	nvme_fc_ctrl_put(ctrl);
+
+	blk_mq_end_request(rq, error);
+}
+
+static struct blk_mq_ops nvme_fc_mq_ops = {
+	.queue_rq	= nvme_fc_queue_rq,
+	.complete	= nvme_fc_complete_rq,
+	.init_request	= nvme_fc_init_request,
+	.exit_request	= nvme_fc_exit_request,
+	.reinit_request	= nvme_fc_reinit_request,
+	.init_hctx	= nvme_fc_init_hctx,
+	.poll		= nvme_fc_poll,
+	.timeout	= nvme_fc_timeout,
+};
+
+static struct blk_mq_ops nvme_fc_admin_mq_ops = {
+	.queue_rq	= nvme_fc_queue_rq,
+	.complete	= nvme_fc_complete_rq,
+	.init_request	= nvme_fc_init_admin_request,
+	.exit_request	= nvme_fc_exit_request,
+	.reinit_request	= nvme_fc_reinit_request,
+	.init_hctx	= nvme_fc_init_admin_hctx,
+	.timeout	= nvme_fc_timeout,
+};
+
+static int
+nvme_fc_configure_admin_queue(struct nvme_fc_ctrl *ctrl)
+{
+	u32 segs;
+	int error;
+
+	nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH);
+
+	error = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0],
+				NVME_FC_AQ_BLKMQ_DEPTH,
+				(NVME_FC_AQ_BLKMQ_DEPTH / 4));
+	if (error)
+		return error;
+
+	memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
+	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
+	ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH;
+	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
+	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+					(SG_CHUNK_SIZE *
+						sizeof(struct scatterlist)) +
+					ctrl->lport->ops->fcprqst_priv_sz;
+	ctrl->admin_tag_set.driver_data = ctrl;
+	ctrl->admin_tag_set.nr_hw_queues = 1;
+	ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
+
+	error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
+	if (error)
+		goto out_free_queue;
+
+	ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+	if (IS_ERR(ctrl->ctrl.admin_q)) {
+		error = PTR_ERR(ctrl->ctrl.admin_q);
+		goto out_free_tagset;
+	}
+
+	error = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0,
+				NVME_FC_AQ_BLKMQ_DEPTH);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = nvmf_connect_admin_queue(&ctrl->ctrl);
+	if (error)
+		goto out_delete_hw_queue;
+
+	error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
+	if (error) {
+		dev_err(ctrl->ctrl.device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_delete_hw_queue;
+	}
+
+	ctrl->ctrl.sqsize =
+		min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
+
+	error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
+	if (error)
+		goto out_delete_hw_queue;
+
+	segs = min_t(u32, NVME_FC_MAX_SEGMENTS,
+			ctrl->lport->ops->max_sgl_segments);
+	ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9);
+
+	error = nvme_init_identify(&ctrl->ctrl);
+	if (error)
+		goto out_delete_hw_queue;
+
+	nvme_start_keep_alive(&ctrl->ctrl);
+
+	return 0;
+
+out_delete_hw_queue:
+	__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
+out_cleanup_queue:
+	blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_free_tagset:
+	blk_mq_free_tag_set(&ctrl->admin_tag_set);
+out_free_queue:
+	nvme_fc_free_queue(&ctrl->queues[0]);
+	return error;
+}
+
+static void
+nvme_fc_kill_request(struct request *req, void *data, bool reserved)
+{
+	struct nvme_ctrl *nctrl = data;
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
+	int status;
+
+	if (!blk_mq_request_started(req))
+		return;
+
+	status = __nvme_fc_abort_op(ctrl, op);
+	if (status) {
+		/* io wasn't active to abort consider it done */
+		/* assume completion path already completing in parallel */
+		return;
+	}
+
+	status = NVME_SC_ABORT_REQ;
+	if (blk_queue_dying(req->q))
+		status |= NVME_SC_DNR;
+	req->errors = status;
+}
+
+
+/*
+ * Called to terminate an association.
+ * Maybe called with association fully in place or partially in place.
+ */
+static void
+nvme_fc_shutdown_ctrl(struct nvme_fc_ctrl *ctrl)
+{
+	if (ctrl->queue_count > 1) {
+		nvme_stop_queues(&ctrl->ctrl);
+		blk_mq_tagset_busy_iter(&ctrl->tag_set,
+					nvme_fc_kill_request, &ctrl->ctrl);
+	}
+
+	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+		nvme_shutdown_ctrl(&ctrl->ctrl);
+
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
+				nvme_fc_kill_request, &ctrl->ctrl);
+}
+
+
+static void
+__nvme_fc_remove_ctrl(struct nvme_fc_ctrl *ctrl)
+{
+	nvme_stop_keep_alive(&ctrl->ctrl);
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_fc_shutdown_ctrl(ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+}
+
+static void
+nvme_fc_del_ctrl_work(struct work_struct *work)
+{
+	struct nvme_fc_ctrl *ctrl =
+			container_of(work, struct nvme_fc_ctrl, delete_work);
+
+	__nvme_fc_remove_ctrl(ctrl);
+}
+
+static int
+__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl)
+{
+	ctrl->state = FCCTRL_DELETING;
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+		return -EBUSY;
+
+	if (!queue_work(nvme_fc_wq, &ctrl->delete_work))
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * Request from nvme core layer to delete the controller
+ */
+static int
+nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+	struct nvme_fc_rport *rport = ctrl->rport;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rport->lock, flags);
+	ret = __nvme_fc_del_ctrl(ctrl);
+	spin_unlock_irqrestore(&rport->lock, flags);
+	if (ret)
+		return ret;
+
+	flush_work(&ctrl->delete_work);
+
+	return 0;
+}
+
+static int
+nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl)
+{
+	return -EIO;
+}
+
+static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
+	.name			= "fc",
+	.module			= THIS_MODULE,
+	.is_fabrics		= true,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.reset_ctrl		= nvme_fc_reset_nvme_ctrl,
+	.free_ctrl		= nvme_fc_free_nvme_ctrl,
+	.submit_async_event	= nvme_fc_submit_async_event,
+	.delete_ctrl		= nvme_fc_del_nvme_ctrl,
+	.get_subsysnqn		= nvmf_get_subsysnqn,
+	.get_address		= nvmf_get_address,
+};
+
+static int
+nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
+{
+	struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+	int ret;
+
+	ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
+	if (ret) {
+		dev_info(ctrl->ctrl.device,
+			"set_queue_count failed: %d\n", ret);
+		return ret;
+	}
+
+	ctrl->queue_count = opts->nr_io_queues + 1;
+	if (!opts->nr_io_queues)
+		return 0;
+
+	dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n",
+			opts->nr_io_queues);
+
+	nvme_fc_init_io_queues(ctrl);
+
+	memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
+	ctrl->tag_set.ops = &nvme_fc_mq_ops;
+	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
+	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
+	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
+					(SG_CHUNK_SIZE *
+						sizeof(struct scatterlist)) +
+					ctrl->lport->ops->fcprqst_priv_sz;
+	ctrl->tag_set.driver_data = ctrl;
+	ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
+	ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
+
+	ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
+	if (ret)
+		return ret;
+
+	ctrl->ctrl.tagset = &ctrl->tag_set;
+
+	ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
+	if (IS_ERR(ctrl->ctrl.connect_q)) {
+		ret = PTR_ERR(ctrl->ctrl.connect_q);
+		goto out_free_tag_set;
+	}
+
+	ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+	if (ret)
+		goto out_cleanup_blk_queue;
+
+	ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.opts->queue_size);
+	if (ret)
+		goto out_delete_hw_queues;
+
+	return 0;
+
+out_delete_hw_queues:
+	nvme_fc_delete_hw_io_queues(ctrl);
+out_cleanup_blk_queue:
+	nvme_stop_keep_alive(&ctrl->ctrl);
+	blk_cleanup_queue(ctrl->ctrl.connect_q);
+out_free_tag_set:
+	blk_mq_free_tag_set(&ctrl->tag_set);
+	nvme_fc_free_io_queues(ctrl);
+
+	/* force put free routine to ignore io queues */
+	ctrl->ctrl.tagset = NULL;
+
+	return ret;
+}
+
+
+static struct nvme_ctrl *
+__nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
+	struct nvme_fc_lport *lport, struct nvme_fc_rport *rport)
+{
+	struct nvme_fc_ctrl *ctrl;
+	unsigned long flags;
+	int ret;
+	bool changed;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+	ctrl->ctrl.opts = opts;
+	INIT_LIST_HEAD(&ctrl->ctrl_list);
+	INIT_LIST_HEAD(&ctrl->ls_req_list);
+	ctrl->lport = lport;
+	ctrl->rport = rport;
+	ctrl->dev = lport->dev;
+	ctrl->state = FCCTRL_INIT;
+
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	ctrl->cnum = nvme_fc_ctrl_cnt++;
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
+	if (ret)
+		goto out_free_ctrl;
+
+	get_device(ctrl->dev);
+	kref_init(&ctrl->ref);
+	atomic_set(&ctrl->myref, 1);
+
+	INIT_WORK(&ctrl->delete_work, nvme_fc_del_ctrl_work);
+	spin_lock_init(&ctrl->lock);
+
+	/* io queue count */
+	ctrl->queue_count = min_t(unsigned int,
+				opts->nr_io_queues,
+				lport->ops->max_hw_queues);
+	opts->nr_io_queues = ctrl->queue_count;	/* so opts has valid value */
+	ctrl->queue_count++;	/* +1 for admin queue */
+
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	ret = -ENOMEM;
+	ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue),
+				GFP_KERNEL);
+	if (!ctrl->queues)
+		goto out_uninit_ctrl;
+
+	ret = nvme_fc_configure_admin_queue(ctrl);
+	if (ret)
+		goto out_uninit_ctrl;
+
+	/* sanity checks */
+
+	/* FC-NVME supports 64-byte SQE only */
+	if (ctrl->ctrl.ioccsz != 4) {
+		dev_err(ctrl->ctrl.device, "ioccsz %d is not supported!\n",
+				ctrl->ctrl.ioccsz);
+		goto out_remove_admin_queue;
+	}
+	/* FC-NVME supports 16-byte CQE only */
+	if (ctrl->ctrl.iorcsz != 1) {
+		dev_err(ctrl->ctrl.device, "iorcsz %d is not supported!\n",
+				ctrl->ctrl.iorcsz);
+		goto out_remove_admin_queue;
+	}
+	/* FC-NVME does not have other data in the capsule */
+	if (ctrl->ctrl.icdoff) {
+		dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n",
+				ctrl->ctrl.icdoff);
+		goto out_remove_admin_queue;
+	}
+
+	/* FC-NVME supports normal SGL Data Block Descriptors */
+
+	if (opts->queue_size > ctrl->ctrl.maxcmd) {
+		/* warn if maxcmd is lower than queue_size */
+		dev_warn(ctrl->ctrl.device,
+			"queue_size %zu > ctrl maxcmd %u, reducing "
+			"to queue_size\n",
+			opts->queue_size, ctrl->ctrl.maxcmd);
+		opts->queue_size = ctrl->ctrl.maxcmd;
+	}
+
+	ret = nvme_fc_init_aen_ops(ctrl);
+	if (ret)
+		goto out_exit_aen_ops;
+
+	if (ctrl->queue_count > 1) {
+		ret = nvme_fc_create_io_queues(ctrl);
+		if (ret)
+			goto out_exit_aen_ops;
+	}
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	ctrl->state = FCCTRL_ACTIVE;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
+	WARN_ON_ONCE(!changed);
+
+	dev_info(ctrl->ctrl.device,
+		"NVME-FC{%d}: new ctrl: NQN \"%s\" (%p)\n",
+		ctrl->cnum, ctrl->ctrl.opts->subsysnqn, &ctrl);
+
+	kref_get(&ctrl->ctrl.kref);
+
+	spin_lock_irqsave(&rport->lock, flags);
+	list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
+	spin_unlock_irqrestore(&rport->lock, flags);
+
+	if (opts->nr_io_queues) {
+		nvme_queue_scan(&ctrl->ctrl);
+		nvme_queue_async_events(&ctrl->ctrl);
+	}
+
+	return &ctrl->ctrl;
+
+out_exit_aen_ops:
+	nvme_fc_exit_aen_ops(ctrl);
+out_remove_admin_queue:
+	nvme_stop_keep_alive(&ctrl->ctrl);
+	nvme_fc_destroy_admin_queue(ctrl);
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	/* exit via here will follow ref point callbacks to free */
+	return ERR_PTR(ret);
+
+out_free_ctrl:
+	/* exit via here doesn't follow ref points */
+	nvme_fc_rport_put(rport);
+	kfree(ctrl);
+	return ERR_PTR(ret);
+}
+
+enum {
+	FCT_TRADDR_ERR		= 0,
+	FCT_TRADDR_WWNN		= 1 << 0,
+	FCT_TRADDR_WWPN		= 1 << 1,
+};
+
+struct nvmet_fc_traddr {
+	u64	nn;
+	u64	pn;
+};
+
+static const match_table_t traddr_opt_tokens = {
+	{ FCT_TRADDR_WWNN,	"nn-%s"		},
+	{ FCT_TRADDR_WWPN,	"pn-%s"		},
+	{ FCT_TRADDR_ERR,	NULL		}
+};
+
+static int
+nvme_fc_parse_address(struct nvmet_fc_traddr *traddr, char *buf)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *options, *o, *p;
+	int token, ret = 0;
+	u64 token64;
+
+	options = o = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	while ((p = strsep(&o, ":\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, traddr_opt_tokens, args);
+		switch (token) {
+		case FCT_TRADDR_WWNN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			traddr->nn = token64;
+			break;
+		case FCT_TRADDR_WWPN:
+			if (match_u64(args, &token64)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			traddr->pn = token64;
+			break;
+		default:
+			pr_warn("unknown traddr token or missing value '%s'\n",
+					p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static struct nvme_ctrl *
+nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
+{
+	struct nvme_fc_lport *lport;
+	struct nvme_fc_rport *rport;
+	struct nvmet_fc_traddr laddr = { 0L, 0L };
+	struct nvmet_fc_traddr raddr = { 0L, 0L };
+	unsigned long flags;
+	int ret;
+
+	ret = nvme_fc_parse_address(&raddr, opts->traddr);
+	if (ret || !raddr.nn || !raddr.pn)
+		return ERR_PTR(-EINVAL);
+
+	ret = nvme_fc_parse_address(&laddr, opts->host_traddr);
+	if (ret || !laddr.nn || !laddr.pn)
+		return ERR_PTR(-EINVAL);
+
+	/* find the host and remote ports to connect together */
+	spin_lock_irqsave(&nvme_fc_lock, flags);
+	list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
+		if ((lport->localport.node_name != laddr.nn) ||
+		    (lport->localport.port_name != laddr.pn))
+			continue;
+
+		list_for_each_entry(rport, &lport->endp_list, endp_list) {
+			if ((rport->remoteport.node_name != raddr.nn) ||
+			    (rport->remoteport.port_name != raddr.pn))
+				continue;
+
+			nvme_fc_rport_get(rport);
+
+			spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+			return __nvme_fc_create_ctrl(dev, opts, lport, rport);
+		}
+	}
+	spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+	return ERR_PTR(-ENOENT);
+}
+
+
+static struct nvmf_transport_ops nvme_fc_transport = {
+	.name		= "fc",
+	.required_opts	= NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
+	.allowed_opts	= NVMF_OPT_RECONNECT_DELAY,
+	.create_ctrl	= nvme_fc_create_ctrl,
+};
+
+static int __init nvme_fc_init_module(void)
+{
+	nvme_fc_wq = create_workqueue("nvme_fc_wq");
+	if (!nvme_fc_wq)
+		return -ENOMEM;
+
+	nvmf_register_transport(&nvme_fc_transport);
+	return 0;
+}
+
+static void __exit nvme_fc_exit_module(void)
+{
+	/* sanity check - all lports should be removed */
+	if (!list_empty(&nvme_fc_lport_list))
+		pr_warn("%s: localport list not empty\n", __func__);
+
+	nvmf_unregister_transport(&nvme_fc_transport);
+
+	destroy_workqueue(nvme_fc_wq);
+}
+
+module_init(nvme_fc_init_module);
+module_exit(nvme_fc_exit_module);
+
+MODULE_LICENSE("GPL v2");
+
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support
  2016-10-07 23:09 [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support James Smart
@ 2016-10-12  9:24 ` Christoph Hellwig
  2016-10-12 17:43   ` J Freyensee
  2016-10-12 19:54   ` James Smart
  0 siblings, 2 replies; 5+ messages in thread
From: Christoph Hellwig @ 2016-10-12  9:24 UTC (permalink / raw)


> +	  To configure a NVMe over Fabrics controller use the nvme-cli tool
> +	  from https://github.com/linux-nvme/nvme-cli.

Talking about nvme-cli, can you post the FC support patches for that
as well?

> +#define NVME_FC_NR_AEN_COMMANDS	1
> +#define NVME_FC_AQ_BLKMQ_DEPTH	\
> +	(NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
> +#define AEN_CMDID_BASE		(NVME_FC_AQ_BLKMQ_DEPTH + 1)
> +#define IS_AEN_COMMAND(command_id) \
> +	((command_id) >= AEN_CMDID_BASE)

This should be an inline.  But given that it's only used once I'd
actually prefer to just kill the helper and opencode it like in the
other NVMe drivers.

> +enum nvme_fcctrl_state {
> +        FCCTRL_INIT             = 0,
> +	FCCTRL_ACTIVE           = 1,
> +	FCCTRL_DELETING         = 2,
> +	FCCTRL_DELETED          = 3,
> +};

Please integrate the FC controller state with the generic controller
state in core.c


> +static LIST_HEAD(nvme_fc_lport_list);
> +static u32 nvme_fc_local_port_cnt;
> +static u32 nvme_fc_ctrl_cnt;

Any reason not to use the ida allocator for the port number?
a 32-bit integer can actually overflow fairly easily, so having
a proper allocator and reusing ids instead of wrapping around would
seem a lot safer.

> +{
> +	struct nvme_fc_lport *lport = localport_to_lport(portptr);
> +	unsigned long flags;
> +	u32 lnum;
> +
> +	if (!portptr)
> +		return -EINVAL;
> +
> +	lnum = portptr->port_num;
> +
> +	spin_lock_irqsave(&nvme_fc_lock, flags);

Just curious: where and why do you call into the NVMe code
from IRQ conect?

> +	/*
> +	 * If successful and ERSP, use the returned CQE
> +	 *
> +	 * Otherwise, there isn't a CQE or it may not have valid content.
> +	 * FC-NVME will need to fudge one up. We also need to fudge up
> +	 * CQE's for LLDD/transport errors.
> +	 */

So the big question here is why?  So far the model we use in the core
NVMe code is that as long as the lower layer returns a negative Linux
errno the CQE isn't valid, and only if we return a valid NVMe status
we need a CQE.  I suspect if you followed that convention you could
avoid a lot of these synthetic CQE games.

> +			/* passed validation, use the cqe */
> +			/* TODO: fix sqhd - deal with out of order */
> +			queue->sqhd = le16_to_cpu(cqe->sq_head);
> +			queue->seqno = be32_to_cpu(op->rsp_iu.rsn);
> +			goto validation_done;

We don't really care about the sq_head at all in Linux, so I don't
think we need to bother with maintaining the SQ head.

> +static int
> +__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
> +{
> +	int state;
> +
> +	state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
> +	if (state != FCPOP_STATE_ACTIVE) {
> +		atomic_set(&op->state, state);
> +		return 1; /* fail */
> +	}
> +
> +	ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
> +					&ctrl->rport->remoteport,
> +					op->queue->lldd_handle,
> +					&op->fcp_req);
> +
> +	return 0;

Not complaining about the code, but we really need to figure out how
to fit transport aborts into the NVMe architecture.  We can take this
offline or to the NVMe working group, though.

> +enum blk_eh_timer_return
> +nvme_fc_timeout(struct request *rq, bool reserved)
> +{
> +	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
> +	struct nvme_fc_ctrl *ctrl = op->ctrl;
> +	int ret;
> +
> +	if (reserved)
> +		return BLK_EH_RESET_TIMER;
> +
> +	ret = __nvme_fc_abort_op(ctrl, op);
> +	if (ret)
> +		/* io wasn't active to abort consider it done */
> +		return BLK_EH_HANDLED;
> +
> +	/* fail with DNR on cmd timeout */
> +	rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
> +
> +	wait_for_completion(&op->abort_done);

The blk timeout handle isn't really supposed to be blocking.  The
idea is to perform the EH action and then complete things in the
completion handler of the abort command.

> +	/* find the host and remote ports to connect together */
> +	spin_lock_irqsave(&nvme_fc_lock, flags);
> +	list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
> +		if ((lport->localport.node_name != laddr.nn) ||
> +		    (lport->localport.port_name != laddr.pn))

Nitpick: no need for the inner braces.

> +			if ((rport->remoteport.node_name != raddr.nn) ||
> +			    (rport->remoteport.port_name != raddr.pn))

Same here.

> +				continue;
> +
> +			nvme_fc_rport_get(rport);

nvme_fc_rport_get is a kref_get_unless_zero, so you need to check
the return value.

> +static void __exit nvme_fc_exit_module(void)
> +{
> +	/* sanity check - all lports should be removed */
> +	if (!list_empty(&nvme_fc_lport_list))
> +		pr_warn("%s: localport list not empty\n", __func__);

Nitpick: use WARN_ON?

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support
  2016-10-12  9:24 ` Christoph Hellwig
@ 2016-10-12 17:43   ` J Freyensee
  2016-10-12 19:54   ` James Smart
  1 sibling, 0 replies; 5+ messages in thread
From: J Freyensee @ 2016-10-12 17:43 UTC (permalink / raw)

On Wed, 2016-10-12@02:24 -0700, Christoph Hellwig wrote:
> > 
> > +	??To configure a NVMe over Fabrics controller use the
> > nvme-cli tool
> > +	??from https://github.com/linux-nvme/nvme-cli.
> 
> Talking about nvme-cli, can you post the FC support patches for that
> as well?

Yes, this would be good, esp since Christoph and I have been writing
man pages and mention "fc" in one of the parameters.

(In fact, providing an fc example in the man pages would go a long way
to enabling fc people using nvme-cli).

I'll try and give your series a look after this patchset is resubmitted
with Christoph's feedback.

J

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support
  2016-10-12  9:24 ` Christoph Hellwig
  2016-10-12 17:43   ` J Freyensee
@ 2016-10-12 19:54   ` James Smart
  2016-10-13  9:51     ` Christoph Hellwig
  1 sibling, 1 reply; 5+ messages in thread
From: James Smart @ 2016-10-12 19:54 UTC (permalink / raw)

On 10/12/2016 2:24 AM, Christoph Hellwig wrote:
>> +	  To configure a NVMe over Fabrics controller use the nvme-cli tool
>> +	  from https://github.com/linux-nvme/nvme-cli.
> Talking about nvme-cli, can you post the FC support patches for that
> as well?

Yes - getting to it. I also need to ensure that we have the udev hooks 
to force auto-connections.

>
>> +#define NVME_FC_NR_AEN_COMMANDS	1
>> +#define NVME_FC_AQ_BLKMQ_DEPTH	\
>> +	(NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS)
>> +#define AEN_CMDID_BASE		(NVME_FC_AQ_BLKMQ_DEPTH + 1)
>> +#define IS_AEN_COMMAND(command_id) \
>> +	((command_id) >= AEN_CMDID_BASE)
> This should be an inline.  But given that it's only used once I'd
> actually prefer to just kill the helper and opencode it like in the
> other NVMe drivers.

ok

>
>> +enum nvme_fcctrl_state {
>> +        FCCTRL_INIT             = 0,
>> +	FCCTRL_ACTIVE           = 1,
>> +	FCCTRL_DELETING         = 2,
>> +	FCCTRL_DELETED          = 3,
>> +};
> Please integrate the FC controller state with the generic controller
> state in core.c

I'm not sure about this. I believe there are things about the fc nvme 
ctlr struct, which has to wait on interactions from the FC LLDD before 
it cleans things up - that are not part of the nvme ctrl state.  Thus 
the distinction between DELETING (e.g. stopping, waiting for nvme to 
call us back after it's last put) vs DELETED (we've received NVME's 
callback, so can now act on our own ref counts).

I will look at it again.

I've noticed RDMA and loop cut a lot of corners on teardown of commands, 
etc - as they are mostly memory references with very quick hw context 
teardown that isn't necessarily the case with FC where we have real hw 
contexts in the LLDD and perhaps line traffic. I would assume most of 
the references in the LLDD come back to the blk request thus nvme ctrl, 
so your assumption that they are close in state to each other should be 
true, but I need to review the paths again.

>
>
>> +static LIST_HEAD(nvme_fc_lport_list);
>> +static u32 nvme_fc_local_port_cnt;
>> +static u32 nvme_fc_ctrl_cnt;
> Any reason not to use the ida allocator for the port number?
> a 32-bit integer can actually overflow fairly easily, so having
> a proper allocator and reusing ids instead of wrapping around would
> seem a lot safer.

No reason - just that it didn't seem necessary.  I'll look at ida.

>
>> +{
>> +	struct nvme_fc_lport *lport = localport_to_lport(portptr);
>> +	unsigned long flags;
>> +	u32 lnum;
>> +
>> +	if (!portptr)
>> +		return -EINVAL;
>> +
>> +	lnum = portptr->port_num;
>> +
>> +	spin_lock_irqsave(&nvme_fc_lock, flags);
> Just curious: where and why do you call into the NVMe code
> from IRQ conect?

Well, its not specified in the api. It's up to the context of the LLD 
when it calls the nvmefc_fcp_req->done routine, which is 
nvme_fc_fcpio_done() routine, which you're looking at here. The routine 
does make the blk_mq_complete_request() call without changing context.  
So whether it's true IRQ or not - depends on the driver and whether it 
calls from the real IRQ or from a DPC-like thread.  Any recommendations ?

>
>> +	/*
>> +	 * If successful and ERSP, use the returned CQE
>> +	 *
>> +	 * Otherwise, there isn't a CQE or it may not have valid content.
>> +	 * FC-NVME will need to fudge one up. We also need to fudge up
>> +	 * CQE's for LLDD/transport errors.
>> +	 */
> So the big question here is why?  So far the model we use in the core
> NVMe code is that as long as the lower layer returns a negative Linux
> errno the CQE isn't valid, and only if we return a valid NVMe status
> we need a CQE.  I suspect if you followed that convention you could
> avoid a lot of these synthetic CQE games.

FC-NVME uses existing hw definitions for FCP RSP's which allow a short 
all zeros's response frame to be passed on the wire for good completions 
(lots of specific rules in order to do so).  As such we don't have CQE 
content, so the transport has to create it (SQ ID, SQHD, Command Id, and 
status=0).  So we can't avoid this fudging. NVME fabrics group was well 
aware of this.

As for negative responses from LLDD/transport errors - true, although 
the fudging is minimal. At best there's 1 synthetic CQE game to be avoided.

>
>> +			/* passed validation, use the cqe */
>> +			/* TODO: fix sqhd - deal with out of order */
>> +			queue->sqhd = le16_to_cpu(cqe->sq_head);
>> +			queue->seqno = be32_to_cpu(op->rsp_iu.rsn);
>> +			goto validation_done;
> We don't really care about the sq_head at all in Linux, so I don't
> think we need to bother with maintaining the SQ head.

I know. It doesn't care as it always sizes the SQs to the maximum number 
of outstanding commands possible (io request q tags count). So it 
doesn't care about sqhd pacing vs posting cmds to any queue. But I had 
heard that there was potentially a desire to change that. I would hate 
to have the nvme layer change and not know the transport had to as well.

FC-NVME also has a requirement to deliver ERSP's in order so that SQHD 
goes base in "order". So it's goodness as that linux doesn't care as I 
can ignore this requirement. But, I will at least want a comment in the 
code to state why that functionality isn't present.

>
>> +static int
>> +__nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
>> +{
>> +	int state;
>> +
>> +	state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
>> +	if (state != FCPOP_STATE_ACTIVE) {
>> +		atomic_set(&op->state, state);
>> +		return 1; /* fail */
>> +	}
>> +
>> +	ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
>> +					&ctrl->rport->remoteport,
>> +					op->queue->lldd_handle,
>> +					&op->fcp_req);
>> +
>> +	return 0;
> Not complaining about the code, but we really need to figure out how
> to fit transport aborts into the NVMe architecture.  We can take this
> offline or to the NVMe working group, though.

Agree.  Its a tricky subject as simply changing completion status isn't 
great as the target side may not be aware of that change.

>
>> +enum blk_eh_timer_return
>> +nvme_fc_timeout(struct request *rq, bool reserved)
>> +{
>> +	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
>> +	struct nvme_fc_ctrl *ctrl = op->ctrl;
>> +	int ret;
>> +
>> +	if (reserved)
>> +		return BLK_EH_RESET_TIMER;
>> +
>> +	ret = __nvme_fc_abort_op(ctrl, op);
>> +	if (ret)
>> +		/* io wasn't active to abort consider it done */
>> +		return BLK_EH_HANDLED;
>> +
>> +	/* fail with DNR on cmd timeout */
>> +	rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
>> +
>> +	wait_for_completion(&op->abort_done);
> The blk timeout handle isn't really supposed to be blocking.  The
> idea is to perform the EH action and then complete things in the
> completion handler of the abort command.

Ok - will address.

>
>> +	/* find the host and remote ports to connect together */
>> +	spin_lock_irqsave(&nvme_fc_lock, flags);
>> +	list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
>> +		if ((lport->localport.node_name != laddr.nn) ||
>> +		    (lport->localport.port_name != laddr.pn))
> Nitpick: no need for the inner braces.
>
>> +			if ((rport->remoteport.node_name != raddr.nn) ||
>> +			    (rport->remoteport.port_name != raddr.pn))
> Same here.
>
>> +				continue;
>> +
>> +			nvme_fc_rport_get(rport);
> nvme_fc_rport_get is a kref_get_unless_zero, so you need to check
> the return value.

ok

>
>> +static void __exit nvme_fc_exit_module(void)
>> +{
>> +	/* sanity check - all lports should be removed */
>> +	if (!list_empty(&nvme_fc_lport_list))
>> +		pr_warn("%s: localport list not empty\n", __func__);
> Nitpick: use WARN_ON?

sure.

-- james

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support
  2016-10-12 19:54   ` James Smart
@ 2016-10-13  9:51     ` Christoph Hellwig
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2016-10-13  9:51 UTC (permalink / raw)


On Wed, Oct 12, 2016@12:54:02PM -0700, James Smart wrote:
> > > +enum nvme_fcctrl_state {
> > > +        FCCTRL_INIT             = 0,
> > > +	FCCTRL_ACTIVE           = 1,
> > > +	FCCTRL_DELETING         = 2,
> > > +	FCCTRL_DELETED          = 3,
> > > +};
> > Please integrate the FC controller state with the generic controller
> > state in core.c
> 
> I'm not sure about this. I believe there are things about the fc nvme ctlr
> struct, which has to wait on interactions from the FC LLDD before it cleans
> things up - that are not part of the nvme ctrl state.  Thus the distinction
> between DELETING (e.g. stopping, waiting for nvme to call us back after it's
> last put) vs DELETED (we've received NVME's callback, so can now act on our
> own ref counts).

I would much prefer to just have an additional state in the common
state machine, with comments on the specific, e.g.
NVME_CTRL_RECONNECTING is not used by PCIe.  That being said the state
machine could use some more documentation, I'll look into it.

> I've noticed RDMA and loop cut a lot of corners on teardown of commands, etc
> - as they are mostly memory references with very quick hw context teardown
> that isn't necessarily the case with FC where we have real hw contexts in
> the LLDD and perhaps line traffic. I would assume most of the references in
> the LLDD come back to the blk request thus nvme ctrl, so your assumption
> that they are close in state to each other should be true, but I need to
> review the paths again.

I'd be happy to get a review of the RDMA code in that respect.  Note
that PCIe shouldn't be all that different from FC in this respect.

> Well, its not specified in the api. It's up to the context of the LLD when
> it calls the nvmefc_fcp_req->done routine, which is nvme_fc_fcpio_done()
> routine, which you're looking at here. The routine does make the
> blk_mq_complete_request() call without changing context.  So whether it's
> true IRQ or not - depends on the driver and whether it calls from the real
> IRQ or from a DPC-like thread.  Any recommendations ?

But nvme_fc_fcpio_done doesn't actually use any locks, that's why
I'm wondering.  I think it's perfectly fine to call the fast path
completion handler from irq context, but I'd try to avoid dragging
irq safe locks into the rest of the code.

> FC-NVME uses existing hw definitions for FCP RSP's which allow a short all
> zeros's response frame to be passed on the wire for good completions (lots
> of specific rules in order to do so).  As such we don't have CQE content, so
> the transport has to create it (SQ ID, SQHD, Command Id, and status=0).  So
> we can't avoid this fudging. NVME fabrics group was well aware of this.

I'm fine with that concept.  I think you're doing way to much work in
the implementation, though..

Unless it's a REQ_TYPE_DRV_PRIV request, and rq->special points to the
CQE there is no point in faking up a CQE at all, as no one is going to
look at it ever. This is especially true for the fast path case you
mention above.  See the untested incremental patch at the end of the
mail on how I'd handle this.

> FC-NVME also has a requirement to deliver ERSP's in order so that SQHD goes
> base in "order". So it's goodness as that linux doesn't care as I can ignore
> this requirement. But, I will at least want a comment in the code to state
> why that functionality isn't present.

Sure, comments are always good..


diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 35ea164..67984572 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -974,7 +974,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 	struct nvme_fc_queue *queue = op->queue;
 	struct nvme_completion *cqe = &op->rsp_iu.cqe;
-	int status, llddstatus = freq->status;
+	int status;
 
 	dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
 				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
@@ -983,73 +983,65 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	 * if we aborted the op, the routines that invoked the abort
 	 * set the desired status on the request (namely DNR bits as
 	 * well as NVME_SC_ABORT_REQ). Thus, use those values for
-	 * the llddstatus which gets put into cQEs.
+	 * the status which gets put into CQEs.
 	 */
 	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
-		llddstatus = rq->errors;
-
-	/*
-	 * If successful and ERSP, use the returned CQE
-	 *
-	 * Otherwise, there isn't a CQE or it may not have valid content.
-	 * FC-NVME will need to fudge one up. We also need to fudge up
-	 * CQE's for LLDD/transport errors.
-	 */
-
-	/*
-	 * if length of the received response is 0 or 12 and llddstatus is 0,
-	 * then a successful response is assumed.  But, need to create a CQE.
-	 */
-	if (!llddstatus && (!freq->rcv_rsplen ||
-			  (freq->rcv_rsplen == NVME_FC_SIZEOF_ZEROS_RSP))) {
-
-		memset(cqe, 0, sizeof(*cqe));
-		cqe->sq_head = cpu_to_le16(queue->sqhd);
-		cqe->command_id = cpu_to_le16(op->rqno);
-		goto validation_done;
-	}
-
-	/* successful ersp */
-	if ((!llddstatus) &&
-			(freq->rcv_rsplen == sizeof(struct nvme_fc_ersp_iu))) {
-		/* validate it */
+		status = rq->errors;
+	else
+		status = freq->status;
+
+	if (status)
+		goto done;
+
+	switch (freq->rcv_rsplen) {
+	case 0:
+	case NVME_FC_SIZEOF_ZEROS_RSP:
+		/* fasth path for successful operations without result */
+		if (unlikely(rq->cmd_type == REQ_TYPE_DRV_PRIV &&
+				rq->special)) {
+			struct nvme_completion *cqe = rq->special;
+
+			memset(cqe, 0, sizeof(*cqe));
+			cqe->sq_head = cpu_to_le16(queue->sqhd);
+			cqe->command_id = cpu_to_le16(op->rqno);
+			cqe->status = cpu_to_le16(status << 1);
+		}
+		break;
+	case sizeof(struct nvme_fc_ersp_iu):
+		/* successful ersp */
 		if (unlikely(be16_to_cpu(op->rsp_iu.iu_len) !=
-				(freq->rcv_rsplen/4)))
-			llddstatus = NVME_SC_FC_FORMAT;
-		else if (unlikely(op->rqno != le16_to_cpu(cqe->command_id)))
-			llddstatus = NVME_SC_FC_CMDID_MISMATCH;
-		else {
-			/* passed validation, use the cqe */
-			/* TODO: fix sqhd - deal with out of order */
-			queue->sqhd = le16_to_cpu(cqe->sq_head);
-			queue->seqno = be32_to_cpu(op->rsp_iu.rsn);
-			goto validation_done;
+				freq->rcv_rsplen / 4)) {
+			status = -EIO;
+			goto done;
 		}
-		/* if error - will fall thru below */
 
-	/* if a bad length */
-	} else if (!llddstatus)
-		llddstatus = NVME_SC_FC_FORMAT;
+		if (unlikely(op->rqno != le16_to_cpu(cqe->command_id))) {
+			status = -EIO;
+			break;
+		}
 
-	/* we have validation errors or a lldd/transport error */
-	memset(cqe, 0, sizeof(*cqe));
-	cqe->sq_head = cpu_to_le16(queue->sqhd);
-	cqe->command_id = cpu_to_le16(op->rqno);
-	cqe->status = cpu_to_le16(llddstatus << 1);
+		/* passed validation, use the cqe */
+		/* TODO: fix sqhd - deal with out of order */
+		queue->sqhd = le16_to_cpu(cqe->sq_head);
+		queue->seqno = be32_to_cpu(op->rsp_iu.rsn);
+	
+		if (!queue->qnum && IS_AEN_COMMAND(cqe->command_id)) {
+			nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
+			return;
+		}
 
-validation_done:
+		if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
+			memcpy(rq->special, cqe, sizeof(*cqe));
 
-	if (!queue->qnum && IS_AEN_COMMAND(cqe->command_id)) {
-		nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
-		return;
+		status = le16_to_cpu(cqe->status) >> 1;
+		break;
+	default:
+		/* invalid size */
+		status = -EIO;
+		break;
 	}
 
-	status = le16_to_cpu(cqe->status);
-	status >>= 1;
-
-	if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
-		memcpy(rq->special, cqe, sizeof(*cqe));
-
+done:
 	blk_mq_complete_request(rq, status);
 }
 

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2016-10-13  9:51 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-10-07 23:09 [PATCH 5/7 v2] nvme-fabrics: Add host FC transport support James Smart
2016-10-12  9:24 ` Christoph Hellwig
2016-10-12 17:43   ` J Freyensee
2016-10-12 19:54   ` James Smart
2016-10-13  9:51     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.