* [PATCH v2 1/3] NVMe: Rename nvme.c to nvme-core.c @ 2012-10-31 17:52 Vishal Verma 2012-10-31 17:52 ` [PATCH v2 2/3] NVMe: Move dev, ns, iod structs to nvme.c Vishal Verma 2012-10-31 17:52 ` [PATCH v2 3/3] NVMe: Add nvme-scsi.c Vishal Verma 0 siblings, 2 replies; 3+ messages in thread From: Vishal Verma @ 2012-10-31 17:52 UTC (permalink / raw) In preparation for adding nvme-scsi.c It is preferable to retain the module name 'nvme' Signed-off-by: Vishal Verma <vishal.l.verma at intel.com> --- drivers/block/Makefile | 1 + drivers/block/nvme-core.c | 1804 +++++++++++++++++++++++++++++++++++++++++++++ drivers/block/nvme.c | 1804 --------------------------------------------- 3 files changed, 1805 insertions(+), 1804 deletions(-) create mode 100644 drivers/block/nvme-core.c delete mode 100644 drivers/block/nvme.c diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 349539a..3f8a5e8 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,4 +41,5 @@ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o +nvme-y := nvme-core.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c new file mode 100644 index 0000000..7df794f --- /dev/null +++ b/drivers/block/nvme-core.c @@ -0,0 +1,1804 @@ +/* + * NVM Express device driver + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/nvme.h> +#include <linux/bio.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kdev_t.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/pci.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/version.h> + +#define NVME_Q_DEPTH 1024 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define NVME_MINORS 64 +#define NVME_IO_TIMEOUT (5 * HZ) +#define ADMIN_TIMEOUT (60 * HZ) + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static DEFINE_SPINLOCK(dev_list_lock); +static LIST_HEAD(dev_list); +static struct task_struct *nvme_thread; + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + u32 __iomem *dbs; + struct pci_dev *pci_dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + int instance; + int queue_count; + int db_stride; + u32 ctrl_config; + struct msix_entry *entry; + struct nvme_bar __iomem *bar; + struct list_head namespaces; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u32 max_hw_sectors; +}; + +/* + * An NVM Express namespace is equivalent to a SCSI LUN + */ +struct nvme_ns { + struct list_head list; + + struct nvme_dev *dev; + struct request_queue *queue; + struct gendisk *disk; + + int ns_id; + int lba_shift; +}; + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct device *q_dmadev; + struct nvme_dev *dev; + spinlock_t q_lock; + struct nvme_command *sq_cmds; + volatile struct nvme_completion *cqes; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + wait_queue_head_t sq_full; + wait_queue_t sq_cong_wait; + struct bio_list sq_cong; + u32 __iomem *q_db; + u16 q_depth; + u16 cq_vector; + u16 sq_head; + u16 sq_tail; + u16 cq_head; + u16 cq_phase; + unsigned long cmdid_data[]; +}; + +/* + * Check we didin't inadvertently grow the command struct + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); +} + +typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, + struct nvme_completion *); + +struct nvme_cmd_info { + nvme_completion_fn fn; + void *ctx; + unsigned long timeout; +}; + +static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +{ + return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; +} + +/** + * alloc_cmdid() - Allocate a Command ID + * @nvmeq: The queue that will be used for this command + * @ctx: A pointer that will be passed to the handler + * @handler: The function to call on completion + * + * Allocate a Command ID for a queue. The data passed in will + * be passed to the completion handler. This is implemented by using + * the bottom two bits of the ctx pointer to store the handler ID. + * Passing in a pointer that's not 4-byte aligned will cause a BUG. + * We can change this if it becomes a problem. + * + * May be called with local interrupts disabled and the q_lock held, + * or with interrupts enabled and no locks held. + */ +static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, + nvme_completion_fn handler, unsigned timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + int cmdid; + + do { + cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); + if (cmdid >= depth) + return -EBUSY; + } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + + info[cmdid].fn = handler; + info[cmdid].ctx = ctx; + info[cmdid].timeout = jiffies + timeout; + return cmdid; +} + +static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, + nvme_completion_fn handler, unsigned timeout) +{ + int cmdid; + wait_event_killable(nvmeq->sq_full, + (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); + return (cmdid < 0) ? -EINTR : cmdid; +} + +/* Special values must be less than 0x1000 */ +#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) +#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) +#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) +#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) +#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) + +static void special_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + if (ctx == CMD_CTX_CANCELLED) + return; + if (ctx == CMD_CTX_FLUSH) + return; + if (ctx == CMD_CTX_COMPLETED) { + dev_warn(&dev->pci_dev->dev, + "completed id %d twice on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + if (ctx == CMD_CTX_INVALID) { + dev_warn(&dev->pci_dev->dev, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + + dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, + nvme_completion_fn *fn) +{ + void *ctx; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + + if (cmdid >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + *fn = info[cmdid].fn; + ctx = info[cmdid].ctx; + info[cmdid].fn = special_completion; + info[cmdid].ctx = CMD_CTX_COMPLETED; + clear_bit(cmdid, nvmeq->cmdid_data); + wake_up(&nvmeq->sq_full); + return ctx; +} + +static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, + nvme_completion_fn *fn) +{ + void *ctx; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + if (fn) + *fn = info[cmdid].fn; + ctx = info[cmdid].ctx; + info[cmdid].fn = special_completion; + info[cmdid].ctx = CMD_CTX_CANCELLED; + return ctx; +} + +static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +{ + return dev->queues[get_cpu() + 1]; +} + +static void put_nvmeq(struct nvme_queue *nvmeq) +{ + put_cpu(); +} + +/** + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + * + * Safe to use from interrupt context + */ +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + u16 tail; + spin_lock_irqsave(&nvmeq->q_lock, flags); + tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + if (++tail == nvmeq->q_depth) + tail = 0; + writel(tail, nvmeq->q_db); + nvmeq->sq_tail = tail; + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + + return 0; +} + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_alloc_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + void *private; /* For the use of the submitter of the I/O */ + int npages; /* In the PRP list. 0 means small pool in use */ + int offset; /* Of PRP list */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist sg[0]; +}; + +static __le64 **iod_list(struct nvme_iod *iod) +{ + return ((void *)iod) + iod->offset; +} + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size) +{ + unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static struct nvme_iod * +nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +{ + struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + + sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(struct scatterlist) * nseg, gfp); + + if (iod) { + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + } + + return iod; +} + +static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +{ + const int last_prp = PAGE_SIZE / 8 - 1; + int i; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma = iod->first_dma; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = list[i]; + dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); + dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); + prp_dma = next_prp_dma; + } + kfree(iod); +} + +static void requeue_bio(struct nvme_dev *dev, struct bio *bio) +{ + struct nvme_queue *nvmeq = get_nvmeq(dev); + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + put_nvmeq(nvmeq); + wake_up_process(nvme_thread); +} + +static void bio_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + struct nvme_iod *iod = ctx; + struct bio *bio = iod->private; + u16 status = le16_to_cpup(&cqe->status) >> 1; + + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + nvme_free_iod(dev, iod); + if (status) { + bio_endio(bio, -EIO); + } else if (bio->bi_vcnt > bio->bi_idx) { + requeue_bio(dev, bio); + } else { + bio_endio(bio, 0); + } +} + +/* length is in bytes. gfp flags indicates whether we may sleep. */ +static int nvme_setup_prps(struct nvme_dev *dev, + struct nvme_common_command *cmd, struct nvme_iod *iod, + int total_len, gfp_t gfp) +{ + struct dma_pool *pool; + int length = total_len; + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + int offset = offset_in_page(dma_addr); + __le64 *prp_list; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma; + int nprps, i; + + cmd->prp1 = cpu_to_le64(dma_addr); + length -= (PAGE_SIZE - offset); + if (length <= 0) + return total_len; + + dma_len -= (PAGE_SIZE - offset); + if (dma_len) { + dma_addr += (PAGE_SIZE - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= PAGE_SIZE) { + cmd->prp2 = cpu_to_le64(dma_addr); + return total_len; + } + + nprps = DIV_ROUND_UP(length, PAGE_SIZE); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) { + cmd->prp2 = cpu_to_le64(dma_addr); + iod->npages = -1; + return (total_len - length) + PAGE_SIZE; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + cmd->prp2 = cpu_to_le64(prp_dma); + i = 0; + for (;;) { + if (i == PAGE_SIZE / 8) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) + return total_len - length; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= PAGE_SIZE; + dma_addr += PAGE_SIZE; + length -= PAGE_SIZE; + if (length <= 0) + break; + if (dma_len > 0) + continue; + BUG_ON(dma_len < 0); + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + return total_len; +} + +/* NVMe scatterlists require no holes in the virtual address */ +#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ + (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) + +static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, + struct bio *bio, enum dma_data_direction dma_dir, int psegs) +{ + struct bio_vec *bvec, *bvprv = NULL; + struct scatterlist *sg = NULL; + int i, old_idx, length = 0, nsegs = 0; + + sg_init_table(iod->sg, psegs); + old_idx = bio->bi_idx; + bio_for_each_segment(bvec, bio, i) { + if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { + sg->length += bvec->bv_len; + } else { + if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) + break; + sg = sg ? sg + 1 : iod->sg; + sg_set_page(sg, bvec->bv_page, bvec->bv_len, + bvec->bv_offset); + nsegs++; + } + length += bvec->bv_len; + bvprv = bvec; + } + bio->bi_idx = i; + iod->nents = nsegs; + sg_mark_end(sg); + if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { + bio->bi_idx = old_idx; + return -ENOMEM; + } + return length; +} + +static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, + int cmdid) +{ + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.command_id = cmdid; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + +static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) +{ + int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, + special_completion, NVME_IO_TIMEOUT); + if (unlikely(cmdid < 0)) + return cmdid; + + return nvme_submit_flush(nvmeq, ns, cmdid); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct bio *bio) +{ + struct nvme_command *cmnd; + struct nvme_iod *iod; + enum dma_data_direction dma_dir; + int cmdid, length, result = -ENOMEM; + u16 control; + u32 dsmgmt; + int psegs = bio_phys_segments(ns->queue, bio); + + if ((bio->bi_rw & REQ_FLUSH) && psegs) { + result = nvme_submit_flush_data(nvmeq, ns); + if (result) + return result; + } + + iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); + if (!iod) + goto nomem; + iod->private = bio; + + result = -EBUSY; + cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); + if (unlikely(cmdid < 0)) + goto free_iod; + + if ((bio->bi_rw & REQ_FLUSH) && !psegs) + return nvme_submit_flush(nvmeq, ns, cmdid); + + control = 0; + if (bio->bi_rw & REQ_FUA) + control |= NVME_RW_FUA; + if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + dsmgmt = 0; + if (bio->bi_rw & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + if (bio_data_dir(bio)) { + cmnd->rw.opcode = nvme_cmd_write; + dma_dir = DMA_TO_DEVICE; + } else { + cmnd->rw.opcode = nvme_cmd_read; + dma_dir = DMA_FROM_DEVICE; + } + + result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); + if (result < 0) + goto free_iod; + length = result; + + cmnd->rw.command_id = cmdid; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, + GFP_ATOMIC); + cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); + cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + + bio->bi_sector += length >> 9; + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; + + free_iod: + nvme_free_iod(nvmeq->dev, iod); + nomem: + return result; +} + +/* + * NB: return value of non-zero would mean that we were a stacking driver. + * make_request must always succeed. + */ +static int nvme_make_request(struct request_queue *q, struct bio *bio) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + int result = -EBUSY; + + spin_lock_irq(&nvmeq->q_lock); + if (bio_list_empty(&nvmeq->sq_cong)) + result = nvme_submit_bio_queue(nvmeq, ns, bio); + if (unlikely(result)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + } + + spin_unlock_irq(&nvmeq->q_lock); + put_nvmeq(nvmeq); + + return 0; +} + +static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) +{ + u16 head, phase; + + head = nvmeq->cq_head; + phase = nvmeq->cq_phase; + + for (;;) { + void *ctx; + nvme_completion_fn fn; + struct nvme_completion cqe = nvmeq->cqes[head]; + if ((le16_to_cpu(cqe.status) & 1) != phase) + break; + nvmeq->sq_head = le16_to_cpu(cqe.sq_head); + if (++head == nvmeq->q_depth) { + head = 0; + phase = !phase; + } + + ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + fn(nvmeq->dev, ctx, &cqe); + } + + /* If the controller ignores the cq head doorbell and continuously + * writes to the queue, it is theoretically possible to wrap around + * the queue twice and mistakenly return IRQ_NONE. Linux only + * requires that 0.1% of your interrupts are handled, so this isn't + * a big problem. + */ + if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + return IRQ_NONE; + + writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); + nvmeq->cq_head = head; + nvmeq->cq_phase = phase; + + return IRQ_HANDLED; +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + irqreturn_t result; + struct nvme_queue *nvmeq = data; + spin_lock(&nvmeq->q_lock); + result = nvme_process_cq(nvmeq); + spin_unlock(&nvmeq->q_lock); + return result; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; + if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) + return IRQ_NONE; + return IRQ_WAKE_THREAD; +} + +static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +{ + spin_lock_irq(&nvmeq->q_lock); + cancel_cmdid(nvmeq, cmdid, NULL); + spin_unlock_irq(&nvmeq->q_lock); +} + +struct sync_cmd_info { + struct task_struct *task; + u32 result; + int status; +}; + +static void sync_completion(struct nvme_dev *dev, void *ctx, + struct nvme_completion *cqe) +{ + struct sync_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + wake_up_process(cmdinfo->task); +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, + struct nvme_command *cmd, u32 *result, unsigned timeout) +{ + int cmdid; + struct sync_cmd_info cmdinfo; + + cmdinfo.task = current; + cmdinfo.status = -EINTR; + + cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, + timeout); + if (cmdid < 0) + return cmdid; + cmd->common.command_id = cmdid; + + set_current_state(TASK_KILLABLE); + nvme_submit_cmd(nvmeq, cmd); + schedule(); + + if (cmdinfo.status == -EINTR) { + nvme_abort_command(nvmeq, cmdid); + return -EINTR; + } + + if (result) + *result = cmdinfo.result; + + return cmdinfo.status; +} + +static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result) +{ + return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + int status; + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + int status; + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + int status; + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + status = nvme_submit_admin_cmd(dev, &c, NULL); + if (status) + return -EIO; + return 0; +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); +} + +static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, + dma_addr_t dma_addr) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.prp1 = cpu_to_le64(dma_addr); + c.identify.cns = cpu_to_le32(cns); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +static int nvme_set_features(struct nvme_dev *dev, unsigned fid, + unsigned dword11, dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +/** + * nvme_cancel_ios - Cancel outstanding I/Os + * @queue: The queue to cancel I/Os on + * @timeout: True to only cancel I/Os which have timed out + */ +static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +{ + int depth = nvmeq->q_depth - 1; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + unsigned long now = jiffies; + int cmdid; + + for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { + void *ctx; + nvme_completion_fn fn; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, + }; + + if (timeout && !time_after(now, info[cmdid].timeout)) + continue; + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); + ctx = cancel_cmdid(nvmeq, cmdid, &fn); + fn(nvmeq->dev, ctx, &cqe); + } +} + +static void nvme_free_queue_mem(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + +static void nvme_free_queue(struct nvme_dev *dev, int qid) +{ + struct nvme_queue *nvmeq = dev->queues[qid]; + int vector = dev->entry[nvmeq->cq_vector].vector; + + spin_lock_irq(&nvmeq->q_lock); + nvme_cancel_ios(nvmeq, false); + spin_unlock_irq(&nvmeq->q_lock); + + irq_set_affinity_hint(vector, NULL); + free_irq(vector, nvmeq); + + /* Don't tell the adapter to delete the admin queue */ + if (qid) { + adapter_delete_sq(dev, qid); + adapter_delete_cq(dev, qid); + } + + nvme_free_queue_mem(nvmeq); +} + +static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int vector) +{ + struct device *dmadev = &dev->pci_dev->dev; + unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * + sizeof(struct nvme_cmd_info)); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + if (!nvmeq) + return NULL; + + nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); + if (!nvmeq->cqes) + goto free_nvmeq; + memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); + + nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + goto free_cqdma; + + nvmeq->q_dmadev = dmadev; + nvmeq->dev = dev; + spin_lock_init(&nvmeq->q_lock); + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + init_waitqueue_head(&nvmeq->sq_full); + init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); + bio_list_init(&nvmeq->sq_cong); + nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; + nvmeq->q_depth = depth; + nvmeq->cq_vector = vector; + + return nvmeq; + + free_cqdma: + dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); + free_nvmeq: + kfree(nvmeq); + return NULL; +} + +static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, + const char *name) +{ + if (use_threaded_interrupts) + return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, + nvme_irq_check, nvme_irq, + IRQF_DISABLED | IRQF_SHARED, + name, nvmeq); + return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, + IRQF_DISABLED | IRQF_SHARED, name, nvmeq); +} + +static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, + int qid, int cq_size, int vector) +{ + int result; + struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); + + if (!nvmeq) + return ERR_PTR(-ENOMEM); + + result = adapter_alloc_cq(dev, qid, nvmeq); + if (result < 0) + goto free_nvmeq; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + goto release_cq; + + result = queue_request_irq(dev, nvmeq, "nvme"); + if (result < 0) + goto release_sq; + + return nvmeq; + + release_sq: + adapter_delete_sq(dev, qid); + release_cq: + adapter_delete_cq(dev, qid); + free_nvmeq: + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); + return ERR_PTR(result); +} + +static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) +{ + int result = 0; + u32 aqa; + u64 cap; + unsigned long timeout; + struct nvme_queue *nvmeq; + + dev->dbs = ((void __iomem *)dev->bar) + 4096; + + nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + if (!nvmeq) + return -ENOMEM; + + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; + dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + + writel(0, &dev->bar->cc); + writel(aqa, &dev->bar->aqa); + writeq(nvmeq->sq_dma_addr, &dev->bar->asq); + writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + writel(dev->ctrl_config, &dev->bar->cc); + + cap = readq(&dev->bar->cap); + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + dev->db_stride = NVME_CAP_STRIDE(cap); + + while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { + msleep(100); + if (fatal_signal_pending(current)) + result = -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device not ready; aborting initialisation\n"); + result = -ENODEV; + } + } + + if (result) { + nvme_free_queue_mem(nvmeq); + return result; + } + + result = queue_request_irq(dev, nvmeq, "nvme admin"); + dev->queues[0] = nvmeq; + return result; +} + +static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, + unsigned long addr, unsigned length) +{ + int i, err, count, nents, offset; + struct scatterlist *sg; + struct page **pages; + struct nvme_iod *iod; + + if (addr & 3) + return ERR_PTR(-EINVAL); + if (!length) + return ERR_PTR(-EINVAL); + + offset = offset_in_page(addr); + count = DIV_ROUND_UP(offset + length, PAGE_SIZE); + pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + err = get_user_pages_fast(addr, count, 1, pages); + if (err < count) { + count = err; + err = -EFAULT; + goto put_pages; + } + + iod = nvme_alloc_iod(count, length, GFP_KERNEL); + sg = iod->sg; + sg_init_table(sg, count); + for (i = 0; i < count; i++) { + sg_set_page(&sg[i], pages[i], + min_t(int, length, PAGE_SIZE - offset), offset); + length -= (PAGE_SIZE - offset); + offset = 0; + } + sg_mark_end(&sg[i - 1]); + iod->nents = count; + + err = -ENOMEM; + nents = dma_map_sg(&dev->pci_dev->dev, sg, count, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (!nents) + goto free_iod; + + kfree(pages); + return iod; + + free_iod: + kfree(iod); + put_pages: + for (i = 0; i < count; i++) + put_page(pages[i]); + kfree(pages); + return ERR_PTR(err); +} + +static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, + struct nvme_iod *iod) +{ + int i; + + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + for (i = 0; i < iod->nents; i++) + put_page(sg_page(&iod->sg[i])); +} + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_dev *dev = ns->dev; + struct nvme_queue *nvmeq; + struct nvme_user_io io; + struct nvme_command c; + unsigned length; + int status; + struct nvme_iod *iod; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + length = (io.nblocks + 1) << ns->lba_shift; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); + break; + default: + return -EINVAL; + } + + if (IS_ERR(iod)) + return PTR_ERR(iod); + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); + c.rw.reftag = io.reftag; + c.rw.apptag = io.apptag; + c.rw.appmask = io.appmask; + /* XXX: metadata */ + length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); + + nvmeq = get_nvmeq(dev); + /* + * Since nvme_submit_sync_cmd sleeps, we can't keep preemption + * disabled. We may be preempted at any point, and be rescheduled + * to a different CPU. That will cause cacheline bouncing, but no + * additional races since q_lock already protects against other CPUs. + */ + put_nvmeq(nvmeq); + if (length != (io.nblocks + 1) << ns->lba_shift) + status = -ENOMEM; + else + status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); + + nvme_unmap_user_pages(dev, io.opcode & 1, iod); + nvme_free_iod(dev, iod); + return status; +} + +static int nvme_user_admin_cmd(struct nvme_dev *dev, + struct nvme_admin_cmd __user *ucmd) +{ + struct nvme_admin_cmd cmd; + struct nvme_command c; + int status, length; + struct nvme_iod *uninitialized_var(iod); + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + length = cmd.data_len; + if (cmd.data_len) { + iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, + length); + if (IS_ERR(iod)) + return PTR_ERR(iod); + length = nvme_setup_prps(dev, &c.common, iod, length, + GFP_KERNEL); + } + + if (length != cmd.data_len) + status = -ENOMEM; + else + status = nvme_submit_admin_cmd(dev, &c, &cmd.result); + + if (cmd.data_len) { + nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); + nvme_free_iod(dev, iod); + } + + if (!status && copy_to_user(&ucmd->result, &cmd.result, + sizeof(cmd.result))) + status = -EFAULT; + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_ioctl, +}; + +static void nvme_resubmit_bios(struct nvme_queue *nvmeq) +{ + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + if (nvme_submit_bio_queue(nvmeq, ns, bio)) { + bio_list_add_head(&nvmeq->sq_cong, bio); + break; + } + if (bio_list_empty(&nvmeq->sq_cong)) + remove_wait_queue(&nvmeq->sq_full, + &nvmeq->sq_cong_wait); + } +} + +static int nvme_kthread(void *data) +{ + struct nvme_dev *dev; + + while (!kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) { + int i; + for (i = 0; i < dev->queue_count; i++) { + struct nvme_queue *nvmeq = dev->queues[i]; + if (!nvmeq) + continue; + spin_lock_irq(&nvmeq->q_lock); + if (nvme_process_cq(nvmeq)) + printk("process_cq did something\n"); + nvme_cancel_ios(nvmeq, true); + nvme_resubmit_bios(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + } + } + spin_unlock(&dev_list_lock); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + return 0; +} + +static DEFINE_IDA(nvme_index_ida); + +static int nvme_get_ns_idx(void) +{ + int index, error; + + do { + if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) + return -1; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_index_ida, &index); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + index = -1; + return index; +} + +static void nvme_put_ns_idx(int index) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_index_ida, index); + spin_unlock(&dev_list_lock); +} + +static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, + struct nvme_id_ns *id, struct nvme_lba_range_type *rt) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int lbaf; + + if (rt->attributes & NVME_LBART_ATTRIB_HIDE) + return NULL; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + ns->queue = blk_alloc_queue(GFP_KERNEL); + if (!ns->queue) + goto out_free_ns; + ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); +/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */ + blk_queue_make_request(ns->queue, nvme_make_request); + ns->dev = dev; + ns->queue->queuedata = ns; + + disk = alloc_disk(NVME_MINORS); + if (!disk) + goto out_free_queue; + ns->ns_id = nsid; + ns->disk = disk; + lbaf = id->flbas & 0xf; + ns->lba_shift = id->lbaf[lbaf].ds; + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (dev->max_hw_sectors) + blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + + disk->major = nvme_major; + disk->minors = NVME_MINORS; + disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->driverfs_dev = &dev->pci_dev->dev; + sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + return ns; + + out_free_queue: + blk_cleanup_queue(ns->queue); + out_free_ns: + kfree(ns); + return NULL; +} + +static void nvme_ns_free(struct nvme_ns *ns) +{ + int index = ns->disk->first_minor / NVME_MINORS; + put_disk(ns->disk); + nvme_put_ns_idx(index); + blk_cleanup_queue(ns->queue); + kfree(ns); +} + +static int set_queue_count(struct nvme_dev *dev, int count) +{ + int status; + u32 result; + u32 q_count = (count - 1) | ((count - 1) << 16); + + status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status) + return -EIO; + return min(result & 0xffff, result >> 16) + 1; +} + +static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) +{ + int result, cpu, i, nr_io_queues, db_bar_size, q_depth; + + nr_io_queues = num_online_cpus(); + result = set_queue_count(dev, nr_io_queues); + if (result < 0) + return result; + if (result < nr_io_queues) + nr_io_queues = result; + + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, dev->queues[0]); + + db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); + if (db_bar_size > 8192) { + iounmap(dev->bar); + dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), + db_bar_size); + dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->queues[0]->q_db = dev->dbs; + } + + for (i = 0; i < nr_io_queues; i++) + dev->entry[i].entry = i; + for (;;) { + result = pci_enable_msix(dev->pci_dev, dev->entry, + nr_io_queues); + if (result == 0) { + break; + } else if (result > 0) { + nr_io_queues = result; + continue; + } else { + nr_io_queues = 1; + break; + } + } + + result = queue_request_irq(dev, dev->queues[0], "nvme admin"); + /* XXX: handle failure here */ + + cpu = cpumask_first(cpu_online_mask); + for (i = 0; i < nr_io_queues; i++) { + irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); + cpu = cpumask_next(cpu, cpu_online_mask); + } + + q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, + NVME_Q_DEPTH); + for (i = 0; i < nr_io_queues; i++) { + dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); + if (IS_ERR(dev->queues[i + 1])) + return PTR_ERR(dev->queues[i + 1]); + dev->queue_count++; + } + + for (; i < num_possible_cpus(); i++) { + int target = i % rounddown_pow_of_two(dev->queue_count - 1); + dev->queues[i + 1] = dev->queues[target + 1]; + } + + return 0; +} + +static void nvme_free_queues(struct nvme_dev *dev) +{ + int i; + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_free_queue(dev, i); +} + +static int __devinit nvme_dev_add(struct nvme_dev *dev) +{ + int res, nn, i; + struct nvme_ns *ns, *next; + struct nvme_id_ctrl *ctrl; + struct nvme_id_ns *id_ns; + void *mem; + dma_addr_t dma_addr; + + res = nvme_setup_io_queues(dev); + if (res) + return res; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, + GFP_KERNEL); + + res = nvme_identify(dev, 0, 1, dma_addr); + if (res) { + res = -EIO; + goto out_free; + } + + ctrl = mem; + nn = le32_to_cpup(&ctrl->nn); + memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); + memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); + memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + if (ctrl->mdts) { + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); + } + + id_ns = mem; + for (i = 1; i <= nn; i++) { + res = nvme_identify(dev, i, 0, dma_addr); + if (res) + continue; + + if (id_ns->ncap == 0) + continue; + + res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, + dma_addr + 4096, NULL); + if (res) + continue; + + ns = nvme_alloc_ns(dev, i, mem, mem + 4096); + if (ns) + list_add_tail(&ns->list, &dev->namespaces); + } + list_for_each_entry(ns, &dev->namespaces, list) + add_disk(ns->disk); + + goto out; + + out_free: + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + list_del(&ns->list); + nvme_ns_free(ns); + } + + out: + dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); + return res; +} + +static int nvme_dev_remove(struct nvme_dev *dev) +{ + struct nvme_ns *ns, *next; + + spin_lock(&dev_list_lock); + list_del(&dev->node); + spin_unlock(&dev_list_lock); + + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + list_del(&ns->list); + del_gendisk(ns->disk); + nvme_ns_free(ns); + } + + nvme_free_queues(dev); + + return 0; +} + +static int nvme_setup_prp_pools(struct nvme_dev *dev) +{ + struct device *dmadev = &dev->pci_dev->dev; + dev->prp_page_pool = dma_pool_create("prp list page", dmadev, + PAGE_SIZE, PAGE_SIZE, 0); + if (!dev->prp_page_pool) + return -ENOMEM; + + /* Optimisation for I/Os between 4k and 128k */ + dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, + 256, 256, 0); + if (!dev->prp_small_pool) { + dma_pool_destroy(dev->prp_page_pool); + return -ENOMEM; + } + return 0; +} + +static void nvme_release_prp_pools(struct nvme_dev *dev) +{ + dma_pool_destroy(dev->prp_page_pool); + dma_pool_destroy(dev->prp_small_pool); +} + +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_dev *dev) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + dev->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_dev *dev) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, dev->instance); + spin_unlock(&dev_list_lock); +} + +static int __devinit nvme_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + int bars, result = -ENOMEM; + struct nvme_dev *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), + GFP_KERNEL); + if (!dev->entry) + goto free; + dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), + GFP_KERNEL); + if (!dev->queues) + goto free; + + if (pci_enable_device_mem(pdev)) + goto free; + pci_set_master(pdev); + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (pci_request_selected_regions(pdev, bars, "nvme")) + goto disable; + + INIT_LIST_HEAD(&dev->namespaces); + dev->pci_dev = pdev; + pci_set_drvdata(pdev, dev); + dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + result = nvme_set_instance(dev); + if (result) + goto disable; + + dev->entry[0].vector = pdev->irq; + + result = nvme_setup_prp_pools(dev); + if (result) + goto disable_msix; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) { + result = -ENOMEM; + goto disable_msix; + } + + result = nvme_configure_admin_queue(dev); + if (result) + goto unmap; + dev->queue_count++; + + spin_lock(&dev_list_lock); + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); + + result = nvme_dev_add(dev); + if (result) + goto delete; + + return 0; + + delete: + spin_lock(&dev_list_lock); + list_del(&dev->node); + spin_unlock(&dev_list_lock); + + nvme_free_queues(dev); + unmap: + iounmap(dev->bar); + disable_msix: + pci_disable_msix(pdev); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); + disable: + pci_disable_device(pdev); + pci_release_regions(pdev); + free: + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); + return result; +} + +static void __devexit nvme_remove(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_dev_remove(dev); + pci_disable_msix(pdev); + iounmap(dev->bar); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); + pci_disable_device(pdev); + pci_release_regions(pdev); + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); +} + +/* These functions are yet to be implemented */ +#define nvme_error_detected NULL +#define nvme_dump_registers NULL +#define nvme_link_reset NULL +#define nvme_slot_reset NULL +#define nvme_error_resume NULL +#define nvme_suspend NULL +#define nvme_resume NULL + +static struct pci_error_handlers nvme_err_handler = { + .error_detected = nvme_error_detected, + .mmio_enabled = nvme_dump_registers, + .link_reset = nvme_link_reset, + .slot_reset = nvme_slot_reset, + .resume = nvme_error_resume, +}; + +/* Move to pci_ids.h later */ +#define PCI_CLASS_STORAGE_EXPRESS 0x010802 + +static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, nvme_id_table); + +static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = __devexit_p(nvme_remove), + .suspend = nvme_suspend, + .resume = nvme_resume, + .err_handler = &nvme_err_handler, +}; + +static int __init nvme_init(void) +{ + int result; + + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + if (IS_ERR(nvme_thread)) + return PTR_ERR(nvme_thread); + + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) + goto kill_kthread; + else if (result > 0) + nvme_major = result; + + result = pci_register_driver(&nvme_driver); + if (result) + goto unregister_blkdev; + return 0; + + unregister_blkdev: + unregister_blkdev(nvme_major, "nvme"); + kill_kthread: + kthread_stop(nvme_thread); + return result; +} + +static void __exit nvme_exit(void) +{ + pci_unregister_driver(&nvme_driver); + unregister_blkdev(nvme_major, "nvme"); + kthread_stop(nvme_thread); +} + +MODULE_AUTHOR("Matthew Wilcox <willy at linux.intel.com>"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.8"); +module_init(nvme_init); +module_exit(nvme_exit); diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c deleted file mode 100644 index 7df794f..0000000 --- a/drivers/block/nvme.c +++ /dev/null @@ -1,1804 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include <linux/nvme.h> -#include <linux/bio.h> -#include <linux/bitops.h> -#include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/idr.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kthread.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/poison.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/version.h> - -#define NVME_Q_DEPTH 1024 -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define NVME_MINORS 64 -#define NVME_IO_TIMEOUT (5 * HZ) -#define ADMIN_TIMEOUT (60 * HZ) - -static int nvme_major; -module_param(nvme_major, int, 0); - -static int use_threaded_interrupts; -module_param(use_threaded_interrupts, int, 0); - -static DEFINE_SPINLOCK(dev_list_lock); -static LIST_HEAD(dev_list); -static struct task_struct *nvme_thread; - -/* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; - u32 __iomem *dbs; - struct pci_dev *pci_dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; - int instance; - int queue_count; - int db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; - struct list_head namespaces; - char serial[20]; - char model[40]; - char firmware_rev[8]; - u32 max_hw_sectors; -}; - -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ -struct nvme_ns { - struct list_head list; - - struct nvme_dev *dev; - struct request_queue *queue; - struct gendisk *disk; - - int ns_id; - int lba_shift; -}; - -/* - * An NVM Express queue. Each device has at least two (one for admin - * commands and one for I/O commands). - */ -struct nvme_queue { - struct device *q_dmadev; - struct nvme_dev *dev; - spinlock_t q_lock; - struct nvme_command *sq_cmds; - volatile struct nvme_completion *cqes; - dma_addr_t sq_dma_addr; - dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - u32 __iomem *q_db; - u16 q_depth; - u16 cq_vector; - u16 sq_head; - u16 sq_tail; - u16 cq_head; - u16 cq_phase; - unsigned long cmdid_data[]; -}; - -/* - * Check we didin't inadvertently grow the command struct - */ -static inline void _nvme_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); - BUILD_BUG_ON(sizeof(struct nvme_features) != 64); - BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); - BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); -} - -typedef void (*nvme_completion_fn)(struct nvme_dev *, void *, - struct nvme_completion *); - -struct nvme_cmd_info { - nvme_completion_fn fn; - void *ctx; - unsigned long timeout; -}; - -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) -{ - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; -} - -/** - * alloc_cmdid() - Allocate a Command ID - * @nvmeq: The queue that will be used for this command - * @ctx: A pointer that will be passed to the handler - * @handler: The function to call on completion - * - * Allocate a Command ID for a queue. The data passed in will - * be passed to the completion handler. This is implemented by using - * the bottom two bits of the ctx pointer to store the handler ID. - * Passing in a pointer that's not 4-byte aligned will cause a BUG. - * We can change this if it becomes a problem. - * - * May be called with local interrupts disabled and the q_lock held, - * or with interrupts enabled and no locks held. - */ -static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) -{ - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - int cmdid; - - do { - cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); - if (cmdid >= depth) - return -EBUSY; - } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); - - info[cmdid].fn = handler; - info[cmdid].ctx = ctx; - info[cmdid].timeout = jiffies + timeout; - return cmdid; -} - -static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) -{ - int cmdid; - wait_event_killable(nvmeq->sq_full, - (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); - return (cmdid < 0) ? -EINTR : cmdid; -} - -/* Special values must be less than 0x1000 */ -#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) -#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) -#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) -#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) - -static void special_completion(struct nvme_dev *dev, void *ctx, - struct nvme_completion *cqe) -{ - if (ctx == CMD_CTX_CANCELLED) - return; - if (ctx == CMD_CTX_FLUSH) - return; - if (ctx == CMD_CTX_COMPLETED) { - dev_warn(&dev->pci_dev->dev, - "completed id %d twice on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - if (ctx == CMD_CTX_INVALID) { - dev_warn(&dev->pci_dev->dev, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - - dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) -{ - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - - if (cmdid >= nvmeq->q_depth) { - *fn = special_completion; - return CMD_CTX_INVALID; - } - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_COMPLETED; - clear_bit(cmdid, nvmeq->cmdid_data); - wake_up(&nvmeq->sq_full); - return ctx; -} - -static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) -{ - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_CANCELLED; - return ctx; -} - -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) -{ - return dev->queues[get_cpu() + 1]; -} - -static void put_nvmeq(struct nvme_queue *nvmeq) -{ - put_cpu(); -} - -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * - * Safe to use from interrupt context - */ -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - unsigned long flags; - u16 tail; - spin_lock_irqsave(&nvmeq->q_lock, flags); - tail = nvmeq->sq_tail; - memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); - if (++tail == nvmeq->q_depth) - tail = 0; - writel(tail, nvmeq->q_db); - nvmeq->sq_tail = tail; - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - - return 0; -} - -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist sg[0]; -}; - -static __le64 **iod_list(struct nvme_iod *iod) -{ - return ((void *)iod) + iod->offset; -} - -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size) -{ - unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); -} - -static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) -{ - struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes) + - sizeof(struct scatterlist) * nseg, gfp); - - if (iod) { - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - } - - return iod; -} - -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) -{ - const int last_prp = PAGE_SIZE / 8 - 1; - int i; - __le64 **list = iod_list(iod); - dma_addr_t prp_dma = iod->first_dma; - - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, list[0], prp_dma); - for (i = 0; i < iod->npages; i++) { - __le64 *prp_list = list[i]; - dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); - prp_dma = next_prp_dma; - } - kfree(iod); -} - -static void requeue_bio(struct nvme_dev *dev, struct bio *bio) -{ - struct nvme_queue *nvmeq = get_nvmeq(dev); - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - put_nvmeq(nvmeq); - wake_up_process(nvme_thread); -} - -static void bio_completion(struct nvme_dev *dev, void *ctx, - struct nvme_completion *cqe) -{ - struct nvme_iod *iod = ctx; - struct bio *bio = iod->private; - u16 status = le16_to_cpup(&cqe->status) >> 1; - - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_free_iod(dev, iod); - if (status) { - bio_endio(bio, -EIO); - } else if (bio->bi_vcnt > bio->bi_idx) { - requeue_bio(dev, bio); - } else { - bio_endio(bio, 0); - } -} - -/* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, - struct nvme_common_command *cmd, struct nvme_iod *iod, - int total_len, gfp_t gfp) -{ - struct dma_pool *pool; - int length = total_len; - struct scatterlist *sg = iod->sg; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = offset_in_page(dma_addr); - __le64 *prp_list; - __le64 **list = iod_list(iod); - dma_addr_t prp_dma; - int nprps, i; - - cmd->prp1 = cpu_to_le64(dma_addr); - length -= (PAGE_SIZE - offset); - if (length <= 0) - return total_len; - - dma_len -= (PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - if (length <= PAGE_SIZE) { - cmd->prp2 = cpu_to_le64(dma_addr); - return total_len; - } - - nprps = DIV_ROUND_UP(length, PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->npages = 0; - } else { - pool = dev->prp_page_pool; - iod->npages = 1; - } - - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); - if (!prp_list) { - cmd->prp2 = cpu_to_le64(dma_addr); - iod->npages = -1; - return (total_len - length) + PAGE_SIZE; - } - list[0] = prp_list; - iod->first_dma = prp_dma; - cmd->prp2 = cpu_to_le64(prp_dma); - i = 0; - for (;;) { - if (i == PAGE_SIZE / 8) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); - if (!prp_list) - return total_len - length; - list[iod->npages++] = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; - } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= PAGE_SIZE; - dma_addr += PAGE_SIZE; - length -= PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - BUG_ON(dma_len < 0); - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - return total_len; -} - -/* NVMe scatterlists require no holes in the virtual address */ -#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ - (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) - -static int nvme_map_bio(struct device *dev, struct nvme_iod *iod, - struct bio *bio, enum dma_data_direction dma_dir, int psegs) -{ - struct bio_vec *bvec, *bvprv = NULL; - struct scatterlist *sg = NULL; - int i, old_idx, length = 0, nsegs = 0; - - sg_init_table(iod->sg, psegs); - old_idx = bio->bi_idx; - bio_for_each_segment(bvec, bio, i) { - if (bvprv && BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) { - sg->length += bvec->bv_len; - } else { - if (bvprv && BIOVEC_NOT_VIRT_MERGEABLE(bvprv, bvec)) - break; - sg = sg ? sg + 1 : iod->sg; - sg_set_page(sg, bvec->bv_page, bvec->bv_len, - bvec->bv_offset); - nsegs++; - } - length += bvec->bv_len; - bvprv = bvec; - } - bio->bi_idx = i; - iod->nents = nsegs; - sg_mark_end(sg); - if (dma_map_sg(dev, iod->sg, iod->nents, dma_dir) == 0) { - bio->bi_idx = old_idx; - return -ENOMEM; - } - return length; -} - -static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, - int cmdid) -{ - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.command_id = cmdid; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; -} - -static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) -{ - int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, - special_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - return nvme_submit_flush(nvmeq, ns, cmdid); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) -{ - struct nvme_command *cmnd; - struct nvme_iod *iod; - enum dma_data_direction dma_dir; - int cmdid, length, result = -ENOMEM; - u16 control; - u32 dsmgmt; - int psegs = bio_phys_segments(ns->queue, bio); - - if ((bio->bi_rw & REQ_FLUSH) && psegs) { - result = nvme_submit_flush_data(nvmeq, ns); - if (result) - return result; - } - - iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); - if (!iod) - goto nomem; - iod->private = bio; - - result = -EBUSY; - cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - goto free_iod; - - if ((bio->bi_rw & REQ_FLUSH) && !psegs) - return nvme_submit_flush(nvmeq, ns, cmdid); - - control = 0; - if (bio->bi_rw & REQ_FUA) - control |= NVME_RW_FUA; - if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - dsmgmt = 0; - if (bio->bi_rw & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - memset(cmnd, 0, sizeof(*cmnd)); - if (bio_data_dir(bio)) { - cmnd->rw.opcode = nvme_cmd_write; - dma_dir = DMA_TO_DEVICE; - } else { - cmnd->rw.opcode = nvme_cmd_read; - dma_dir = DMA_FROM_DEVICE; - } - - result = nvme_map_bio(nvmeq->q_dmadev, iod, bio, dma_dir, psegs); - if (result < 0) - goto free_iod; - length = result; - - cmnd->rw.command_id = cmdid; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length, - GFP_ATOMIC); - cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9)); - cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1); - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - - bio->bi_sector += length >> 9; - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; - - free_iod: - nvme_free_iod(nvmeq->dev, iod); - nomem: - return result; -} - -/* - * NB: return value of non-zero would mean that we were a stacking driver. - * make_request must always succeed. - */ -static int nvme_make_request(struct request_queue *q, struct bio *bio) -{ - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - spin_lock_irq(&nvmeq->q_lock); - if (bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); - - return 0; -} - -static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) -{ - u16 head, phase; - - head = nvmeq->cq_head; - phase = nvmeq->cq_phase; - - for (;;) { - void *ctx; - nvme_completion_fn fn; - struct nvme_completion cqe = nvmeq->cqes[head]; - if ((le16_to_cpu(cqe.status) & 1) != phase) - break; - nvmeq->sq_head = le16_to_cpu(cqe.sq_head); - if (++head == nvmeq->q_depth) { - head = 0; - phase = !phase; - } - - ctx = free_cmdid(nvmeq, cqe.command_id, &fn); - fn(nvmeq->dev, ctx, &cqe); - } - - /* If the controller ignores the cq head doorbell and continuously - * writes to the queue, it is theoretically possible to wrap around - * the queue twice and mistakenly return IRQ_NONE. Linux only - * requires that 0.1% of your interrupts are handled, so this isn't - * a big problem. - */ - if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) - return IRQ_NONE; - - writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); - nvmeq->cq_head = head; - nvmeq->cq_phase = phase; - - return IRQ_HANDLED; -} - -static irqreturn_t nvme_irq(int irq, void *data) -{ - irqreturn_t result; - struct nvme_queue *nvmeq = data; - spin_lock(&nvmeq->q_lock); - result = nvme_process_cq(nvmeq); - spin_unlock(&nvmeq->q_lock); - return result; -} - -static irqreturn_t nvme_irq_check(int irq, void *data) -{ - struct nvme_queue *nvmeq = data; - struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; - if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) - return IRQ_NONE; - return IRQ_WAKE_THREAD; -} - -static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) -{ - spin_lock_irq(&nvmeq->q_lock); - cancel_cmdid(nvmeq, cmdid, NULL); - spin_unlock_irq(&nvmeq->q_lock); -} - -struct sync_cmd_info { - struct task_struct *task; - u32 result; - int status; -}; - -static void sync_completion(struct nvme_dev *dev, void *ctx, - struct nvme_completion *cqe) -{ - struct sync_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - wake_up_process(cmdinfo->task); -} - -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd, u32 *result, unsigned timeout) -{ - int cmdid; - struct sync_cmd_info cmdinfo; - - cmdinfo.task = current; - cmdinfo.status = -EINTR; - - cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, - timeout); - if (cmdid < 0) - return cmdid; - cmd->common.command_id = cmdid; - - set_current_state(TASK_KILLABLE); - nvme_submit_cmd(nvmeq, cmd); - schedule(); - - if (cmdinfo.status == -EINTR) { - nvme_abort_command(nvmeq, cmdid); - return -EINTR; - } - - if (result) - *result = cmdinfo.result; - - return cmdinfo.status; -} - -static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) -{ - return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); -} - -static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) -{ - int status; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(id); - - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; -} - -static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - int status; - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; - - memset(&c, 0, sizeof(c)); - c.create_cq.opcode = nvme_admin_create_cq; - c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); - c.create_cq.cqid = cpu_to_le16(qid); - c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; -} - -static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - int status; - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; - - memset(&c, 0, sizeof(c)); - c.create_sq.opcode = nvme_admin_create_sq; - c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); - c.create_sq.sqid = cpu_to_le16(qid); - c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_sq.sq_flags = cpu_to_le16(flags); - c.create_sq.cqid = cpu_to_le16(qid); - - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; -} - -static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); -} - -static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); -} - -static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, - dma_addr_t dma_addr) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.prp1 = cpu_to_le64(dma_addr); - c.identify.cns = cpu_to_le32(cns); - - return nvme_submit_admin_cmd(dev, &c, NULL); -} - -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - - return nvme_submit_admin_cmd(dev, &c, result); -} - -static int nvme_set_features(struct nvme_dev *dev, unsigned fid, - unsigned dword11, dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - return nvme_submit_admin_cmd(dev, &c, result); -} - -/** - * nvme_cancel_ios - Cancel outstanding I/Os - * @queue: The queue to cancel I/Os on - * @timeout: True to only cancel I/Os which have timed out - */ -static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) -{ - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; - - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, - }; - - if (timeout && !time_after(now, info[cmdid].timeout)) - continue; - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq->dev, ctx, &cqe); - } -} - -static void nvme_free_queue_mem(struct nvme_queue *nvmeq) -{ - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); -} - -static void nvme_free_queue(struct nvme_dev *dev, int qid) -{ - struct nvme_queue *nvmeq = dev->queues[qid]; - int vector = dev->entry[nvmeq->cq_vector].vector; - - spin_lock_irq(&nvmeq->q_lock); - nvme_cancel_ios(nvmeq, false); - spin_unlock_irq(&nvmeq->q_lock); - - irq_set_affinity_hint(vector, NULL); - free_irq(vector, nvmeq); - - /* Don't tell the adapter to delete the admin queue */ - if (qid) { - adapter_delete_sq(dev, qid); - adapter_delete_cq(dev, qid); - } - - nvme_free_queue_mem(nvmeq); -} - -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int vector) -{ - struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * - sizeof(struct nvme_cmd_info)); - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); - if (!nvmeq) - return NULL; - - nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); - if (!nvmeq->cqes) - goto free_nvmeq; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); - - nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) - goto free_cqdma; - - nvmeq->q_dmadev = dmadev; - nvmeq->dev = dev; - spin_lock_init(&nvmeq->q_lock); - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - init_waitqueue_head(&nvmeq->sq_full); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); - bio_list_init(&nvmeq->sq_cong); - nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; - nvmeq->q_depth = depth; - nvmeq->cq_vector = vector; - - return nvmeq; - - free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); - free_nvmeq: - kfree(nvmeq); - return NULL; -} - -static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, - const char *name) -{ - if (use_threaded_interrupts) - return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, - nvme_irq_check, nvme_irq, - IRQF_DISABLED | IRQF_SHARED, - name, nvmeq); - return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, - IRQF_DISABLED | IRQF_SHARED, name, nvmeq); -} - -static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, - int qid, int cq_size, int vector) -{ - int result; - struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); - - if (!nvmeq) - return ERR_PTR(-ENOMEM); - - result = adapter_alloc_cq(dev, qid, nvmeq); - if (result < 0) - goto free_nvmeq; - - result = adapter_alloc_sq(dev, qid, nvmeq); - if (result < 0) - goto release_cq; - - result = queue_request_irq(dev, nvmeq, "nvme"); - if (result < 0) - goto release_sq; - - return nvmeq; - - release_sq: - adapter_delete_sq(dev, qid); - release_cq: - adapter_delete_cq(dev, qid); - free_nvmeq: - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); - return ERR_PTR(result); -} - -static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) -{ - int result = 0; - u32 aqa; - u64 cap; - unsigned long timeout; - struct nvme_queue *nvmeq; - - dev->dbs = ((void __iomem *)dev->bar) + 4096; - - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); - if (!nvmeq) - return -ENOMEM; - - aqa = nvmeq->q_depth - 1; - aqa |= aqa << 16; - - dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; - dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - - writel(0, &dev->bar->cc); - writel(aqa, &dev->bar->aqa); - writeq(nvmeq->sq_dma_addr, &dev->bar->asq); - writeq(nvmeq->cq_dma_addr, &dev->bar->acq); - writel(dev->ctrl_config, &dev->bar->cc); - - cap = readq(&dev->bar->cap); - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - dev->db_stride = NVME_CAP_STRIDE(cap); - - while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) { - msleep(100); - if (fatal_signal_pending(current)) - result = -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, - "Device not ready; aborting initialisation\n"); - result = -ENODEV; - } - } - - if (result) { - nvme_free_queue_mem(nvmeq); - return result; - } - - result = queue_request_irq(dev, nvmeq, "nvme admin"); - dev->queues[0] = nvmeq; - return result; -} - -static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length) -{ - int i, err, count, nents, offset; - struct scatterlist *sg; - struct page **pages; - struct nvme_iod *iod; - - if (addr & 3) - return ERR_PTR(-EINVAL); - if (!length) - return ERR_PTR(-EINVAL); - - offset = offset_in_page(addr); - count = DIV_ROUND_UP(offset + length, PAGE_SIZE); - pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - err = get_user_pages_fast(addr, count, 1, pages); - if (err < count) { - count = err; - err = -EFAULT; - goto put_pages; - } - - iod = nvme_alloc_iod(count, length, GFP_KERNEL); - sg = iod->sg; - sg_init_table(sg, count); - for (i = 0; i < count; i++) { - sg_set_page(&sg[i], pages[i], - min_t(int, length, PAGE_SIZE - offset), offset); - length -= (PAGE_SIZE - offset); - offset = 0; - } - sg_mark_end(&sg[i - 1]); - iod->nents = count; - - err = -ENOMEM; - nents = dma_map_sg(&dev->pci_dev->dev, sg, count, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (!nents) - goto free_iod; - - kfree(pages); - return iod; - - free_iod: - kfree(iod); - put_pages: - for (i = 0; i < count; i++) - put_page(pages[i]); - kfree(pages); - return ERR_PTR(err); -} - -static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod) -{ - int i; - - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - for (i = 0; i < iod->nents; i++) - put_page(sg_page(&iod->sg[i])); -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_dev *dev = ns->dev; - struct nvme_queue *nvmeq; - struct nvme_user_io io; - struct nvme_command c; - unsigned length; - int status; - struct nvme_iod *iod; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - length = (io.nblocks + 1) << ns->lba_shift; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length); - break; - default: - return -EINVAL; - } - - if (IS_ERR(iod)) - return PTR_ERR(iod); - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le16(io.dsmgmt); - c.rw.reftag = io.reftag; - c.rw.apptag = io.apptag; - c.rw.appmask = io.appmask; - /* XXX: metadata */ - length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); - - nvmeq = get_nvmeq(dev); - /* - * Since nvme_submit_sync_cmd sleeps, we can't keep preemption - * disabled. We may be preempted at any point, and be rescheduled - * to a different CPU. That will cause cacheline bouncing, but no - * additional races since q_lock already protects against other CPUs. - */ - put_nvmeq(nvmeq); - if (length != (io.nblocks + 1) << ns->lba_shift) - status = -ENOMEM; - else - status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); - - nvme_unmap_user_pages(dev, io.opcode & 1, iod); - nvme_free_iod(dev, iod); - return status; -} - -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) -{ - struct nvme_admin_cmd cmd; - struct nvme_command c; - int status, length; - struct nvme_iod *uninitialized_var(iod); - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - - length = cmd.data_len; - if (cmd.data_len) { - iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, - length); - if (IS_ERR(iod)) - return PTR_ERR(iod); - length = nvme_setup_prps(dev, &c.common, iod, length, - GFP_KERNEL); - } - - if (length != cmd.data_len) - status = -ENOMEM; - else - status = nvme_submit_admin_cmd(dev, &c, &cmd.result); - - if (cmd.data_len) { - nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); - nvme_free_iod(dev, iod); - } - - if (!status && copy_to_user(&ucmd->result, &cmd.result, - sizeof(cmd.result))) - status = -EFAULT; - - return status; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - - switch (cmd) { - case NVME_IOCTL_ID: - return ns->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, -}; - -static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -{ - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - bio_list_add_head(&nvmeq->sq_cong, bio); - break; - } - if (bio_list_empty(&nvmeq->sq_cong)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - } -} - -static int nvme_kthread(void *data) -{ - struct nvme_dev *dev; - - while (!kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { - int i; - for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = dev->queues[i]; - if (!nvmeq) - continue; - spin_lock_irq(&nvmeq->q_lock); - if (nvme_process_cq(nvmeq)) - printk("process_cq did something\n"); - nvme_cancel_ios(nvmeq, true); - nvme_resubmit_bios(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - } - } - spin_unlock(&dev_list_lock); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } - return 0; -} - -static DEFINE_IDA(nvme_index_ida); - -static int nvme_get_ns_idx(void) -{ - int index, error; - - do { - if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL)) - return -1; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_index_ida, &index); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - index = -1; - return index; -} - -static void nvme_put_ns_idx(int index) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_index_ida, index); - spin_unlock(&dev_list_lock); -} - -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, - struct nvme_id_ns *id, struct nvme_lba_range_type *rt) -{ - struct nvme_ns *ns; - struct gendisk *disk; - int lbaf; - - if (rt->attributes & NVME_LBART_ATTRIB_HIDE) - return NULL; - - ns = kzalloc(sizeof(*ns), GFP_KERNEL); - if (!ns) - return NULL; - ns->queue = blk_alloc_queue(GFP_KERNEL); - if (!ns->queue) - goto out_free_ns; - ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); -/* queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */ - blk_queue_make_request(ns->queue, nvme_make_request); - ns->dev = dev; - ns->queue->queuedata = ns; - - disk = alloc_disk(NVME_MINORS); - if (!disk) - goto out_free_queue; - ns->ns_id = nsid; - ns->disk = disk; - lbaf = id->flbas & 0xf; - ns->lba_shift = id->lbaf[lbaf].ds; - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) - blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); - - disk->major = nvme_major; - disk->minors = NVME_MINORS; - disk->first_minor = NVME_MINORS * nvme_get_ns_idx(); - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->driverfs_dev = &dev->pci_dev->dev; - sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - return ns; - - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); - return NULL; -} - -static void nvme_ns_free(struct nvme_ns *ns) -{ - int index = ns->disk->first_minor / NVME_MINORS; - put_disk(ns->disk); - nvme_put_ns_idx(index); - blk_cleanup_queue(ns->queue); - kfree(ns); -} - -static int set_queue_count(struct nvme_dev *dev, int count) -{ - int status; - u32 result; - u32 q_count = (count - 1) | ((count - 1) << 16); - - status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, - &result); - if (status) - return -EIO; - return min(result & 0xffff, result >> 16) + 1; -} - -static int __devinit nvme_setup_io_queues(struct nvme_dev *dev) -{ - int result, cpu, i, nr_io_queues, db_bar_size, q_depth; - - nr_io_queues = num_online_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result < 0) - return result; - if (result < nr_io_queues) - nr_io_queues = result; - - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, dev->queues[0]); - - db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); - if (db_bar_size > 8192) { - iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), - db_bar_size); - dev->dbs = ((void __iomem *)dev->bar) + 4096; - dev->queues[0]->q_db = dev->dbs; - } - - for (i = 0; i < nr_io_queues; i++) - dev->entry[i].entry = i; - for (;;) { - result = pci_enable_msix(dev->pci_dev, dev->entry, - nr_io_queues); - if (result == 0) { - break; - } else if (result > 0) { - nr_io_queues = result; - continue; - } else { - nr_io_queues = 1; - break; - } - } - - result = queue_request_irq(dev, dev->queues[0], "nvme admin"); - /* XXX: handle failure here */ - - cpu = cpumask_first(cpu_online_mask); - for (i = 0; i < nr_io_queues; i++) { - irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu)); - cpu = cpumask_next(cpu, cpu_online_mask); - } - - q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, - NVME_Q_DEPTH); - for (i = 0; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); - if (IS_ERR(dev->queues[i + 1])) - return PTR_ERR(dev->queues[i + 1]); - dev->queue_count++; - } - - for (; i < num_possible_cpus(); i++) { - int target = i % rounddown_pow_of_two(dev->queue_count - 1); - dev->queues[i + 1] = dev->queues[target + 1]; - } - - return 0; -} - -static void nvme_free_queues(struct nvme_dev *dev) -{ - int i; - - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_free_queue(dev, i); -} - -static int __devinit nvme_dev_add(struct nvme_dev *dev) -{ - int res, nn, i; - struct nvme_ns *ns, *next; - struct nvme_id_ctrl *ctrl; - struct nvme_id_ns *id_ns; - void *mem; - dma_addr_t dma_addr; - - res = nvme_setup_io_queues(dev); - if (res) - return res; - - mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, - GFP_KERNEL); - - res = nvme_identify(dev, 0, 1, dma_addr); - if (res) { - res = -EIO; - goto out_free; - } - - ctrl = mem; - nn = le32_to_cpup(&ctrl->nn); - memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) { - int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - } - - id_ns = mem; - for (i = 1; i <= nn; i++) { - res = nvme_identify(dev, i, 0, dma_addr); - if (res) - continue; - - if (id_ns->ncap == 0) - continue; - - res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096, NULL); - if (res) - continue; - - ns = nvme_alloc_ns(dev, i, mem, mem + 4096); - if (ns) - list_add_tail(&ns->list, &dev->namespaces); - } - list_for_each_entry(ns, &dev->namespaces, list) - add_disk(ns->disk); - - goto out; - - out_free: - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - nvme_ns_free(ns); - } - - out: - dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); - return res; -} - -static int nvme_dev_remove(struct nvme_dev *dev) -{ - struct nvme_ns *ns, *next; - - spin_lock(&dev_list_lock); - list_del(&dev->node); - spin_unlock(&dev_list_lock); - - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - del_gendisk(ns->disk); - nvme_ns_free(ns); - } - - nvme_free_queues(dev); - - return 0; -} - -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - struct device *dmadev = &dev->pci_dev->dev; - dev->prp_page_pool = dma_pool_create("prp list page", dmadev, - PAGE_SIZE, PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, - 256, 256, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_dev *dev) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - dev->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_dev *dev) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->instance); - spin_unlock(&dev_list_lock); -} - -static int __devinit nvme_probe(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - int bars, result = -ENOMEM; - struct nvme_dev *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), - GFP_KERNEL); - if (!dev->entry) - goto free; - dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), - GFP_KERNEL); - if (!dev->queues) - goto free; - - if (pci_enable_device_mem(pdev)) - goto free; - pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable; - - INIT_LIST_HEAD(&dev->namespaces); - dev->pci_dev = pdev; - pci_set_drvdata(pdev, dev); - dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); - result = nvme_set_instance(dev); - if (result) - goto disable; - - dev->entry[0].vector = pdev->irq; - - result = nvme_setup_prp_pools(dev); - if (result) - goto disable_msix; - - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) { - result = -ENOMEM; - goto disable_msix; - } - - result = nvme_configure_admin_queue(dev); - if (result) - goto unmap; - dev->queue_count++; - - spin_lock(&dev_list_lock); - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - result = nvme_dev_add(dev); - if (result) - goto delete; - - return 0; - - delete: - spin_lock(&dev_list_lock); - list_del(&dev->node); - spin_unlock(&dev_list_lock); - - nvme_free_queues(dev); - unmap: - iounmap(dev->bar); - disable_msix: - pci_disable_msix(pdev); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); - disable: - pci_disable_device(pdev); - pci_release_regions(pdev); - free: - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); - return result; -} - -static void __devexit nvme_remove(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_remove(dev); - pci_disable_msix(pdev); - iounmap(dev->bar); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); - pci_disable_device(pdev); - pci_release_regions(pdev); - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); -} - -/* These functions are yet to be implemented */ -#define nvme_error_detected NULL -#define nvme_dump_registers NULL -#define nvme_link_reset NULL -#define nvme_slot_reset NULL -#define nvme_error_resume NULL -#define nvme_suspend NULL -#define nvme_resume NULL - -static struct pci_error_handlers nvme_err_handler = { - .error_detected = nvme_error_detected, - .mmio_enabled = nvme_dump_registers, - .link_reset = nvme_link_reset, - .slot_reset = nvme_slot_reset, - .resume = nvme_error_resume, -}; - -/* Move to pci_ids.h later */ -#define PCI_CLASS_STORAGE_EXPRESS 0x010802 - -static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = { - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, - { 0, } -}; -MODULE_DEVICE_TABLE(pci, nvme_id_table); - -static struct pci_driver nvme_driver = { - .name = "nvme", - .id_table = nvme_id_table, - .probe = nvme_probe, - .remove = __devexit_p(nvme_remove), - .suspend = nvme_suspend, - .resume = nvme_resume, - .err_handler = &nvme_err_handler, -}; - -static int __init nvme_init(void) -{ - int result; - - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - if (IS_ERR(nvme_thread)) - return PTR_ERR(nvme_thread); - - result = register_blkdev(nvme_major, "nvme"); - if (result < 0) - goto kill_kthread; - else if (result > 0) - nvme_major = result; - - result = pci_register_driver(&nvme_driver); - if (result) - goto unregister_blkdev; - return 0; - - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); - kill_kthread: - kthread_stop(nvme_thread); - return result; -} - -static void __exit nvme_exit(void) -{ - pci_unregister_driver(&nvme_driver); - unregister_blkdev(nvme_major, "nvme"); - kthread_stop(nvme_thread); -} - -MODULE_AUTHOR("Matthew Wilcox <willy at linux.intel.com>"); -MODULE_LICENSE("GPL"); -MODULE_VERSION("0.8"); -module_init(nvme_init); -module_exit(nvme_exit); -- 1.7.0.4 ^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH v2 2/3] NVMe: Move dev, ns, iod structs to nvme.c 2012-10-31 17:52 [PATCH v2 1/3] NVMe: Rename nvme.c to nvme-core.c Vishal Verma @ 2012-10-31 17:52 ` Vishal Verma 2012-10-31 17:52 ` [PATCH v2 3/3] NVMe: Add nvme-scsi.c Vishal Verma 1 sibling, 0 replies; 3+ messages in thread From: Vishal Verma @ 2012-10-31 17:52 UTC (permalink / raw) nvme-scsi.c uses nvme_dev, nvme_ns, and nvme_iod. Move the structure definitions to nvme.h Signed-off-by: Vishal Verma <vishal.l.verma at intel.com> --- drivers/block/nvme-core.c | 53 ----------------------------------- include/linux/nvme.h | 67 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 53 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 7df794f..f808b0a 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -59,43 +59,6 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; /* - * Represents an NVM Express device. Each nvme_dev is a PCI function. - */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; - u32 __iomem *dbs; - struct pci_dev *pci_dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; - int instance; - int queue_count; - int db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; - struct list_head namespaces; - char serial[20]; - char model[40]; - char firmware_rev[8]; - u32 max_hw_sectors; -}; - -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ -struct nvme_ns { - struct list_head list; - - struct nvme_dev *dev; - struct request_queue *queue; - struct gendisk *disk; - - int ns_id; - int lba_shift; -}; - -/* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). */ @@ -292,22 +255,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) return 0; } -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - void *private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist sg[0]; -}; - static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1837611..f4e0649 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -461,4 +461,71 @@ struct nvme_admin_cmd { #define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#ifdef __KERNEL__ +#include <linux/pci.h> + +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + u32 __iomem *dbs; + struct pci_dev *pci_dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + int instance; + int queue_count; + int db_stride; + u32 ctrl_config; + struct msix_entry *entry; + struct nvme_bar __iomem *bar; + struct list_head namespaces; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u16 vendor_id; + u32 max_hw_sectors; +}; + +/* + * An NVM Express namespace is equivalent to a SCSI LUN + */ +struct nvme_ns { + struct list_head list; + + struct nvme_dev *dev; + struct request_queue *queue; + struct gendisk *disk; + + int ns_id; + int lba_shift; + u64 mode_select_num_blocks; + u32 mode_select_block_len; +}; + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_alloc_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + void *private; /* For the use of the submitter of the I/O */ + int npages; /* In the PRP list. 0 means small pool in use */ + int offset; /* Of PRP list */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist sg[0]; +}; + +/** + * nvme_free_iod - frees an nvme_iod + * @dev: The device that the I/O was submitted to + * @iod: The memory to free + */ + +#endif + #endif /* _LINUX_NVME_H */ -- 1.7.0.4 ^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH v2 3/3] NVMe: Add nvme-scsi.c 2012-10-31 17:52 [PATCH v2 1/3] NVMe: Rename nvme.c to nvme-core.c Vishal Verma 2012-10-31 17:52 ` [PATCH v2 2/3] NVMe: Move dev, ns, iod structs to nvme.c Vishal Verma @ 2012-10-31 17:52 ` Vishal Verma 1 sibling, 0 replies; 3+ messages in thread From: Vishal Verma @ 2012-10-31 17:52 UTC (permalink / raw) Translates SCSI commands in SG_IO ioctl to NVMe commands. Uses the scsi-nvme translation spec from nvmexpress.org as reference. Signed-off-by: Vishal Verma <vishal.l.verma at intel.com> --- drivers/block/Makefile | 2 +- drivers/block/nvme-core.c | 37 +- drivers/block/nvme-scsi.c | 2943 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 40 + 4 files changed, 3005 insertions(+), 17 deletions(-) create mode 100644 drivers/block/nvme-scsi.c diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 3f8a5e8..358cb46 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,5 +41,5 @@ obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o -nvme-y := nvme-core.o +nvme-y := nvme-core.o nvme-scsi.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index f808b0a..51bd00a 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -40,12 +40,12 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/version.h> +#include <scsi/sg.h> #define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define NVME_MINORS 64 -#define NVME_IO_TIMEOUT (5 * HZ) #define ADMIN_TIMEOUT (60 * HZ) static int nvme_major; @@ -93,6 +93,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); @@ -222,12 +223,12 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, return ctx; } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) +struct nvme_queue *get_nvmeq(struct nvme_dev *dev) { return dev->queues[get_cpu() + 1]; } -static void put_nvmeq(struct nvme_queue *nvmeq) +void put_nvmeq(struct nvme_queue *nvmeq) { put_cpu(); } @@ -287,7 +288,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) return iod; } -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = PAGE_SIZE / 8 - 1; int i; @@ -335,9 +336,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, } /* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, - struct nvme_common_command *cmd, struct nvme_iod *iod, - int total_len, gfp_t gfp) +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, + struct nvme_iod *iod, int total_len, gfp_t gfp) { struct dma_pool *pool; int length = total_len; @@ -470,7 +470,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, return 0; } -static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) +int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns) { int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH, special_completion, NVME_IO_TIMEOUT); @@ -671,8 +671,8 @@ static void sync_completion(struct nvme_dev *dev, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, - struct nvme_command *cmd, u32 *result, unsigned timeout) +int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + u32 *result, unsigned timeout) { int cmdid; struct sync_cmd_info cmdinfo; @@ -701,7 +701,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, return cmdinfo.status; } -static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT); @@ -774,7 +774,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, +int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, dma_addr_t dma_addr) { struct nvme_command c; @@ -788,7 +788,7 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, return nvme_submit_admin_cmd(dev, &c, NULL); } -static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -802,7 +802,7 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, return nvme_submit_admin_cmd(dev, &c, result); } -static int nvme_set_features(struct nvme_dev *dev, unsigned fid, +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; @@ -1017,7 +1017,7 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, +struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length) { int i, err, count, nents, offset; @@ -1073,7 +1073,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, return ERR_PTR(err); } -static void nvme_unmap_user_pages(struct nvme_dev *dev, int write, +void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod) { int i; @@ -1209,6 +1209,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, return nvme_user_admin_cmd(ns->dev, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); default: return -ENOTTY; } @@ -1466,6 +1470,7 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev) memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + dev->vendor_id = le16_to_cpup(&ctrl->vid); if (ctrl->mdts) { int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c new file mode 100644 index 0000000..b24f360 --- /dev/null +++ b/drivers/block/nvme-scsi.c @@ -0,0 +1,2943 @@ +/* + * NVM Express device driver + * Copyright (c) 2011, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * Refer SCSI-NVMe Translation spec for details on how + * each command is translated. + */ + +#include <linux/nvme.h> +#include <linux/bio.h> +#include <linux/bitops.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kdev_t.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/pci.h> +#include <linux/poison.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/version.h> +#include <scsi/sg.h> +#include <scsi/scsi.h> + + +static int sg_version_num = 30534; /* 2 digits for each component */ + +#define SNTI_TRANSLATION_SUCCESS 0 +#define SNTI_INTERNAL_ERROR 1 + +/* VPD Page Codes */ +#define VPD_SUPPORTED_PAGES 0x00 +#define VPD_SERIAL_NUMBER 0x80 +#define VPD_DEVICE_IDENTIFIERS 0x83 +#define VPD_EXTENDED_INQUIRY 0x86 +#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 + +/* CDB offsets */ +#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET 6 +#define REPORT_LUNS_SR_OFFSET 2 +#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET 10 +#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET 4 +#define REQUEST_SENSE_DESC_OFFSET 1 +#define REQUEST_SENSE_DESC_MASK 0x01 +#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE 1 +#define INQUIRY_EVPD_BYTE_OFFSET 1 +#define INQUIRY_PAGE_CODE_BYTE_OFFSET 2 +#define INQUIRY_EVPD_BIT_MASK 1 +#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET 3 +#define START_STOP_UNIT_CDB_IMMED_OFFSET 1 +#define START_STOP_UNIT_CDB_IMMED_MASK 0x1 +#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET 3 +#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK 0xF +#define START_STOP_UNIT_CDB_POWER_COND_OFFSET 4 +#define START_STOP_UNIT_CDB_POWER_COND_MASK 0xF0 +#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET 4 +#define START_STOP_UNIT_CDB_NO_FLUSH_MASK 0x4 +#define START_STOP_UNIT_CDB_START_OFFSET 4 +#define START_STOP_UNIT_CDB_START_MASK 0x1 +#define WRITE_BUFFER_CDB_MODE_OFFSET 1 +#define WRITE_BUFFER_CDB_MODE_MASK 0x1F +#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET 2 +#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET 3 +#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET 6 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET 1 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK 0xC0 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT 6 +#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET 1 +#define FORMAT_UNIT_CDB_LONG_LIST_MASK 0x20 +#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET 1 +#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK 0x10 +#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 +#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 +#define FORMAT_UNIT_PROT_INT_OFFSET 3 +#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 +#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 + +/* Misc. defines */ +#define NIBBLE_SHIFT 4 +#define FIXED_SENSE_DATA 0x70 +#define DESC_FORMAT_SENSE_DATA 0x72 +#define FIXED_SENSE_DATA_ADD_LENGTH 10 +#define LUN_ENTRY_SIZE 8 +#define LUN_DATA_HEADER_SIZE 8 +#define ALL_LUNS_RETURNED 0x02 +#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 +#define RESTRICTED_LUNS_RETURNED 0x00 +#define NVME_POWER_STATE_START_VALID 0x00 +#define NVME_POWER_STATE_ACTIVE 0x01 +#define NVME_POWER_STATE_IDLE 0x02 +#define NVME_POWER_STATE_STANDBY 0x03 +#define NVME_POWER_STATE_LU_CONTROL 0x07 +#define POWER_STATE_0 0 +#define POWER_STATE_1 1 +#define POWER_STATE_2 2 +#define POWER_STATE_3 3 +#define DOWNLOAD_SAVE_ACTIVATE 0x05 +#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E +#define ACTIVATE_DEFERRED_MICROCODE 0x0F +#define FORMAT_UNIT_IMMED_MASK 0x2 +#define FORMAT_UNIT_IMMED_OFFSET 1 +#define KELVIN_TEMP_FACTOR 273 +#define FIXED_FMT_SENSE_DATA_SIZE 18 +#define DESC_FMT_SENSE_DATA_SIZE 8 + +/* SCSI/NVMe defines and bit masks */ +#define INQ_STANDARD_INQUIRY_PAGE 0x00 +#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 +#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 +#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 +#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 +#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 +#define INQ_SERIAL_NUMBER_LENGTH 0x14 +#define INQ_NUM_SUPPORTED_VPD_PAGES 5 +#define VERSION_SPC_4 0x06 +#define ACA_UNSUPPORTED 0 +#define STANDARD_INQUIRY_LENGTH 36 +#define ADDITIONAL_STD_INQ_LENGTH 31 +#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C +#define RESERVED_FIELD 0 + +/* SCSI READ/WRITE Defines */ +#define IO_CDB_WP_MASK 0xE0 +#define IO_CDB_WP_SHIFT 5 +#define IO_CDB_FUA_MASK 0x8 +#define IO_6_CDB_LBA_OFFSET 0 +#define IO_6_CDB_LBA_MASK 0x001FFFFF +#define IO_6_CDB_TX_LEN_OFFSET 4 +#define IO_6_DEFAULT_TX_LEN 256 +#define IO_10_CDB_LBA_OFFSET 2 +#define IO_10_CDB_TX_LEN_OFFSET 7 +#define IO_10_CDB_WP_OFFSET 1 +#define IO_10_CDB_FUA_OFFSET 1 +#define IO_12_CDB_LBA_OFFSET 2 +#define IO_12_CDB_TX_LEN_OFFSET 6 +#define IO_12_CDB_WP_OFFSET 1 +#define IO_12_CDB_FUA_OFFSET 1 +#define IO_16_CDB_FUA_OFFSET 1 +#define IO_16_CDB_WP_OFFSET 1 +#define IO_16_CDB_LBA_OFFSET 2 +#define IO_16_CDB_TX_LEN_OFFSET 10 + +/* Mode Sense/Select defines */ +#define MODE_PAGE_INFO_EXCEP 0x1C +#define MODE_PAGE_CACHING 0x08 +#define MODE_PAGE_CONTROL 0x0A +#define MODE_PAGE_POWER_CONDITION 0x1A +#define MODE_PAGE_RETURN_ALL 0x3F +#define MODE_PAGE_BLK_DES_LEN 0x08 +#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 +#define MODE_PAGE_CACHING_LEN 0x14 +#define MODE_PAGE_CONTROL_LEN 0x0C +#define MODE_PAGE_POW_CND_LEN 0x28 +#define MODE_PAGE_INF_EXC_LEN 0x0C +#define MODE_PAGE_ALL_LEN 0x54 +#define MODE_SENSE6_MPH_SIZE 4 +#define MODE_SENSE6_ALLOC_LEN_OFFSET 4 +#define MODE_SENSE_PAGE_CONTROL_OFFSET 2 +#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 +#define MODE_SENSE_PAGE_CODE_OFFSET 2 +#define MODE_SENSE_PAGE_CODE_MASK 0x3F +#define MODE_SENSE_LLBAA_OFFSET 1 +#define MODE_SENSE_LLBAA_MASK 0x10 +#define MODE_SENSE_LLBAA_SHIFT 4 +#define MODE_SENSE_DBD_OFFSET 1 +#define MODE_SENSE_DBD_MASK 8 +#define MODE_SENSE_DBD_SHIFT 3 +#define MODE_SENSE10_MPH_SIZE 8 +#define MODE_SENSE10_ALLOC_LEN_OFFSET 7 +#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET 1 +#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET 1 +#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET 4 +#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET 7 +#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 +#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 +#define MODE_SELECT_6_BD_OFFSET 3 +#define MODE_SELECT_10_BD_OFFSET 6 +#define MODE_SELECT_10_LLBAA_OFFSET 4 +#define MODE_SELECT_10_LLBAA_MASK 1 +#define MODE_SELECT_6_MPH_SIZE 4 +#define MODE_SELECT_10_MPH_SIZE 8 +#define CACHING_MODE_PAGE_WCE_MASK 0x04 +#define MODE_SENSE_BLK_DESC_ENABLED 0 +#define MODE_SENSE_BLK_DESC_COUNT 1 +#define MODE_SELECT_PAGE_CODE_MASK 0x3F +#define SHORT_DESC_BLOCK 8 +#define LONG_DESC_BLOCK 16 +#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 +#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A +#define MODE_PAGE_CACHING_LEN_FIELD 0x12 +#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A +#define MODE_SENSE_PC_CURRENT_VALUES 0 + +/* Log Sense defines */ +#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 +#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 +#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F +#define LOG_PAGE_TEMPERATURE_PAGE 0x0D +#define LOG_SENSE_CDB_SP_OFFSET 1 +#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 +#define LOG_SENSE_CDB_PC_OFFSET 2 +#define LOG_SENSE_CDB_PC_MASK 0xC0 +#define LOG_SENSE_CDB_PC_SHIFT 6 +#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 +#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F +#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET 7 +#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 +#define LOG_INFO_EXCP_PAGE_LENGTH 0xC +#define REMAINING_TEMP_PAGE_LENGTH 0xC +#define LOG_TEMP_PAGE_LENGTH 0x10 +#define LOG_TEMP_UNKNOWN 0xFF +#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 + +/* Read Capacity defines */ +#define READ_CAP_10_RESP_SIZE 8 +#define READ_CAP_16_RESP_SIZE 32 + +/* NVMe Namespace and Command Defines */ +#define NVME_GET_SMART_LOG_PAGE 0x02 +#define NVME_GET_FEAT_TEMP_THRESH 0x04 +#define BYTES_TO_DWORDS 4 +#define NVME_MAX_FIRMWARE_SLOT 7 + +/* Report LUNs defines */ +#define REPORT_LUNS_FIRST_LUN_OFFSET 8 + +/* SCSI ADDITIONAL SENSE Codes */ + +#define SCSI_ASC_NO_SENSE 0x00 +#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 +#define SCSI_ASC_LUN_NOT_READY 0x04 +#define SCSI_ASC_WARNING 0x0B +#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 +#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D +#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 +#define SCSI_ASC_ILLEGAL_COMMAND 0x20 +#define SCSI_ASC_ILLEGAL_BLOCK 0x21 +#define SCSI_ASC_INVALID_CDB 0x24 +#define SCSI_ASC_INVALID_LUN 0x25 +#define SCSI_ASC_INVALID_PARAMETER 0x26 +#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 +#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 + +/* SCSI ADDITIONAL SENSE Code Qualifiers */ + +#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 +#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 +#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 +#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 +#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 +#define SCSI_ASCQ_INVALID_LUN_ID 0x09 + +/** + * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to + * enable DPOFUA support type 0x10 value. + */ +#define DEVICE_SPECIFIC_PARAMETER 0 +#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR) + +/* MACROs to extract information from CDBs */ + +#define GET_OPCODE(cdb) cdb[0] + +#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0) + +#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0)) + +#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \ +(cdb[index + 1] << 8) | \ +(cdb[index + 2] << 0)) + +#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \ +(cdb[index + 1] << 16) | \ +(cdb[index + 2] << 8) | \ +(cdb[index + 3] << 0)) + +#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \ +(((u64)cdb[index + 1]) << 48) | \ +(((u64)cdb[index + 2]) << 40) | \ +(((u64)cdb[index + 3]) << 32) | \ +(((u64)cdb[index + 4]) << 24) | \ +(((u64)cdb[index + 5]) << 16) | \ +(((u64)cdb[index + 6]) << 8) | \ +(((u64)cdb[index + 7]) << 0)) + +/* Inquiry Helper Macros */ +#define GET_INQ_EVPD_BIT(cdb) \ +((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) & \ +INQUIRY_EVPD_BIT_MASK) ? 1 : 0) + +#define GET_INQ_PAGE_CODE(cdb) \ +(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET)) + +#define GET_INQ_ALLOC_LENGTH(cdb) \ +(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET)) + +/* Report LUNs Helper Macros */ +#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb) \ +(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET)) + +/* Read Capacity Helper Macros */ +#define GET_READ_CAP_16_ALLOC_LENGTH(cdb) \ +(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) + +#define IS_READ_CAP_16(cdb) \ +((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) + +/* Request Sense Helper Macros */ +#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ +(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET)) + +/* Mode Sense Helper Macros */ +#define GET_MODE_SENSE_DBD(cdb) \ +((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >> \ +MODE_SENSE_DBD_SHIFT) + +#define GET_MODE_SENSE_LLBAA(cdb) \ +((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) & \ +MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT) + +#define GET_MODE_SENSE_MPH_SIZE(cdb10) \ +(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE) + + +/* Struct to gather data that needs to be extracted from a SCSI CDB. + Not conforming to any particular CDB variant, but compatible with all. */ + +struct nvme_trans_io_cdb { + u8 fua; + u8 prot_info; + u64 lba; + u32 xfer_len; +}; + + +/* Internal Helper Functions */ + + +/* Copy data to userspace memory */ + +static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, + unsigned long n) +{ + int res = SNTI_TRANSLATION_SUCCESS; + unsigned long not_copied; + int i; + void *index = from; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec *sgl = hdr->dxferp; + + for (i = 0; i < hdr->iovec_count; i++) { + xfer_len = min(remaining, sgl[i].iov_len); + not_copied = copy_to_user(__user sgl[i].iov_base, index, + xfer_len); + if (not_copied) { + res = -EFAULT; + break; + } + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return res; + } + not_copied = copy_to_user(__user hdr->dxferp, from, n); + if (not_copied) + res = -EFAULT; + return res; +} + +/* Copy data from userspace memory */ + +static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, + unsigned long n) +{ + int res = SNTI_TRANSLATION_SUCCESS; + unsigned long not_copied; + int i; + void *index = to; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec *sgl = hdr->dxferp; + + for (i = 0; i < hdr->iovec_count; i++) { + xfer_len = min(remaining, sgl[i].iov_len); + not_copied = copy_from_user(index, + __user sgl[i].iov_base, xfer_len); + if (not_copied) { + res = -EFAULT; + break; + } + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return res; + } + + not_copied = copy_from_user(to, __user hdr->dxferp, n); + if (not_copied) + res = -EFAULT; + return res; +} + +/* Status/Sense Buffer Writeback */ + +static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, + u8 asc, u8 ascq) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 xfer_len; + u8 resp[DESC_FMT_SENSE_DATA_SIZE]; + + if (scsi_status_is_good(status)) { + hdr->status = SAM_STAT_GOOD; + hdr->masked_status = GOOD; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + hdr->sb_len_wr = 0; + } else { + hdr->status = status; + hdr->masked_status = status >> 1; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + + memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); + resp[0] = DESC_FORMAT_SENSE_DATA; + resp[1] = sense_key; + resp[2] = asc; + resp[3] = ascq; + + xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); + hdr->sb_len_wr = xfer_len; + if (copy_to_user(__user hdr->sbp, resp, xfer_len) > 0) + res = -EFAULT; + } + + return res; +} + +static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) +{ + u8 status, sense_key, asc, ascq; + int res = SNTI_TRANSLATION_SUCCESS; + + /* For non-nvme (Linux) errors, simply return the error code */ + if (nvme_sc < 0) + return nvme_sc; + + /* Mask DNR, More, and reserved fields */ + nvme_sc &= 0x7FF; + + switch (nvme_sc) { + /* Generic Command Status */ + case NVME_SC_SUCCESS: + status = SAM_STAT_GOOD; + sense_key = NO_SENSE; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_OPCODE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_COMMAND; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_FIELD: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_DATA_XFER_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_POWER_LOSS: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_WARNING; + ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; + break; + case NVME_SC_INTERNAL: + status = SAM_STAT_CHECK_CONDITION; + sense_key = HARDWARE_ERROR; + asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_REQ: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_QUEUE: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_FAIL: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_MISSING: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_NS: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + case NVME_SC_LBA_RANGE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_BLOCK; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_CAP_EXCEEDED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_NS_NOT_READY: + status = SAM_STAT_CHECK_CONDITION; + sense_key = NOT_READY; + asc = SCSI_ASC_LUN_NOT_READY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Command Specific Status */ + case NVME_SC_INVALID_FORMAT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_FORMAT_COMMAND_FAILED; + ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; + break; + case NVME_SC_BAD_ATTRIBUTES: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Media Errors */ + case NVME_SC_WRITE_FAULT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_READ_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_UNRECOVERED_READ_ERROR; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_GUARD_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; + break; + case NVME_SC_APPTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; + break; + case NVME_SC_REFTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; + break; + case NVME_SC_COMPARE_FAILED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MISCOMPARE; + asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ACCESS_DENIED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + + /* Unspecified/Default */ + case NVME_SC_CMDID_CONFLICT: + case NVME_SC_CMD_SEQ_ERROR: + case NVME_SC_CQ_INVALID: + case NVME_SC_QID_INVALID: + case NVME_SC_QUEUE_SIZE: + case NVME_SC_ABORT_LIMIT: + case NVME_SC_ABORT_MISSING: + case NVME_SC_ASYNC_LIMIT: + case NVME_SC_FIRMWARE_SLOT: + case NVME_SC_FIRMWARE_IMAGE: + case NVME_SC_INVALID_VECTOR: + case NVME_SC_INVALID_LOG_PAGE: + default: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + + res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); + + return res; +} + +/* INQUIRY Helper Functions */ + +static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + int xfer_len; + u8 resp_data_format = 0x02; + u8 protect; + u8 cmdque = 0x01 << 1; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme ns identify - use DPS value for PROTECT field */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + /* + * If nvme_sc was -ve, res will be -ve here. + * If nvme_sc was +ve, the status would bace been translated, and res + * can only be 0 or -ve. + * - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc + * - If -ve, return because its a Linux error. + */ + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ns = mem; + (id_ns->dps) ? (protect = 0x01) : (protect = 0); + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[2] = VERSION_SPC_4; + inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ + inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; + inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ + inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ + strncpy(&inq_response[8], "NVMe ", 8); + strncpy(&inq_response[16], dev->model, 16); + strncpy(&inq_response[32], dev->firmware_rev, 4); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + return res; +} + +static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ + inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ + inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; + inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; + inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; + inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + return res; +} + +static int nvme_trans_unit_serial_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ + inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ + strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + return res; +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u8 ieee[4]; + int xfer_len; + u32 tmp_id = cpu_to_be64(ns->ns_id); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme controller identify */ + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ctrl = mem; + + /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */ + ieee[0] = id_ctrl->ieee[0] << 4; + ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4; + ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4; + ieee[3] = id_ctrl->ieee[2] >> 4; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ + inq_response[3] = 20; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 16; /* Designator Length */ + /* Designator start */ + inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/ + inq_response[9] = ieee[2]; /* IEEE ID */ + inq_response[10] = ieee[1]; /* IEEE ID */ + inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */ + inq_response[12] = (dev->vendor_id & 0xFF00) >> 8; + inq_response[13] = (dev->vendor_id & 0x00FF); + inq_response[14] = dev->serial[0]; + inq_response[15] = dev->serial[1]; + inq_response[16] = dev->model[0]; + inq_response[17] = dev->model[1]; + memcpy(&inq_response[18], &tmp_id, sizeof(u32)); + /* Last 2 bytes are zero */ + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + return res; +} + +static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + struct nvme_id_ns *id_ns; + int xfer_len; + u8 microcode = 0x80; + u8 spt; + u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; + u8 grd_chk, app_chk, ref_chk, protect; + u8 uask_sup = 0x20; + u8 v_sup; + u8 luiclr = 0x01; + + inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ns = mem; + spt = spt_lut[(id_ns->dpc) & 0x07] << 3; + (id_ns->dps) ? (protect = 0x01) : (protect = 0); + grd_chk = protect << 2; + app_chk = protect << 1; + ref_chk = protect; + + /* nvme controller identify */ + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ctrl = mem; + v_sup = id_ctrl->vwc; + + memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; + inq_response[5] = uask_sup; + inq_response[6] = v_sup; + inq_response[7] = luiclr; + inq_response[8] = 0; + inq_response[9] = 0; + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ + inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ + inq_response[6] = 0x00; /* Form Factor */ + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + kfree(inq_response); + out_mem: + return res; +} + +/* LOG SENSE Helper Functions */ + +static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + + log_response = kmalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(log_response, 0, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); + + log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; + log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; + + xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, + struct sg_io_hdr *hdr, int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + struct nvme_command c; + struct nvme_dev *dev = ns->dev; + struct nvme_smart_log *smart_log; + dma_addr_t dma_addr; + void *mem; + u8 temp_c; + u16 temp_k; + + log_response = kmalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(log_response, 0, LOG_INFO_EXCP_PAGE_LENGTH); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_smart_log), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* Get SMART Log Page */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xFFFFFFFF); + c.common.prp1 = cpu_to_le64(dma_addr); + c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + res = nvme_submit_admin_cmd(dev, &c, NULL); + if (res != NVME_SC_SUCCESS) { + temp_c = LOG_TEMP_UNKNOWN; + } else { + smart_log = mem; + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c = temp_k - KELVIN_TEMP_FACTOR; + } + + log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; + /* Informational Exceptions Log Parameter 1 Start */ + /* Parameter Code=0x0000 bytes 4,5 */ + log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ + log_response[7] = 0x04; /* PARAMETER LENGTH */ + /* Add sense Code and qualifier = 0x00 each */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[10] = temp_c; + + xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), + mem, dma_addr); + out_dma: + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + struct nvme_command c; + struct nvme_dev *dev = ns->dev; + struct nvme_smart_log *smart_log; + dma_addr_t dma_addr; + void *mem; + u32 feature_resp; + u8 temp_c_cur, temp_c_thresh; + u16 temp_k; + + log_response = kmalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(log_response, 0, LOG_TEMP_PAGE_LENGTH); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_smart_log), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* Get SMART Log Page */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xFFFFFFFF); + c.common.prp1 = cpu_to_le64(dma_addr); + c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE); + res = nvme_submit_admin_cmd(dev, &c, NULL); + if (res != NVME_SC_SUCCESS) { + temp_c_cur = LOG_TEMP_UNKNOWN; + } else { + smart_log = mem; + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; + } + + /* Get Features for Temp Threshold */ + res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, + &feature_resp); + if (res != NVME_SC_SUCCESS) + temp_c_thresh = LOG_TEMP_UNKNOWN; + else + temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; + + log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_TEMP_PAGE_LENGTH; + /* Temperature Log Parameter 1 (Temperature) Start */ + /* Parameter Code = 0x0000 */ + log_response[6] = 0x01; /* Format and Linking = 01b */ + log_response[7] = 0x02; /* Parameter Length */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[9] = temp_c_cur; + /* Temperature Log Parameter 2 (Reference Temperature) Start */ + log_response[11] = 0x01; /* Parameter Code = 0x0001 */ + log_response[12] = 0x01; /* Format and Linking = 01b */ + log_response[13] = 0x02; /* Parameter Length */ + /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ + log_response[15] = temp_c_thresh; + + xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), + mem, dma_addr); + out_dma: + kfree(log_response); + out_mem: + return res; +} + +/* MODE SENSE Helper Functions */ + +static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, + u16 mode_data_length, u16 blk_desc_len) +{ + /* Quick check to make sure I don't stomp on my own memory... */ + if ((cdb10 && len < 8) || (!cdb10 && len < 4)) + return SNTI_INTERNAL_ERROR; + + if (cdb10) { + resp[0] = (mode_data_length & 0xFF00) >> 8; + resp[1] = (mode_data_length & 0x00FF); + /* resp[2] and [3] are zero */ + resp[4] = llbaa; + resp[5] = RESERVED_FIELD; + resp[6] = (blk_desc_len & 0xFF00) >> 8; + resp[7] = (blk_desc_len & 0x00FF); + } else { + resp[0] = (mode_data_length & 0x00FF); + /* resp[1] and [2] are zero */ + resp[3] = (blk_desc_len & 0x00FF); + } + + return SNTI_TRANSLATION_SUCCESS; +} + +static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len, u8 llbaa) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 flbas; + u32 lba_length; + + if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) + return SNTI_INTERNAL_ERROR; + else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) + return SNTI_INTERNAL_ERROR; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + + if (llbaa == 0) { + u32 tmp_cap = cpu_to_be32(id_ns->ncap); + /* Byte 4 is reserved */ + u32 tmp_len = cpu_to_be32(lba_length) & 0x00FFFFFF; + + memcpy(resp, &tmp_cap, sizeof(u32)); + memcpy(&resp[4], &tmp_len, sizeof(u32)); + } else { + u64 tmp_cap = cpu_to_be64(id_ns->ncap); + u32 tmp_len = cpu_to_be32(lba_length); + + memcpy(resp, &tmp_cap, sizeof(u64)); + /* Bytes 8, 9, 10, 11 are reserved */ + memcpy(&resp[12], &tmp_len, sizeof(u32)); + } + + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_fill_control_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + if (len < MODE_PAGE_CONTROL_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_CONTROL; + resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; + resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, + * D_SENSE=1, GLTSD=1, RLEC=0 */ + resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ + /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ + resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ + /* resp[6] and [7] are obsolete, thus zero */ + resp[8] = 0xFF; /* Busy timeout period = 0xffff */ + resp[9] = 0xFF; + /* Bytes 10,11: Extended selftest completion time = 0x0000 */ + + return SNTI_TRANSLATION_SUCCESS; +} + +static int nvme_trans_fill_caching_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + u32 feature_resp; + u8 vwc; + + if (len < MODE_PAGE_CACHING_LEN) + return SNTI_INTERNAL_ERROR; + + nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, + &feature_resp); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) { + res = nvme_sc; + goto out; + } + vwc = feature_resp & 0x00000001; + + resp[0] = MODE_PAGE_CACHING; + resp[1] = MODE_PAGE_CACHING_LEN_FIELD; + resp[2] = vwc << 2; + + out: + return res; +} + +static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + + if (len < MODE_PAGE_POW_CND_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_POWER_CONDITION; + resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; + /* All other bytes are zero */ + + return res; +} + +static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + + if (len < MODE_PAGE_INF_EXC_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_INFO_EXCEP; + resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; + resp[2] = 0x88; + /* All other bytes are zero */ + + return res; +} + +static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 mode_pages_offset_1 = 0; + u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; + + mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; + mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; + mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; + + res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], + MODE_PAGE_CACHING_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], + MODE_PAGE_CONTROL_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], + MODE_PAGE_POW_CND_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], + MODE_PAGE_INF_EXC_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + out: + return res; +} + +static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) +{ + if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { + /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ + return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; + } else { + return 0; + } +} + +static int nvme_trans_mode_page_create(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd, + u16 alloc_len, u8 cdb10, + int (*mode_page_fill_func) + (struct nvme_ns *, + struct sg_io_hdr *hdr, u8 *, int), + u16 mode_pages_tot_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *response; + u8 dbd, llbaa; + u16 resp_size; + int mph_size; + u16 mode_pages_offset_1; + u16 blk_desc_len, blk_desc_offset, mode_data_length; + + dbd = GET_MODE_SENSE_DBD(cmd); + llbaa = GET_MODE_SENSE_LLBAA(cmd); + mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10); + blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); + + resp_size = mph_size + blk_desc_len + mode_pages_tot_len; + /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ + mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; + + blk_desc_offset = mph_size; + mode_pages_offset_1 = blk_desc_offset + blk_desc_len; + + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_mem; + } + memset(response, 0, resp_size); + + res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, + llbaa, mode_data_length, blk_desc_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + if (blk_desc_len > 0) { + res = nvme_trans_fill_blk_desc(ns, hdr, + &response[blk_desc_offset], + blk_desc_len, llbaa); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + } + res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], + mode_pages_tot_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + out_free: + kfree(response); + out_mem: + return res; +} + +/* Read Capacity Helper Functions */ + +static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, + u8 cdb16) +{ + u8 flbas; + u32 lba_length; + u64 rlba; + u8 prot_en; + u8 p_type_lut[4] = {0, 0, 1, 2}; + u64 tmp_rlba; + u32 tmp_rlba_32; + u32 tmp_len; + + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + rlba = le64_to_cpup(&id_ns->nsze) - 1; + (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); + + if (!cdb16) { + if (rlba > 0xFFFFFFFF) + rlba = 0xFFFFFFFF; + tmp_rlba_32 = cpu_to_be32(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba_32, sizeof(u32)); + memcpy(&response[4], &tmp_len, sizeof(u32)); + } else { + tmp_rlba = cpu_to_be64(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba, sizeof(u64)); + memcpy(&response[8], &tmp_len, sizeof(u32)); + response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; + /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ + /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ + /* Bytes 16-31 - Reserved */ + } +} + +/* Start Stop Unit Helper Functions */ + +static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 pc, u8 pcmod, u8 start) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + int lowest_pow_st; /* max npss = lowest power consumption */ + unsigned ps_desired = 0; + + /* NVMe Controller Identify */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ctrl), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ctrl = mem; + lowest_pow_st = id_ctrl->npss - 1; + + switch (pc) { + case NVME_POWER_STATE_START_VALID: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0 && start == 0x1) + ps_desired = POWER_STATE_0; + if (pcmod == 0 && start == 0x0) + ps_desired = lowest_pow_st; + break; + case NVME_POWER_STATE_ACTIVE: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0) + ps_desired = POWER_STATE_0; + break; + case NVME_POWER_STATE_IDLE: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ + /* min of desired state and (lps-1) because lps is STOP */ + if (pcmod == 0x0) + ps_desired = min(POWER_STATE_1, (lowest_pow_st - 1)); + else if (pcmod == 0x1) + ps_desired = min(POWER_STATE_2, (lowest_pow_st - 1)); + else if (pcmod == 0x2) + ps_desired = min(POWER_STATE_3, (lowest_pow_st - 1)); + break; + case NVME_POWER_STATE_STANDBY: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ + if (pcmod == 0x0) + ps_desired = max(0, (lowest_pow_st - 2)); + else if (pcmod == 0x1) + ps_desired = max(0, (lowest_pow_st - 1)); + break; + case NVME_POWER_STATE_LU_CONTROL: + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, + NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) + res = nvme_sc; + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, + dma_addr); + out: + return res; +} + +/* Write Buffer Helper Functions */ +/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */ + +static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 opcode, u32 tot_len, u32 offset, + u8 buffer_id) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + struct nvme_command c; + struct nvme_iod *iod = NULL; + unsigned length; + + memset(&c, 0, sizeof(c)); + c.common.opcode = opcode; + if (opcode == nvme_admin_download_fw) { + if (hdr->iovec_count > 0) { + /* Assuming SGL is not allowed for this command */ + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + iod = nvme_map_user_pages(dev, DMA_TO_DEVICE, + (unsigned long)hdr->dxferp, tot_len); + if (IS_ERR(iod)) { + res = PTR_ERR(iod); + goto out; + } + length = nvme_setup_prps(dev, &c.common, iod, tot_len, + GFP_KERNEL); + if (length != tot_len) { + res = -ENOMEM; + goto out_unmap; + } + + c.dlfw.numd = (tot_len/BYTES_TO_DWORDS) - 1; + c.dlfw.offset = offset/BYTES_TO_DWORDS; + } else if (opcode == nvme_admin_activate_fw) { + c.common.cdw10[0] = buffer_id; + /* AA=01b Replace & activate at reset */ + c.common.cdw10[0] |= 0x00000008; + } + + nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_unmap; + if (nvme_sc) + res = nvme_sc; + + out_unmap: + if (opcode == nvme_admin_download_fw) { + nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod); + nvme_free_iod(dev, iod); + } + out: + return res; +} + +/* Mode Select Helper Functions */ + +static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, + u16 *bd_len, u8 *llbaa) +{ + if (cdb10) { + /* 10 Byte CDB */ + *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + + parm_list[MODE_SELECT_10_BD_OFFSET + 1]; + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] && + MODE_SELECT_10_LLBAA_MASK; + } else { + /* 6 Byte CDB */ + *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; + } +} + +static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, + u16 idx, u16 bd_len, u8 llbaa) +{ + u16 bd_num; + + bd_num = bd_len / ((llbaa == 0) ? + SHORT_DESC_BLOCK : LONG_DESC_BLOCK); + /* Store block descriptor info if a FORMAT UNIT comes later */ + /* TODO Saving 1st BD info; what to do if multiple BD received? */ + if (llbaa == 0) { + /* Standard Block Descriptor - spc4r34 7.5.5.1 */ + ns->mode_select_num_blocks = + (parm_list[idx + 1] << 16) + + (parm_list[idx + 2] << 8) + + (parm_list[idx + 3]); + + ns->mode_select_block_len = + (parm_list[idx + 5] << 16) + + (parm_list[idx + 6] << 8) + + (parm_list[idx + 7]); + } else { + /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ + ns->mode_select_num_blocks = + (((u64)parm_list[idx + 0]) << 56) + + (((u64)parm_list[idx + 1]) << 48) + + (((u64)parm_list[idx + 2]) << 40) + + (((u64)parm_list[idx + 3]) << 32) + + (((u64)parm_list[idx + 4]) << 24) + + (((u64)parm_list[idx + 5]) << 16) + + (((u64)parm_list[idx + 6]) << 8) + + ((u64)parm_list[idx + 7]); + + ns->mode_select_block_len = + (parm_list[idx + 12] << 24) + + (parm_list[idx + 13] << 16) + + (parm_list[idx + 14] << 8) + + (parm_list[idx + 15]); + } +} + +static u16 nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *mode_page, u8 page_code) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + unsigned dword11; + + switch (page_code) { + case MODE_PAGE_CACHING: + dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); + nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, + 0, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + break; + if (nvme_sc) { + res = nvme_sc; + break; + } + break; + case MODE_PAGE_CONTROL: + break; + case MODE_PAGE_POWER_CONDITION: + /* Verify the OS is not trying to set timers */ + if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + if (!res) + res = SNTI_INTERNAL_ERROR; + break; + } + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + if (!res) + res = SNTI_INTERNAL_ERROR; + break; + } + + return res; +} + +static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd, u16 parm_list_len, u8 pf, + u8 sp, u8 cdb10) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 *parm_list; + u16 bd_len; + u8 llbaa = 0; + u16 index, saved_index; + u8 page_code; + u16 mp_size; + + /* Get parm list from data-in/out buffer */ + parm_list = kmalloc(parm_list_len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + + res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_mem; + + nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); + index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); + + if (bd_len != 0) { + /* Block Descriptors present, parse */ + nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); + index += bd_len; + } + saved_index = index; + + /* Multiple mode pages may be present; iterate through all */ + /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + if ((page_code != MODE_PAGE_CACHING) && + (page_code != MODE_PAGE_CONTROL) && + (page_code != MODE_PAGE_POWER_CONDITION)) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + index += mp_size; + } while (index < parm_list_len); + + /* In 2nd Iteration, do the NVME Commands */ + index = saved_index; + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], + page_code); + if (res != SNTI_TRANSLATION_SUCCESS) + break; + index += mp_size; + } while (index < parm_list_len); + + out_mem: + kfree(parm_list); + out: + return res; +} + +/* Format Unit Helper Functions */ + +static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, + struct sg_io_hdr *hdr) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 flbas; + + /* + * SCSI Expects a MODE SELECT would have been issued prior to + * a FORMAT UNIT, and the block size and number would be used + * from the block descriptor in it. If a MODE SELECT had not + * been issued, FORMAT shall use the current values for both. + */ + + if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + + if (ns->mode_select_num_blocks == 0) + ns->mode_select_num_blocks = id_ns->ncap; + if (ns->mode_select_block_len == 0) { + flbas = (id_ns->flbas) & 0x0F; + ns->mode_select_block_len = + (1 << (id_ns->lbaf[flbas].ds)); + } + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + mem, dma_addr); + } + out: + return res; +} + +static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, + u8 format_prot_info, u8 *nvme_pf_code) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 *parm_list; + u8 pf_usage, pf_code; + + parm_list = kmalloc(len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + res = nvme_trans_copy_from_user(hdr, parm_list, len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_mem; + + if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & + FORMAT_UNIT_IMMED_MASK) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + + if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && + (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & + FORMAT_UNIT_PROT_FIELD_USAGE_MASK; + pf_code = (pf_usage << 2) | format_prot_info; + switch (pf_code) { + case 0: + *nvme_pf_code = 0; + break; + case 2: + *nvme_pf_code = 1; + break; + case 3: + *nvme_pf_code = 2; + break; + case 7: + *nvme_pf_code = 3; + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out_mem: + kfree(parm_list); + out: + return res; +} + +static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 prot_info) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 i; + u8 flbas, nlbaf; + u8 selected_lbaf = 0xFF; + u32 cdw10 = 0; + struct nvme_command c; + + /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + flbas = (id_ns->flbas) & 0x0F; + nlbaf = id_ns->nlbaf; + + for (i = 0; i < nlbaf; i++) { + if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { + selected_lbaf = i; + break; + } + } + if (selected_lbaf > 0x0F) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + if (ns->mode_select_num_blocks != id_ns->ncap) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + cdw10 |= prot_info << 5; + cdw10 |= selected_lbaf & 0x0F; + memset(&c, 0, sizeof(c)); + c.format.opcode = nvme_admin_format_nvm; + c.format.nsid = ns->ns_id; + c.format.cdw10 = cpu_to_le32(cdw10); + + nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) + res = nvme_sc; + + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +/* Read/Write Helper Functions */ + +static inline void nvme_trans_get_io_cdb6(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = 0; + cdb_info->prot_info = 0; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) & + IO_6_CDB_LBA_MASK; + cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET); + + /* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */ + if (cdb_info->xfer_len == 0) + cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN; +} + +static inline void nvme_trans_get_io_cdb10(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET); +} + +static inline void nvme_trans_get_io_cdb12(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET); +} + +static inline void nvme_trans_get_io_cdb16(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET); +} + +static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, + u32 max_blocks) +{ + /* If using iovecs, send one nvme command per vector */ + if (hdr->iovec_count > 0) + return hdr->iovec_count; + else if (cdb_info->xfer_len > max_blocks) + return ((cdb_info->xfer_len - 1) / max_blocks) + 1; + else + return 1; +} + +static u16 nvme_trans_io_get_control(struct nvme_ns *ns, + struct nvme_trans_io_cdb *cdb_info) +{ + u16 control = 0; + + /* When Protection information support is added, implement here */ + + if (cdb_info->fua > 0) + control |= NVME_RW_FUA; + + return control; +} + +static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, u8 is_write) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + u32 num_cmds; + struct nvme_iod *iod; + u64 unit_len; + u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ + u32 retcode; + u32 i = 0; + u64 nvme_offset = 0; + void *next_mapping_addr; + struct nvme_command c; + u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); + u16 control; + u32 max_blocks = (dev->max_hw_sectors << 9) >> ns->lba_shift; + + num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); + + /* + * This loop handles two cases. + * First, when an SGL is used in the form of an iovec list: + * - Use iov_base as the next mapping address for the nvme command_id + * - Use iov_len as the data transfer length for the command. + * Second, when we have a single buffer + * - If larger than max_blocks, split into chunks, offset + * each nvme command accordingly. + */ + for (i = 0; i < num_cmds; i++) { + memset(&c, 0, sizeof(c)); + if (hdr->iovec_count > 0) { + struct sg_iovec *sgl = hdr->dxferp; + + unit_len = sgl[i].iov_len; + unit_num_blocks = unit_len >> ns->lba_shift; + next_mapping_addr = sgl[i].iov_base; + } else { + unit_num_blocks = min((u64)max_blocks, + (cdb_info->xfer_len - nvme_offset)); + unit_len = unit_num_blocks << ns->lba_shift; + next_mapping_addr = hdr->dxferp + + ((1 << ns->lba_shift) * nvme_offset); + } + + c.rw.opcode = opcode; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(cdb_info->lba) + nvme_offset; + c.rw.length = cpu_to_le16(unit_num_blocks - 1); + control = nvme_trans_io_get_control(ns, cdb_info); + c.rw.control = cpu_to_le16(control); + + iod = nvme_map_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + (unsigned long)next_mapping_addr, unit_len); + if (IS_ERR(iod)) { + res = PTR_ERR(iod); + goto out; + } + retcode = nvme_setup_prps(dev, &c.common, iod, unit_len, + GFP_KERNEL); + if (retcode != unit_len) { + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + res = -ENOMEM; + goto out; + } + + nvme_offset += unit_num_blocks; + + nvmeq = get_nvmeq(dev); + /* + * Since nvme_submit_sync_cmd sleeps, we can't keep + * preemption disabled. We may be preempted at any + * point, and be rescheduled to a different CPU. That + * will cause cacheline bouncing, but no additional + * races since q_lock already protects against other + * CPUs. + */ + put_nvmeq(nvmeq); + nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, + NVME_IO_TIMEOUT); + if (nvme_sc != NVME_SC_SUCCESS) { + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + res = nvme_trans_status_code(hdr, nvme_sc); + goto out; + } + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + } + res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + + out: + return res; +} + + +/* SCSI Command Translation Functions */ + +static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + struct nvme_trans_io_cdb cdb_info; + u8 opcode = cmd[0]; + u64 xfer_bytes; + u64 sum_iov_len = 0; + struct sg_iovec *sgl; + int i; + + /* Extract Fields from CDB */ + switch (opcode) { + case WRITE_6: + case READ_6: + nvme_trans_get_io_cdb6(cmd, &cdb_info); + break; + case WRITE_10: + case READ_10: + nvme_trans_get_io_cdb10(cmd, &cdb_info); + break; + case WRITE_12: + case READ_12: + nvme_trans_get_io_cdb12(cmd, &cdb_info); + break; + case WRITE_16: + case READ_16: + nvme_trans_get_io_cdb16(cmd, &cdb_info); + break; + default: + /* Will never really reach here */ + res = SNTI_INTERNAL_ERROR; + goto out; + } + + /* Calculate total length of transfer (in bytes) */ + if (hdr->iovec_count > 0) { + sgl = hdr->dxferp; + for (i = 0; i < hdr->iovec_count; i++) { + sum_iov_len += sgl[i].iov_len; + /* IO vector sizes should be multiples of block size */ + if (sgl[i].iov_len % (1 << ns->lba_shift) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + } + } else { + sum_iov_len = hdr->dxfer_len; + } + + /* As Per sg ioctl howto, if the lengths differ, use the lower one */ + xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); + + /* If block count and actual data buffer size dont match, error out */ + if ((xfer_bytes / (1 << ns->lba_shift)) != cdb_info.xfer_len) { + res = -EINVAL; + goto out; + } + + /* Check for 0 length transfer - it is not illegal */ + if (cdb_info.xfer_len == 0) + goto out; + + /* Send NVMe IO Command(s) */ + res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + out: + return res; +} + +static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 evpd; + u8 page_code; + int alloc_len; + u8 *inq_response; + + evpd = GET_INQ_EVPD_BIT(cmd); + page_code = GET_INQ_PAGE_CODE(cmd); + alloc_len = GET_INQ_ALLOC_LENGTH(cmd); + + inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + if (evpd == 0) { + if (page_code == INQ_STANDARD_INQUIRY_PAGE) { + res = nvme_trans_standard_inquiry_page(ns, hdr, + inq_response, alloc_len); + } else { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + } else { + switch (page_code) { + case VPD_SUPPORTED_PAGES: + res = nvme_trans_supported_vpd_pages(ns, hdr, + inq_response, alloc_len); + break; + case VPD_SERIAL_NUMBER: + res = nvme_trans_unit_serial_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_DEVICE_IDENTIFIERS: + res = nvme_trans_device_id_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_EXTENDED_INQUIRY: + res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); + break; + case VPD_BLOCK_DEV_CHARACTERISTICS: + res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + } + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 alloc_len; + u8 sp; + u8 pc; + u8 page_code; + + sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET); + if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET); + page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK; + pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; + if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET); + switch (page_code) { + case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: + res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); + break; + case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: + res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); + break; + case LOG_PAGE_TEMPERATURE_PAGE: + res = nvme_trans_log_temperature(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 cdb10 = 0; + u16 parm_list_len; + u8 page_format; + u8 save_pages; + + page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET); + page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK; + + save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET); + save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK; + + if (GET_OPCODE(cmd) == MODE_SELECT) { + parm_list_len = GET_U8_FROM_CDB(cmd, + MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET); + } else { + parm_list_len = GET_U16_FROM_CDB(cmd, + MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET); + cdb10 = 1; + } + + if (parm_list_len != 0) { + /* + * According to SPC-4 r24, a paramter list length field of 0 + * shall not be considered an error + */ + res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, + page_format, save_pages, cdb10); + } + + return res; +} + +static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 alloc_len; + u8 cdb10 = 0; + u8 page_code; + u8 pc; + + if (GET_OPCODE(cmd) == MODE_SENSE) { + alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET); + } else { + alloc_len = GET_U16_FROM_CDB(cmd, + MODE_SENSE10_ALLOC_LEN_OFFSET); + cdb10 = 1; + } + + pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) & + MODE_SENSE_PAGE_CONTROL_MASK; + if (pc != MODE_SENSE_PC_CURRENT_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) & + MODE_SENSE_PAGE_CODE_MASK; + switch (page_code) { + case MODE_PAGE_CACHING: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_caching_page, + MODE_PAGE_CACHING_LEN); + break; + case MODE_PAGE_CONTROL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_control_page, + MODE_PAGE_CONTROL_LEN); + break; + case MODE_PAGE_POWER_CONDITION: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_pow_cnd_page, + MODE_PAGE_POW_CND_LEN); + break; + case MODE_PAGE_INFO_EXCEP: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_inf_exc_page, + MODE_PAGE_INF_EXC_LEN); + break; + case MODE_PAGE_RETURN_ALL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_all_pages, + MODE_PAGE_ALL_LEN); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u32 alloc_len = READ_CAP_10_RESP_SIZE; + u32 resp_size = READ_CAP_10_RESP_SIZE; + u32 xfer_len; + u8 cdb16; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 *response; + + cdb16 = IS_READ_CAP_16(cmd); + if (cdb16) { + alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd); + resp_size = READ_CAP_16_RESP_SIZE; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_dma; + } + memset(response, 0, resp_size); + nvme_trans_fill_read_cap(response, id_ns, cdb16); + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u32 alloc_len, xfer_len, resp_size; + u8 select_report; + u8 *response; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + u32 ll_length, lun_id; + u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; + u32 tmp_len; + + alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd); + select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET); + + if ((select_report != ALL_LUNS_RETURNED) && + (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) && + (select_report != RESTRICTED_LUNS_RETURNED)) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } else { + /* NVMe Controller Identify */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ctrl), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ctrl = mem; + ll_length = id_ctrl->nn * LUN_ENTRY_SIZE; + resp_size = ll_length + LUN_DATA_HEADER_SIZE; + + if (alloc_len < resp_size) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_dma; + } + + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_dma; + } + memset(response, 0, resp_size); + + /* The first LUN ID will always be 0 per the SAM spec */ + for (lun_id = 0; lun_id < id_ctrl->nn; lun_id++) { + /* + * Set the LUN Id and then increment to the next LUN + * location in the parameter data. + */ + u64 tmp_id = cpu_to_be64(lun_id); + memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); + lun_id_offset += LUN_ENTRY_SIZE; + } + tmp_len = cpu_to_be32(ll_length); + memcpy(response, &tmp_len, sizeof(u32)); + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 alloc_len, xfer_len, resp_size; + u8 desc_format; + u8 *response; + + alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd); + desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET); + desc_format &= REQUEST_SENSE_DESC_MASK; + + resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : + (FIXED_FMT_SENSE_DATA_SIZE)); + response = kmalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out; + } + memset(response, 0, resp_size); + + if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { + /* Descriptor Format Sense Data */ + response[0] = DESC_FORMAT_SENSE_DATA; + response[1] = NO_SENSE; + /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ + response[2] = SCSI_ASC_NO_SENSE; + response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ + } else { + /* Fixed Format Sense Data */ + response[0] = FIXED_SENSE_DATA; + /* Byte 1 = Obsolete */ + response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ + /* Bytes 3-6 - Information - set to zero */ + response[7] = FIXED_SENSE_DATA_ADD_LENGTH; + /* Bytes 8-11 - Cmd Specific Information - set to zero */ + response[12] = SCSI_ASC_NO_SENSE; + response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* Byte 14 = Field Replaceable Unit Code = 0 */ + /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out: + return res; +} + +static int nvme_trans_security_protocol(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); +} + +static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + u8 immed, pcmod, pc, no_flush, start; + + immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET); + pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET); + pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET); + no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET); + start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET); + + immed &= START_STOP_UNIT_CDB_IMMED_MASK; + pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK; + pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT; + no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK; + start &= START_STOP_UNIT_CDB_START_MASK; + + if (immed != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } else { + if (no_flush == 0) { + /* Issue NVME FLUSH command prior to START STOP UNIT */ + nvme_sc = nvme_submit_flush_data(nvmeq, ns); + put_nvmeq(nvmeq); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) { + res = nvme_sc; + goto out; + } + } + /* Setup the expected power state transition */ + res = nvme_trans_power_state(ns, hdr, pc, pcmod, start); + } + + out: + return res; +} + +static int nvme_trans_synchronize_cache(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + put_nvmeq(nvmeq); + nvme_sc = nvme_submit_flush_data(nvmeq, ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) + res = nvme_sc; + + out: + return res; +} + +static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 parm_hdr_len = 0; + u8 nvme_pf_code = 0; + u8 format_prot_info, long_list, format_data; + + format_prot_info = GET_U8_FROM_CDB(cmd, + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET); + long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET); + format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET); + + format_prot_info = (format_prot_info & + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >> + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT; + long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK; + format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK; + + if (format_data != 0) { + if (format_prot_info != 0) { + if (long_list == 0) + parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; + else + parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; + } + } else if (format_data == 0 && format_prot_info != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + /* Get parm header from data-in/out buffer */ + /* + * According to the translation spec, the only fields in the parameter + * list we are concerned with are in the header. So allocate only that. + */ + if (parm_hdr_len > 0) { + res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, + format_prot_info, &nvme_pf_code); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + } + + /* Attempt to activate any previously downloaded firmware image */ + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0); + + /* Determine Block size and count and send format command */ + res = nvme_trans_fmt_set_blk_size_count(ns, hdr); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); + + out: + return res; +} + +static int nvme_trans_test_unit_ready(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + struct nvme_dev *dev = ns->dev; + + if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + NOT_READY, SCSI_ASC_LUN_NOT_READY, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + else + res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); + + return res; +} + +static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u32 buffer_offset, parm_list_length; + u8 buffer_id, mode; + + parm_list_length = + GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET); + if (parm_list_length % BYTES_TO_DWORDS != 0) { + /* NVMe expects Firmware file to be a whole number of DWORDS */ + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET); + if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) & + WRITE_BUFFER_CDB_MODE_MASK; + buffer_offset = + GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET); + + switch (mode) { + case DOWNLOAD_SAVE_ACTIVATE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case DOWNLOAD_SAVE_DEFER_ACTIVATE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case ACTIVATE_DEFERRED_MICROCODE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) +{ + u8 cmd[BLK_MAX_CDB]; + int retcode; + unsigned int opcode; + + if (hdr->cmdp == NULL) + return -EMSGSIZE; + if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) + return -EFAULT; + + opcode = cmd[0]; + + switch (opcode) { + case READ_6: + case READ_10: + case READ_12: + case READ_16: + retcode = nvme_trans_io(ns, hdr, 0, cmd); + break; + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + retcode = nvme_trans_io(ns, hdr, 1, cmd); + break; + case INQUIRY: + retcode = nvme_trans_inquiry(ns, hdr, cmd); + break; + case LOG_SENSE: + retcode = nvme_trans_log_sense(ns, hdr, cmd); + break; + case MODE_SELECT: + case MODE_SELECT_10: + retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + case MODE_SENSE: + case MODE_SENSE_10: + retcode = nvme_trans_mode_sense(ns, hdr, cmd); + break; + case READ_CAPACITY: + retcode = nvme_trans_read_capacity(ns, hdr, cmd); + break; + case SERVICE_ACTION_IN: + if (IS_READ_CAP_16(cmd)) + retcode = nvme_trans_read_capacity(ns, hdr, cmd); + else + goto out; + break; + case REPORT_LUNS: + retcode = nvme_trans_report_luns(ns, hdr, cmd); + break; + case REQUEST_SENSE: + retcode = nvme_trans_request_sense(ns, hdr, cmd); + break; + case SECURITY_PROTOCOL_IN: + case SECURITY_PROTOCOL_OUT: + retcode = nvme_trans_security_protocol(ns, hdr, cmd); + break; + case START_STOP: + retcode = nvme_trans_start_stop(ns, hdr, cmd); + break; + case SYNCHRONIZE_CACHE: + retcode = nvme_trans_synchronize_cache(ns, hdr, cmd); + break; + case FORMAT_UNIT: + retcode = nvme_trans_format_unit(ns, hdr, cmd); + break; + case TEST_UNIT_READY: + retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); + break; + case WRITE_BUFFER: + retcode = nvme_trans_write_buffer(ns, hdr, cmd); + break; + default: + out: + retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + return retcode; +} + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) +{ + struct sg_io_hdr hdr; + int retcode; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) + return -EFAULT; + if (hdr.interface_id != 'S') + return -EINVAL; + if (hdr.cmd_len > BLK_MAX_CDB) + return -EINVAL; + + retcode = nvme_scsi_translate(ns, &hdr); + if (retcode < 0) + return retcode; + if (retcode > 0) + retcode = SNTI_TRANSLATION_SUCCESS; + if (copy_to_user(__user u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) + return -EFAULT; + + return retcode; +} + +int nvme_sg_get_version_num(int __user *ip) +{ + return put_user(sg_version_num, ip); +} diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f4e0649..90c1db8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -362,6 +362,16 @@ struct nvme_download_firmware { __u32 rsvd12[4]; }; +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -372,6 +382,7 @@ struct nvme_command { struct nvme_create_sq create_sq; struct nvme_delete_queue delete_queue; struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; }; }; @@ -388,6 +399,7 @@ enum { NVME_SC_FUSED_FAIL = 0x9, NVME_SC_FUSED_MISSING = 0xa, NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, @@ -504,6 +516,8 @@ struct nvme_ns { u32 mode_select_block_len; }; +#define NVME_IO_TIMEOUT (5 * HZ) + /* * The nvme_iod describes the data in an I/O, including the list of PRP * entries. You can't see it in this data structure because C doesn't let @@ -525,6 +539,32 @@ struct nvme_iod { * @dev: The device that the I/O was submitted to * @iod: The memory to free */ +void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); + +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, + struct nvme_iod *iod, int total_len, gfp_t gfp); +struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, + unsigned long addr, unsigned length); +void nvme_unmap_user_pages(struct nvme_dev *dev, int write, + struct nvme_iod *iod); +struct nvme_queue *get_nvmeq(struct nvme_dev *dev); +void put_nvmeq(struct nvme_queue *nvmeq); +int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, + u32 *result, unsigned timeout); +int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); +int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, + u32 *result); +int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, + dma_addr_t dma_addr); +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result); +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result); + +struct sg_io_hdr; + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); +int nvme_sg_get_version_num(int __user *ip); #endif -- 1.7.0.4 Note that this patchset is dependent on: NVMe: Define SMART log NVMe: Add result to nvme_get_features NVMe: Set result from user admin command ^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2012-10-31 17:52 UTC | newest] Thread overview: 3+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2012-10-31 17:52 [PATCH v2 1/3] NVMe: Rename nvme.c to nvme-core.c Vishal Verma 2012-10-31 17:52 ` [PATCH v2 2/3] NVMe: Move dev, ns, iod structs to nvme.c Vishal Verma 2012-10-31 17:52 ` [PATCH v2 3/3] NVMe: Add nvme-scsi.c Vishal Verma
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).