All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stephen Bates <sbates@raithlin.com>
To: linux-kernel@vger.kernel.org, linux-nvdimm@lists.01.org,
	linux-rdma@vger.kernel.org, linux-block@vger.kernel.org,
	linux-mm@kvack.org
Cc: dan.j.williams@intel.com, ross.zwisler@linux.intel.com,
	willy@linux.intel.com, jgunthorpe@obsidianresearch.com,
	haggaie@mellanox.com, hch@infradead.org, axboe@fb.com,
	corbet@lwn.net, jim.macdonald@everspin.com, sbates@raithin.com,
	logang@deltatee.com, Stephen Bates <sbates@raithlin.com>
Subject: [PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory.
Date: Tue, 18 Oct 2016 15:42:16 -0600	[thread overview]
Message-ID: <1476826937-20665-3-git-send-email-sbates@raithlin.com> (raw)
In-Reply-To: <1476826937-20665-1-git-send-email-sbates@raithlin.com>

Add a new block device driver that binds to PCIe devices and turns
PCIe BARs into DAX capable block devices.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 MAINTAINERS            |   7 ++
 drivers/block/Kconfig  |  27 ++++
 drivers/block/Makefile |   1 +
 drivers/block/iopmem.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 368 insertions(+)
 create mode 100644 drivers/block/iopmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..c379f9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6510,6 +6510,13 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
 F:	drivers/iommu/

+IOPMEM BLOCK DEVICE DRVIER
+M:	Stephen Bates <sbates@raithlin.com>
+L:	linux-block@vger.kernel.org
+S:	Maintained
+F:	drivers/block/iopmem.c
+F:	Documentation/blockdev/iopmem.txt
+
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b..13ae1e7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -537,4 +537,31 @@ config BLK_DEV_RSXX
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.

+config BLK_DEV_IOPMEM
+	tristate "Persistent block device backed by PCIe Memory"
+	depends on ZONE_DEVICE
+	default n
+	help
+	  Say Y here if you want to include a generic device driver
+	  that can create a block device from persistent PCIe attached
+	  IO memory.
+
+	  To compile this driver as a module, choose M here: The
+	  module will be called iopmem. A block device will be created
+	  for each PCIe attached device that matches the vendor and
+	  device ID as specified in the module. Alternativel this
+	  driver can be bound to any aribtary PCIe function using the
+	  sysfs bind entry.
+
+	  This block device supports direct access (DAX) file systems
+	  and supports struct page backing for the IO Memory. This
+	  makes the underlying memory suitable for things like RDMA
+	  Memory Regions and Direct IO which is useful for PCIe
+	  peer-to-peer DMA operations.
+
+	  Note that persistent is only assured if the memory on the
+	  PCIe card has some form of power loss protection. This could
+	  be provided via some form of battery, a supercap/NAND combo
+	  or some exciting new persistent memory technology.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..1f4f69b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IOPMEM)	+= iopmem.o

 skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/iopmem.c b/drivers/block/iopmem.c
new file mode 100644
index 0000000..4a1e693
--- /dev/null
+++ b/drivers/block/iopmem.c
@@ -0,0 +1,333 @@
+/*
+ * IOPMEM Block Device Driver
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/pmem.c.
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+
+static const int BAR_ID = 4;
+
+static struct pci_device_id iopmem_id_table[] = {
+	{ PCI_DEVICE(0x11f8, 0xf115) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, iopmem_id_table);
+
+struct iopmem_device {
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct device *dev;
+
+	int instance;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+  /*
+   * We can only access the iopmem device with full 32-bit word
+   * accesses which cannot be gaurantee'd by the regular memcpy
+   */
+
+static void memcpy_from_iopmem(void *dst, const void *src, size_t sz)
+{
+	u64 *wdst = dst;
+	const u64 *wsrc = src;
+	u64 tmp;
+
+	while (sz >= sizeof(*wdst)) {
+		*wdst++ = *wsrc++;
+		sz -= sizeof(*wdst);
+	}
+
+	if (!sz)
+		return;
+
+	tmp = *wsrc;
+	memcpy(wdst, &tmp, sz);
+}
+
+static void write_iopmem(void *iopmem_addr, struct page *page,
+		       unsigned int off, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy(iopmem_addr, mem + off, len);
+	kunmap_atomic(mem);
+}
+
+static void read_iopmem(struct page *page, unsigned int off,
+			void *iopmem_addr, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy_from_iopmem(mem + off, iopmem_addr, len);
+	kunmap_atomic(mem);
+}
+
+static void iopmem_do_bvec(struct iopmem_device *iopmem, struct page *page,
+			   unsigned int len, unsigned int off, bool is_write,
+			   sector_t sector)
+{
+	phys_addr_t iopmem_off = sector * 512;
+	void *iopmem_addr = iopmem->virt_addr + iopmem_off;
+
+	if (!is_write) {
+		read_iopmem(page, off, iopmem_addr, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		write_iopmem(iopmem_addr, page, off, len);
+	}
+}
+
+static blk_qc_t iopmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct iopmem_device *iopmem = q->queuedata;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		iopmem_do_bvec(iopmem, bvec.bv_page, bvec.bv_len,
+			    bvec.bv_offset, op_is_write(bio_op(bio)),
+			    iter.bi_sector);
+	}
+
+	bio_endio(bio);
+	return BLK_QC_T_NONE;
+}
+
+static int iopmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, bool is_write)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+
+	iopmem_do_bvec(iopmem, page, PAGE_SIZE, 0, is_write, sector);
+	page_endio(page, is_write, 0);
+	return 0;
+}
+
+static long iopmem_direct_access(struct block_device *bdev, sector_t sector,
+			       void **kaddr, pfn_t *pfn, long size)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512;
+
+	if (!iopmem)
+		return -ENODEV;
+
+	*kaddr = iopmem->virt_addr + offset;
+	 *pfn = phys_to_pfn_t(iopmem->phys_addr + offset, PFN_DEV | PFN_MAP);
+
+	return iopmem->size - offset;
+}
+
+static const struct block_device_operations iopmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		iopmem_rw_page,
+	.direct_access =	iopmem_direct_access,
+};
+
+static DEFINE_IDA(iopmem_instance_ida);
+static DEFINE_SPINLOCK(ida_lock);
+
+static int iopmem_set_instance(struct iopmem_device *iopmem)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&iopmem_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&ida_lock);
+		error = ida_get_new(&iopmem_instance_ida, &instance);
+		spin_unlock(&ida_lock);
+
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	iopmem->instance = instance;
+	return 0;
+}
+
+static void iopmem_release_instance(struct iopmem_device *iopmem)
+{
+	spin_lock(&ida_lock);
+	ida_remove(&iopmem_instance_ida, iopmem->instance);
+	spin_unlock(&ida_lock);
+}
+
+static int iopmem_attach_disk(struct iopmem_device *iopmem)
+{
+	struct gendisk *disk;
+	int nid = dev_to_node(iopmem->dev);
+	struct request_queue *q = iopmem->queue;
+
+	blk_queue_write_cache(q, true, true);
+	blk_queue_make_request(q, iopmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	q->queuedata = iopmem;
+
+	disk = alloc_disk_node(0, nid);
+	if (unlikely(!disk))
+		return -ENOMEM;
+
+	disk->fops		= &iopmem_fops;
+	disk->queue		= q;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "iopmem%d", iopmem->instance);
+	set_capacity(disk, iopmem->size / 512);
+	iopmem->disk = disk;
+
+	device_add_disk(iopmem->dev, disk);
+	revalidate_disk(disk);
+
+	return 0;
+}
+
+static void iopmem_detach_disk(struct iopmem_device *iopmem)
+{
+	del_gendisk(iopmem->disk);
+	put_disk(iopmem->disk);
+}
+
+static int iopmem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct iopmem_device *iopmem;
+	struct device *dev;
+	int err = 0;
+	int nid = dev_to_node(&pdev->dev);
+
+	if (pci_enable_device_mem(pdev) < 0) {
+		dev_err(&pdev->dev, "unable to enable device!\n");
+		goto out;
+	}
+
+	iopmem = kzalloc(sizeof(*iopmem), GFP_KERNEL);
+	if (unlikely(!iopmem)) {
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	iopmem->phys_addr = pci_resource_start(pdev, BAR_ID);
+	iopmem->size = pci_resource_end(pdev, BAR_ID) - iopmem->phys_addr + 1;
+	iopmem->dev = dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, iopmem);
+
+	err = iopmem_set_instance(iopmem);
+	if (err)
+		goto out_put_device;
+
+	dev_info(dev, "bar space 0x%llx len %lld\n",
+		(unsigned long long) iopmem->phys_addr,
+		(unsigned long long) iopmem->size);
+
+	if (!devm_request_mem_region(dev, iopmem->phys_addr,
+				     iopmem->size, dev_name(dev))) {
+		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+			 &iopmem->phys_addr, iopmem->size);
+		err = -EBUSY;
+		goto out_release_instance;
+	}
+
+	iopmem->queue = blk_alloc_queue_node(GFP_KERNEL, nid);
+	if (!iopmem->queue) {
+		err = -ENOMEM;
+		goto out_release_instance;
+	}
+
+	iopmem->virt_addr = devm_memremap_pages(dev, &pdev->resource[BAR_ID],
+				&iopmem->queue->q_usage_counter,
+				NULL, MEMREMAP_WC);
+	if (IS_ERR(iopmem->virt_addr)) {
+		err = -ENXIO;
+		goto out_free_queue;
+	}
+
+	err = iopmem_attach_disk(iopmem);
+	if (err)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	blk_cleanup_queue(iopmem->queue);
+out_release_instance:
+	iopmem_release_instance(iopmem);
+out_put_device:
+	put_device(&pdev->dev);
+	kfree(iopmem);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void iopmem_remove(struct pci_dev *pdev)
+{
+	struct iopmem_device *iopmem = pci_get_drvdata(pdev);
+
+	blk_set_queue_dying(iopmem->queue);
+	iopmem_detach_disk(iopmem);
+	blk_cleanup_queue(iopmem->queue);
+	iopmem_release_instance(iopmem);
+	put_device(iopmem->dev);
+	kfree(iopmem);
+	pci_disable_device(pdev);
+}
+
+static struct pci_driver iopmem_pci_driver = {
+	.name = "iopmem",
+	.id_table = iopmem_id_table,
+	.probe = iopmem_probe,
+	.remove = iopmem_remove,
+};
+
+static int __init iopmem_init(void)
+{
+	int rc;
+
+	rc = pci_register_driver(&iopmem_pci_driver);
+	if (rc)
+		return rc;
+
+	pr_info("iopmem: module loaded\n");
+	return 0;
+}
+
+static void __exit iopmem_exit(void)
+{
+	pci_unregister_driver(&iopmem_pci_driver);
+	pr_info("iopmem: module unloaded\n");
+}
+
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_LICENSE("GPL");
+module_init(iopmem_init);
+module_exit(iopmem_exit);
--
2.1.4

WARNING: multiple messages have this Message-ID (diff)
From: Stephen Bates <sbates@raithlin.com>
To: linux-kernel@vger.kernel.org, linux-nvdimm@lists.01.org,
	linux-rdma@vger.kernel.org, linux-block@vger.kernel.org,
	linux-mm@kvack.org
Cc: hch@infradead.org, sbates@raithin.com, haggaie@mellanox.com,
	axboe@fb.com, corbet@lwn.net, jim.macdonald@everspin.com,
	Stephen Bates <sbates@raithlin.com>,
	jgunthorpe@obsidianresearch.com
Subject: [PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory.
Date: Tue, 18 Oct 2016 15:42:16 -0600	[thread overview]
Message-ID: <1476826937-20665-3-git-send-email-sbates@raithlin.com> (raw)
In-Reply-To: <1476826937-20665-1-git-send-email-sbates@raithlin.com>

Add a new block device driver that binds to PCIe devices and turns
PCIe BARs into DAX capable block devices.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 MAINTAINERS            |   7 ++
 drivers/block/Kconfig  |  27 ++++
 drivers/block/Makefile |   1 +
 drivers/block/iopmem.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 368 insertions(+)
 create mode 100644 drivers/block/iopmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..c379f9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6510,6 +6510,13 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
 F:	drivers/iommu/

+IOPMEM BLOCK DEVICE DRVIER
+M:	Stephen Bates <sbates@raithlin.com>
+L:	linux-block@vger.kernel.org
+S:	Maintained
+F:	drivers/block/iopmem.c
+F:	Documentation/blockdev/iopmem.txt
+
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b..13ae1e7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -537,4 +537,31 @@ config BLK_DEV_RSXX
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.

+config BLK_DEV_IOPMEM
+	tristate "Persistent block device backed by PCIe Memory"
+	depends on ZONE_DEVICE
+	default n
+	help
+	  Say Y here if you want to include a generic device driver
+	  that can create a block device from persistent PCIe attached
+	  IO memory.
+
+	  To compile this driver as a module, choose M here: The
+	  module will be called iopmem. A block device will be created
+	  for each PCIe attached device that matches the vendor and
+	  device ID as specified in the module. Alternativel this
+	  driver can be bound to any aribtary PCIe function using the
+	  sysfs bind entry.
+
+	  This block device supports direct access (DAX) file systems
+	  and supports struct page backing for the IO Memory. This
+	  makes the underlying memory suitable for things like RDMA
+	  Memory Regions and Direct IO which is useful for PCIe
+	  peer-to-peer DMA operations.
+
+	  Note that persistent is only assured if the memory on the
+	  PCIe card has some form of power loss protection. This could
+	  be provided via some form of battery, a supercap/NAND combo
+	  or some exciting new persistent memory technology.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..1f4f69b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IOPMEM)	+= iopmem.o

 skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/iopmem.c b/drivers/block/iopmem.c
new file mode 100644
index 0000000..4a1e693
--- /dev/null
+++ b/drivers/block/iopmem.c
@@ -0,0 +1,333 @@
+/*
+ * IOPMEM Block Device Driver
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/pmem.c.
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+
+static const int BAR_ID = 4;
+
+static struct pci_device_id iopmem_id_table[] = {
+	{ PCI_DEVICE(0x11f8, 0xf115) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, iopmem_id_table);
+
+struct iopmem_device {
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct device *dev;
+
+	int instance;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+  /*
+   * We can only access the iopmem device with full 32-bit word
+   * accesses which cannot be gaurantee'd by the regular memcpy
+   */
+
+static void memcpy_from_iopmem(void *dst, const void *src, size_t sz)
+{
+	u64 *wdst = dst;
+	const u64 *wsrc = src;
+	u64 tmp;
+
+	while (sz >= sizeof(*wdst)) {
+		*wdst++ = *wsrc++;
+		sz -= sizeof(*wdst);
+	}
+
+	if (!sz)
+		return;
+
+	tmp = *wsrc;
+	memcpy(wdst, &tmp, sz);
+}
+
+static void write_iopmem(void *iopmem_addr, struct page *page,
+		       unsigned int off, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy(iopmem_addr, mem + off, len);
+	kunmap_atomic(mem);
+}
+
+static void read_iopmem(struct page *page, unsigned int off,
+			void *iopmem_addr, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy_from_iopmem(mem + off, iopmem_addr, len);
+	kunmap_atomic(mem);
+}
+
+static void iopmem_do_bvec(struct iopmem_device *iopmem, struct page *page,
+			   unsigned int len, unsigned int off, bool is_write,
+			   sector_t sector)
+{
+	phys_addr_t iopmem_off = sector * 512;
+	void *iopmem_addr = iopmem->virt_addr + iopmem_off;
+
+	if (!is_write) {
+		read_iopmem(page, off, iopmem_addr, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		write_iopmem(iopmem_addr, page, off, len);
+	}
+}
+
+static blk_qc_t iopmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct iopmem_device *iopmem = q->queuedata;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		iopmem_do_bvec(iopmem, bvec.bv_page, bvec.bv_len,
+			    bvec.bv_offset, op_is_write(bio_op(bio)),
+			    iter.bi_sector);
+	}
+
+	bio_endio(bio);
+	return BLK_QC_T_NONE;
+}
+
+static int iopmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, bool is_write)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+
+	iopmem_do_bvec(iopmem, page, PAGE_SIZE, 0, is_write, sector);
+	page_endio(page, is_write, 0);
+	return 0;
+}
+
+static long iopmem_direct_access(struct block_device *bdev, sector_t sector,
+			       void **kaddr, pfn_t *pfn, long size)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512;
+
+	if (!iopmem)
+		return -ENODEV;
+
+	*kaddr = iopmem->virt_addr + offset;
+	 *pfn = phys_to_pfn_t(iopmem->phys_addr + offset, PFN_DEV | PFN_MAP);
+
+	return iopmem->size - offset;
+}
+
+static const struct block_device_operations iopmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		iopmem_rw_page,
+	.direct_access =	iopmem_direct_access,
+};
+
+static DEFINE_IDA(iopmem_instance_ida);
+static DEFINE_SPINLOCK(ida_lock);
+
+static int iopmem_set_instance(struct iopmem_device *iopmem)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&iopmem_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&ida_lock);
+		error = ida_get_new(&iopmem_instance_ida, &instance);
+		spin_unlock(&ida_lock);
+
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	iopmem->instance = instance;
+	return 0;
+}
+
+static void iopmem_release_instance(struct iopmem_device *iopmem)
+{
+	spin_lock(&ida_lock);
+	ida_remove(&iopmem_instance_ida, iopmem->instance);
+	spin_unlock(&ida_lock);
+}
+
+static int iopmem_attach_disk(struct iopmem_device *iopmem)
+{
+	struct gendisk *disk;
+	int nid = dev_to_node(iopmem->dev);
+	struct request_queue *q = iopmem->queue;
+
+	blk_queue_write_cache(q, true, true);
+	blk_queue_make_request(q, iopmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	q->queuedata = iopmem;
+
+	disk = alloc_disk_node(0, nid);
+	if (unlikely(!disk))
+		return -ENOMEM;
+
+	disk->fops		= &iopmem_fops;
+	disk->queue		= q;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "iopmem%d", iopmem->instance);
+	set_capacity(disk, iopmem->size / 512);
+	iopmem->disk = disk;
+
+	device_add_disk(iopmem->dev, disk);
+	revalidate_disk(disk);
+
+	return 0;
+}
+
+static void iopmem_detach_disk(struct iopmem_device *iopmem)
+{
+	del_gendisk(iopmem->disk);
+	put_disk(iopmem->disk);
+}
+
+static int iopmem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct iopmem_device *iopmem;
+	struct device *dev;
+	int err = 0;
+	int nid = dev_to_node(&pdev->dev);
+
+	if (pci_enable_device_mem(pdev) < 0) {
+		dev_err(&pdev->dev, "unable to enable device!\n");
+		goto out;
+	}
+
+	iopmem = kzalloc(sizeof(*iopmem), GFP_KERNEL);
+	if (unlikely(!iopmem)) {
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	iopmem->phys_addr = pci_resource_start(pdev, BAR_ID);
+	iopmem->size = pci_resource_end(pdev, BAR_ID) - iopmem->phys_addr + 1;
+	iopmem->dev = dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, iopmem);
+
+	err = iopmem_set_instance(iopmem);
+	if (err)
+		goto out_put_device;
+
+	dev_info(dev, "bar space 0x%llx len %lld\n",
+		(unsigned long long) iopmem->phys_addr,
+		(unsigned long long) iopmem->size);
+
+	if (!devm_request_mem_region(dev, iopmem->phys_addr,
+				     iopmem->size, dev_name(dev))) {
+		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+			 &iopmem->phys_addr, iopmem->size);
+		err = -EBUSY;
+		goto out_release_instance;
+	}
+
+	iopmem->queue = blk_alloc_queue_node(GFP_KERNEL, nid);
+	if (!iopmem->queue) {
+		err = -ENOMEM;
+		goto out_release_instance;
+	}
+
+	iopmem->virt_addr = devm_memremap_pages(dev, &pdev->resource[BAR_ID],
+				&iopmem->queue->q_usage_counter,
+				NULL, MEMREMAP_WC);
+	if (IS_ERR(iopmem->virt_addr)) {
+		err = -ENXIO;
+		goto out_free_queue;
+	}
+
+	err = iopmem_attach_disk(iopmem);
+	if (err)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	blk_cleanup_queue(iopmem->queue);
+out_release_instance:
+	iopmem_release_instance(iopmem);
+out_put_device:
+	put_device(&pdev->dev);
+	kfree(iopmem);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void iopmem_remove(struct pci_dev *pdev)
+{
+	struct iopmem_device *iopmem = pci_get_drvdata(pdev);
+
+	blk_set_queue_dying(iopmem->queue);
+	iopmem_detach_disk(iopmem);
+	blk_cleanup_queue(iopmem->queue);
+	iopmem_release_instance(iopmem);
+	put_device(iopmem->dev);
+	kfree(iopmem);
+	pci_disable_device(pdev);
+}
+
+static struct pci_driver iopmem_pci_driver = {
+	.name = "iopmem",
+	.id_table = iopmem_id_table,
+	.probe = iopmem_probe,
+	.remove = iopmem_remove,
+};
+
+static int __init iopmem_init(void)
+{
+	int rc;
+
+	rc = pci_register_driver(&iopmem_pci_driver);
+	if (rc)
+		return rc;
+
+	pr_info("iopmem: module loaded\n");
+	return 0;
+}
+
+static void __exit iopmem_exit(void)
+{
+	pci_unregister_driver(&iopmem_pci_driver);
+	pr_info("iopmem: module unloaded\n");
+}
+
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_LICENSE("GPL");
+module_init(iopmem_init);
+module_exit(iopmem_exit);
--
2.1.4
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

WARNING: multiple messages have this Message-ID (diff)
From: Stephen Bates <sbates@raithlin.com>
To: linux-kernel@vger.kernel.org, linux-nvdimm@lists.01.org,
	linux-rdma@vger.kernel.org, linux-block@vger.kernel.org,
	linux-mm@kvack.org
Cc: dan.j.williams@intel.com, ross.zwisler@linux.intel.com,
	willy@linux.intel.com, jgunthorpe@obsidianresearch.com,
	haggaie@mellanox.com, hch@infradead.org, axboe@fb.com,
	corbet@lwn.net, jim.macdonald@everspin.com, sbates@raithin.com,
	logang@deltatee.com, Stephen Bates <sbates@raithlin.com>
Subject: [PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory.
Date: Tue, 18 Oct 2016 15:42:16 -0600	[thread overview]
Message-ID: <1476826937-20665-3-git-send-email-sbates@raithlin.com> (raw)
In-Reply-To: <1476826937-20665-1-git-send-email-sbates@raithlin.com>

Add a new block device driver that binds to PCIe devices and turns
PCIe BARs into DAX capable block devices.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 MAINTAINERS            |   7 ++
 drivers/block/Kconfig  |  27 ++++
 drivers/block/Makefile |   1 +
 drivers/block/iopmem.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 368 insertions(+)
 create mode 100644 drivers/block/iopmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..c379f9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6510,6 +6510,13 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
 F:	drivers/iommu/

+IOPMEM BLOCK DEVICE DRVIER
+M:	Stephen Bates <sbates@raithlin.com>
+L:	linux-block@vger.kernel.org
+S:	Maintained
+F:	drivers/block/iopmem.c
+F:	Documentation/blockdev/iopmem.txt
+
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b..13ae1e7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -537,4 +537,31 @@ config BLK_DEV_RSXX
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.

+config BLK_DEV_IOPMEM
+	tristate "Persistent block device backed by PCIe Memory"
+	depends on ZONE_DEVICE
+	default n
+	help
+	  Say Y here if you want to include a generic device driver
+	  that can create a block device from persistent PCIe attached
+	  IO memory.
+
+	  To compile this driver as a module, choose M here: The
+	  module will be called iopmem. A block device will be created
+	  for each PCIe attached device that matches the vendor and
+	  device ID as specified in the module. Alternativel this
+	  driver can be bound to any aribtary PCIe function using the
+	  sysfs bind entry.
+
+	  This block device supports direct access (DAX) file systems
+	  and supports struct page backing for the IO Memory. This
+	  makes the underlying memory suitable for things like RDMA
+	  Memory Regions and Direct IO which is useful for PCIe
+	  peer-to-peer DMA operations.
+
+	  Note that persistent is only assured if the memory on the
+	  PCIe card has some form of power loss protection. This could
+	  be provided via some form of battery, a supercap/NAND combo
+	  or some exciting new persistent memory technology.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..1f4f69b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IOPMEM)	+= iopmem.o

 skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/iopmem.c b/drivers/block/iopmem.c
new file mode 100644
index 0000000..4a1e693
--- /dev/null
+++ b/drivers/block/iopmem.c
@@ -0,0 +1,333 @@
+/*
+ * IOPMEM Block Device Driver
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/pmem.c.
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+
+static const int BAR_ID = 4;
+
+static struct pci_device_id iopmem_id_table[] = {
+	{ PCI_DEVICE(0x11f8, 0xf115) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, iopmem_id_table);
+
+struct iopmem_device {
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct device *dev;
+
+	int instance;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+  /*
+   * We can only access the iopmem device with full 32-bit word
+   * accesses which cannot be gaurantee'd by the regular memcpy
+   */
+
+static void memcpy_from_iopmem(void *dst, const void *src, size_t sz)
+{
+	u64 *wdst = dst;
+	const u64 *wsrc = src;
+	u64 tmp;
+
+	while (sz >= sizeof(*wdst)) {
+		*wdst++ = *wsrc++;
+		sz -= sizeof(*wdst);
+	}
+
+	if (!sz)
+		return;
+
+	tmp = *wsrc;
+	memcpy(wdst, &tmp, sz);
+}
+
+static void write_iopmem(void *iopmem_addr, struct page *page,
+		       unsigned int off, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy(iopmem_addr, mem + off, len);
+	kunmap_atomic(mem);
+}
+
+static void read_iopmem(struct page *page, unsigned int off,
+			void *iopmem_addr, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy_from_iopmem(mem + off, iopmem_addr, len);
+	kunmap_atomic(mem);
+}
+
+static void iopmem_do_bvec(struct iopmem_device *iopmem, struct page *page,
+			   unsigned int len, unsigned int off, bool is_write,
+			   sector_t sector)
+{
+	phys_addr_t iopmem_off = sector * 512;
+	void *iopmem_addr = iopmem->virt_addr + iopmem_off;
+
+	if (!is_write) {
+		read_iopmem(page, off, iopmem_addr, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		write_iopmem(iopmem_addr, page, off, len);
+	}
+}
+
+static blk_qc_t iopmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct iopmem_device *iopmem = q->queuedata;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		iopmem_do_bvec(iopmem, bvec.bv_page, bvec.bv_len,
+			    bvec.bv_offset, op_is_write(bio_op(bio)),
+			    iter.bi_sector);
+	}
+
+	bio_endio(bio);
+	return BLK_QC_T_NONE;
+}
+
+static int iopmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, bool is_write)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+
+	iopmem_do_bvec(iopmem, page, PAGE_SIZE, 0, is_write, sector);
+	page_endio(page, is_write, 0);
+	return 0;
+}
+
+static long iopmem_direct_access(struct block_device *bdev, sector_t sector,
+			       void **kaddr, pfn_t *pfn, long size)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512;
+
+	if (!iopmem)
+		return -ENODEV;
+
+	*kaddr = iopmem->virt_addr + offset;
+	 *pfn = phys_to_pfn_t(iopmem->phys_addr + offset, PFN_DEV | PFN_MAP);
+
+	return iopmem->size - offset;
+}
+
+static const struct block_device_operations iopmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		iopmem_rw_page,
+	.direct_access =	iopmem_direct_access,
+};
+
+static DEFINE_IDA(iopmem_instance_ida);
+static DEFINE_SPINLOCK(ida_lock);
+
+static int iopmem_set_instance(struct iopmem_device *iopmem)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&iopmem_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&ida_lock);
+		error = ida_get_new(&iopmem_instance_ida, &instance);
+		spin_unlock(&ida_lock);
+
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	iopmem->instance = instance;
+	return 0;
+}
+
+static void iopmem_release_instance(struct iopmem_device *iopmem)
+{
+	spin_lock(&ida_lock);
+	ida_remove(&iopmem_instance_ida, iopmem->instance);
+	spin_unlock(&ida_lock);
+}
+
+static int iopmem_attach_disk(struct iopmem_device *iopmem)
+{
+	struct gendisk *disk;
+	int nid = dev_to_node(iopmem->dev);
+	struct request_queue *q = iopmem->queue;
+
+	blk_queue_write_cache(q, true, true);
+	blk_queue_make_request(q, iopmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	q->queuedata = iopmem;
+
+	disk = alloc_disk_node(0, nid);
+	if (unlikely(!disk))
+		return -ENOMEM;
+
+	disk->fops		= &iopmem_fops;
+	disk->queue		= q;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "iopmem%d", iopmem->instance);
+	set_capacity(disk, iopmem->size / 512);
+	iopmem->disk = disk;
+
+	device_add_disk(iopmem->dev, disk);
+	revalidate_disk(disk);
+
+	return 0;
+}
+
+static void iopmem_detach_disk(struct iopmem_device *iopmem)
+{
+	del_gendisk(iopmem->disk);
+	put_disk(iopmem->disk);
+}
+
+static int iopmem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct iopmem_device *iopmem;
+	struct device *dev;
+	int err = 0;
+	int nid = dev_to_node(&pdev->dev);
+
+	if (pci_enable_device_mem(pdev) < 0) {
+		dev_err(&pdev->dev, "unable to enable device!\n");
+		goto out;
+	}
+
+	iopmem = kzalloc(sizeof(*iopmem), GFP_KERNEL);
+	if (unlikely(!iopmem)) {
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	iopmem->phys_addr = pci_resource_start(pdev, BAR_ID);
+	iopmem->size = pci_resource_end(pdev, BAR_ID) - iopmem->phys_addr + 1;
+	iopmem->dev = dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, iopmem);
+
+	err = iopmem_set_instance(iopmem);
+	if (err)
+		goto out_put_device;
+
+	dev_info(dev, "bar space 0x%llx len %lld\n",
+		(unsigned long long) iopmem->phys_addr,
+		(unsigned long long) iopmem->size);
+
+	if (!devm_request_mem_region(dev, iopmem->phys_addr,
+				     iopmem->size, dev_name(dev))) {
+		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+			 &iopmem->phys_addr, iopmem->size);
+		err = -EBUSY;
+		goto out_release_instance;
+	}
+
+	iopmem->queue = blk_alloc_queue_node(GFP_KERNEL, nid);
+	if (!iopmem->queue) {
+		err = -ENOMEM;
+		goto out_release_instance;
+	}
+
+	iopmem->virt_addr = devm_memremap_pages(dev, &pdev->resource[BAR_ID],
+				&iopmem->queue->q_usage_counter,
+				NULL, MEMREMAP_WC);
+	if (IS_ERR(iopmem->virt_addr)) {
+		err = -ENXIO;
+		goto out_free_queue;
+	}
+
+	err = iopmem_attach_disk(iopmem);
+	if (err)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	blk_cleanup_queue(iopmem->queue);
+out_release_instance:
+	iopmem_release_instance(iopmem);
+out_put_device:
+	put_device(&pdev->dev);
+	kfree(iopmem);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void iopmem_remove(struct pci_dev *pdev)
+{
+	struct iopmem_device *iopmem = pci_get_drvdata(pdev);
+
+	blk_set_queue_dying(iopmem->queue);
+	iopmem_detach_disk(iopmem);
+	blk_cleanup_queue(iopmem->queue);
+	iopmem_release_instance(iopmem);
+	put_device(iopmem->dev);
+	kfree(iopmem);
+	pci_disable_device(pdev);
+}
+
+static struct pci_driver iopmem_pci_driver = {
+	.name = "iopmem",
+	.id_table = iopmem_id_table,
+	.probe = iopmem_probe,
+	.remove = iopmem_remove,
+};
+
+static int __init iopmem_init(void)
+{
+	int rc;
+
+	rc = pci_register_driver(&iopmem_pci_driver);
+	if (rc)
+		return rc;
+
+	pr_info("iopmem: module loaded\n");
+	return 0;
+}
+
+static void __exit iopmem_exit(void)
+{
+	pci_unregister_driver(&iopmem_pci_driver);
+	pr_info("iopmem: module unloaded\n");
+}
+
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_LICENSE("GPL");
+module_init(iopmem_init);
+module_exit(iopmem_exit);
--
2.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)
From: Stephen Bates <sbates@raithlin.com>
To: linux-kernel@vger.kernel.org, linux-nvdimm@ml01.01.org,
	linux-rdma@vger.kernel.org, linux-block@vger.kernel.org,
	linux-mm@kvack.org
Cc: dan.j.williams@intel.com, ross.zwisler@linux.intel.com,
	willy@linux.intel.com, jgunthorpe@obsidianresearch.com,
	haggaie@mellanox.com, hch@infradead.org, axboe@fb.com,
	corbet@lwn.net, jim.macdonald@everspin.com, sbates@raithin.com,
	logang@deltatee.com, Stephen Bates <sbates@raithlin.com>
Subject: [PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory.
Date: Tue, 18 Oct 2016 15:42:16 -0600	[thread overview]
Message-ID: <1476826937-20665-3-git-send-email-sbates@raithlin.com> (raw)
In-Reply-To: <1476826937-20665-1-git-send-email-sbates@raithlin.com>

Add a new block device driver that binds to PCIe devices and turns
PCIe BARs into DAX capable block devices.

Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
---
 MAINTAINERS            |   7 ++
 drivers/block/Kconfig  |  27 ++++
 drivers/block/Makefile |   1 +
 drivers/block/iopmem.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 368 insertions(+)
 create mode 100644 drivers/block/iopmem.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7..c379f9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6510,6 +6510,13 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/iommu/
 F:	drivers/iommu/

+IOPMEM BLOCK DEVICE DRVIER
+M:	Stephen Bates <sbates@raithlin.com>
+L:	linux-block@vger.kernel.org
+S:	Maintained
+F:	drivers/block/iopmem.c
+F:	Documentation/blockdev/iopmem.txt
+
 IP MASQUERADING
 M:	Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 39dd30b..13ae1e7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -537,4 +537,31 @@ config BLK_DEV_RSXX
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.

+config BLK_DEV_IOPMEM
+	tristate "Persistent block device backed by PCIe Memory"
+	depends on ZONE_DEVICE
+	default n
+	help
+	  Say Y here if you want to include a generic device driver
+	  that can create a block device from persistent PCIe attached
+	  IO memory.
+
+	  To compile this driver as a module, choose M here: The
+	  module will be called iopmem. A block device will be created
+	  for each PCIe attached device that matches the vendor and
+	  device ID as specified in the module. Alternativel this
+	  driver can be bound to any aribtary PCIe function using the
+	  sysfs bind entry.
+
+	  This block device supports direct access (DAX) file systems
+	  and supports struct page backing for the IO Memory. This
+	  makes the underlying memory suitable for things like RDMA
+	  Memory Regions and Direct IO which is useful for PCIe
+	  peer-to-peer DMA operations.
+
+	  Note that persistent is only assured if the memory on the
+	  PCIe card has some form of power loss protection. This could
+	  be provided via some form of battery, a supercap/NAND combo
+	  or some exciting new persistent memory technology.
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e..1f4f69b 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IOPMEM)	+= iopmem.o

 skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/iopmem.c b/drivers/block/iopmem.c
new file mode 100644
index 0000000..4a1e693
--- /dev/null
+++ b/drivers/block/iopmem.c
@@ -0,0 +1,333 @@
+/*
+ * IOPMEM Block Device Driver
+ * Copyright (c) 2016, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/pmem.c.
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pfn_t.h>
+#include <linux/memremap.h>
+
+static const int BAR_ID = 4;
+
+static struct pci_device_id iopmem_id_table[] = {
+	{ PCI_DEVICE(0x11f8, 0xf115) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, iopmem_id_table);
+
+struct iopmem_device {
+	struct request_queue *queue;
+	struct gendisk *disk;
+	struct device *dev;
+
+	int instance;
+
+	/* One contiguous memory region per device */
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+  /*
+   * We can only access the iopmem device with full 32-bit word
+   * accesses which cannot be gaurantee'd by the regular memcpy
+   */
+
+static void memcpy_from_iopmem(void *dst, const void *src, size_t sz)
+{
+	u64 *wdst = dst;
+	const u64 *wsrc = src;
+	u64 tmp;
+
+	while (sz >= sizeof(*wdst)) {
+		*wdst++ = *wsrc++;
+		sz -= sizeof(*wdst);
+	}
+
+	if (!sz)
+		return;
+
+	tmp = *wsrc;
+	memcpy(wdst, &tmp, sz);
+}
+
+static void write_iopmem(void *iopmem_addr, struct page *page,
+		       unsigned int off, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy(iopmem_addr, mem + off, len);
+	kunmap_atomic(mem);
+}
+
+static void read_iopmem(struct page *page, unsigned int off,
+			void *iopmem_addr, unsigned int len)
+{
+	void *mem = kmap_atomic(page);
+
+	memcpy_from_iopmem(mem + off, iopmem_addr, len);
+	kunmap_atomic(mem);
+}
+
+static void iopmem_do_bvec(struct iopmem_device *iopmem, struct page *page,
+			   unsigned int len, unsigned int off, bool is_write,
+			   sector_t sector)
+{
+	phys_addr_t iopmem_off = sector * 512;
+	void *iopmem_addr = iopmem->virt_addr + iopmem_off;
+
+	if (!is_write) {
+		read_iopmem(page, off, iopmem_addr, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		write_iopmem(iopmem_addr, page, off, len);
+	}
+}
+
+static blk_qc_t iopmem_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct iopmem_device *iopmem = q->queuedata;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		iopmem_do_bvec(iopmem, bvec.bv_page, bvec.bv_len,
+			    bvec.bv_offset, op_is_write(bio_op(bio)),
+			    iter.bi_sector);
+	}
+
+	bio_endio(bio);
+	return BLK_QC_T_NONE;
+}
+
+static int iopmem_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, bool is_write)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+
+	iopmem_do_bvec(iopmem, page, PAGE_SIZE, 0, is_write, sector);
+	page_endio(page, is_write, 0);
+	return 0;
+}
+
+static long iopmem_direct_access(struct block_device *bdev, sector_t sector,
+			       void **kaddr, pfn_t *pfn, long size)
+{
+	struct iopmem_device *iopmem = bdev->bd_queue->queuedata;
+	resource_size_t offset = sector * 512;
+
+	if (!iopmem)
+		return -ENODEV;
+
+	*kaddr = iopmem->virt_addr + offset;
+	 *pfn = phys_to_pfn_t(iopmem->phys_addr + offset, PFN_DEV | PFN_MAP);
+
+	return iopmem->size - offset;
+}
+
+static const struct block_device_operations iopmem_fops = {
+	.owner =		THIS_MODULE,
+	.rw_page =		iopmem_rw_page,
+	.direct_access =	iopmem_direct_access,
+};
+
+static DEFINE_IDA(iopmem_instance_ida);
+static DEFINE_SPINLOCK(ida_lock);
+
+static int iopmem_set_instance(struct iopmem_device *iopmem)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&iopmem_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&ida_lock);
+		error = ida_get_new(&iopmem_instance_ida, &instance);
+		spin_unlock(&ida_lock);
+
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	iopmem->instance = instance;
+	return 0;
+}
+
+static void iopmem_release_instance(struct iopmem_device *iopmem)
+{
+	spin_lock(&ida_lock);
+	ida_remove(&iopmem_instance_ida, iopmem->instance);
+	spin_unlock(&ida_lock);
+}
+
+static int iopmem_attach_disk(struct iopmem_device *iopmem)
+{
+	struct gendisk *disk;
+	int nid = dev_to_node(iopmem->dev);
+	struct request_queue *q = iopmem->queue;
+
+	blk_queue_write_cache(q, true, true);
+	blk_queue_make_request(q, iopmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
+	q->queuedata = iopmem;
+
+	disk = alloc_disk_node(0, nid);
+	if (unlikely(!disk))
+		return -ENOMEM;
+
+	disk->fops		= &iopmem_fops;
+	disk->queue		= q;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "iopmem%d", iopmem->instance);
+	set_capacity(disk, iopmem->size / 512);
+	iopmem->disk = disk;
+
+	device_add_disk(iopmem->dev, disk);
+	revalidate_disk(disk);
+
+	return 0;
+}
+
+static void iopmem_detach_disk(struct iopmem_device *iopmem)
+{
+	del_gendisk(iopmem->disk);
+	put_disk(iopmem->disk);
+}
+
+static int iopmem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct iopmem_device *iopmem;
+	struct device *dev;
+	int err = 0;
+	int nid = dev_to_node(&pdev->dev);
+
+	if (pci_enable_device_mem(pdev) < 0) {
+		dev_err(&pdev->dev, "unable to enable device!\n");
+		goto out;
+	}
+
+	iopmem = kzalloc(sizeof(*iopmem), GFP_KERNEL);
+	if (unlikely(!iopmem)) {
+		err = -ENOMEM;
+		goto out_disable_device;
+	}
+
+	iopmem->phys_addr = pci_resource_start(pdev, BAR_ID);
+	iopmem->size = pci_resource_end(pdev, BAR_ID) - iopmem->phys_addr + 1;
+	iopmem->dev = dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, iopmem);
+
+	err = iopmem_set_instance(iopmem);
+	if (err)
+		goto out_put_device;
+
+	dev_info(dev, "bar space 0x%llx len %lld\n",
+		(unsigned long long) iopmem->phys_addr,
+		(unsigned long long) iopmem->size);
+
+	if (!devm_request_mem_region(dev, iopmem->phys_addr,
+				     iopmem->size, dev_name(dev))) {
+		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
+			 &iopmem->phys_addr, iopmem->size);
+		err = -EBUSY;
+		goto out_release_instance;
+	}
+
+	iopmem->queue = blk_alloc_queue_node(GFP_KERNEL, nid);
+	if (!iopmem->queue) {
+		err = -ENOMEM;
+		goto out_release_instance;
+	}
+
+	iopmem->virt_addr = devm_memremap_pages(dev, &pdev->resource[BAR_ID],
+				&iopmem->queue->q_usage_counter,
+				NULL, MEMREMAP_WC);
+	if (IS_ERR(iopmem->virt_addr)) {
+		err = -ENXIO;
+		goto out_free_queue;
+	}
+
+	err = iopmem_attach_disk(iopmem);
+	if (err)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	blk_cleanup_queue(iopmem->queue);
+out_release_instance:
+	iopmem_release_instance(iopmem);
+out_put_device:
+	put_device(&pdev->dev);
+	kfree(iopmem);
+out_disable_device:
+	pci_disable_device(pdev);
+out:
+	return err;
+}
+
+static void iopmem_remove(struct pci_dev *pdev)
+{
+	struct iopmem_device *iopmem = pci_get_drvdata(pdev);
+
+	blk_set_queue_dying(iopmem->queue);
+	iopmem_detach_disk(iopmem);
+	blk_cleanup_queue(iopmem->queue);
+	iopmem_release_instance(iopmem);
+	put_device(iopmem->dev);
+	kfree(iopmem);
+	pci_disable_device(pdev);
+}
+
+static struct pci_driver iopmem_pci_driver = {
+	.name = "iopmem",
+	.id_table = iopmem_id_table,
+	.probe = iopmem_probe,
+	.remove = iopmem_remove,
+};
+
+static int __init iopmem_init(void)
+{
+	int rc;
+
+	rc = pci_register_driver(&iopmem_pci_driver);
+	if (rc)
+		return rc;
+
+	pr_info("iopmem: module loaded\n");
+	return 0;
+}
+
+static void __exit iopmem_exit(void)
+{
+	pci_unregister_driver(&iopmem_pci_driver);
+	pr_info("iopmem: module unloaded\n");
+}
+
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_LICENSE("GPL");
+module_init(iopmem_init);
+module_exit(iopmem_exit);
--
2.1.4

  parent reply	other threads:[~2016-10-18 22:06 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-18 21:42 [PATCH 0/3] iopmem : A block device for PCIe memory Stephen Bates
2016-10-18 21:42 ` Stephen Bates
2016-10-18 21:42 ` Stephen Bates
2016-10-18 21:42 ` Stephen Bates
2016-10-18 21:42 ` Stephen Bates
2016-10-18 21:42 ` [PATCH 1/3] memremap.c : Add support for ZONE_DEVICE IO memory with struct pages Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-19 17:50   ` Dan Williams
2016-10-19 17:50     ` Dan Williams
2016-10-19 17:50     ` Dan Williams
2016-10-19 17:50     ` Dan Williams
2016-10-19 17:50     ` Dan Williams
2016-10-19 18:40     ` Stephen Bates
2016-10-19 18:40       ` Stephen Bates
2016-10-19 18:40       ` Stephen Bates
2016-10-19 20:01       ` Dan Williams
2016-10-19 20:01         ` Dan Williams
2016-10-19 20:01         ` Dan Williams
2016-10-19 20:01         ` Dan Williams
2016-10-19 20:01         ` Dan Williams
2016-10-25 11:54         ` Stephen Bates
2016-10-25 11:54           ` Stephen Bates
2016-10-25 11:54           ` Stephen Bates
2016-10-25 11:54           ` Stephen Bates
2016-10-18 21:42 ` Stephen Bates [this message]
2016-10-18 21:42   ` [PATCH 2/3] iopmem : Add a block device driver for PCIe attached IO memory Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-28  6:45   ` Christoph Hellwig
2016-10-28  6:45     ` Christoph Hellwig
2016-10-28  6:45     ` Christoph Hellwig
2016-10-28 19:22     ` Logan Gunthorpe
2016-10-28 19:22       ` Logan Gunthorpe
2016-10-28 19:22       ` Logan Gunthorpe
2016-10-18 21:42 ` [PATCH 3/3] iopmem : Add documentation for iopmem driver Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-18 21:42   ` Stephen Bates
2016-10-28  6:46   ` Christoph Hellwig
2016-10-28  6:46     ` Christoph Hellwig
2016-10-28  6:46     ` Christoph Hellwig
2016-10-19  3:51 ` [PATCH 0/3] iopmem : A block device for PCIe memory Dan Williams
2016-10-19  3:51   ` Dan Williams
2016-10-19  3:51   ` Dan Williams
2016-10-19  3:51   ` Dan Williams
2016-10-19 18:48   ` Stephen Bates
2016-10-19 18:48     ` Stephen Bates
2016-10-19 18:48     ` Stephen Bates
2016-10-19 18:48     ` Stephen Bates
2016-10-19 19:58     ` Dan Williams
2016-10-19 19:58       ` Dan Williams
2016-10-19 19:58       ` Dan Williams
2016-10-19 19:58       ` Dan Williams
2016-10-19 22:54       ` Stephen Bates
2016-10-19 22:54         ` Stephen Bates
2016-10-19 22:54         ` Stephen Bates
2016-10-19 22:54         ` Stephen Bates
2016-10-20 23:22     ` Dave Chinner
2016-10-20 23:22       ` Dave Chinner
2016-10-20 23:22       ` Dave Chinner
2016-10-20 23:22       ` Dave Chinner
2016-10-21  9:57       ` Christoph Hellwig
2016-10-21  9:57         ` Christoph Hellwig
2016-10-21  9:57         ` Christoph Hellwig
2016-10-21 11:12         ` Dave Chinner
2016-10-21 11:12           ` Dave Chinner
2016-10-21 11:12           ` Dave Chinner
2016-10-25 11:50           ` Stephen Bates
2016-10-25 11:50             ` Stephen Bates
2016-10-25 11:50             ` Stephen Bates
2016-10-25 21:19             ` Dave Chinner
2016-10-25 21:19               ` Dave Chinner
2016-10-25 21:19               ` Dave Chinner
2016-11-06 14:05               ` Stephen Bates
2016-11-06 14:05                 ` Stephen Bates
2016-11-06 14:05                 ` Stephen Bates
2016-11-06 14:05                 ` Stephen Bates
2016-10-27 10:22         ` Sagi Grimberg
2016-10-27 10:22           ` Sagi Grimberg
2016-10-27 10:22           ` Sagi Grimberg
2016-10-27 12:32           ` Christoph Hellwig
2016-10-27 12:32             ` Christoph Hellwig
2016-10-27 12:32             ` Christoph Hellwig
2016-10-27 12:32             ` Christoph Hellwig
2016-10-26  8:24   ` Haggai Eran
2016-10-26  8:24     ` Haggai Eran
2016-10-26  8:24     ` Haggai Eran
2016-10-26  8:24     ` Haggai Eran
2016-10-26  8:24     ` Haggai Eran
2016-10-26 13:39     ` Dan Williams
2016-10-26 13:39       ` Dan Williams
2016-10-26 13:39       ` Dan Williams
2016-10-26 13:39       ` Dan Williams
2016-10-26 13:39       ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1476826937-20665-3-git-send-email-sbates@raithlin.com \
    --to=sbates@raithlin.com \
    --cc=axboe@fb.com \
    --cc=corbet@lwn.net \
    --cc=dan.j.williams@intel.com \
    --cc=haggaie@mellanox.com \
    --cc=hch@infradead.org \
    --cc=jgunthorpe@obsidianresearch.com \
    --cc=jim.macdonald@everspin.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvdimm@lists.01.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=logang@deltatee.com \
    --cc=ross.zwisler@linux.intel.com \
    --cc=sbates@raithin.com \
    --cc=willy@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.