All of lore.kernel.org
 help / color / mirror / Atom feed
From: Boaz Harrosh <boaz@plexistor.com>
To: Ross Zwisler <ross.zwisler@linux.intel.com>,
	linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, Matthew Wilcox <willy@linux.intel.com>,
	Sagi Manole <sagi@plexistor.com>,
	Yigal Korman <yigal@plexistor.com>
Subject: [RFC 1/9] prd: Initial version of Persistent RAM Driver
Date: Wed, 13 Aug 2014 15:10:32 +0300	[thread overview]
Message-ID: <53EB55B8.3000004@plexistor.com> (raw)
In-Reply-To: <53EB5536.8020702@gmail.com>

From: Ross Zwisler <ross.zwisler@linux.intel.com>

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
---
 drivers/block/Kconfig  |  41 +++++
 drivers/block/Makefile |   1 +
 drivers/block/prd.c    | 398 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 440 insertions(+)
 create mode 100644 drivers/block/prd.c

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 014a1cf..463c45e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -403,6 +403,47 @@ config BLK_DEV_XIP
 	  will prevent RAM block device backing store memory from being
 	  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_PMEM
+	tristate "Persistent memory block device support"
+	help
+	  Saying Y here will allow you to use a contiguous range of reserved
+	  memory as one or more block devices.  Memory for PRD should be
+	  reserved using the "memmap" kernel parameter.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called prd.
+
+	  Most normal users won't need this functionality, and can thus say N
+	  here.
+
+config BLK_DEV_PMEM_START
+	int "Offset in GiB of where to start claiming space"
+	default "0"
+	depends on BLK_DEV_PMEM
+	help
+	  Starting offset in GiB that PRD should use when claiming memory.  This
+	  memory needs to be reserved from the OS at boot time using the
+	  "memmap" kernel parameter.
+
+	  If you provide PRD with volatile memory it will act as a volatile
+	  RAM disk and your data will not be persistent.
+
+config BLK_DEV_PMEM_COUNT
+	int "Default number of PMEM disks"
+	default "4"
+	depends on BLK_DEV_PMEM
+	help
+	  Number of equal sized block devices that PRD should create.
+
+config BLK_DEV_PMEM_SIZE
+	int "Size in GiB of space to claim"
+	depends on BLK_DEV_PMEM
+	default "0"
+	help
+	  Amount of memory in GiB that PRD should use when creating block
+	  devices.  This memory needs to be reserved from the OS at
+	  boot time using the "memmap" kernel parameter.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 02b688d..6e94c61 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_PMEM)	+= prd.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
diff --git a/drivers/block/prd.c b/drivers/block/prd.c
new file mode 100644
index 0000000..7684197
--- /dev/null
+++ b/drivers/block/prd.c
@@ -0,0 +1,398 @@
+/*
+ * Persistent RAM Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * This driver is heavily based on drivers/block/brd.c.
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#define SECTOR_SHIFT		9
+#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
+
+/*
+ * driver-wide physical address and total_size - one single, contiguous memory
+ * region that we divide up in to same-sized devices
+ */
+phys_addr_t	phys_addr;
+void		*virt_addr;
+size_t		total_size;
+
+struct prd_device {
+	int			prd_number;
+
+	struct request_queue	*prd_queue;
+	struct gendisk		*prd_disk;
+	struct list_head	prd_list;
+
+	phys_addr_t		phys_addr;
+	void			*virt_addr;
+	size_t			size;
+};
+
+/*
+ * direct translation from (prd,sector) => void*
+ * We do not require that sector be page aligned.
+ * The return value will point to the beginning of the page containing the
+ * given sector, not to the sector itself.
+ */
+static void *prd_lookup_pg_addr(struct prd_device *prd, sector_t sector)
+{
+	size_t page_offset = sector >> PAGE_SECTORS_SHIFT;
+	size_t offset = page_offset << PAGE_SHIFT;
+
+	BUG_ON(offset >= prd->size);
+	return prd->virt_addr + offset;
+}
+
+/* sector must be page aligned */
+static unsigned long prd_lookup_pfn(struct prd_device *prd, sector_t sector)
+{
+	size_t page_offset = sector >> PAGE_SECTORS_SHIFT;
+
+	BUG_ON(sector & (PAGE_SECTORS - 1));
+	return (prd->phys_addr >> PAGE_SHIFT) + page_offset;
+}
+
+/*
+ * sector is not required to be page aligned.
+ * n is at most a single page, but could be less.
+ */
+static void copy_to_prd(struct prd_device *prd, const void *src,
+			sector_t sector, size_t n)
+{
+	void *dst;
+	unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	BUG_ON(n > PAGE_SIZE);
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	dst = prd_lookup_pg_addr(prd, sector);
+	memcpy(dst + offset, src, copy);
+
+	if (copy < n) {
+		src += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		dst = prd_lookup_pg_addr(prd, sector);
+		memcpy(dst, src, copy);
+	}
+}
+
+/*
+ * sector is not required to be page aligned.
+ * n is at most a single page, but could be less.
+ */
+static void copy_from_prd(void *dst, struct prd_device *prd,
+			  sector_t sector, size_t n)
+{
+	void *src;
+	unsigned int offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	BUG_ON(n > PAGE_SIZE);
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	src = prd_lookup_pg_addr(prd, sector);
+
+	memcpy(dst, src + offset, copy);
+
+	if (copy < n) {
+		dst += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		src = prd_lookup_pg_addr(prd, sector);
+		memcpy(dst, src, copy);
+	}
+}
+
+static void prd_do_bvec(struct prd_device *prd, struct page *page,
+			unsigned int len, unsigned int off, int rw,
+			sector_t sector)
+{
+	void *mem = kmap_atomic(page);
+
+	if (rw == READ) {
+		copy_from_prd(mem + off, prd, sector, len);
+		flush_dcache_page(page);
+	} else {
+		/*
+		 * FIXME: Need more involved flushing to ensure that writes to
+		 * NVDIMMs are actually durable before returning.
+		 */
+		flush_dcache_page(page);
+		copy_to_prd(prd, mem + off, sector, len);
+	}
+
+	kunmap_atomic(mem);
+}
+
+static void prd_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct block_device *bdev = bio->bi_bdev;
+	struct prd_device *prd = bdev->bd_disk->private_data;
+	int rw;
+	struct bio_vec bvec;
+	sector_t sector;
+	struct bvec_iter iter;
+	int err = 0;
+
+	sector = bio->bi_iter.bi_sector;
+	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+		err = -EIO;
+		goto out;
+	}
+
+	BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+	rw = bio_rw(bio);
+	if (rw == READA)
+		rw = READ;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+
+		BUG_ON(len > PAGE_SIZE);
+		prd_do_bvec(prd, bvec.bv_page, len,
+			    bvec.bv_offset, rw, sector);
+		sector += len >> SECTOR_SHIFT;
+	}
+
+out:
+	bio_endio(bio, err);
+}
+
+static long prd_direct_access(struct block_device *bdev, sector_t sector,
+			      void **kaddr, unsigned long *pfn, long size)
+{
+	struct prd_device *prd = bdev->bd_disk->private_data;
+
+	if (!prd)
+		return -ENODEV;
+
+	*kaddr = prd_lookup_pg_addr(prd, sector);
+	*pfn = prd_lookup_pfn(prd, sector);
+
+	return size;
+}
+
+static const struct block_device_operations prd_fops = {
+	.owner =		THIS_MODULE,
+	.direct_access =	prd_direct_access,
+};
+
+/* Kernel module stuff */
+static int prd_start_gb = CONFIG_BLK_DEV_PMEM_START;
+module_param(prd_start_gb, int, S_IRUGO);
+MODULE_PARM_DESC(prd_start_gb, "Offset in GB of where to start claiming space");
+
+static int prd_size_gb = CONFIG_BLK_DEV_PMEM_SIZE;
+module_param(prd_size_gb,  int, S_IRUGO);
+MODULE_PARM_DESC(prd_size_gb,  "Total size in GB of space to claim for all disks");
+
+static int prd_major;
+module_param(prd_major, int, 0);
+MODULE_PARM_DESC(prd_major,  "Major number to request for this driver");
+
+static int prd_count = CONFIG_BLK_DEV_PMEM_COUNT;
+module_param(prd_count, int, S_IRUGO);
+MODULE_PARM_DESC(prd_count, "Number of prd devices to evenly split allocated space");
+
+static LIST_HEAD(prd_devices);
+static DEFINE_MUTEX(prd_devices_mutex);
+
+/* FIXME: move phys_addr, virt_addr, size calls up to caller */
+static struct prd_device *prd_alloc(int i)
+{
+	struct prd_device *prd;
+	struct gendisk *disk;
+	size_t disk_size = total_size / prd_count;
+	size_t disk_sectors =  disk_size / 512;
+
+	prd = kzalloc(sizeof(*prd), GFP_KERNEL);
+	if (!prd)
+		goto out;
+
+	prd->prd_number	= i;
+	prd->phys_addr = phys_addr + i * disk_size;
+	prd->virt_addr = virt_addr + i * disk_size;
+	prd->size = disk_size;
+
+	prd->prd_queue = blk_alloc_queue(GFP_KERNEL);
+	if (!prd->prd_queue)
+		goto out_free_dev;
+
+	blk_queue_make_request(prd->prd_queue, prd_make_request);
+	blk_queue_max_hw_sectors(prd->prd_queue, 1024);
+	blk_queue_bounce_limit(prd->prd_queue, BLK_BOUNCE_ANY);
+
+	disk = prd->prd_disk = alloc_disk(0);
+	if (!disk)
+		goto out_free_queue;
+	disk->major		= prd_major;
+	disk->first_minor	= 0;
+	disk->fops		= &prd_fops;
+	disk->private_data	= prd;
+	disk->queue		= prd->prd_queue;
+	disk->flags		= GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "pmem%d", i);
+	set_capacity(disk, disk_sectors);
+
+	return prd;
+
+out_free_queue:
+	blk_cleanup_queue(prd->prd_queue);
+out_free_dev:
+	kfree(prd);
+out:
+	return NULL;
+}
+
+static void prd_free(struct prd_device *prd)
+{
+	put_disk(prd->prd_disk);
+	blk_cleanup_queue(prd->prd_queue);
+	kfree(prd);
+}
+
+static struct prd_device *prd_init_one(int i)
+{
+	struct prd_device *prd;
+
+	list_for_each_entry(prd, &prd_devices, prd_list) {
+		if (prd->prd_number == i)
+			goto out;
+	}
+
+	prd = prd_alloc(i);
+	if (prd) {
+		add_disk(prd->prd_disk);
+		list_add_tail(&prd->prd_list, &prd_devices);
+	}
+out:
+	return prd;
+}
+
+static void prd_del_one(struct prd_device *prd)
+{
+	list_del(&prd->prd_list);
+	del_gendisk(prd->prd_disk);
+	prd_free(prd);
+}
+
+static struct kobject *prd_probe(dev_t dev, int *part, void *data)
+{
+	struct prd_device *prd;
+	struct kobject *kobj;
+
+	mutex_lock(&prd_devices_mutex);
+	prd = prd_init_one(MINOR(dev));
+	kobj = prd ? get_disk(prd->prd_disk) : NULL;
+	mutex_unlock(&prd_devices_mutex);
+
+	return kobj;
+}
+
+static int __init prd_init(void)
+{
+	int result, i;
+	struct resource *res_mem;
+	struct prd_device *prd, *next;
+
+	phys_addr  = (phys_addr_t) prd_start_gb * 1024 * 1024 * 1024;
+	total_size = (size_t)	   prd_size_gb  * 1024 * 1024 * 1024;
+
+	res_mem = request_mem_region_exclusive(phys_addr, total_size, "prd");
+	if (!res_mem)
+		return -ENOMEM;
+
+	virt_addr = ioremap_cache(phys_addr, total_size);
+
+	if (!virt_addr) {
+		result = -ENOMEM;
+		goto out_release;
+	}
+
+	result = register_blkdev(prd_major, "prd");
+	if (result < 0) {
+		result = -EIO;
+		goto out_unmap;
+	} else if (result > 0)
+		prd_major = result;
+
+	for (i = 0; i < prd_count; i++) {
+		prd = prd_alloc(i);
+		if (!prd) {
+			result = -ENOMEM;
+			goto out_free;
+		}
+		list_add_tail(&prd->prd_list, &prd_devices);
+	}
+
+	list_for_each_entry(prd, &prd_devices, prd_list)
+		add_disk(prd->prd_disk);
+
+	blk_register_region(MKDEV(prd_major, 0), 0,
+				  THIS_MODULE, prd_probe, NULL, NULL);
+
+	pr_info("prd: module loaded\n");
+	return 0;
+
+out_free:
+	list_for_each_entry_safe(prd, next, &prd_devices, prd_list) {
+		list_del(&prd->prd_list);
+		prd_free(prd);
+	}
+	unregister_blkdev(prd_major, "prd");
+
+out_unmap:
+	iounmap(virt_addr);
+
+out_release:
+	release_mem_region(phys_addr, total_size);
+	return result;
+}
+
+static void __exit prd_exit(void)
+{
+	struct prd_device *prd, *next;
+
+	blk_unregister_region(MKDEV(prd_major, 0), 0);
+
+	list_for_each_entry_safe(prd, next, &prd_devices, prd_list)
+		prd_del_one(prd);
+
+	unregister_blkdev(prd_major, "prd");
+	iounmap(virt_addr);
+	release_mem_region(phys_addr, total_size);
+
+	pr_info("prd: module unloaded\n");
+}
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL");
+module_init(prd_init);
+module_exit(prd_exit);
-- 
1.9.3


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2014-08-13 12:10 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-13 12:08 [RFC 0/9] pmem: Support for "struct page" with Persistent Memory storage Boaz Harrosh
2014-08-13 12:08 ` Boaz Harrosh
2014-08-13 12:10 ` Boaz Harrosh [this message]
2014-08-13 12:11 ` [RFC 2/9] prd: add support for rw_page() Boaz Harrosh
2014-08-13 12:12 ` [RFC 3/9] prd: Add getgeo to block ops Boaz Harrosh
2014-08-13 12:14 ` [RFC 4/9] SQUASHME: prd: Fixs to getgeo Boaz Harrosh
2014-08-20 22:10   ` Ross Zwisler
2014-08-21  9:47     ` Boaz Harrosh
2014-08-13 12:16 ` [RFC 5/9] SQUASHME: prd: Last fixes for partitions Boaz Harrosh
2014-08-14 13:04   ` Boaz Harrosh
2014-08-14 13:16     ` Matthew Wilcox
2014-08-14 13:16       ` Matthew Wilcox
2014-08-14 13:55       ` Boaz Harrosh
2014-08-14 13:55         ` Boaz Harrosh
2014-08-14 13:07   ` [PATCH 5/9 v2] " Boaz Harrosh
2014-08-25 20:10     ` Ross Zwisler
2014-08-26  8:18       ` Boaz Harrosh
2014-08-26 17:36         ` Boaz Harrosh
2014-08-26 17:36           ` Boaz Harrosh
2014-08-26 20:34           ` Ross Zwisler
2014-08-26 20:34             ` Ross Zwisler
2014-08-27  9:41             ` Boaz Harrosh
2014-08-27  4:38           ` Matthew Wilcox
2014-08-27  9:55             ` Boaz Harrosh
2014-08-27 12:46               ` Matthew Wilcox
2014-08-27 13:01                 ` Boaz Harrosh
2014-08-20 23:03   ` [RFC 5/9] " Ross Zwisler
2014-08-21 10:05     ` Boaz Harrosh
2014-08-21 10:05       ` Boaz Harrosh
2014-08-13 12:18 ` [RFC 6/9] SQUASHME: prd: Let each prd-device manage private memory region Boaz Harrosh
2014-08-21 16:57   ` Ross Zwisler
2014-08-13 12:20 ` [RFC 7/9] SQUASHME: prd: Support of multiple memory regions Boaz Harrosh
2014-08-25 23:02   ` Ross Zwisler
2014-08-25 23:02     ` Ross Zwisler
2014-08-13 12:21 ` [RFC 8/9] mm: export sparse_add/remove_one_section Boaz Harrosh
2014-08-13 12:26 ` [RFC 9/9] prd: Add support for page struct mapping Boaz Harrosh
2014-08-15 20:28   ` Toshi Kani
2014-08-17  9:17     ` Boaz Harrosh
2014-08-18 19:48       ` Toshi Kani
2014-08-19  8:40         ` Boaz Harrosh
2014-08-19 16:49           ` Toshi Kani
2014-08-22 14:36   ` Dave Hansen
2014-08-22 14:36     ` Dave Hansen
2014-09-09 16:16     ` Boaz Harrosh
2014-09-09 16:29       ` Dave Hansen
2014-08-20 20:13 ` [RFC 0/9] pmem: Support for "struct page" with Persistent Memory storage Ross Zwisler

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=53EB55B8.3000004@plexistor.com \
    --to=boaz@plexistor.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ross.zwisler@linux.intel.com \
    --cc=sagi@plexistor.com \
    --cc=willy@linux.intel.com \
    --cc=yigal@plexistor.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.