* [RFC 1/4] cleancache: make put_page async possible
2013-09-26 14:14 [RFC 0/4] cleancache: SSD backed cleancache backend Shaohua Li
@ 2013-09-26 14:14 ` Shaohua Li
2013-09-26 14:14 ` [RFC 2/4] cleancache: make get_page " Shaohua Li
` (3 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Shaohua Li @ 2013-09-26 14:14 UTC (permalink / raw)
To: linux-mm; +Cc: sjenning, bob.liu, dan.magenheimer
[-- Attachment #1: cleancache-async-put_page.patch --]
[-- Type: text/plain, Size: 5061 bytes --]
Previously put_page must store page in sync way. This patch makes it possible
that put_page stores page in async way. To store page in async way, put_page
just increases the page reference, stores the page in other context, and
finally free the page at proper time.
In the page reclaim code path, put_page is called with page reference 0. Since
I need increase page reference, some page reference checks are relieved.
Signed-off-by: Shaohua Li <shli@kernel.org>
---
include/linux/mm.h | 5 -----
include/linux/pagemap.h | 1 -
mm/filemap.c | 7 ++++++-
mm/vmscan.c | 21 ++++++++++++++++-----
4 files changed, 22 insertions(+), 12 deletions(-)
Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h 2013-09-26 21:12:15.479396069 +0800
+++ linux/include/linux/mm.h 2013-09-26 21:12:15.471392582 +0800
@@ -414,11 +414,6 @@ static inline void get_page(struct page
if (unlikely(PageTail(page)))
if (likely(__get_page_tail(page)))
return;
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
atomic_inc(&page->_count);
}
Index: linux/include/linux/pagemap.h
===================================================================
--- linux.orig/include/linux/pagemap.h 2013-09-26 21:12:15.479396069 +0800
+++ linux/include/linux/pagemap.h 2013-09-26 21:12:15.475394311 +0800
@@ -210,7 +210,6 @@ static inline int page_freeze_refs(struc
static inline void page_unfreeze_refs(struct page *page, int count)
{
- VM_BUG_ON(page_count(page) != 0);
VM_BUG_ON(count == 0);
atomic_set(&page->_count, count);
Index: linux/mm/filemap.c
===================================================================
--- linux.orig/mm/filemap.c 2013-09-26 21:12:15.479396069 +0800
+++ linux/mm/filemap.c 2013-09-26 21:12:15.475394311 +0800
@@ -117,17 +117,22 @@ void __delete_from_page_cache(struct pag
struct address_space *mapping = page->mapping;
trace_mm_filemap_delete_from_page_cache(page);
+
+ radix_tree_delete(&mapping->page_tree, page->index);
+
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
* stale data around in the cleancache once our page is gone
+ * Do this after page is removed from radix tree. put_page might
+ * increase refcnt, we don't want to break speculative get page
+ * protocol.
*/
if (PageUptodate(page) && PageMappedToDisk(page))
cleancache_put_page(page);
else
cleancache_invalidate_page(mapping, page);
- radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
Index: linux/mm/vmscan.c
===================================================================
--- linux.orig/mm/vmscan.c 2013-09-26 21:12:15.479396069 +0800
+++ linux/mm/vmscan.c 2013-09-26 21:12:15.475394311 +0800
@@ -570,8 +570,7 @@ cannot_free:
/*
* Attempt to detach a locked page from its ->mapping. If it is dirty or if
* someone else has a ref on the page, abort and return 0. If it was
- * successfully detached, return 1. Assumes the caller has a single ref on
- * this page.
+ * successfully detached, return 1.
*/
int remove_mapping(struct address_space *mapping, struct page *page)
{
@@ -581,7 +580,7 @@ int remove_mapping(struct address_space
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_unfreeze_refs(page, 1);
+ page_unfreeze_refs(page, 1 + page_count(page));
return 1;
}
return 0;
@@ -782,7 +781,7 @@ static unsigned long shrink_page_list(st
struct page *page;
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
- bool dirty, writeback;
+ bool dirty, writeback, free_page = true;
cond_resched();
@@ -1049,16 +1048,28 @@ static unsigned long shrink_page_list(st
goto keep_locked;
/*
+ * there is a case cleancache eats this page, it will free this
+ * page after the page is unlocked
+ */
+ free_page = page_count(page) == 0;
+
+ /*
* At this point, we have no other references and there is
* no way to pick any more up (removed from LRU, removed
* from pagecache). Can use non-atomic bitops now (and
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ if (free_page)
+ __clear_page_locked(page);
+ else
+ unlock_page(page);
free_it:
nr_reclaimed++;
+ if (!free_page)
+ continue;
+
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC 2/4] cleancache: make get_page async possible
2013-09-26 14:14 [RFC 0/4] cleancache: SSD backed cleancache backend Shaohua Li
2013-09-26 14:14 ` [RFC 1/4] cleancache: make put_page async possible Shaohua Li
@ 2013-09-26 14:14 ` Shaohua Li
2013-09-26 14:14 ` [RFC 3/4] cleancache: invalidate cache at dirty page Shaohua Li
` (2 subsequent siblings)
4 siblings, 0 replies; 10+ messages in thread
From: Shaohua Li @ 2013-09-26 14:14 UTC (permalink / raw)
To: linux-mm; +Cc: sjenning, bob.liu, dan.magenheimer
[-- Attachment #1: cleancache-async-get_page.patch --]
[-- Type: text/plain, Size: 6014 bytes --]
Make cleancache get_page support async page fetch. Just normal page read,
cleancache unlock the page after page fetch is finished.
But we don't support IO error from cleancache get_page. That is if cleancache
get_page fails, we can't fallback to normal page read.
Signed-off-by: Shaohua Li <shli@kernel.org>
---
drivers/xen/tmem.c | 8 +++++---
fs/btrfs/extent_io.c | 10 ++++++++--
fs/mpage.c | 15 ++++++++++++---
include/linux/cleancache.h | 11 +++++++----
mm/cleancache.c | 5 +++--
5 files changed, 35 insertions(+), 14 deletions(-)
Index: linux/fs/btrfs/extent_io.c
===================================================================
--- linux.orig/fs/btrfs/extent_io.c 2013-09-26 21:21:14.530330681 +0800
+++ linux/fs/btrfs/extent_io.c 2013-09-26 21:21:14.522330771 +0800
@@ -2530,6 +2530,12 @@ readpage_ok:
bio_put(bio);
}
+static void extent_end_get_page(struct page *page, int err)
+{
+ SetPageUptodate(page);
+ unlock_page(page);
+}
+
/*
* this allocates from the btrfs_bioset. We're returning a bio right now
* but you can call btrfs_io_bio for the appropriate container_of magic
@@ -2770,10 +2776,10 @@ static int __do_readpage(struct extent_i
end = page_end;
if (!PageUptodate(page)) {
- if (cleancache_get_page(page) == 0) {
+ if (cleancache_get_page(page, extent_end_get_page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
unlock_extent(tree, start, end);
- goto out;
+ return 0;
}
}
Index: linux/fs/mpage.c
===================================================================
--- linux.orig/fs/mpage.c 2013-09-26 21:21:14.530330681 +0800
+++ linux/fs/mpage.c 2013-09-26 21:21:14.522330771 +0800
@@ -71,6 +71,14 @@ static void mpage_end_io(struct bio *bio
bio_put(bio);
}
+static void mpage_end_get_page(struct page *page, int err)
+{
+ /* We don't support IO error so far */
+ WARN_ON(err);
+ SetPageUptodate(page);
+ unlock_page(page);
+}
+
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
@@ -273,9 +281,10 @@ do_mpage_readpage(struct bio *bio, struc
}
if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
- cleancache_get_page(page) == 0) {
- SetPageUptodate(page);
- goto confused;
+ cleancache_get_page(page, mpage_end_get_page) == 0) {
+ if (bio)
+ bio = mpage_bio_submit(READ, bio);
+ goto out;
}
/*
Index: linux/include/linux/cleancache.h
===================================================================
--- linux.orig/include/linux/cleancache.h 2013-09-26 21:21:14.530330681 +0800
+++ linux/include/linux/cleancache.h 2013-09-26 21:21:14.526330726 +0800
@@ -25,7 +25,8 @@ struct cleancache_ops {
int (*init_fs)(size_t);
int (*init_shared_fs)(char *uuid, size_t);
int (*get_page)(int, struct cleancache_filekey,
- pgoff_t, struct page *);
+ pgoff_t, struct page *,
+ void (*end_get_page)(struct page *, int err));
void (*put_page)(int, struct cleancache_filekey,
pgoff_t, struct page *);
void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t);
@@ -37,7 +38,8 @@ extern struct cleancache_ops *
cleancache_register_ops(struct cleancache_ops *ops);
extern void __cleancache_init_fs(struct super_block *);
extern void __cleancache_init_shared_fs(char *, struct super_block *);
-extern int __cleancache_get_page(struct page *);
+extern int __cleancache_get_page(struct page *,
+ void (*end_get_page)(struct page *page, int err));
extern void __cleancache_put_page(struct page *);
extern void __cleancache_invalidate_page(struct address_space *, struct page *);
extern void __cleancache_invalidate_inode(struct address_space *);
@@ -84,12 +86,13 @@ static inline void cleancache_init_share
__cleancache_init_shared_fs(uuid, sb);
}
-static inline int cleancache_get_page(struct page *page)
+static inline int cleancache_get_page(struct page *page,
+ void (*end_get_page)(struct page *page, int err))
{
int ret = -1;
if (cleancache_enabled && cleancache_fs_enabled(page))
- ret = __cleancache_get_page(page);
+ ret = __cleancache_get_page(page, end_get_page);
return ret;
}
Index: linux/mm/cleancache.c
===================================================================
--- linux.orig/mm/cleancache.c 2013-09-26 21:21:14.530330681 +0800
+++ linux/mm/cleancache.c 2013-09-26 21:21:14.526330726 +0800
@@ -225,7 +225,8 @@ static int get_poolid_from_fake(int fake
* a backend is registered and whether the sb->cleancache_poolid
* is correct.
*/
-int __cleancache_get_page(struct page *page)
+int __cleancache_get_page(struct page *page,
+ void (*end_get_page)(struct page *page, int err))
{
int ret = -1;
int pool_id;
@@ -248,7 +249,7 @@ int __cleancache_get_page(struct page *p
if (pool_id >= 0)
ret = cleancache_ops->get_page(pool_id,
- key, page->index, page);
+ key, page->index, page, end_get_page);
if (ret == 0)
cleancache_succ_gets++;
else
Index: linux/drivers/xen/tmem.c
===================================================================
--- linux.orig/drivers/xen/tmem.c 2013-09-26 21:21:14.530330681 +0800
+++ linux/drivers/xen/tmem.c 2013-09-26 21:21:14.526330726 +0800
@@ -184,7 +184,8 @@ static void tmem_cleancache_put_page(int
}
static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key,
- pgoff_t index, struct page *page)
+ pgoff_t index, struct page *page,
+ void (*end_get_page)(struct page *, int))
{
u32 ind = (u32) index;
struct tmem_oid oid = *(struct tmem_oid *)&key;
@@ -197,9 +198,10 @@ static int tmem_cleancache_get_page(int
if (ind != index)
return -1;
ret = xen_tmem_get_page((u32)pool, oid, ind, pfn);
- if (ret == 1)
+ if (ret == 1) {
+ end_get_page(page, 0);
return 0;
- else
+ } else
return -1;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [RFC 4/4] cleancache: SSD backed cleancache backend
2013-09-26 14:14 [RFC 0/4] cleancache: SSD backed cleancache backend Shaohua Li
` (2 preceding siblings ...)
2013-09-26 14:14 ` [RFC 3/4] cleancache: invalidate cache at dirty page Shaohua Li
@ 2013-09-26 14:14 ` Shaohua Li
2013-09-26 16:14 ` [RFC 0/4] " Seth Jennings
4 siblings, 0 replies; 10+ messages in thread
From: Shaohua Li @ 2013-09-26 14:14 UTC (permalink / raw)
To: linux-mm; +Cc: sjenning, bob.liu, dan.magenheimer
[-- Attachment #1: cleancache-backend-ssd.patch --]
[-- Type: text/plain, Size: 25351 bytes --]
This is a cleancache backend which caches page to disk, usually a SSD. The
usage model is similar like Windows readyboost. Eg, user plugs a USB drive,
and we use the USB drive to cache clean pages to reduce IO to hard disks.
The storage algorithm is quite simple so far. We just store pages in disk
sequentially. If there is no space left, just reclaim disk space sequentially
too. So write should have very good performance (and we aggregate write too).
metadata is in memory, so this doesn't work well for big size disk.
Signed-off-by: Shaohua Li <shli@kernel.org>
---
mm/Kconfig | 7
mm/Makefile | 1
mm/ssd-cleancache.c | 932 ++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 940 insertions(+)
Index: linux/mm/ssd-cleancache.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux/mm/ssd-cleancache.c 2013-09-26 21:38:45.417119257 +0800
@@ -0,0 +1,932 @@
+#include <linux/kernel.h>
+#include <linux/cleancache.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/hashtable.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+
+/* For each inode */
+struct cache_inode {
+ struct radix_tree_root pages; /* radix tree leaf stores disk location */
+ unsigned free:1;
+ spinlock_t pages_lock; /* protect above */
+
+ struct rb_node rb_node;
+ unsigned long rb_index;
+ atomic_t refcnt;
+
+ struct cache_fs *fs;
+};
+
+/* For each fs */
+struct cache_fs {
+ struct rb_root inodes;
+ bool valid;
+};
+
+struct io_slot {
+ struct cache_inode *inode;
+ pgoff_t index;
+ struct page *page;
+ sector_t sect;
+
+ void (*end_get_page)(struct page *page, int err);
+ void (*end_io)(struct io_slot *slot, int err);
+
+ struct list_head link;
+ struct hlist_node hash_link;
+
+ unsigned rw:1;
+ unsigned abort:1;
+};
+
+#define SSDCACHE_GFP (__GFP_NORETRY|__GFP_NOWARN)
+
+#define BATCH_IOPAGES_NR (128*1024/PAGE_SIZE)
+#define MAX_WRITE_PERCENTAGE 5
+static unsigned long max_pending_write_pages __read_mostly;
+
+static struct block_device *cache_bdev;
+static char *blkdev_name;
+module_param_named(cache_device_name, blkdev_name, charp, 0);
+
+struct cache_meta {
+ struct cache_inode *inode;
+ pgoff_t index;
+};
+
+#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
+#define META_PAGE_ENTRY_NR (PAGE_SIZE / sizeof(struct cache_meta))
+static struct cache_meta *cache_meta;
+static unsigned long data_alloc_index = 1;
+static unsigned long data_total_pages;
+static DEFINE_SPINLOCK(meta_lock);
+
+static void ssdcache_prepare_reclaim_inode_page(struct cache_inode *inode);
+static void ssdcache_reclaim_inode_page(struct cache_inode *inode,
+ pgoff_t index);
+/* alloc can sleep, free not */
+static sector_t ssdcache_alloc_sector(struct cache_inode *inode,
+ pgoff_t index)
+{
+ sector_t sect;
+ struct cache_inode *reclaim_inode;
+ pgoff_t reclaim_index;
+ unsigned long flags;
+ bool reclaim_run = false;
+
+ spin_lock_irqsave(&meta_lock, flags);
+again:
+ /* we must skip sector 0, as 0 == NULL */
+ if (cache_meta[data_alloc_index].inode == NULL) {
+ cache_meta[data_alloc_index].inode = inode;
+ cache_meta[data_alloc_index].index = index;
+ sect = data_alloc_index << PAGE_SECTOR_SHIFT;
+ data_alloc_index = (data_alloc_index + 1) % data_total_pages;
+ if (data_alloc_index == 0)
+ data_alloc_index = 1;
+ spin_unlock_irqrestore(&meta_lock, flags);
+ return sect;
+ }
+
+ /* The slot is busy IO */
+ if (reclaim_run) {
+ data_alloc_index = (data_alloc_index + 1) % data_total_pages;
+ if (data_alloc_index == 0)
+ data_alloc_index = 1;
+ reclaim_run = false;
+ goto again;
+ }
+
+ /*
+ * We can make sure the inode is valid, because ssdcache_free_sector
+ * holds meta_lock too. If sector isn't freed, inode isn't freed
+ */
+ reclaim_inode = cache_meta[data_alloc_index].inode;
+ reclaim_index = cache_meta[data_alloc_index].index;
+ ssdcache_prepare_reclaim_inode_page(reclaim_inode);
+ spin_unlock_irqrestore(&meta_lock, flags);
+
+ ssdcache_reclaim_inode_page(reclaim_inode, reclaim_index);
+ reclaim_run = true;
+
+ spin_lock_irqsave(&meta_lock, flags);
+ goto again;
+}
+
+static void ssdcache_free_sector(sector_t sect)
+{
+ unsigned long flags;
+
+ pgoff_t index = sect >> PAGE_SECTOR_SHIFT;
+
+ spin_lock_irqsave(&meta_lock, flags);
+ BUG_ON(cache_meta[index].inode == NULL);
+ cache_meta[index].inode = NULL;
+ cache_meta[index].index = 0;
+ spin_unlock_irqrestore(&meta_lock, flags);
+}
+
+static void ssdcache_access_sector(sector_t sect)
+{
+ /* maybe a lru algorithm */
+}
+
+#define IOSLOT_HASH_BITS 8
+static DEFINE_HASHTABLE(io_slot_hashtbl, IOSLOT_HASH_BITS);
+static LIST_HEAD(write_io_slots);
+static unsigned long pending_write_nr, total_write_nr;
+static DEFINE_SPINLOCK(io_lock);
+
+static unsigned long ssdcache_io_slot_hash(struct cache_inode *inode,
+ pgoff_t index)
+{
+ return hash_ptr(inode, IOSLOT_HASH_BITS) ^
+ hash_long(index, IOSLOT_HASH_BITS);
+}
+
+static struct io_slot *__ssdcache_find_io_slot(struct cache_inode *inode,
+ pgoff_t index)
+{
+ struct io_slot *slot;
+
+ hash_for_each_possible(io_slot_hashtbl, slot, hash_link,
+ ssdcache_io_slot_hash(inode, index)) {
+ if (slot->inode == inode && slot->index == index)
+ return slot;
+ }
+ return NULL;
+}
+
+static struct io_slot *__ssdcache_get_io_slot(int rw)
+{
+ struct io_slot *slot;
+
+ if (rw == WRITE && total_write_nr >= max_pending_write_pages) {
+ return NULL;
+ }
+
+ slot = kmalloc(sizeof(*slot), SSDCACHE_GFP);
+ if (!slot) {
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&slot->link);
+ INIT_HLIST_NODE(&slot->hash_link);
+
+ slot->abort = 0;
+
+ if (rw == WRITE)
+ total_write_nr++;
+ return slot;
+}
+
+static void __ssdcache_put_io_slot(struct io_slot *slot)
+{
+ list_del(&slot->link);
+ hlist_del(&slot->hash_link);
+
+ if (slot->rw == WRITE)
+ total_write_nr--;
+ kfree(slot);
+}
+
+static void __ssdcache_io_slot_add_hash(struct io_slot *slot)
+{
+ hash_add(io_slot_hashtbl, &slot->hash_link,
+ ssdcache_io_slot_hash(slot->inode, slot->index));
+}
+
+static void ssdcache_wakeup_worker(void);
+static void __ssdcache_queue_io_slot_write(struct io_slot *slot)
+{
+ list_add_tail(&slot->link, &write_io_slots);
+ pending_write_nr++;
+ if (pending_write_nr >= BATCH_IOPAGES_NR)
+ ssdcache_wakeup_worker();
+}
+
+static void __ssdcache_io_slot_peek_write(struct list_head *list_head)
+{
+ if (pending_write_nr < BATCH_IOPAGES_NR)
+ return;
+ pending_write_nr = 0;
+
+ list_splice_init(&write_io_slots, list_head);
+}
+
+static void ssdcache_io_slot_end_bio(struct bio *bio, int err)
+{
+ struct io_slot *slot = bio->bi_private;
+
+ slot->end_io(slot, err);
+ bio_put(bio);
+}
+
+static int ssdcache_io_slot_submit(struct io_slot *slot)
+{
+ struct bio *bio;
+
+ bio = bio_alloc(SSDCACHE_GFP, 1);
+ if (!bio)
+ return -EINVAL;
+ bio->bi_sector = slot->sect;
+ bio->bi_io_vec[0].bv_page = slot->page;
+ bio->bi_io_vec[0].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[0].bv_offset = 0;
+ bio->bi_vcnt = 1;
+ bio->bi_size = PAGE_SIZE;
+ bio->bi_end_io = ssdcache_io_slot_end_bio;
+
+ bio->bi_bdev = cache_bdev;
+ bio->bi_private = slot;
+
+ submit_bio(slot->rw, bio);
+ return 0;
+}
+
+#define SSDCACHE_MAGIC 0x10293a656c656c09
+struct ssdcache_super {
+ char bootbits[1024];
+ uint64_t magic;
+} __attribute__((packed));
+
+static int ssdcache_io_init(void)
+{
+ fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
+ ssize_t old_blocksize;
+ sector_t max_sector;
+ struct buffer_head *bh;
+ struct ssdcache_super *super;
+ int error;
+
+ cache_bdev = blkdev_get_by_path(blkdev_name, mode, ssdcache_io_init);
+ if (IS_ERR(cache_bdev))
+ return PTR_ERR(cache_bdev);
+
+ old_blocksize = block_size(cache_bdev);
+ error = set_blocksize(cache_bdev, PAGE_SIZE);
+ if (error < 0) {
+ blkdev_put(cache_bdev, mode);
+ return error;
+ }
+
+ bh = __bread(cache_bdev, 0, PAGE_SIZE);
+ if (!bh)
+ goto error;
+ super = (struct ssdcache_super *)bh->b_data;
+ if (super->magic != cpu_to_le64(SSDCACHE_MAGIC)) {
+ printk(KERN_ERR"Wrong magic number in disk\n");
+ brelse(bh);
+ goto error;
+ }
+ brelse(bh);
+
+ max_sector = i_size_read(cache_bdev->bd_inode) >> 9;
+ max_sector = rounddown(max_sector,
+ META_PAGE_ENTRY_NR << PAGE_SECTOR_SHIFT);
+ data_total_pages = max_sector >> PAGE_SECTOR_SHIFT;
+ cache_meta = vzalloc(data_total_pages / META_PAGE_ENTRY_NR * PAGE_SIZE);
+ if (!cache_meta)
+ goto error;
+
+ max_pending_write_pages = totalram_pages * MAX_WRITE_PERCENTAGE / 100;
+ return 0;
+error:
+ set_blocksize(cache_bdev, old_blocksize);
+ blkdev_put(cache_bdev, mode);
+ return -ENOMEM;
+}
+
+#define MAX_INITIALIZABLE_FS 32
+static struct cache_fs cache_fs_array[MAX_INITIALIZABLE_FS];
+static int cache_fs_nr;
+static DEFINE_SPINLOCK(cache_fs_lock);
+
+static wait_queue_head_t io_wait;
+
+/*
+ * Cleancache ops types: G(et), P(ut), I(nvalidate), I(nvalidate)I(node).
+ * Since we make P async now, put has a sync part (P) and async part (AP).
+ *
+ * P, G, I gets page lock, so run exclusive
+ * AP can run any time
+ * II doesn't hold any lock, so can run any time
+ */
+
+static struct cache_inode *ssdcache_get_inode(struct cache_fs *fs,
+ unsigned long index, bool create)
+{
+ struct cache_inode *inode;
+ struct rb_node **rb_link, *rb_parent, *rb_prev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache_fs_lock, flags);
+ rb_link = &fs->inodes.rb_node;
+ rb_prev = rb_parent = NULL;
+
+ while (*rb_link) {
+ rb_parent = *rb_link;
+ inode = rb_entry(rb_parent, struct cache_inode, rb_node);
+ if (inode->rb_index > index)
+ rb_link = &rb_parent->rb_left;
+ else if (inode->rb_index < index) {
+ rb_prev = rb_parent;
+ rb_link = &rb_parent->rb_right;
+ } else {
+ atomic_inc(&inode->refcnt);
+ spin_unlock_irqrestore(&cache_fs_lock, flags);
+ return inode;
+ }
+ }
+
+ if (!create) {
+ spin_unlock_irqrestore(&cache_fs_lock, flags);
+ return NULL;
+ }
+
+ inode = kmalloc(sizeof(*inode), SSDCACHE_GFP);
+ if (!inode) {
+ spin_unlock_irqrestore(&cache_fs_lock, flags);
+ return NULL;
+ }
+
+ INIT_RADIX_TREE(&inode->pages, SSDCACHE_GFP);
+ spin_lock_init(&inode->pages_lock);
+ inode->rb_index = index;
+ rb_link_node(&inode->rb_node, rb_parent, rb_link);
+ rb_insert_color(&inode->rb_node, &fs->inodes);
+ atomic_set(&inode->refcnt, 2);
+ inode->free = 0;
+ inode->fs = fs;
+
+ spin_unlock_irqrestore(&cache_fs_lock, flags);
+ return inode;
+}
+
+static void ssdcache_put_inode(struct cache_inode *inode)
+{
+ if (atomic_dec_and_test(&inode->refcnt)) {
+ BUG_ON(!inode->free);
+
+ kfree(inode);
+ }
+}
+
+/* put and optionally abort slot */
+static void ssdcache_put_abort_slot(struct io_slot *slot)
+{
+ struct cache_inode *inode = slot->inode;
+ unsigned long flags;
+
+ spin_lock_irqsave(&io_lock, flags);
+ if (slot->abort) {
+ spin_unlock(&io_lock);
+ spin_lock(&inode->pages_lock);
+
+ radix_tree_delete(&inode->pages, slot->index);
+
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ ssdcache_free_sector(slot->sect);
+
+ spin_lock_irqsave(&io_lock, flags);
+ }
+
+ __ssdcache_put_io_slot(slot);
+ spin_unlock_irqrestore(&io_lock, flags);
+}
+
+static void ssdcache_put_page_endio(struct io_slot *slot, int err)
+{
+ struct cache_inode *inode = slot->inode;
+
+ page_cache_release(slot->page);
+
+ /* if another P or II, abort is set */
+ ssdcache_put_abort_slot(slot);
+
+ ssdcache_put_inode(inode);
+}
+
+static void ssdcache_do_put_page(struct io_slot *slot)
+{
+ struct cache_inode *inode = slot->inode;
+ unsigned long flags;
+ sector_t sect, old_sect;
+ int err;
+
+ /* Make sure page reclaim isn't using this page */
+ lock_page(slot->page);
+ __clear_page_locked(slot->page);
+
+ sect = ssdcache_alloc_sector(inode, slot->index);
+
+ spin_lock_irqsave(&inode->pages_lock, flags);
+
+ old_sect = (sector_t)radix_tree_delete(&inode->pages, slot->index);
+
+ if (inode->free)
+ goto error;
+
+ err = radix_tree_insert(&inode->pages, slot->index, (void *)sect);
+ if (err)
+ goto error;
+
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ /* submit IO here */
+ slot->sect = sect;
+ err = ssdcache_io_slot_submit(slot);
+ if (err)
+ goto error_io;
+
+ if (old_sect)
+ ssdcache_free_sector(old_sect);
+ return;
+
+error_io:
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ radix_tree_delete(&inode->pages, slot->index);
+error:
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ if (old_sect)
+ ssdcache_free_sector(old_sect);
+ /* It's impossible sect is freed by invalidate_inode as io_slot exists */
+ ssdcache_free_sector(sect);
+
+ page_cache_release(slot->page);
+
+ spin_lock_irqsave(&io_lock, flags);
+ __ssdcache_put_io_slot(slot);
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ ssdcache_put_inode(inode);
+}
+
+static void ssdcache_put_page(int pool_id, struct cleancache_filekey key,
+ pgoff_t index, struct page *page)
+{
+ struct cache_fs *fs = cache_fs_array + pool_id;
+ struct cache_inode *inode;
+ unsigned long ino;
+ struct io_slot *slot;
+ sector_t sect;
+ unsigned long flags;
+
+ /* we don't support filehandle */
+ ino = key.u.ino;
+
+ inode = ssdcache_get_inode(fs, ino, true);
+ if (!inode)
+ return;
+
+ spin_lock_irqsave(&io_lock, flags);
+ slot = __ssdcache_find_io_slot(inode, index);
+ if (slot) {
+ /*
+ * AP -> P case. we ignore P and make AP abort. AP record
+ * should be deleted. AP and P are using different pages.
+ */
+ BUG_ON(slot->rw != WRITE);
+ slot->abort = 1;
+ spin_unlock_irqrestore(&io_lock, flags);
+ ssdcache_put_inode(inode);
+ return;
+ }
+
+ slot = __ssdcache_get_io_slot(WRITE);
+ if (!slot)
+ goto unlock;
+
+ slot->inode = inode;
+ slot->index = index;
+ slot->page = page;
+ slot->end_io = ssdcache_put_page_endio;
+ slot->rw = WRITE;
+ __ssdcache_io_slot_add_hash(slot);
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ sect = (sector_t)radix_tree_lookup(&inode->pages, index);
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ /* the page isn't changed since last put */
+ if (sect != 0) {
+ /* II could run here */
+ ssdcache_access_sector(sect);
+ ssdcache_put_abort_slot(slot);
+
+ ssdcache_put_inode(inode);
+ return;
+ }
+
+ page_cache_get(page);
+
+ spin_lock_irqsave(&io_lock, flags);
+ __ssdcache_queue_io_slot_write(slot);
+ spin_unlock_irqrestore(&io_lock, flags);
+ return;
+unlock:
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ sect = (sector_t)radix_tree_delete(&inode->pages, index);
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+ ssdcache_put_inode(inode);
+ if (sect != 0)
+ ssdcache_free_sector(sect);
+}
+
+
+static void ssdcache_get_page_endio(struct io_slot *slot, int err)
+{
+ void (*end_get_page)(struct page *, int) = slot->end_get_page;
+ struct cache_inode *inode = slot->inode;
+ struct page *page = slot->page;
+
+ /* if II, abort is set */
+ ssdcache_put_abort_slot(slot);
+
+ ssdcache_put_inode(inode);
+
+ end_get_page(page, 0);
+}
+
+static int ssdcache_get_page(int pool_id, struct cleancache_filekey key,
+ pgoff_t index, struct page *page,
+ void (*end_get_page)(struct page *page, int err))
+{
+ struct cache_fs *fs = cache_fs_array + pool_id;
+ struct cache_inode *inode;
+ struct io_slot *slot;
+ unsigned long ino;
+ sector_t sect;
+ unsigned long flags;
+ int err;
+
+ /* we don't support filehandle */
+ ino = key.u.ino;
+
+ inode = ssdcache_get_inode(fs, ino, false);
+ if (!inode)
+ return -EINVAL;
+
+ spin_lock_irqsave(&io_lock, flags);
+ slot = __ssdcache_find_io_slot(inode, index);
+ if (slot) {
+ /* AP -> P -> G case, second P is ignore, so G should be ignored */
+ if (slot->abort)
+ goto unlock_error;
+
+ /* AP -> G case */
+ copy_highpage(page, slot->page);
+ goto unlock_success;
+ }
+
+ slot = __ssdcache_get_io_slot(READ);
+ if (!slot)
+ goto unlock_error;
+
+ slot->inode = inode;
+ slot->index = index;
+ slot->page = page;
+
+ slot->end_io = ssdcache_get_page_endio;
+ slot->end_get_page = end_get_page;
+ slot->rw = READ;
+
+ __ssdcache_io_slot_add_hash(slot);
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ /* II can't free cache now */
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ sect = (sector_t)radix_tree_lookup(&inode->pages, index);
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+ if (sect == 0)
+ goto error_put_ioslot;
+
+ slot->sect = sect;
+
+ err = ssdcache_io_slot_submit(slot);
+ if (err)
+ goto error_put_ioslot;
+ return 0;
+
+unlock_success:
+ spin_unlock_irqrestore(&io_lock, flags);
+ ssdcache_put_inode(inode);
+ end_get_page(page, 0);
+ return 0;
+error_put_ioslot:
+ spin_lock_irqsave(&io_lock, flags);
+ /* II want to abort the cache */
+ if (slot->abort && sect) {
+ spin_unlock(&io_lock);
+ spin_lock(&inode->pages_lock);
+
+ radix_tree_delete(&inode->pages, slot->index);
+
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ ssdcache_free_sector(sect);
+
+ spin_lock_irqsave(&io_lock, flags);
+ }
+ __ssdcache_put_io_slot(slot);
+unlock_error:
+ spin_unlock_irqrestore(&io_lock, flags);
+ ssdcache_put_inode(inode);
+ return -EINVAL;
+}
+
+static void ssdcache_prepare_reclaim_inode_page(struct cache_inode *inode)
+{
+ atomic_inc(&inode->refcnt);
+}
+
+static void __ssdcache_reclaim_inode_page(struct cache_inode *inode,
+ pgoff_t index, bool reclaim)
+{
+ struct io_slot *slot;
+ sector_t sect;
+ unsigned long flags;
+
+ spin_lock_irqsave(&io_lock, flags);
+ slot = __ssdcache_find_io_slot(inode, index);
+ if (slot) {
+ /* If reclaim, ignore it */
+ if (!reclaim) {
+ /* AP -> I case */
+ BUG_ON(slot->rw != WRITE);
+ slot->abort = 1;
+ }
+ spin_unlock_irqrestore(&io_lock, flags);
+ ssdcache_put_inode(inode);
+ return;
+ }
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ sect = (sector_t)radix_tree_delete(&inode->pages, index);
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ if (sect != 0)
+ ssdcache_free_sector(sect);
+
+ ssdcache_put_inode(inode);
+}
+
+static void ssdcache_reclaim_inode_page(struct cache_inode *inode,
+ pgoff_t index)
+{
+ __ssdcache_reclaim_inode_page(inode, index, true);
+}
+
+static void ssdcache_invalidate_page(int pool_id, struct cleancache_filekey key,
+ pgoff_t index)
+{
+ struct cache_fs *fs = cache_fs_array + pool_id;
+ struct cache_inode *inode;
+ unsigned long ino;
+
+ /* we don't support filehandle */
+ ino = key.u.ino;
+
+ inode = ssdcache_get_inode(fs, ino, false);
+ if (!inode)
+ return;
+
+ __ssdcache_reclaim_inode_page(inode, index, false);
+}
+
+#define RADIX_BATCH 8
+static int ssdcache_lookup_inode_caches(struct cache_inode *inode, pgoff_t start,
+ pgoff_t *index, sector_t *sects, ssize_t size)
+{
+ struct radix_tree_iter iter;
+ ssize_t cnt = 0;
+ void **slot;
+
+ radix_tree_for_each_slot(slot, &inode->pages, &iter, start) {
+ sects[cnt] = (sector_t)radix_tree_deref_slot(slot);
+ if (sects[cnt] == 0)
+ continue;
+ index[cnt] = iter.index;
+ cnt++;
+ if (cnt >= size)
+ break;
+ }
+ return cnt;
+}
+
+static void ssdcache_invalidate_inode(int pool_id, struct cleancache_filekey key)
+{
+ struct cache_fs *fs = cache_fs_array + pool_id;
+ struct cache_inode *inode;
+ unsigned long ino;
+ struct io_slot *slot;
+ unsigned long flags;
+ pgoff_t index[RADIX_BATCH];
+ sector_t sects[RADIX_BATCH];
+ pgoff_t start;
+ int cnt, i;
+
+ /* we don't support filehandle */
+ ino = key.u.ino;
+
+ inode = ssdcache_get_inode(fs, ino, false);
+ if (!inode)
+ return;
+
+ spin_lock_irqsave(&cache_fs_lock, flags);
+ /* Guarantee the inode can't be found any more */
+ rb_erase(&inode->rb_node, &inode->fs->inodes);
+ spin_unlock_irqrestore(&cache_fs_lock, flags);
+
+ /* II could run when G/P is running. So G/P should always add slot first */
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ /* Guarantee no new entry is added to radix tree */
+ inode->free = 1;
+ start = 0;
+
+again:
+ cnt = ssdcache_lookup_inode_caches(inode, start, index, sects, RADIX_BATCH);
+
+ for (i = 0; i < cnt; i++) {
+ start = index[i];
+
+ /*
+ * slot abort could delete radix entry too, but the duplication
+ * is not a problem
+ */
+ radix_tree_delete(&inode->pages, index[i]);
+ }
+ start++;
+ spin_unlock_irqrestore(&inode->pages_lock, flags);
+
+ spin_lock_irqsave(&io_lock, flags);
+ for (i = 0; i < cnt; i++) {
+ slot = __ssdcache_find_io_slot(inode, index[i]);
+ /*
+ * either read/write endio will remove this radix entry and
+ * free the sectors. io_slot protects we don't free sector duplicated
+ */
+ if (slot) {
+ slot->abort = 1;
+ sects[i] = 0;
+ }
+ }
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ /*
+ * G, P, I could run here, but we don't free sectors duplicated. If G,
+ * P are running, there are slots existing, we skip free sectors. If I
+ * is running, we always free radix tree first, so no duplication.
+ */
+ for (i = 0; i < cnt; i++) {
+ if (sects[i])
+ ssdcache_free_sector(sects[i]);
+ }
+
+ if (cnt) {
+ spin_lock_irqsave(&inode->pages_lock, flags);
+ goto again;
+ }
+
+ ssdcache_put_inode(inode);
+ ssdcache_put_inode(inode);
+ /* The inode might still be not freed, after G/P finish, it will be freed */
+}
+
+static void ssdcache_invalidate_fs(int pool_id)
+{
+ struct cache_fs *fs = cache_fs_array + pool_id;
+ struct cache_inode *inode;
+ struct rb_node *node;
+ unsigned long flags;
+ struct cleancache_filekey key;
+
+ while (1) {
+ spin_lock_irqsave(&cache_fs_lock, flags);
+ node = rb_first(&fs->inodes);
+ if (node) {
+ /* Get inode number with lock hold */
+ inode = rb_entry(node, struct cache_inode, rb_node);
+ key.u.ino = inode->rb_index;
+ }
+ spin_unlock_irqrestore(&cache_fs_lock, flags);
+
+ if (node == NULL)
+ return;
+
+ ssdcache_invalidate_inode(pool_id, key);
+ }
+}
+
+static int ssdcache_init_fs(size_t pagesize)
+{
+ int i;
+
+ if (pagesize != PAGE_SIZE)
+ return -EINVAL;
+
+ if (cache_fs_nr >= MAX_INITIALIZABLE_FS)
+ return -EINVAL;
+ cache_fs_nr++;
+
+ for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
+ if (!cache_fs_array[i].valid) {
+ cache_fs_array[i].inodes = RB_ROOT;
+
+ cache_fs_array[i].valid = true;
+ break;
+ }
+ }
+ return i;
+}
+
+static int ssdcache_init_shared_fs(char *uuid, size_t pagesize)
+{
+ /* shared pools are unsupported and map to private */
+ return ssdcache_init_fs(pagesize);
+}
+
+static struct cleancache_ops ssdcache_ops = {
+ .put_page = ssdcache_put_page,
+ .get_page = ssdcache_get_page,
+ .invalidate_page = ssdcache_invalidate_page,
+ .invalidate_inode = ssdcache_invalidate_inode,
+ .invalidate_fs = ssdcache_invalidate_fs,
+ .init_shared_fs = ssdcache_init_shared_fs,
+ .init_fs = ssdcache_init_fs
+};
+
+static void ssdcache_wakeup_worker(void)
+{
+ wake_up(&io_wait);
+}
+
+static int ssdcache_do_io(void *data)
+{
+ struct io_slot *slot;
+ DEFINE_WAIT(wait);
+ unsigned long flags;
+ LIST_HEAD(write_list);
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ while (!kthread_should_stop()) {
+ while (1) {
+ prepare_to_wait(&io_wait, &wait, TASK_INTERRUPTIBLE);
+
+ spin_lock_irqsave(&io_lock, flags);
+ __ssdcache_io_slot_peek_write(&write_list);
+ spin_unlock_irqrestore(&io_lock, flags);
+
+ if (!list_empty(&write_list) || kthread_should_stop())
+ break;
+ schedule();
+ }
+ finish_wait(&io_wait, &wait);
+
+ while (!list_empty(&write_list)) {
+ slot = list_first_entry(&write_list, struct io_slot,
+ link);
+ list_del_init(&slot->link);
+ ssdcache_do_put_page(slot);
+ }
+ }
+ blk_finish_plug(&plug);
+ return 0;
+}
+
+static int __init ssdcache_init(void)
+{
+ struct task_struct *tsk;
+
+ init_waitqueue_head(&io_wait);
+ tsk = kthread_run(ssdcache_do_io, NULL, "ssd_cleancache");
+ if (!tsk)
+ return -EINVAL;
+ if (ssdcache_io_init()) {
+ kthread_stop(tsk);
+ return -EINVAL;
+ }
+ cleancache_register_ops(&ssdcache_ops);
+ return 0;
+}
+
+module_init(ssdcache_init);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Shaohua Li <shli@kernel.org>");
+MODULE_DESCRIPTION("SSD backed cleancache backend");
Index: linux/mm/Kconfig
===================================================================
--- linux.orig/mm/Kconfig 2013-09-26 21:38:45.425119143 +0800
+++ linux/mm/Kconfig 2013-09-26 21:38:45.417119257 +0800
@@ -532,6 +532,13 @@ config ZSWAP
they have not be fully explored on the large set of potential
configurations and workloads that exist.
+config SSD_CLEANCACHE
+ depends on CLEANCACHE
+ tristate "Enable SSD backed cleancache backend"
+ default n
+ help
+ A SSD backed cleancache backend
+
config MEM_SOFT_DIRTY
bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
Index: linux/mm/Makefile
===================================================================
--- linux.orig/mm/Makefile 2013-09-26 21:38:45.425119143 +0800
+++ linux/mm/Makefile 2013-09-26 21:38:45.421119196 +0800
@@ -60,3 +60,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kme
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZBUD) += zbud.o
+obj-$(CONFIG_SSD_CLEANCACHE) += ssd-cleancache.o
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread