diff for duplicates of <20111222155151.GA21469@ca-server1.us.oracle.com> diff --git a/a/1.txt b/N1/1.txt index 8b13789..2b52708 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -1 +1,2666 @@ +>From d23d411ef33f094c14855d48962d44aec854c500 Mon Sep 17 00:00:00 2001 +From: Dan Magenheimer <dan.magenheimer@oracle.com> +Date: Wed, 21 Dec 2011 14:01:59 -0700 +Subject: [PATCH v2 4/6] drivers/staging/ramster: ramster-specific changes to zcache/tmem +In tmem.[ch], new "repatriate" (provoke async get) and "localify" (handle +incoming data resulting from an async get) routines combine with a handful +of changes to existing pamops interfaces allow the generic tmem code +to support asynchronous operations. Also, a new tmem_xhandle struct +groups together key information that must be passed to remote tmem stores. + +Zcache-main.c is augmented with a large amount of ramster-specific code +to handle remote operations and "foreign" pages on both ends of the +"remotify" protocol. New "foreign" pools are auto-created on demand. +A "selfshrinker" thread periodically repatriates remote persistent pages +when local memory conditions allow. For certain operations, a queue is +necessary to guarantee strict ordering as out-of-order puts/flushes can +cause strange race conditions. Pampd pointers now either point to local +memory OR describe a remote page; to allow the same 64-bits to describe +either, the LSB is used to differentiate. Some acrobatics must be performed +to ensure local memory is available to handle a remote persistent get, +or deal with the data directly anyway if the malloc failed. Lots +of ramster-specific statistics are available via sysfs. + +Note: Some debug ifdefs left in for now. + +Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> + +--- + + drivers/staging/ramster/Kconfig | 17 +- + drivers/staging/ramster/Makefile | 5 +- + drivers/staging/ramster/tmem.c | 117 ++- + drivers/staging/ramster/tmem.h | 46 +- + drivers/staging/ramster/zcache-main.c | 1651 +++++++++++++++++++++++++++++---- + 5 files changed, 1636 insertions(+), 200 deletions(-) + +diff --git a/drivers/staging/ramster/Kconfig b/drivers/staging/ramster/Kconfig +index 7fabcb2..5154693 100644 +--- a/drivers/staging/ramster/Kconfig ++++ b/drivers/staging/ramster/Kconfig +@@ -1,13 +1,14 @@ +-config ZCACHE +- tristate "Dynamic compression of swap pages and clean pagecache pages" +- depends on CLEANCACHE || FRONTSWAP ++config RAMSTER ++ tristate "Cross-machine RAM capacity sharing, aka peer-to-peer tmem" ++ depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS && !OCFS2_FS && !ZCACHE && !PREEMPT_VOLUNTARY && !HIGHMEM + select XVMALLOC + select LZO_COMPRESS + select LZO_DECOMPRESS + default n + help +- Zcache doubles RAM efficiency while providing a significant +- performance boosts on many workloads. Zcache uses lzo1x +- compression and an in-kernel implementation of transcendent +- memory to store clean page cache pages and swap in RAM, +- providing a noticeable reduction in disk I/O. ++ RAMster allows RAM on other machines in a cluster to be utilized ++ dynamically and symmetrically instead of swapping to a local swap ++ disk, thus improving performance on memory-constrained workloads ++ while minimizing total RAM across the cluster. RAMster, like ++ zcache, compresses swap pages into local RAM, but then remotifies ++ the compressed pages to another node in the RAMster cluster. +diff --git a/drivers/staging/ramster/Makefile b/drivers/staging/ramster/Makefile +index 60daa27..e6c4a2e 100644 +--- a/drivers/staging/ramster/Makefile ++++ b/drivers/staging/ramster/Makefile +@@ -1,3 +1,2 @@ +-zcache-y := zcache-main.o tmem.o +- +-obj-$(CONFIG_ZCACHE) += zcache.o ++obj-$(CONFIG_RAMSTER) += zcache-main.o tmem.o ++obj-$(CONFIG_RAMSTER) += ramster_o2net.o cluster/ +diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c +index 1ca66ea..ed7d07b 100644 +--- a/drivers/staging/ramster/tmem.c ++++ b/drivers/staging/ramster/tmem.c +@@ -27,6 +27,7 @@ + #include <linux/list.h> + #include <linux/spinlock.h> + #include <linux/atomic.h> ++#include <linux/delay.h> + + #include "tmem.h" + +@@ -316,7 +317,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) + } + + static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, +- void *new_pampd) ++ void *new_pampd, bool no_free) + { + struct tmem_objnode **slot; + void *ret = NULL; +@@ -325,7 +326,9 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, + if ((slot != NULL) && (*slot != NULL)) { + void *old_pampd = *(void **)slot; + *(void **)slot = new_pampd; +- (*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0); ++ if (!no_free) ++ (*tmem_pamops.free)(old_pampd, obj->pool, ++ NULL, 0, false); + ret = new_pampd; + } + return ret; +@@ -481,7 +484,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj, + if (ht == 1) { + obj->pampd_count--; + (*tmem_pamops.free)(objnode->slots[i], +- obj->pool, NULL, 0); ++ obj->pool, NULL, 0, true); + objnode->slots[i] = NULL; + continue; + } +@@ -498,7 +501,8 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) + return; + if (obj->objnode_tree_height == 0) { + obj->pampd_count--; +- (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0); ++ (*tmem_pamops.free)(obj->objnode_tree_root, ++ obj->pool, NULL, 0, true); + } else { + tmem_objnode_node_destroy(obj, obj->objnode_tree_root, + obj->objnode_tree_height); +@@ -529,7 +533,7 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) + * always flushes for simplicity. + */ + int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, +- char *data, size_t size, bool raw, bool ephemeral) ++ char *data, size_t size, bool raw, int ephemeral) + { + struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; + void *pampd = NULL, *pampd_del = NULL; +@@ -545,7 +549,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + /* if found, is a dup put, flush the old one */ + pampd_del = tmem_pampd_delete_from_obj(obj, index); + BUG_ON(pampd_del != pampd); +- (*tmem_pamops.free)(pampd, pool, oidp, index); ++ (*tmem_pamops.free)(pampd, pool, oidp, index, true); + if (obj->pampd_count == 0) { + objnew = obj; + objfound = NULL; +@@ -576,7 +580,7 @@ delete_and_free: + (void)tmem_pampd_delete_from_obj(obj, index); + free: + if (pampd) +- (*tmem_pamops.free)(pampd, pool, NULL, 0); ++ (*tmem_pamops.free)(pampd, pool, NULL, 0, true); + if (objnew) { + tmem_obj_free(objnew, hb); + (*tmem_hostops.obj_free)(objnew, pool); +@@ -586,6 +590,65 @@ out: + return ret; + } + ++void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, ++ uint32_t index, struct tmem_obj **ret_obj, ++ void **saved_hb) ++{ ++ struct tmem_hashbucket *hb; ++ struct tmem_obj *obj = NULL; ++ void *pampd = NULL; ++ ++ hb = &pool->hashbucket[tmem_oid_hash(oidp)]; ++ spin_lock(&hb->lock); ++ obj = tmem_obj_find(hb, oidp); ++ if (likely(obj != NULL)) ++ pampd = tmem_pampd_lookup_in_obj(obj, index); ++ *ret_obj = obj; ++ *saved_hb = (void *)hb; ++ /* note, hashbucket remains locked */ ++ return pampd; ++} ++ ++void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, ++ void *pampd, void *saved_hb, bool delete) ++{ ++ struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; ++ ++ BUG_ON(!spin_is_locked(&hb->lock)); ++ if (pampd != NULL) { ++ BUG_ON(obj == NULL); ++ (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); ++ } else if (delete) { ++ BUG_ON(obj == NULL); ++ (void)tmem_pampd_delete_from_obj(obj, index); ++ } ++ spin_unlock(&hb->lock); ++} ++ ++static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, ++ struct tmem_pool *pool, struct tmem_oid *oidp, ++ uint32_t index, bool free, char *data) ++{ ++ void *old_pampd = *ppampd, *new_pampd = NULL; ++ bool intransit = false; ++ int ret = 0; ++ ++ ++ if (!is_ephemeral(pool)) ++ new_pampd = (*tmem_pamops.repatriate_preload)( ++ old_pampd, pool, oidp, index, &intransit); ++ if (intransit) ++ ret = -EAGAIN; ++ else if (new_pampd != NULL) ++ *ppampd = new_pampd; ++ /* must release the hb->lock else repatriate can't sleep */ ++ spin_unlock(&hb->lock); ++ if (!intransit) ++ ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, ++ oidp, index, free, data); ++ return ret; ++} ++ + /* + * "Get" a page, e.g. if one can be found, copy the tmem page with the + * matching handle from PAM space to the kernel. By tmem definition, +@@ -607,14 +670,38 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + int ret = -1; + struct tmem_hashbucket *hb; + bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); +- bool lock_held = false; ++ bool lock_held = 0; ++ void **ppampd; + ++again: + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); +- lock_held = true; ++ lock_held = 1; + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; ++ ppampd = __tmem_pampd_lookup_in_obj(obj, index); ++ if (ppampd == NULL) ++ goto out; ++ if (tmem_pamops.is_remote(*ppampd)) { ++ ret = tmem_repatriate(ppampd, hb, pool, oidp, ++ index, free, data); ++ lock_held = 0; /* note hb->lock has been unlocked */ ++ if (ret == -EAGAIN) { ++ /* rare I think, but should cond_resched()??? */ ++ usleep_range(10, 1000); ++ goto again; ++ } else if (ret != 0) { ++#if 1 ++ if (ret != -ENOENT) ++ pr_err("UNTESTED case in tmem_get, ret=%d\n", ++ ret); ++#endif ++ ret = -1; ++ goto out; ++ } ++ goto out; ++ } + if (free) + pampd = tmem_pampd_delete_from_obj(obj, index); + else +@@ -628,10 +715,6 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + obj = NULL; + } + } +- if (tmem_pamops.is_remote(pampd)) { +- lock_held = false; +- spin_unlock(&hb->lock); +- } + if (free) + ret = (*tmem_pamops.get_data_and_free)( + data, size, raw, pampd, pool, oidp, index); +@@ -668,7 +751,7 @@ int tmem_flush_page(struct tmem_pool *pool, + pampd = tmem_pampd_delete_from_obj(obj, index); + if (pampd == NULL) + goto out; +- (*tmem_pamops.free)(pampd, pool, oidp, index); ++ (*tmem_pamops.free)(pampd, pool, oidp, index, true); + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); +@@ -682,8 +765,8 @@ out: + + /* + * If a page in tmem matches the handle, replace the page so that any +- * subsequent "get" gets the new page. Returns 0 if +- * there was a page to replace, else returns -1. ++ * subsequent "get" gets the new page. Returns the new page if ++ * there was a page to replace, else returns NULL. + */ + int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, void *new_pampd) +@@ -697,7 +780,7 @@ int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; +- new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd); ++ new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); + ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); + out: + spin_unlock(&hb->lock); +diff --git a/drivers/staging/ramster/tmem.h b/drivers/staging/ramster/tmem.h +index ed147c4..47f1918 100644 +--- a/drivers/staging/ramster/tmem.h ++++ b/drivers/staging/ramster/tmem.h +@@ -9,7 +9,6 @@ + #ifndef _TMEM_H_ + #define _TMEM_H_ + +-#include <linux/types.h> + #include <linux/highmem.h> + #include <linux/hash.h> + #include <linux/atomic.h> +@@ -89,6 +88,31 @@ struct tmem_oid { + uint64_t oid[3]; + }; + ++struct tmem_xhandle { ++ uint8_t client_id; ++ uint8_t xh_data_cksum; ++ uint16_t xh_data_size; ++ uint16_t pool_id; ++ struct tmem_oid oid; ++ uint32_t index; ++ void *extra; ++}; ++ ++static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id, ++ struct tmem_pool *pool, ++ struct tmem_oid *oidp, ++ uint32_t index) ++{ ++ struct tmem_xhandle xh; ++ xh.client_id = client_id; ++ xh.xh_data_cksum = (uint8_t)-1; ++ xh.xh_data_size = (uint16_t)-1; ++ xh.pool_id = pool->pool_id; ++ xh.oid = *oidp; ++ xh.index = index; ++ return xh; ++} ++ + static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) + { + oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; +@@ -147,7 +171,11 @@ struct tmem_obj { + unsigned int objnode_tree_height; + unsigned long objnode_count; + long pampd_count; +- void *extra; /* for private use by pampd implementation */ ++ /* for current design of ramster, all pages belonging to ++ * an object reside on the same remotenode and extra is ++ * used to record the number of the remotenode so a ++ * flush-object operation can specify it */ ++ void *extra; /* for use by pampd implementation */ + DECL_SENTINEL + }; + +@@ -174,9 +202,14 @@ struct tmem_pamops { + int (*get_data_and_free)(char *, size_t *, bool, void *, + struct tmem_pool *, struct tmem_oid *, + uint32_t); +- void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t); ++ void (*free)(void *, struct tmem_pool *, ++ struct tmem_oid *, uint32_t, bool); + void (*free_obj)(struct tmem_pool *, struct tmem_obj *); + bool (*is_remote)(void *); ++ void *(*repatriate_preload)(void *, struct tmem_pool *, ++ struct tmem_oid *, uint32_t, bool *); ++ int (*repatriate)(void *, void *, struct tmem_pool *, ++ struct tmem_oid *, uint32_t, bool, void *); + void (*new_obj)(struct tmem_obj *); + int (*replace_in_obj)(void *, struct tmem_obj *); + }; +@@ -193,11 +226,16 @@ extern void tmem_register_hostops(struct tmem_hostops *m); + + /* core tmem accessor functions */ + extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, +- char *, size_t, bool, bool); ++ char *, size_t, bool, int); + extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, + char *, size_t *, bool, int); + extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, + void *); ++extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *, ++ uint32_t index, struct tmem_obj **, ++ void **); ++extern void tmem_localify_finish(struct tmem_obj *, uint32_t index, ++ void *, void *, bool); + extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, + uint32_t index); + extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); +diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c +index cd0ed84..92fc7b2 100644 +--- a/drivers/staging/ramster/zcache-main.c ++++ b/drivers/staging/ramster/zcache-main.c +@@ -1,7 +1,7 @@ + /* + * zcache.c + * +- * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. ++ * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. + * Copyright (c) 2010,2011, Nitin Gupta + * + * Zcache provides an in-kernel "host implementation" for transcendent memory +@@ -17,9 +17,11 @@ + * + * [1] For a definition of page-accessible memory (aka PAM), see: + * http://marc.info/?l=linux-mm&m=127811271605009 ++ * RAMSTER TODO: ++ * - handle remotifying of buddied pages (see zbud_remotify_zbpg) ++ * - kernel boot params: nocleancache/nofrontswap don't always work?!? + */ + +-#include <linux/module.h> + #include <linux/cpu.h> + #include <linux/highmem.h> + #include <linux/list.h> +@@ -30,11 +32,13 @@ + #include <linux/atomic.h> + #include <linux/math64.h> + #include "tmem.h" ++#include "zcache.h" ++#include "ramster.h" + + #include "../zram/xvmalloc.h" /* if built in drivers/staging */ + + #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) +-#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" ++#error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" + #endif + #ifdef CONFIG_CLEANCACHE + #include <linux/cleancache.h> +@@ -43,6 +47,61 @@ + #include <linux/frontswap.h> + #endif + ++enum ramster_remotify_op { ++ RAMSTER_REMOTIFY_EPH_PUT, ++ RAMSTER_REMOTIFY_PERS_PUT, ++ RAMSTER_REMOTIFY_FLUSH_PAGE, ++ RAMSTER_REMOTIFY_FLUSH_OBJ, ++ RAMSTER_INTRANSIT_PERS ++}; ++ ++struct ramster_remotify_hdr { ++ enum ramster_remotify_op op; ++ struct list_head list; ++}; ++ ++#define ZBH_SENTINEL 0x43214321 ++#define ZBPG_SENTINEL 0xdeadbeef ++ ++#define ZBUD_MAX_BUDS 2 ++ ++struct zbud_hdr { ++ struct ramster_remotify_hdr rem_op; ++ uint16_t client_id; ++ uint16_t pool_id; ++ struct tmem_oid oid; ++ uint32_t index; ++ uint16_t size; /* compressed size in bytes, zero means unused */ ++ DECL_SENTINEL ++}; ++ ++#define ZVH_SENTINEL 0x43214321 ++static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; ++ ++struct zv_hdr { ++ struct ramster_remotify_hdr rem_op; ++ uint16_t client_id; ++ uint16_t pool_id; ++ struct tmem_oid oid; ++ uint32_t index; ++ DECL_SENTINEL ++}; ++ ++struct flushlist_node { ++ struct ramster_remotify_hdr rem_op; ++ struct tmem_xhandle xh; ++}; ++ ++union { ++ struct ramster_remotify_hdr rem_op; ++ struct zv_hdr zv; ++ struct zbud_hdr zbud; ++ struct flushlist_node flist; ++} remotify_list_node; ++ ++static LIST_HEAD(zcache_rem_op_list); ++static DEFINE_SPINLOCK(zcache_rem_op_list_lock); ++ + #if 0 + /* this is more aggressive but may cause other problems? */ + #define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) +@@ -98,20 +157,6 @@ static inline bool is_local_client(struct zcache_client *cli) + * read or written unless the zbpg's lock is held. + */ + +-#define ZBH_SENTINEL 0x43214321 +-#define ZBPG_SENTINEL 0xdeadbeef +- +-#define ZBUD_MAX_BUDS 2 +- +-struct zbud_hdr { +- uint16_t client_id; +- uint16_t pool_id; +- struct tmem_oid oid; +- uint32_t index; +- uint16_t size; /* compressed size in bytes, zero means unused */ +- DECL_SENTINEL +-}; +- + struct zbud_page { + struct list_head bud_list; + spinlock_t lock; +@@ -153,8 +198,37 @@ static unsigned long zcache_zbud_curr_zbytes; + static unsigned long zcache_zbud_cumul_zpages; + static unsigned long zcache_zbud_cumul_zbytes; + static unsigned long zcache_compress_poor; ++static unsigned long zcache_policy_percent_exceeded; + static unsigned long zcache_mean_compress_poor; + ++/* ++ * RAMster counters ++ * - Remote pages are pages with a local pampd but the data is remote ++ * - Foreign pages are pages stored locally but belonging to another node ++ */ ++static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0); ++static unsigned long ramster_pers_remotify_enable; ++static unsigned long ramster_eph_remotify_enable; ++static unsigned long ramster_eph_pages_remoted; ++static unsigned long ramster_eph_pages_remote_failed; ++static unsigned long ramster_pers_pages_remoted; ++static unsigned long ramster_pers_pages_remote_failed; ++static unsigned long ramster_pers_pages_remote_nomem; ++static unsigned long ramster_remote_objects_flushed; ++static unsigned long ramster_remote_object_flushes_failed; ++static unsigned long ramster_remote_pages_flushed; ++static unsigned long ramster_remote_page_flushes_failed; ++static unsigned long ramster_remote_eph_pages_succ_get; ++static unsigned long ramster_remote_pers_pages_succ_get; ++static unsigned long ramster_remote_eph_pages_unsucc_get; ++static unsigned long ramster_remote_pers_pages_unsucc_get; ++static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0); ++static unsigned long ramster_curr_flnode_count_max; ++static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0); ++static unsigned long ramster_foreign_eph_pampd_count_max; ++static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0); ++static unsigned long ramster_foreign_pers_pampd_count_max; ++ + /* forward references */ + static void *zcache_get_free_page(void); + static void zcache_free_page(void *p); +@@ -210,6 +284,29 @@ static char *zbud_data(struct zbud_hdr *zh, unsigned size) + return p; + } + ++static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh) ++{ ++ struct zbud_page *zbpg; ++ char *p; ++ unsigned budnum; ++ ++ ASSERT_SENTINEL(zh, ZBH); ++ budnum = zbud_budnum(zh); ++ zbpg = container_of(zh, struct zbud_page, buddy[budnum]); ++ spin_lock(&zbpg->lock); ++ BUG_ON(zh->size > *size); ++ p = (char *)zbpg; ++ if (budnum == 0) ++ p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & ++ CHUNK_MASK); ++ else if (budnum == 1) ++ p += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK); ++ /* client should be filled in by caller */ ++ memcpy(data, p, zh->size); ++ *size = zh->size; ++ spin_unlock(&zbpg->lock); ++} ++ + /* + * zbud raw page management + */ +@@ -299,6 +396,7 @@ static void zbud_free_and_delist(struct zbud_hdr *zh) + struct zbud_page *zbpg = + container_of(zh, struct zbud_page, buddy[budnum]); + ++ BUG_ON(!irqs_disabled()); + spin_lock(&zbpg->lock); + if (list_empty(&zbpg->bud_list)) { + /* ignore zombie page... see zbud_evict_pages() */ +@@ -358,8 +456,13 @@ static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, + if (unlikely(zbpg == NULL)) + goto out; + /* ok, have a page, now compress the data before taking locks */ ++#if 1 /* 110721 FIX LOCK ORDERING TO ELIMINATE DEADLOCK */ ++ spin_lock(&zbud_budlists_spinlock); ++ spin_lock(&zbpg->lock); ++#else + spin_lock(&zbpg->lock); + spin_lock(&zbud_budlists_spinlock); ++#endif + list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); + zbud_unbuddied[nchunks].count++; + zh = &zbpg->buddy[0]; +@@ -389,12 +492,19 @@ init_zh: + zh->oid = *oid; + zh->pool_id = pool_id; + zh->client_id = client_id; ++#if 1 /* 110721 FIX LOCK ORDERING TO ELIMINATE DEADLOCK */ ++ to = zbud_data(zh, size); ++ memcpy(to, cdata, size); ++ spin_unlock(&zbpg->lock); ++ spin_unlock(&zbud_budlists_spinlock); ++#else + /* can wait to copy the data until the list locks are dropped */ + spin_unlock(&zbud_budlists_spinlock); + + to = zbud_data(zh, size); + memcpy(to, cdata, size); + spin_unlock(&zbpg->lock); ++#endif + zbud_cumul_chunk_counts[nchunks]++; + atomic_inc(&zcache_zbud_curr_zpages); + zcache_zbud_cumul_zpages++; +@@ -458,6 +568,7 @@ static void zbud_evict_zbpg(struct zbud_page *zbpg) + uint32_t index[ZBUD_MAX_BUDS]; + struct tmem_oid oid[ZBUD_MAX_BUDS]; + struct tmem_pool *pool; ++ unsigned long flags; + + ASSERT_SPINLOCK(&zbpg->lock); + BUG_ON(!list_empty(&zbpg->bud_list)); +@@ -474,9 +585,12 @@ static void zbud_evict_zbpg(struct zbud_page *zbpg) + } + spin_unlock(&zbpg->lock); + for (i = 0; i < j; i++) { ++ /* FIXME FIXME this just evicts local ephemeral pages!!! */ + pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); + if (pool != NULL) { ++ local_irq_save(flags); + tmem_flush_page(pool, &oid[i], index[i]); ++ local_irq_restore(flags); + zcache_put_pool(pool); + } + } +@@ -496,7 +610,7 @@ static void zbud_evict_zbpg(struct zbud_page *zbpg) + static void zbud_evict_pages(int nr) + { + struct zbud_page *zbpg; +- int i; ++ int i, newly_unused_pages = 0; + + /* first try freeing any pages on unused list */ + retry_unused_list: +@@ -512,7 +626,7 @@ retry_unused_list: + zcache_free_page(zbpg); + zcache_evicted_raw_pages++; + if (--nr <= 0) +- goto out; ++ goto done; + goto retry_unused_list; + } + spin_unlock_bh(&zbpg_unused_list_spinlock); +@@ -534,9 +648,10 @@ retry_unbud_list_i: + zcache_evicted_unbuddied_pages++; + /* want budlists unlocked when doing zbpg eviction */ + zbud_evict_zbpg(zbpg); ++ newly_unused_pages++; + local_bh_enable(); + if (--nr <= 0) +- goto out; ++ goto evict_unused; + goto retry_unbud_list_i; + } + spin_unlock_bh(&zbud_budlists_spinlock); +@@ -547,7 +662,7 @@ retry_bud_list: + spin_lock_bh(&zbud_budlists_spinlock); + if (list_empty(&zbud_buddied_list)) { + spin_unlock_bh(&zbud_budlists_spinlock); +- goto out; ++ goto evict_unused; + } + list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { + if (unlikely(!spin_trylock(&zbpg->lock))) +@@ -558,16 +673,362 @@ retry_bud_list: + zcache_evicted_buddied_pages++; + /* want budlists unlocked when doing zbpg eviction */ + zbud_evict_zbpg(zbpg); ++ newly_unused_pages++; + local_bh_enable(); + if (--nr <= 0) +- goto out; ++ goto evict_unused; + goto retry_bud_list; + } + spin_unlock_bh(&zbud_budlists_spinlock); ++ ++evict_unused: ++ /* ++ * zbud_evict_zbpg just moves pages on the unused list, it doesn't ++ * free them so we need to actually free them here. ++ */ ++ spin_lock_bh(&zbpg_unused_list_spinlock); ++ if (!list_empty(&zbpg_unused_list) && newly_unused_pages--) { ++ /* can't walk list here, since it may change when unlocked */ ++ zbpg = list_first_entry(&zbpg_unused_list, ++ struct zbud_page, bud_list); ++ list_del_init(&zbpg->bud_list); ++ zcache_zbpg_unused_list_count--; ++ atomic_dec(&zcache_zbud_curr_raw_pages); ++ spin_unlock_bh(&zbpg_unused_list_spinlock); ++ zcache_free_page(zbpg); ++ goto evict_unused; ++ } ++ spin_unlock_bh(&zbpg_unused_list_spinlock); ++done: ++ return; ++} ++ ++static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem); ++ ++/* only doing unbuddied for now, so only one remote put per zbpg */ ++static int zbud_remotify_zbpg(struct zbud_page *zbpg) ++{ ++ struct zbud_hdr *zh; ++ struct tmem_xhandle xh; ++ struct tmem_pool *pool; ++ bool ephemeral; ++ char *data; ++ size_t size; ++ int remotenode, ret = -1; ++ unsigned long flags; ++ unsigned char cksum; ++ char *p; ++ int i; ++ unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); ++ ++ ASSERT_SPINLOCK(&zbpg->lock); ++ BUG_ON(!list_empty(&zbpg->bud_list)); ++ ++ if (zbpg->buddy[0].size == 0) ++ zh = &zbpg->buddy[1]; ++ else if (zbpg->buddy[1].size == 0) ++ zh = &zbpg->buddy[0]; ++ else ++ BUG(); /* apparently NOT unbuddied ?!? */ ++ ++ /* don't remotify pages that are already remotified */ ++ if (zh->client_id != LOCAL_CLIENT) { ++ spin_unlock(&zbpg->lock); ++ ret = 0; ++ preempt_enable(); ++ goto out; ++ } ++ xh.client_id = zh->client_id; ++ xh.pool_id = zh->pool_id; ++ xh.oid = zh->oid; ++ xh.index = zh->index; ++ size = zh->size; ++ data = zbud_data(zh, size); ++ for (p = data, cksum = 0, i = 0; i < size; i++) ++ cksum += *p; ++ memcpy(tmpmem, data, size); ++ data = tmpmem; ++ spin_unlock(&zbpg->lock); ++ preempt_enable(); /* no locks held anymore */ ++ pool = zcache_get_pool_by_id(zh->client_id, zh->pool_id); ++ BUG_ON(pool == NULL); ++ ephemeral = !pool->persistent; ++ zcache_put_pool(pool); ++ ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode); ++ if (ret == 0) { ++ /* data was successfully remoted so change the local version ++ * to point to the remote node where it landed */ ++ struct tmem_pool *pool; ++ pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); ++ BUG_ON(pool == NULL); ++ local_irq_save(flags); ++ (void)tmem_replace(pool, &xh.oid, xh.index, ++ pampd_make_remote(remotenode, size, cksum)); ++ local_irq_restore(flags); ++ zcache_put_pool(pool); ++ ramster_eph_pages_remoted++; ++ ret = 1; ++ } else ++ ramster_eph_pages_remote_failed++; ++ ++out: ++ return ret; ++} ++ ++void zbud_remotify_pages(int nr) ++{ ++ struct zbud_page *zbpg; ++ int i, ret; ++ ++ /* ++ * for now just try remotifying unbuddied pages, starting with ++ * least space avail ++ */ ++ for (i = 0; i < MAX_CHUNK; i++) { ++retry_unbud_list_i: ++ preempt_disable(); /* enable in zbud_remotify_zbpg */ ++ spin_lock_bh(&zbud_budlists_spinlock); ++ if (list_empty(&zbud_unbuddied[i].list)) { ++ spin_unlock_bh(&zbud_budlists_spinlock); ++ continue; ++ } ++ list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { ++ if (unlikely(!spin_trylock(&zbpg->lock))) { ++ spin_unlock_bh(&zbud_budlists_spinlock); ++ preempt_enable(); ++ continue; ++ } ++ list_del_init(&zbpg->bud_list); ++ zbud_unbuddied[i].count--; ++ spin_unlock(&zbud_budlists_spinlock); ++ /* want budlists unlocked when doing zbpg remotify */ ++ local_bh_enable(); ++ ret = zbud_remotify_zbpg(zbpg); ++ /* preemption is now re-enabled */ ++ if (ret == 0) ++ BUG(); ++ else if (ret == 1) ++ --nr; ++ else { ++ /* if fail to remotify any page, quit */ ++pr_err("TESTING zbud_remotify_pages failed on page, trying to re-add\n"); ++ spin_lock_bh(&zbud_budlists_spinlock); ++ spin_lock(&zbpg->lock); ++ list_add_tail(&zbpg->bud_list, ++ &zbud_unbuddied[i].list); ++ zbud_unbuddied[i].count++; ++ spin_unlock(&zbpg->lock); ++ spin_unlock_bh(&zbud_budlists_spinlock); ++pr_err("TESTING zbud_remotify_pages failed on page, finished re-add\n"); ++ goto out; ++ } ++ if (nr <= 0) ++ goto out; ++ goto retry_unbud_list_i; ++ } ++ spin_unlock_bh(&zbud_budlists_spinlock); ++ } + out: + return; + } + ++/* the "flush list" asynchronously collects pages to remotely flush */ ++#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1) ++static void ramster_flnode_free(struct flushlist_node *, ++ struct tmem_pool *); ++ ++static void zcache_remote_flush_page(struct flushlist_node *flnode) ++{ ++ struct tmem_xhandle *xh; ++ int remotenode, ret; ++ ++ preempt_disable(); ++ xh = &flnode->xh; ++ remotenode = flnode->xh.client_id; ++ ret = ramster_remote_flush(xh, remotenode); ++ if (ret >= 0) ++ ramster_remote_pages_flushed++; ++ else ++ ramster_remote_page_flushes_failed++; ++ preempt_enable_no_resched(); ++ ramster_flnode_free(flnode, NULL); ++} ++ ++static void zcache_remote_flush_object(struct flushlist_node *flnode) ++{ ++ struct tmem_xhandle *xh; ++ int remotenode, ret; ++ ++ preempt_disable(); ++ xh = &flnode->xh; ++ remotenode = flnode->xh.client_id; ++ ret = ramster_remote_flush_object(xh, remotenode); ++ if (ret >= 0) ++ ramster_remote_objects_flushed++; ++ else ++ ramster_remote_object_flushes_failed++; ++ preempt_enable_no_resched(); ++ ramster_flnode_free(flnode, NULL); ++} ++ ++static void zcache_remote_eph_put(struct zbud_hdr *zbud) ++{ ++ /* FIXME */ ++} ++ ++static void zcache_remote_pers_put(struct zv_hdr *zv) ++{ ++ struct tmem_xhandle xh; ++ uint16_t size; ++ bool ephemeral; ++ int remotenode, ret = -1; ++ char *data; ++ struct tmem_pool *pool; ++ unsigned long flags; ++ unsigned char cksum; ++ char *p; ++ int i; ++ unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); ++ ++ ASSERT_SENTINEL(zv, ZVH); ++ BUG_ON(zv->client_id != LOCAL_CLIENT); ++ local_bh_disable(); ++ xh.client_id = zv->client_id; ++ xh.pool_id = zv->pool_id; ++ xh.oid = zv->oid; ++ xh.index = zv->index; ++ size = xv_get_object_size(zv) - sizeof(*zv); ++ BUG_ON(size == 0 || size > zv_max_page_size); ++ data = (char *)zv + sizeof(*zv); ++ for (p = data, cksum = 0, i = 0; i < size; i++) ++ cksum += *p; ++ memcpy(tmpmem, data, size); ++ data = tmpmem; ++ pool = zcache_get_pool_by_id(zv->client_id, zv->pool_id); ++ ephemeral = is_ephemeral(pool); ++ zcache_put_pool(pool); ++ /* now OK to release lock set in caller */ ++ spin_unlock(&zcache_rem_op_list_lock); ++ local_bh_enable(); ++ preempt_disable(); ++ ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode); ++ preempt_enable_no_resched(); ++ if (ret != 0) { ++ /* ++ * This is some form of a memory leak... if the remote put ++ * fails, there will never be another attempt to remotify ++ * this page. But since we've dropped the zv pointer, ++ * the page may have been freed or the data replaced ++ * so we can't just "put it back" in the remote op list. ++ * Even if we could, not sure where to put it in the list ++ * because there may be flushes that must be strictly ++ * ordered vs the put. So leave this as a FIXME for now. ++ * But count them so we know if it becomes a problem. ++ */ ++ ramster_pers_pages_remote_failed++; ++ goto out; ++ } else ++ atomic_inc(&ramster_remote_pers_pages); ++ ramster_pers_pages_remoted++; ++ /* ++ * data was successfully remoted so change the local version to ++ * point to the remote node where it landed ++ */ ++ local_bh_disable(); ++ pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); ++ local_irq_save(flags); ++ (void)tmem_replace(pool, &xh.oid, xh.index, ++ pampd_make_remote(remotenode, size, cksum)); ++ local_irq_restore(flags); ++ zcache_put_pool(pool); ++ local_bh_enable(); ++out: ++ return; ++} ++ ++static void zcache_do_remotify_ops(int nr) ++{ ++ struct ramster_remotify_hdr *rem_op; ++ union remotify_list_node *u; ++ ++ while (1) { ++ if (!nr) ++ goto out; ++ spin_lock(&zcache_rem_op_list_lock); ++ if (list_empty(&zcache_rem_op_list)) { ++ spin_unlock(&zcache_rem_op_list_lock); ++ goto out; ++ } ++ rem_op = list_first_entry(&zcache_rem_op_list, ++ struct ramster_remotify_hdr, list); ++ list_del_init(&rem_op->list); ++ if (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT) ++ spin_unlock(&zcache_rem_op_list_lock); ++ u = (union remotify_list_node *)rem_op; ++ switch (rem_op->op) { ++ case RAMSTER_REMOTIFY_EPH_PUT: ++BUG(); ++ zcache_remote_eph_put((struct zbud_hdr *)rem_op); ++ break; ++ case RAMSTER_REMOTIFY_PERS_PUT: ++ zcache_remote_pers_put((struct zv_hdr *)rem_op); ++ break; ++ case RAMSTER_REMOTIFY_FLUSH_PAGE: ++ zcache_remote_flush_page((struct flushlist_node *)u); ++ break; ++ case RAMSTER_REMOTIFY_FLUSH_OBJ: ++ zcache_remote_flush_object((struct flushlist_node *)u); ++ break; ++ default: ++ BUG(); ++ } ++ } ++out: ++ return; ++} ++ ++/* ++ * For now, just push over a few pages every few seconds to ++ * ensure that it basically works ++ */ ++static struct workqueue_struct *ramster_remotify_workqueue; ++static void ramster_remotify_process(struct work_struct *work); ++static DECLARE_DELAYED_WORK(ramster_remotify_worker, ++ ramster_remotify_process); ++ ++static void ramster_remotify_queue_delayed_work(unsigned long delay) ++{ ++ if (!queue_delayed_work(ramster_remotify_workqueue, ++ &ramster_remotify_worker, delay)) ++ pr_err("ramster_remotify: bad workqueue\n"); ++} ++ ++ ++static int use_frontswap; ++static int use_cleancache; ++static void ramster_remotify_process(struct work_struct *work) ++{ ++ BUG_ON(irqs_disabled()); ++ ramster_remotify_queue_delayed_work(10 * HZ); ++#ifdef CONFIG_FRONTSWAP ++ if (use_frontswap && ramster_pers_remotify_enable) ++ zcache_do_remotify_ops(500); /* FIXME is this a good number? */ ++#endif ++#ifdef CONFIG_CLEANCACHE ++ if (use_cleancache && ramster_eph_remotify_enable) ++ zbud_remotify_pages(100); ++#endif ++} ++ ++static void ramster_remotify_init(void) ++{ ++ unsigned long n = 60UL; ++ ramster_remotify_workqueue = ++ create_singlethread_workqueue("ramster_remotify"); ++ ramster_remotify_queue_delayed_work(n * HZ); ++} ++ ++ + static void zbud_init(void) + { + int i; +@@ -631,15 +1092,6 @@ static int zbud_show_cumul_chunk_counts(char *buf) + * necessary for decompression) immediately preceding the compressed data. + */ + +-#define ZVH_SENTINEL 0x43214321 +- +-struct zv_hdr { +- uint32_t pool_id; +- struct tmem_oid oid; +- uint32_t index; +- DECL_SENTINEL +-}; +- + /* rudimentary policy limits */ + /* total number of persistent pages may not exceed this percentage */ + static unsigned int zv_page_count_policy_percent = 75; +@@ -658,7 +1110,7 @@ static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; + static unsigned long zv_curr_dist_counts[NCHUNKS]; + static unsigned long zv_cumul_dist_counts[NCHUNKS]; + +-static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, ++static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id, + struct tmem_oid *oid, uint32_t index, + void *cdata, unsigned clen) + { +@@ -671,7 +1123,7 @@ static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, + + BUG_ON(!irqs_disabled()); + BUG_ON(chunks >= NCHUNKS); +- ret = xv_malloc(xvpool, alloc_size, ++ ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), + &page, &offset, ZCACHE_GFP_MASK); + if (unlikely(ret)) + goto out; +@@ -682,12 +1134,50 @@ static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id, + zv->oid = *oid; + zv->pool_id = pool_id; + SET_SENTINEL(zv, ZVH); ++ INIT_LIST_HEAD(&zv->rem_op.list); ++ zv->client_id = get_client_id_from_client(cli); ++ zv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT; ++ if (zv->client_id == LOCAL_CLIENT) { ++ spin_lock(&zcache_rem_op_list_lock); ++ list_add_tail(&zv->rem_op.list, &zcache_rem_op_list); ++ spin_unlock(&zcache_rem_op_list_lock); ++ } + memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); + kunmap_atomic(zv, KM_USER0); + out: + return zv; + } + ++/* similar to zv_create, but just reserve space, no data yet */ ++static struct zv_hdr *zv_alloc(struct tmem_pool *pool, ++ struct tmem_oid *oid, uint32_t index, ++ unsigned clen) ++{ ++ struct zcache_client *cli = pool->client; ++ struct page *page; ++ struct zv_hdr *zv = NULL; ++ uint32_t offset; ++ int ret; ++ ++ BUG_ON(!irqs_disabled()); ++ BUG_ON(!is_local_client(pool->client)); ++ ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), ++ &page, &offset, ZCACHE_GFP_MASK); ++ if (unlikely(ret)) ++ goto out; ++ zv = kmap_atomic(page, KM_USER0) + offset; ++ SET_SENTINEL(zv, ZVH); ++ INIT_LIST_HEAD(&zv->rem_op.list); ++ zv->client_id = LOCAL_CLIENT; ++ zv->rem_op.op = RAMSTER_INTRANSIT_PERS; ++ zv->index = index; ++ zv->oid = *oid; ++ zv->pool_id = pool->pool_id; ++ kunmap_atomic(zv, KM_USER0); ++out: ++ return zv; ++} ++ + static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) + { + unsigned long flags; +@@ -700,8 +1190,13 @@ static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) + BUG_ON(chunks >= NCHUNKS); + zv_curr_dist_counts[chunks]--; + size -= sizeof(*zv); ++ spin_lock(&zcache_rem_op_list_lock); ++ size = xv_get_object_size(zv) - sizeof(*zv); + BUG_ON(size == 0); + INVERT_SENTINEL(zv, ZVH); ++ if (!list_empty(&zv->rem_op.list)) ++ list_del_init(&zv->rem_op.list); ++ spin_unlock(&zcache_rem_op_list_lock); + page = virt_to_page(zv); + offset = (unsigned long)zv & ~PAGE_MASK; + local_irq_save(flags); +@@ -727,6 +1222,29 @@ static void zv_decompress(struct page *page, struct zv_hdr *zv) + BUG_ON(clen != PAGE_SIZE); + } + ++static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv) ++{ ++ unsigned size; ++ ++ ASSERT_SENTINEL(zv, ZVH); ++ size = xv_get_object_size(zv) - sizeof(*zv); ++ BUG_ON(size == 0 || size > zv_max_page_size); ++ BUG_ON(size > *bufsize); ++ memcpy(data, (char *)zv + sizeof(*zv), size); ++ *bufsize = size; ++} ++ ++static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size) ++{ ++ unsigned zv_size; ++ ++ ASSERT_SENTINEL(zv, ZVH); ++ zv_size = xv_get_object_size(zv) - sizeof(*zv); ++ BUG_ON(zv_size != size); ++ BUG_ON(zv_size == 0 || zv_size > zv_max_page_size); ++ memcpy((char *)zv + sizeof(*zv), data, size); ++} ++ + #ifdef CONFIG_SYSFS + /* + * show a distribution of compression stats for zv pages. +@@ -979,6 +1497,7 @@ static DEFINE_SPINLOCK(zcache_direct_reclaim_lock); + */ + static struct kmem_cache *zcache_objnode_cache; + static struct kmem_cache *zcache_obj_cache; ++static struct kmem_cache *ramster_flnode_cache; + static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); + static unsigned long zcache_curr_obj_count_max; + static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); +@@ -994,6 +1513,7 @@ struct zcache_preload { + struct tmem_obj *obj; + int nr; + struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; ++ struct flushlist_node *flnode; + }; + static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; + +@@ -1002,6 +1522,7 @@ static int zcache_do_preload(struct tmem_pool *pool) + struct zcache_preload *kp; + struct tmem_objnode *objnode; + struct tmem_obj *obj; ++ struct flushlist_node *flnode; + void *page; + int ret = -ENOMEM; + +@@ -1009,10 +1530,6 @@ static int zcache_do_preload(struct tmem_pool *pool) + goto out; + if (unlikely(zcache_obj_cache == NULL)) + goto out; +- if (!spin_trylock(&zcache_direct_reclaim_lock)) { +- zcache_aborted_preload++; +- goto out; +- } + preempt_disable(); + kp = &__get_cpu_var(zcache_preloads); + while (kp->nr < ARRAY_SIZE(kp->objnodes)) { +@@ -1036,6 +1553,11 @@ static int zcache_do_preload(struct tmem_pool *pool) + zcache_failed_alloc++; + goto unlock_out; + } ++ flnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK); ++ if (unlikely(flnode == NULL)) { ++ zcache_failed_alloc++; ++ goto unlock_out; ++ } + page = (void *)__get_free_page(ZCACHE_GFP_MASK); + if (unlikely(page == NULL)) { + zcache_failed_get_free_pages++; +@@ -1048,17 +1570,40 @@ static int zcache_do_preload(struct tmem_pool *pool) + kp->obj = obj; + else + kmem_cache_free(zcache_obj_cache, obj); ++ if (kp->flnode == NULL) ++ kp->flnode = flnode; ++ else ++ kmem_cache_free(ramster_flnode_cache, flnode); + if (kp->page == NULL) + kp->page = page; + else + free_page((unsigned long)page); + ret = 0; + unlock_out: +- spin_unlock(&zcache_direct_reclaim_lock); + out: + return ret; + } + ++static int ramster_do_preload_flnode_only(struct tmem_pool *pool) ++{ ++ struct zcache_preload *kp; ++ struct flushlist_node *flnode; ++ int ret = -ENOMEM; ++ ++ BUG_ON(!irqs_disabled()); ++ if (unlikely(ramster_flnode_cache == NULL)) ++ BUG(); ++ kp = &__get_cpu_var(zcache_preloads); ++ flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC); ++ if (unlikely(flnode == NULL) && kp->flnode == NULL) ++ BUG(); /* FIXME handle more gracefully, but how??? */ ++ else if (kp->flnode == NULL) ++ kp->flnode = flnode; ++ else ++ kmem_cache_free(ramster_flnode_cache, flnode); ++ return ret; ++} ++ + static void *zcache_get_free_page(void) + { + struct zcache_preload *kp; +@@ -1131,6 +1676,30 @@ static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) + kmem_cache_free(zcache_obj_cache, obj); + } + ++static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool) ++{ ++ struct flushlist_node *flnode = NULL; ++ struct zcache_preload *kp; ++ int count; ++ ++ kp = &__get_cpu_var(zcache_preloads); ++ flnode = kp->flnode; ++ BUG_ON(flnode == NULL); ++ kp->flnode = NULL; ++ count = atomic_inc_return(&ramster_curr_flnode_count); ++ if (count > ramster_curr_flnode_count_max) ++ ramster_curr_flnode_count_max = count; ++ return flnode; ++} ++ ++static void ramster_flnode_free(struct flushlist_node *flnode, ++ struct tmem_pool *pool) ++{ ++ atomic_dec(&ramster_curr_flnode_count); ++ BUG_ON(atomic_read(&ramster_curr_flnode_count) < 0); ++ kmem_cache_free(ramster_flnode_cache, flnode); ++} ++ + static struct tmem_hostops zcache_hostops = { + .obj_alloc = zcache_obj_alloc, + .obj_free = zcache_obj_free, +@@ -1150,22 +1719,20 @@ static unsigned long zcache_curr_pers_pampd_count_max; + /* forward reference */ + static int zcache_compress(struct page *from, void **out_va, size_t *out_len); + +-static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, ++static int zcache_pampd_eph_create(char *data, size_t size, bool raw, + struct tmem_pool *pool, struct tmem_oid *oid, +- uint32_t index) ++ uint32_t index, void **pampd) + { +- void *pampd = NULL, *cdata; +- size_t clen; +- int ret; +- unsigned long count; +- struct page *page = (struct page *)(data); ++ int ret = -1; ++ void *cdata = data; ++ size_t clen = size; + struct zcache_client *cli = pool->client; + uint16_t client_id = get_client_id_from_client(cli); +- unsigned long zv_mean_zsize; +- unsigned long curr_pers_pampd_count; +- u64 total_zsize; ++ struct page *page = NULL; ++ unsigned long count; + +- if (eph) { ++ if (!raw) { ++ page = virt_to_page(data); + ret = zcache_compress(page, &cdata, &clen); + if (ret == 0) + goto out; +@@ -1173,46 +1740,137 @@ static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, + zcache_compress_poor++; + goto out; + } +- pampd = (void *)zbud_create(client_id, pool->pool_id, oid, +- index, page, cdata, clen); +- if (pampd != NULL) { +- count = atomic_inc_return(&zcache_curr_eph_pampd_count); +- if (count > zcache_curr_eph_pampd_count_max) +- zcache_curr_eph_pampd_count_max = count; +- } +- } else { +- curr_pers_pampd_count = +- atomic_read(&zcache_curr_pers_pampd_count); +- if (curr_pers_pampd_count > +- (zv_page_count_policy_percent * totalram_pages) / 100) +- goto out; +- ret = zcache_compress(page, &cdata, &clen); +- if (ret == 0) +- goto out; +- /* reject if compression is too poor */ +- if (clen > zv_max_zsize) { +- zcache_compress_poor++; ++ } ++ *pampd = (void *)zbud_create(client_id, pool->pool_id, oid, ++ index, page, cdata, clen); ++ if (*pampd == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ret = 0; ++ count = atomic_inc_return(&zcache_curr_eph_pampd_count); ++ if (count > zcache_curr_eph_pampd_count_max) ++ zcache_curr_eph_pampd_count_max = count; ++ if (client_id != LOCAL_CLIENT) { ++ count = atomic_inc_return(&ramster_foreign_eph_pampd_count); ++ if (count > ramster_foreign_eph_pampd_count_max) ++ ramster_foreign_eph_pampd_count_max = count; ++ } ++out: ++ return ret; ++} ++ ++static int zcache_pampd_pers_create(char *data, size_t size, bool raw, ++ struct tmem_pool *pool, struct tmem_oid *oid, ++ uint32_t index, void **pampd) ++{ ++ int ret = -1; ++ void *cdata = data; ++ size_t clen = size; ++ struct zcache_client *cli = pool->client; ++ struct page *page; ++ unsigned long count; ++ unsigned long zv_mean_zsize; ++ struct zv_hdr *zv; ++ long curr_pers_pampd_count; ++ u64 total_zsize; ++ ++ curr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) - ++ atomic_read(&ramster_remote_pers_pages); ++ /* should always be positive, but warn if accounting is off */ ++ WARN_ON_ONCE(curr_pers_pampd_count < 0); ++ if (curr_pers_pampd_count > ++ (zv_page_count_policy_percent * totalram_pages) / 100) { ++ zcache_policy_percent_exceeded++; ++#if 0 ++{ ++static unsigned long cnt; ++cnt++; ++if (!(cnt&(cnt-1))) ++pr_err("TESTING zppc policy cnt=%lu, curr=%lu, limit=%lu, totalram=%lu\n", ++cnt, curr_pers_pampd_count, ++((zv_page_count_policy_percent * totalram_pages) / 100), totalram_pages); ++} ++#endif ++ goto out; ++ } ++ if (raw) ++ goto ok_to_create; ++ page = virt_to_page(data); ++ if (zcache_compress(page, &cdata, &clen) == 0) ++ goto out; ++ /* reject if compression is too poor */ ++ if (clen > zv_max_zsize) { ++ zcache_compress_poor++; ++ goto out; ++ } ++ /* reject if mean compression is too poor */ ++ if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { ++ total_zsize = xv_get_total_size_bytes(cli->xvpool); ++ zv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count); ++ if (zv_mean_zsize > zv_max_mean_zsize) { ++ zcache_mean_compress_poor++; + goto out; + } +- /* reject if mean compression is too poor */ +- if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { +- total_zsize = xv_get_total_size_bytes(cli->xvpool); +- zv_mean_zsize = div_u64(total_zsize, +- curr_pers_pampd_count); +- if (zv_mean_zsize > zv_max_mean_zsize) { +- zcache_mean_compress_poor++; +- goto out; +- } +- } +- pampd = (void *)zv_create(cli->xvpool, pool->pool_id, +- oid, index, cdata, clen); +- if (pampd == NULL) +- goto out; +- count = atomic_inc_return(&zcache_curr_pers_pampd_count); +- if (count > zcache_curr_pers_pampd_count_max) +- zcache_curr_pers_pampd_count_max = count; + } ++ok_to_create: ++ *pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen); ++ if (*pampd == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ret = 0; ++ count = atomic_inc_return(&zcache_curr_pers_pampd_count); ++ if (count > zcache_curr_pers_pampd_count_max) ++ zcache_curr_pers_pampd_count_max = count; ++ if (is_local_client(cli)) ++ goto out; ++ zv = *(struct zv_hdr **)pampd; ++ count = atomic_inc_return(&ramster_foreign_pers_pampd_count); ++ if (count > ramster_foreign_pers_pampd_count_max) ++ ramster_foreign_pers_pampd_count_max = count; + out: ++#if 0 ++if (ret == -ENOMEM) { ++static unsigned long cnt, lclcnt, fgncnt; ++cnt++; ++if (is_local_client(pool->client)) ++ lclcnt++; ++else ++ fgncnt++; ++if (!(cnt&(cnt-1))) ++pr_err("TESTING zcache_pampd_create_PERS ENOMEM cnt=%lu, local=%lu, foreign=%lu, tot pampd_count=%lu, remote=%lu\n", cnt, lclcnt, fgncnt, (long)atomic_read(&zcache_curr_pers_pampd_count), (long)atomic_read(&ramster_remote_pers_pages)); ++} else if (ret < 0) { ++static unsigned long cnt, lclcnt, fgncnt; ++cnt++; ++if (is_local_client(pool->client)) ++ lclcnt++; ++else ++ fgncnt++; ++if (!(cnt&(cnt-1))) ++pr_err("TESTING zcache_pampd_create_PERS POLICYFAIL cnt=%lu, local=%lu, foreign=%lu, tot pampd_count=%lu, remote=%lu\n", cnt, lclcnt, fgncnt, (long)atomic_read(&zcache_curr_pers_pampd_count), (long)atomic_read(&ramster_remote_pers_pages)); ++} ++#endif ++ return ret; ++} ++ ++static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, ++ struct tmem_pool *pool, struct tmem_oid *oid, ++ uint32_t index) ++{ ++ void *pampd = NULL; ++ int ret; ++ bool ephemeral; ++ ++ BUG_ON(preemptible()); ++ ephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool)); ++ if (ephemeral) ++ ret = zcache_pampd_eph_create(data, size, raw, pool, ++ oid, index, &pampd); ++ else ++ ret = zcache_pampd_pers_create(data, size, raw, pool, ++ oid, index, &pampd); ++ /* FIXME add some counters here for failed creates? */ + return pampd; + } + +@@ -1226,75 +1884,368 @@ static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, + { + int ret = 0; + +- BUG_ON(is_ephemeral(pool)); +- zv_decompress((struct page *)(data), pampd); ++ BUG_ON(preemptible()); ++ BUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */ ++ BUG_ON(pampd_is_remote(pampd)); ++ if (raw) ++ zv_copy_from_pampd(data, bufsize, pampd); ++ else ++ zv_decompress(virt_to_page(data), pampd); + return ret; + } + +-/* +- * fill the pageframe corresponding to the struct page with the data +- * from the passed pampd +- */ + static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, + void *pampd, struct tmem_pool *pool, + struct tmem_oid *oid, uint32_t index) + { + int ret = 0; ++ unsigned long flags; ++ struct zcache_client *cli = pool->client; + +- BUG_ON(!is_ephemeral(pool)); +- zbud_decompress((struct page *)(data), pampd); +- zbud_free_and_delist((struct zbud_hdr *)pampd); +- atomic_dec(&zcache_curr_eph_pampd_count); ++ BUG_ON(preemptible()); ++ BUG_ON(pampd_is_remote(pampd)); ++ if (is_ephemeral(pool)) { ++ local_irq_save(flags); ++ if (raw) ++ zbud_copy_from_pampd(data, bufsize, pampd); ++ else ++ ret = zbud_decompress(virt_to_page(data), pampd); ++ zbud_free_and_delist((struct zbud_hdr *)pampd); ++ local_irq_restore(flags); ++ if (!is_local_client(cli)) { ++ atomic_dec(&ramster_foreign_eph_pampd_count); ++ WARN_ON_ONCE(atomic_read(&ramster_foreign_eph_pampd_count) < 0); ++ } ++ atomic_dec(&zcache_curr_eph_pampd_count); ++ WARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0); ++ } else { ++ if (is_local_client(cli)) ++ BUG(); ++ if (raw) ++ zv_copy_from_pampd(data, bufsize, pampd); ++ else ++ zv_decompress(virt_to_page(data), pampd); ++ zv_free(cli->xvpool, pampd); ++ if (!is_local_client(cli)) { ++ atomic_dec(&ramster_foreign_pers_pampd_count); ++ WARN_ON_ONCE(atomic_read(&ramster_foreign_pers_pampd_count) < 0); ++ } ++ atomic_dec(&zcache_curr_pers_pampd_count); ++ WARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0); ++ ret = 0; ++ } + return ret; + } + ++static bool zcache_pampd_is_remote(void *pampd) ++{ ++ return pampd_is_remote(pampd); ++} ++ + /* + * free the pampd and remove it from any zcache lists + * pampd must no longer be pointed to from any tmem data structures! + */ + static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, +- struct tmem_oid *oid, uint32_t index) ++ struct tmem_oid *oid, uint32_t index, bool acct) + { + struct zcache_client *cli = pool->client; +- +- if (is_ephemeral(pool)) { ++ bool eph = is_ephemeral(pool); ++ struct zv_hdr *zv; ++ ++ BUG_ON(preemptible()); ++ if (pampd_is_remote(pampd)) { ++ WARN_ON(acct == false); ++ if (oid == NULL) { ++ /* ++ * a NULL oid means to ignore this pampd free ++ * as the remote freeing will be handled elsewhere ++ */ ++ } else if (eph) { ++ /* FIXME remote flush optional but probably good idea */ ++ /* FIXME get these working properly again */ ++ atomic_dec(&zcache_curr_eph_pampd_count); ++ WARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0); ++ } else if (pampd_is_intransit(pampd)) { ++ /* did a pers remote get_and_free, so just free local */ ++ pampd = pampd_mask_intransit_and_remote(pampd); ++ goto local_pers; ++ } else { ++ struct flushlist_node *flnode = ++ ramster_flnode_alloc(pool); ++ ++ flnode->xh.client_id = pampd_remote_node(pampd); ++ flnode->xh.pool_id = pool->pool_id; ++ flnode->xh.oid = *oid; ++ flnode->xh.index = index; ++ flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE; ++ spin_lock(&zcache_rem_op_list_lock); ++ list_add(&flnode->rem_op.list, &zcache_rem_op_list); ++ spin_unlock(&zcache_rem_op_list_lock); ++ atomic_dec(&zcache_curr_pers_pampd_count); ++ WARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0); ++ atomic_dec(&ramster_remote_pers_pages); ++ WARN_ON_ONCE(atomic_read(&ramster_remote_pers_pages) < 0); ++ } ++ } else if (eph) { + zbud_free_and_delist((struct zbud_hdr *)pampd); +- atomic_dec(&zcache_curr_eph_pampd_count); +- BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); ++ if (!is_local_client(pool->client)) { ++ atomic_dec(&ramster_foreign_eph_pampd_count); ++ WARN_ON_ONCE(atomic_read(&ramster_foreign_eph_pampd_count) < 0); ++ } ++ if (acct) ++ atomic_dec(&zcache_curr_eph_pampd_count); ++ /* FIXME get these working properly again */ ++ WARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0); + } else { +- zv_free(cli->xvpool, (struct zv_hdr *)pampd); +- atomic_dec(&zcache_curr_pers_pampd_count); +- BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); ++local_pers: ++ zv = (struct zv_hdr *)pampd; ++ if (!is_local_client(pool->client)) { ++ atomic_dec(&ramster_foreign_pers_pampd_count); ++ WARN_ON_ONCE(atomic_read(&ramster_foreign_pers_pampd_count) < 0); ++ } ++ zv_free(cli->xvpool, zv); ++ if (acct) ++ atomic_dec(&zcache_curr_pers_pampd_count); ++ /* FIXME get these working properly again */ ++ WARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0); + } + } + +-static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj) ++static void zcache_pampd_free_obj(struct tmem_pool *pool, ++ struct tmem_obj *obj) + { ++ struct flushlist_node *flnode; ++ ++ BUG_ON(preemptible()); ++ if (obj->extra == NULL) ++ return; ++ BUG_ON(!pampd_is_remote(obj->extra)); ++ flnode = ramster_flnode_alloc(pool); ++ flnode->xh.client_id = pampd_remote_node(obj->extra); ++ flnode->xh.pool_id = pool->pool_id; ++ flnode->xh.oid = obj->oid; ++ flnode->xh.index = FLUSH_ENTIRE_OBJECT; ++ flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ; ++ spin_lock(&zcache_rem_op_list_lock); ++ list_add(&flnode->rem_op.list, &zcache_rem_op_list); ++ spin_unlock(&zcache_rem_op_list_lock); + } + +-static void zcache_pampd_new_obj(struct tmem_obj *obj) ++void zcache_pampd_new_obj(struct tmem_obj *obj) + { ++ obj->extra = NULL; + } + +-static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj) ++int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj) + { +- return -1; ++ int ret = -1; ++ ++ if (new_pampd != NULL) { ++ if (obj->extra == NULL) ++ obj->extra = new_pampd; ++ /* enforce that all remote pages in an object reside ++ * in the same node! */ ++ else if (pampd_remote_node(new_pampd) != ++ pampd_remote_node((void *)(obj->extra))) ++ BUG(); ++ ret = 0; ++ } ++ return ret; + } + +-static bool zcache_pampd_is_remote(void *pampd) ++/* ++ * Called by the message handler after a (still compressed) page has been ++ * fetched from the remote machine in response to an "is_remote" tmem_get ++ * or persistent tmem_localify. For a tmem_get, "extra" is the address of ++ * the page that is to be filled to succesfully resolve the tmem_get; for ++ * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only ++ * in the local zcache). "data" points to "size" bytes of (compressed) data ++ * passed in the message. In the case of a persistent remote get, if ++ * pre-allocation was successful (see zcache_repatriate_preload), the page ++ * is placed into both local zcache and at "extra". ++ */ ++int zcache_localify(int pool_id, struct tmem_oid *oidp, ++ uint32_t index, char *data, size_t size, ++ void *extra) + { +- return 0; ++ int ret = -ENOENT; ++ unsigned long flags; ++ struct tmem_pool *pool; ++ bool ephemeral, delete = false; ++ size_t clen = PAGE_SIZE; ++ void *pampd, *saved_hb; ++ struct tmem_obj *obj; ++ ++ pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id); ++ if (unlikely(pool == NULL)) ++ /* pool doesn't exist anymore */ ++ goto out; ++ ephemeral = is_ephemeral(pool); ++ local_irq_save(flags); /* FIXME: maybe only disable softirqs? */ ++ pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb); ++ if (pampd == NULL) { ++ /* hmmm... must have been a flush while waiting */ ++#if 1 ++ pr_err("UNTESTED pampd==NULL in zcache_localify\n"); ++#endif ++ if (ephemeral) ++ ramster_remote_eph_pages_unsucc_get++; ++ else ++ ramster_remote_pers_pages_unsucc_get++; ++ obj = NULL; ++ goto finish; ++ } else if (unlikely(!pampd_is_remote(pampd))) { ++ /* hmmm... must have been a dup put while waiting */ ++#if 1 ++ pr_err("UNTESTED dup while waiting in zcache_localify\n"); ++#endif ++ if (ephemeral) ++ ramster_remote_eph_pages_unsucc_get++; ++ else ++ ramster_remote_pers_pages_unsucc_get++; ++ obj = NULL; ++ pampd = NULL; ++ ret = -EEXIST; ++ goto finish; ++ } else if (size == 0) { ++ /* no remote data, delete the local is_remote pampd */ ++ pampd = NULL; ++ if (ephemeral) ++ ramster_remote_eph_pages_unsucc_get++; ++ else ++ BUG(); ++ delete = true; ++ goto finish; ++ } ++ if (!ephemeral && pampd_is_intransit(pampd)) { ++ /* localify to zcache */ ++ pampd = pampd_mask_intransit_and_remote(pampd); ++ zv_copy_to_pampd(pampd, data, size); ++ } else { ++ pampd = NULL; ++ obj = NULL; ++ } ++ if (extra != NULL) { ++ /* decompress direct-to-memory to complete remotify */ ++ ret = lzo1x_decompress_safe((char *)data, size, ++ (char *)extra, &clen); ++ BUG_ON(ret != LZO_E_OK); ++ BUG_ON(clen != PAGE_SIZE); ++ } ++ if (ephemeral) ++ ramster_remote_eph_pages_succ_get++; ++ else ++ ramster_remote_pers_pages_succ_get++; ++ ret = 0; ++finish: ++ tmem_localify_finish(obj, index, pampd, saved_hb, delete); ++ zcache_put_pool(pool); ++ local_irq_restore(flags); ++out: ++ return ret; ++} ++ ++/* ++ * Called on a remote persistent tmem_get to attempt to preallocate ++ * local storage for the data contained in the remote persistent page. ++ * If succesfully preallocated, returns the pampd, marked as remote and ++ * in_transit. Else returns NULL. Note that the appropriate tmem data ++ * structure must be locked. ++ */ ++static void *zcache_pampd_repatriate_preload(void *pampd, ++ struct tmem_pool *pool, ++ struct tmem_oid *oid, ++ uint32_t index, ++ bool *intransit) ++{ ++ int clen = pampd_remote_size(pampd); ++ void *ret_pampd = NULL; ++ unsigned long flags; ++ ++ if (!pampd_is_remote(pampd)) ++ BUG(); ++ if (is_ephemeral(pool)) ++ BUG(); ++ if (pampd_is_intransit(pampd)) { ++ /* ++ * to avoid multiple allocations (and maybe a memory leak) ++ * don't preallocate if already in the process of being ++ * repatriated ++ */ ++ *intransit = true; ++ goto out; ++ } ++#if 0 ++{ ++static unsigned long cnt; ++cnt++; ++if (!(cnt&(cnt-1))) ++pr_err("TESTING zcache_pampd_repat_preload, size=%d, cksum=??, cnt=%lu\n", ++clen, cnt); ++} ++#endif ++ *intransit = false; ++ local_irq_save(flags); ++ ret_pampd = (void *)zv_alloc(pool, oid, index, clen); ++ if (ret_pampd != NULL) { ++ /* ++ * a pampd is marked intransit if it is remote and space has ++ * been allocated for it locally (note, only happens for ++ * persistent pages, in which case the remote copy is freed) ++ */ ++ ret_pampd = pampd_mark_intransit(ret_pampd); ++ atomic_dec(&ramster_remote_pers_pages); ++ WARN_ON_ONCE(atomic_read(&ramster_remote_pers_pages) < 0); ++ } else ++ ramster_pers_pages_remote_nomem++; ++ local_irq_restore(flags); ++out: ++ return ret_pampd; ++} ++ ++/* ++ * Called on a remote tmem_get to invoke a message to fetch the page. ++ * Might sleep so no tmem locks can be held. "extra" is passed ++ * all the way through the round-trip messaging to zcache_localify. ++ */ ++static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd, ++ struct tmem_pool *pool, ++ struct tmem_oid *oid, uint32_t index, ++ bool free, void *extra) ++{ ++ struct tmem_xhandle xh; ++ int ret; ++ ++ if (pampd_is_intransit(real_pampd)) ++ /* have local space pre-reserved, so free remote copy */ ++ free = true; ++ xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index); ++ /* unreliable request/response for now */ ++ ret = ramster_remote_async_get(&xh, free, ++ pampd_remote_node(fake_pampd), ++ pampd_remote_size(fake_pampd), ++ pampd_remote_cksum(fake_pampd), ++ extra); ++#if 1 ++ if (ret != 0 && ret != -ENOENT) ++ pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n", ++ ret); ++#endif ++ return ret; + } + + static struct tmem_pamops zcache_pamops = { + .create = zcache_pampd_create, + .get_data = zcache_pampd_get_data, +- .get_data_and_free = zcache_pampd_get_data_and_free, + .free = zcache_pampd_free, ++ .get_data_and_free = zcache_pampd_get_data_and_free, + .free_obj = zcache_pampd_free_obj, ++ .is_remote = zcache_pampd_is_remote, ++ .repatriate_preload = zcache_pampd_repatriate_preload, ++ .repatriate = zcache_pampd_repatriate, + .new_obj = zcache_pampd_new_obj, + .replace_in_obj = zcache_pampd_replace_in_obj, +- .is_remote = zcache_pampd_is_remote, + }; + + /* +@@ -1342,9 +2293,13 @@ static int zcache_cpu_notifier(struct notifier_block *nb, + per_cpu(zcache_workmem, cpu) = + kzalloc(LZO1X_MEM_COMPRESS, + GFP_KERNEL | __GFP_REPEAT); ++ per_cpu(zcache_remoteputmem, cpu) = ++ kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); + break; + case CPU_DEAD: + case CPU_UP_CANCELED: ++ kfree(per_cpu(zcache_remoteputmem, cpu)); ++ per_cpu(zcache_remoteputmem, cpu) = NULL; + free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), + LZO_DSTMEM_PAGE_ORDER); + per_cpu(zcache_dstmem, cpu) = NULL; +@@ -1427,6 +2382,7 @@ ZCACHE_SYSFS_RO(aborted_preload); + ZCACHE_SYSFS_RO(aborted_shrink); + ZCACHE_SYSFS_RO(compress_poor); + ZCACHE_SYSFS_RO(mean_compress_poor); ++ZCACHE_SYSFS_RO(policy_percent_exceeded); + ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); + ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); + ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); +@@ -1451,6 +2407,7 @@ static struct attribute *zcache_attrs[] = { + &zcache_flobj_found_attr.attr, + &zcache_failed_eph_puts_attr.attr, + &zcache_failed_pers_puts_attr.attr, ++ &zcache_policy_percent_exceeded_attr.attr, + &zcache_compress_poor_attr.attr, + &zcache_mean_compress_poor_attr.attr, + &zcache_zbud_curr_raw_pages_attr.attr, +@@ -1483,6 +2440,151 @@ static struct attribute_group zcache_attr_group = { + .name = "zcache", + }; + ++#define RAMSTER_SYSFS_RO(_name) \ ++ static ssize_t ramster_##_name##_show(struct kobject *kobj, \ ++ struct kobj_attribute *attr, char *buf) \ ++ { \ ++ return sprintf(buf, "%lu\n", ramster_##_name); \ ++ } \ ++ static struct kobj_attribute ramster_##_name##_attr = { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = ramster_##_name##_show, \ ++ } ++ ++#define RAMSTER_SYSFS_RW(_name) \ ++ static ssize_t ramster_##_name##_show(struct kobject *kobj, \ ++ struct kobj_attribute *attr, char *buf) \ ++ { \ ++ return sprintf(buf, "%lu\n", ramster_##_name); \ ++ } \ ++ static ssize_t ramster_##_name##_store(struct kobject *kobj, \ ++ struct kobj_attribute *attr, const char *buf, size_t count) \ ++ { \ ++ int err; \ ++ unsigned long enable; \ ++ err = strict_strtoul(buf, 10, &enable); \ ++ if (err) \ ++ return -EINVAL; \ ++ ramster_##_name = enable; \ ++ return count; \ ++ } \ ++ static struct kobj_attribute ramster_##_name##_attr = { \ ++ .attr = { .name = __stringify(_name), .mode = 0644 }, \ ++ .show = ramster_##_name##_show, \ ++ .store = ramster_##_name##_store, \ ++ } ++ ++#define RAMSTER_SYSFS_RO_ATOMIC(_name) \ ++ static ssize_t ramster_##_name##_show(struct kobject *kobj, \ ++ struct kobj_attribute *attr, char *buf) \ ++ { \ ++ return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \ ++ } \ ++ static struct kobj_attribute ramster_##_name##_attr = { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = ramster_##_name##_show, \ ++ } ++ ++RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages); ++RAMSTER_SYSFS_RW(pers_remotify_enable); ++RAMSTER_SYSFS_RW(eph_remotify_enable); ++RAMSTER_SYSFS_RO(eph_pages_remoted); ++RAMSTER_SYSFS_RO(eph_pages_remote_failed); ++RAMSTER_SYSFS_RO(pers_pages_remoted); ++RAMSTER_SYSFS_RO(pers_pages_remote_failed); ++RAMSTER_SYSFS_RO(pers_pages_remote_nomem); ++RAMSTER_SYSFS_RO(remote_pages_flushed); ++RAMSTER_SYSFS_RO(remote_page_flushes_failed); ++RAMSTER_SYSFS_RO(remote_objects_flushed); ++RAMSTER_SYSFS_RO(remote_object_flushes_failed); ++RAMSTER_SYSFS_RO(remote_eph_pages_succ_get); ++RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get); ++RAMSTER_SYSFS_RO(remote_pers_pages_succ_get); ++RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get); ++RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count); ++RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max); ++RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count); ++RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max); ++RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count); ++RAMSTER_SYSFS_RO(curr_flnode_count_max); ++ ++#define MANUAL_NODES 8 ++static bool ramster_nodes_manual_up[MANUAL_NODES]; ++static ssize_t ramster_manual_node_up_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i; ++ char *p = buf; ++ for (i = 0; i < MANUAL_NODES; i++) ++ if (ramster_nodes_manual_up[i]) ++ p += sprintf(p, "%d ", i); ++ p += sprintf(p, "\n"); ++ return p - buf; ++} ++ ++static ssize_t ramster_manual_node_up_store(struct kobject *kobj, ++ struct kobj_attribute *attr, const char *buf, size_t count) ++{ ++ int err; ++ unsigned long node_num; ++ extern void o2net_hb_node_up_manual(int); ++ ++ err = strict_strtoul(buf, 10, &node_num); ++ if (err) { ++ pr_err("bad strtoul?\n"); ++ return -EINVAL; ++ } ++ if (node_num >= MANUAL_NODES) { ++ pr_err("bad node_num=%lu?\n", node_num); ++ return -EINVAL; ++ } ++ if (ramster_nodes_manual_up[node_num]) { ++ pr_err("node %d already up, ignoring\n", (int)node_num); ++ } else { ++ ramster_nodes_manual_up[node_num] = true; ++ o2net_hb_node_up_manual((int)node_num); ++ } ++ return count; ++} ++ ++static struct kobj_attribute ramster_manual_node_up_attr = { ++ .attr = { .name = "manual_node_up", .mode = 0644 }, ++ .show = ramster_manual_node_up_show, ++ .store = ramster_manual_node_up_store, ++}; ++ ++static struct attribute *ramster_attrs[] = { ++ &ramster_pers_remotify_enable_attr.attr, ++ &ramster_eph_remotify_enable_attr.attr, ++ &ramster_remote_pers_pages_attr.attr, ++ &ramster_eph_pages_remoted_attr.attr, ++ &ramster_eph_pages_remote_failed_attr.attr, ++ &ramster_pers_pages_remoted_attr.attr, ++ &ramster_pers_pages_remote_failed_attr.attr, ++ &ramster_pers_pages_remote_nomem_attr.attr, ++ &ramster_remote_pages_flushed_attr.attr, ++ &ramster_remote_page_flushes_failed_attr.attr, ++ &ramster_remote_objects_flushed_attr.attr, ++ &ramster_remote_object_flushes_failed_attr.attr, ++ &ramster_remote_eph_pages_succ_get_attr.attr, ++ &ramster_remote_eph_pages_unsucc_get_attr.attr, ++ &ramster_remote_pers_pages_succ_get_attr.attr, ++ &ramster_remote_pers_pages_unsucc_get_attr.attr, ++ &ramster_foreign_eph_pampd_count_attr.attr, ++ &ramster_foreign_eph_pampd_count_max_attr.attr, ++ &ramster_foreign_pers_pampd_count_attr.attr, ++ &ramster_foreign_pers_pampd_count_max_attr.attr, ++ &ramster_curr_flnode_count_attr.attr, ++ &ramster_curr_flnode_count_max_attr.attr, ++ &ramster_manual_node_up_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group ramster_attr_group = { ++ .attrs = ramster_attrs, ++ .name = "ramster", ++}; ++ + #endif /* CONFIG_SYSFS */ + /* + * When zcache is disabled ("frozen"), pools can be created and destroyed, +@@ -1527,8 +2629,9 @@ static struct shrinker zcache_shrinker = { + * zcache shims between cleancache/frontswap ops and tmem + */ + +-static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, +- uint32_t index, struct page *page) ++int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp, ++ uint32_t index, char *data, size_t size, ++ bool raw, int ephemeral) + { + struct tmem_pool *pool; + int ret = -1; +@@ -1539,8 +2642,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, + goto out; + if (!zcache_freeze && zcache_do_preload(pool) == 0) { + /* preload does preempt_disable on success */ +- ret = tmem_put(pool, oidp, index, (char *)(page), +- PAGE_SIZE, 0, is_ephemeral(pool)); ++ ret = tmem_put(pool, oidp, index, data, size, raw, ephemeral); + if (ret < 0) { + if (is_ephemeral(pool)) + zcache_failed_eph_puts++; +@@ -1560,27 +2662,40 @@ out: + return ret; + } + +-static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, +- uint32_t index, struct page *page) ++int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp, ++ uint32_t index, char *data, size_t *sizep, ++ bool raw, int get_and_free) + { + struct tmem_pool *pool; + int ret = -1; +- unsigned long flags; +- size_t size = PAGE_SIZE; ++ bool eph; + +- local_irq_save(flags); ++ if (!raw) { ++ BUG_ON(irqs_disabled()); ++ BUG_ON(in_softirq()); ++ } + pool = zcache_get_pool_by_id(cli_id, pool_id); ++ eph = is_ephemeral(pool); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) +- ret = tmem_get(pool, oidp, index, (char *)(page), +- &size, 0, is_ephemeral(pool)); ++ ret = tmem_get(pool, oidp, index, data, sizep, ++ raw, get_and_free); + zcache_put_pool(pool); + } +- local_irq_restore(flags); ++ WARN_ONCE((!eph && (ret != 0)), "zcache_get fails on persistent pool, " ++ "bad things are very likely to happen soon\n"); ++#if 1 ++{ ++if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool))) ++pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret); ++} ++#endif ++ if (ret == -EAGAIN) ++ BUG(); /* FIXME... don't need this anymore??? let's ensure */ + return ret; + } + +-static int zcache_flush_page(int cli_id, int pool_id, ++int zcache_flush(int cli_id, int pool_id, + struct tmem_oid *oidp, uint32_t index) + { + struct tmem_pool *pool; +@@ -1590,6 +2705,7 @@ static int zcache_flush_page(int cli_id, int pool_id, + local_irq_save(flags); + zcache_flush_total++; + pool = zcache_get_pool_by_id(cli_id, pool_id); ++ ramster_do_preload_flnode_only(pool); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_flush_page(pool, oidp, index); +@@ -1601,8 +2717,7 @@ static int zcache_flush_page(int cli_id, int pool_id, + return ret; + } + +-static int zcache_flush_object(int cli_id, int pool_id, +- struct tmem_oid *oidp) ++int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp) + { + struct tmem_pool *pool; + int ret = -1; +@@ -1611,6 +2726,7 @@ static int zcache_flush_object(int cli_id, int pool_id, + local_irq_save(flags); + zcache_flobj_total++; + pool = zcache_get_pool_by_id(cli_id, pool_id); ++ ramster_do_preload_flnode_only(pool); + if (likely(pool != NULL)) { + if (atomic_read(&pool->obj_count) > 0) + ret = tmem_flush_object(pool, oidp); +@@ -1622,7 +2738,7 @@ static int zcache_flush_object(int cli_id, int pool_id, + return ret; + } + +-static int zcache_destroy_pool(int cli_id, int pool_id) ++int zcache_client_destroy_pool(int cli_id, int pool_id) + { + struct tmem_pool *pool = NULL; + struct zcache_client *cli = NULL; +@@ -1649,13 +2765,17 @@ static int zcache_destroy_pool(int cli_id, int pool_id) + ret = tmem_destroy_pool(pool); + local_bh_enable(); + kfree(pool); +- pr_info("zcache: destroyed pool id=%d, cli_id=%d\n", +- pool_id, cli_id); ++ pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id, cli_id); + out: + return ret; + } + +-static int zcache_new_pool(uint16_t cli_id, uint32_t flags) ++static int zcache_destroy_pool(int pool_id) ++{ ++ return zcache_client_destroy_pool(LOCAL_CLIENT, pool_id); ++} ++ ++int zcache_new_pool(uint16_t cli_id, uint32_t flags) + { + int poolid = -1; + struct tmem_pool *pool; +@@ -1670,7 +2790,7 @@ static int zcache_new_pool(uint16_t cli_id, uint32_t flags) + atomic_inc(&cli->refcount); + pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); + if (pool == NULL) { +- pr_info("zcache: pool creation failed: out of memory\n"); ++ pr_info("ramster: pool creation failed: out of memory\n"); + goto out; + } + +@@ -1678,7 +2798,7 @@ static int zcache_new_pool(uint16_t cli_id, uint32_t flags) + if (cli->tmem_pools[poolid] == NULL) + break; + if (poolid >= MAX_POOLS_PER_CLIENT) { +- pr_info("zcache: pool creation failed: max exceeded\n"); ++ pr_info("ramster: pool creation failed: max exceeded\n"); + kfree(pool); + poolid = -1; + goto out; +@@ -1688,7 +2808,7 @@ static int zcache_new_pool(uint16_t cli_id, uint32_t flags) + pool->pool_id = poolid; + tmem_new_pool(pool, flags); + cli->tmem_pools[poolid] = pool; +- pr_info("zcache: created %s tmem pool, id=%d, client=%d\n", ++ pr_info("ramster: created %s tmem pool, id=%d, client=%d\n", + flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", + poolid, cli_id); + out: +@@ -1697,6 +2817,64 @@ out: + return poolid; + } + ++static int zcache_local_new_pool(uint32_t flags) ++{ ++ return zcache_new_pool(LOCAL_CLIENT, flags); ++} ++ ++int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral) ++{ ++ struct tmem_pool *pool; ++ struct zcache_client *cli = NULL; ++ uint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST; ++ int ret = -1; ++ ++ if (cli_id == LOCAL_CLIENT) ++ goto out; ++ if (pool_id >= MAX_POOLS_PER_CLIENT) ++ goto out; ++ else if ((unsigned int)cli_id < MAX_CLIENTS) ++ cli = &zcache_clients[cli_id]; ++ if ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap)) ++ BUG(); /* FIXME, handle more gracefully later */ ++ if (!cli->allocated) { ++ if (zcache_new_client(cli_id)) ++ BUG(); /* FIXME, handle more gracefully later */ ++ cli = &zcache_clients[cli_id]; ++ } ++ atomic_inc(&cli->refcount); ++ pool = cli->tmem_pools[pool_id]; ++ if (pool != NULL) { ++ if (pool->persistent && ephemeral) { ++ pr_err("zcache_autocreate_pool: type mismatch\n"); ++ goto out; ++ } ++ ret = 0; ++ goto out; ++ } ++ pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); ++ if (pool == NULL) { ++ pr_info("ramster: pool creation failed: out of memory\n"); ++ goto out; ++ } ++ atomic_set(&pool->refcount, 0); ++ pool->client = cli; ++ pool->pool_id = pool_id; ++ tmem_new_pool(pool, flags); ++ cli->tmem_pools[pool_id] = pool; ++ pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n", ++ flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", ++ pool_id, cli_id); ++ ret = 0; ++out: ++ if (cli == NULL) ++ BUG(); /* FIXME, handle more gracefully later */ ++ /* pr_err("zcache_autocreate_pool: failed\n"); */ ++ if (cli != NULL) ++ atomic_dec(&cli->refcount); ++ return ret; ++} ++ + /********** + * Two kernel functionalities currently can be layered on top of tmem. + * These are "cleancache" which is used as a second-chance cache for clean +@@ -1713,8 +2891,12 @@ static void zcache_cleancache_put_page(int pool_id, + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + +- if (likely(ind == index)) +- (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page); ++ if (likely(ind == index)) { ++ char *kva = page_address(page); ++ ++ (void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index, ++ kva, PAGE_SIZE, 0, 1); ++ } + } + + static int zcache_cleancache_get_page(int pool_id, +@@ -1725,8 +2907,15 @@ static int zcache_cleancache_get_page(int pool_id, + struct tmem_oid oid = *(struct tmem_oid *)&key; + int ret = -1; + +- if (likely(ind == index)) +- ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page); ++ preempt_disable(); ++ if (likely(ind == index)) { ++ char *kva = page_address(page); ++ size_t size = PAGE_SIZE; ++ ++ ret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index, ++ kva, &size, 0, 0); ++ } ++ preempt_enable(); + return ret; + } + +@@ -1738,7 +2927,7 @@ static void zcache_cleancache_flush_page(int pool_id, + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (likely(ind == index)) +- (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind); ++ (void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind); + } + + static void zcache_cleancache_flush_inode(int pool_id, +@@ -1752,7 +2941,7 @@ static void zcache_cleancache_flush_inode(int pool_id, + static void zcache_cleancache_flush_fs(int pool_id) + { + if (pool_id >= 0) +- (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id); ++ (void)zcache_destroy_pool(pool_id); + } + + static int zcache_cleancache_init_fs(size_t pagesize) +@@ -1760,7 +2949,7 @@ static int zcache_cleancache_init_fs(size_t pagesize) + BUG_ON(sizeof(struct cleancache_filekey) != + sizeof(struct tmem_oid)); + BUG_ON(pagesize != PAGE_SIZE); +- return zcache_new_pool(LOCAL_CLIENT, 0); ++ return zcache_local_new_pool(0); + } + + static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) +@@ -1769,7 +2958,7 @@ static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) + BUG_ON(sizeof(struct cleancache_filekey) != + sizeof(struct tmem_oid)); + BUG_ON(pagesize != PAGE_SIZE); +- return zcache_new_pool(LOCAL_CLIENT, 0); ++ return zcache_local_new_pool(0); + } + + static struct cleancache_ops zcache_cleancache_ops = { +@@ -1799,7 +2988,7 @@ static int zcache_frontswap_poolid = -1; + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +-#define SWIZ_BITS 4 ++#define SWIZ_BITS 8 + #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) + #define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) + #define iswiz(_ind) (_ind >> SWIZ_BITS) +@@ -1819,12 +3008,14 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, + struct tmem_oid oid = oswiz(type, ind); + int ret = -1; + unsigned long flags; ++ char *kva; + + BUG_ON(!PageLocked(page)); + if (likely(ind64 == ind)) { + local_irq_save(flags); +- ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid, +- &oid, iswiz(ind), page); ++ kva = page_address(page); ++ ret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid, ++ &oid, iswiz(ind), kva, PAGE_SIZE, 0, 0); + local_irq_restore(flags); + } + return ret; +@@ -1840,10 +3031,16 @@ static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, + struct tmem_oid oid = oswiz(type, ind); + int ret = -1; + ++ preempt_disable(); /* FIXME, remove this? */ + BUG_ON(!PageLocked(page)); +- if (likely(ind64 == ind)) +- ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid, +- &oid, iswiz(ind), page); ++ if (likely(ind64 == ind)) { ++ char *kva = page_address(page); ++ size_t size = PAGE_SIZE; ++ ++ ret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid, ++ &oid, iswiz(ind), kva, &size, 0, -1); ++ } ++ preempt_enable(); /* FIXME, remove this? */ + return ret; + } + +@@ -1855,7 +3052,7 @@ static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) + struct tmem_oid oid = oswiz(type, ind); + + if (likely(ind64 == ind)) +- (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid, ++ (void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid, + &oid, iswiz(ind)); + } + +@@ -1877,7 +3074,7 @@ static void zcache_frontswap_init(unsigned ignored) + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ + if (zcache_frontswap_poolid < 0) + zcache_frontswap_poolid = +- zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST); ++ zcache_local_new_pool(TMEM_POOL_PERSIST); + } + + static struct frontswap_ops zcache_frontswap_ops = { +@@ -1898,19 +3095,125 @@ struct frontswap_ops zcache_frontswap_register_ops(void) + #endif + + /* ++ * frontswap selfshrinking ++ */ ++ ++#ifdef CONFIG_FRONTSWAP ++/* In HZ, controls frequency of worker invocation. */ ++static unsigned int selfshrink_interval __read_mostly = 5; ++ ++static void selfshrink_process(struct work_struct *work); ++static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); ++ ++/* Enable/disable with sysfs. */ ++static bool frontswap_selfshrinking __read_mostly; ++ ++/* Enable/disable with kernel boot option. */ ++static bool use_frontswap_selfshrink __initdata = true; ++ ++/* ++ * The default values for the following parameters were deemed reasonable ++ * by experimentation, may be workload-dependent, and can all be ++ * adjusted via sysfs. ++ */ ++ ++/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ ++static unsigned int frontswap_hysteresis __read_mostly = 20; ++ ++/* ++ * Number of selfshrink worker invocations to wait before observing that ++ * frontswap selfshrinking should commence. Note that selfshrinking does ++ * not use a separate worker thread. ++ */ ++static unsigned int frontswap_inertia __read_mostly = 3; ++ ++/* Countdown to next invocation of frontswap_shrink() */ ++static unsigned long frontswap_inertia_counter; ++ ++/* ++ * Invoked by the selfshrink worker thread, uses current number of pages ++ * in frontswap (frontswap_curr_pages()), previous status, and control ++ * values (hysteresis and inertia) to determine if frontswap should be ++ * shrunk and what the new frontswap size should be. Note that ++ * frontswap_shrink is essentially a partial swapoff that immediately ++ * transfers pages from the "swap device" (frontswap) back into kernel ++ * RAM; despite the name, frontswap "shrinking" is very different from ++ * the "shrinker" interface used by the kernel MM subsystem to reclaim ++ * memory. ++ */ ++static void frontswap_selfshrink(void) ++{ ++ static unsigned long cur_frontswap_pages; ++ static unsigned long last_frontswap_pages; ++ static unsigned long tgt_frontswap_pages; ++ ++ last_frontswap_pages = cur_frontswap_pages; ++ cur_frontswap_pages = frontswap_curr_pages(); ++ if (!cur_frontswap_pages || ++ (cur_frontswap_pages > last_frontswap_pages)) { ++ frontswap_inertia_counter = frontswap_inertia; ++ return; ++ } ++ if (frontswap_inertia_counter && --frontswap_inertia_counter) ++ return; ++ if (cur_frontswap_pages <= frontswap_hysteresis) ++ tgt_frontswap_pages = 0; ++ else ++ tgt_frontswap_pages = cur_frontswap_pages - ++ (cur_frontswap_pages / frontswap_hysteresis); ++ frontswap_shrink(tgt_frontswap_pages); ++} ++ ++static int __init ramster_nofrontswap_selfshrink_setup(char *s) ++{ ++ use_frontswap_selfshrink = false; ++ return 1; ++} ++ ++__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); ++ ++static void selfshrink_process(struct work_struct *work) ++{ ++ if (frontswap_selfshrinking && frontswap_enabled) { ++ frontswap_selfshrink(); ++ schedule_delayed_work(&selfshrink_worker, ++ selfshrink_interval * HZ); ++ } ++} ++ ++static int ramster_enabled; ++ ++static int __init ramster_selfshrink_init(void) ++{ ++ frontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink; ++ if (frontswap_selfshrinking) ++ pr_info("ramster: Initializing frontswap " ++ "selfshrinking driver.\n"); ++ else ++ return -ENODEV; ++ ++ schedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ); ++ ++ return 0; ++} ++ ++subsys_initcall(ramster_selfshrink_init); ++#endif ++ ++/* + * zcache initialization +- * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR ++ * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR + * NOTHING HAPPENS! + */ + +-static int zcache_enabled; ++static int ramster_enabled; + +-static int __init enable_zcache(char *s) ++static int __init enable_ramster(char *s) + { +- zcache_enabled = 1; ++ ramster_enabled = 1; + return 1; + } +-__setup("zcache", enable_zcache); ++__setup("ramster", enable_ramster); + + /* allow independent dynamic disabling of cleancache and frontswap */ + +@@ -1918,16 +3221,22 @@ static int use_cleancache = 1; + + static int __init no_cleancache(char *s) + { ++ pr_info("INIT no_cleancache called\n"); + use_cleancache = 0; + return 1; + } + +-__setup("nocleancache", no_cleancache); ++/* ++ * FIXME: need to guarantee this gets checked before zcache_init is called ++ * What is the correct way to achieve this? ++ */ ++early_param("nocleancache", no_cleancache); + + static int use_frontswap = 1; + + static int __init no_frontswap(char *s) + { ++ pr_info("INIT no_frontswap called\n"); + use_frontswap = 0; + return 1; + } +@@ -1940,20 +3249,22 @@ static int __init zcache_init(void) + + #ifdef CONFIG_SYSFS + ret = sysfs_create_group(mm_kobj, &zcache_attr_group); ++ ret = sysfs_create_group(mm_kobj, &ramster_attr_group); + if (ret) { +- pr_err("zcache: can't create sysfs\n"); ++ pr_err("ramster: can't create sysfs\n"); + goto out; + } + #endif /* CONFIG_SYSFS */ + #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) +- if (zcache_enabled) { ++ if (ramster_enabled) { + unsigned int cpu; + ++ (void)ramster_o2net_register_handlers(); + tmem_register_hostops(&zcache_hostops); + tmem_register_pamops(&zcache_pamops); + ret = register_cpu_notifier(&zcache_cpu_notifier_block); + if (ret) { +- pr_err("zcache: can't register cpu notifier\n"); ++ pr_err("ramster: can't register cpu notifier\n"); + goto out; + } + for_each_online_cpu(cpu) { +@@ -1966,35 +3277,39 @@ static int __init zcache_init(void) + sizeof(struct tmem_objnode), 0, 0, NULL); + zcache_obj_cache = kmem_cache_create("zcache_obj", + sizeof(struct tmem_obj), 0, 0, NULL); +- ret = zcache_new_client(LOCAL_CLIENT); +- if (ret) { +- pr_err("zcache: can't create client\n"); +- goto out; +- } ++ ramster_flnode_cache = kmem_cache_create("ramster_flnode", ++ sizeof(struct flushlist_node), 0, 0, NULL); + #endif + #ifdef CONFIG_CLEANCACHE +- if (zcache_enabled && use_cleancache) { ++ pr_info("INIT ramster_enabled=%d use_cleancache=%d\n", ++ ramster_enabled, use_cleancache); ++ if (ramster_enabled && use_cleancache) { + struct cleancache_ops old_ops; + + zbud_init(); + register_shrinker(&zcache_shrinker); + old_ops = zcache_cleancache_register_ops(); +- pr_info("zcache: cleancache enabled using kernel " ++ pr_info("ramster: cleancache enabled using kernel " + "transcendent memory and compression buddies\n"); + if (old_ops.init_fs != NULL) +- pr_warning("zcache: cleancache_ops overridden"); ++ pr_warning("ramster: cleancache_ops overridden"); + } + #endif + #ifdef CONFIG_FRONTSWAP +- if (zcache_enabled && use_frontswap) { ++ pr_info("INIT ramster_enabled=%d use_frontswap=%d\n", ++ ramster_enabled, use_frontswap); ++ if (ramster_enabled && use_frontswap) { + struct frontswap_ops old_ops; + ++ zcache_new_client(LOCAL_CLIENT); + old_ops = zcache_frontswap_register_ops(); +- pr_info("zcache: frontswap enabled using kernel " ++ pr_info("ramster: frontswap enabled using kernel " + "transcendent memory and xvmalloc\n"); + if (old_ops.init != NULL) + pr_warning("ktmem: frontswap_ops overridden"); + } ++ if (ramster_enabled && (use_frontswap || use_cleancache)) ++ ramster_remotify_init(); + #endif + out: + return ret; +-- +1.7.1 diff --git a/a/content_digest b/N1/content_digest index 2f50d01..2eba9f6 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -13,5 +13,2671 @@ " dan.magenheimer@oracle.com\0" "\00:1\0" "b\0" + ">From d23d411ef33f094c14855d48962d44aec854c500 Mon Sep 17 00:00:00 2001\n" + "From: Dan Magenheimer <dan.magenheimer@oracle.com>\n" + "Date: Wed, 21 Dec 2011 14:01:59 -0700\n" + "Subject: [PATCH v2 4/6] drivers/staging/ramster: ramster-specific changes to zcache/tmem\n" + "\n" + "In tmem.[ch], new \"repatriate\" (provoke async get) and \"localify\" (handle\n" + "incoming data resulting from an async get) routines combine with a handful\n" + "of changes to existing pamops interfaces allow the generic tmem code\n" + "to support asynchronous operations. Also, a new tmem_xhandle struct\n" + "groups together key information that must be passed to remote tmem stores.\n" + "\n" + "Zcache-main.c is augmented with a large amount of ramster-specific code\n" + "to handle remote operations and \"foreign\" pages on both ends of the\n" + "\"remotify\" protocol. New \"foreign\" pools are auto-created on demand.\n" + "A \"selfshrinker\" thread periodically repatriates remote persistent pages\n" + "when local memory conditions allow. For certain operations, a queue is\n" + "necessary to guarantee strict ordering as out-of-order puts/flushes can\n" + "cause strange race conditions. Pampd pointers now either point to local\n" + "memory OR describe a remote page; to allow the same 64-bits to describe\n" + "either, the LSB is used to differentiate. Some acrobatics must be performed\n" + "to ensure local memory is available to handle a remote persistent get,\n" + "or deal with the data directly anyway if the malloc failed. Lots\n" + "of ramster-specific statistics are available via sysfs.\n" + "\n" + "Note: Some debug ifdefs left in for now.\n" + "\n" + "Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>\n" + "\n" + "---\n" + "\n" + " drivers/staging/ramster/Kconfig | 17 +-\n" + " drivers/staging/ramster/Makefile | 5 +-\n" + " drivers/staging/ramster/tmem.c | 117 ++-\n" + " drivers/staging/ramster/tmem.h | 46 +-\n" + " drivers/staging/ramster/zcache-main.c | 1651 +++++++++++++++++++++++++++++----\n" + " 5 files changed, 1636 insertions(+), 200 deletions(-)\n" + "\n" + "diff --git a/drivers/staging/ramster/Kconfig b/drivers/staging/ramster/Kconfig\n" + "index 7fabcb2..5154693 100644\n" + "--- a/drivers/staging/ramster/Kconfig\n" + "+++ b/drivers/staging/ramster/Kconfig\n" + "@@ -1,13 +1,14 @@\n" + "-config ZCACHE\n" + "-\ttristate \"Dynamic compression of swap pages and clean pagecache pages\"\n" + "-\tdepends on CLEANCACHE || FRONTSWAP\n" + "+config RAMSTER\n" + "+\ttristate \"Cross-machine RAM capacity sharing, aka peer-to-peer tmem\"\n" + "+\tdepends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS && !OCFS2_FS && !ZCACHE && !PREEMPT_VOLUNTARY && !HIGHMEM\n" + " \tselect XVMALLOC\n" + " \tselect LZO_COMPRESS\n" + " \tselect LZO_DECOMPRESS\n" + " \tdefault n\n" + " \thelp\n" + "-\t Zcache doubles RAM efficiency while providing a significant\n" + "-\t performance boosts on many workloads. Zcache uses lzo1x\n" + "-\t compression and an in-kernel implementation of transcendent\n" + "-\t memory to store clean page cache pages and swap in RAM,\n" + "-\t providing a noticeable reduction in disk I/O.\n" + "+\t RAMster allows RAM on other machines in a cluster to be utilized\n" + "+\t dynamically and symmetrically instead of swapping to a local swap\n" + "+\t disk, thus improving performance on memory-constrained workloads\n" + "+\t while minimizing total RAM across the cluster. RAMster, like\n" + "+\t zcache, compresses swap pages into local RAM, but then remotifies\n" + "+\t the compressed pages to another node in the RAMster cluster.\n" + "diff --git a/drivers/staging/ramster/Makefile b/drivers/staging/ramster/Makefile\n" + "index 60daa27..e6c4a2e 100644\n" + "--- a/drivers/staging/ramster/Makefile\n" + "+++ b/drivers/staging/ramster/Makefile\n" + "@@ -1,3 +1,2 @@\n" + "-zcache-y\t:=\tzcache-main.o tmem.o\n" + "-\n" + "-obj-$(CONFIG_ZCACHE)\t+=\tzcache.o\n" + "+obj-$(CONFIG_RAMSTER)\t+=\tzcache-main.o tmem.o\n" + "+obj-$(CONFIG_RAMSTER)\t+=\tramster_o2net.o cluster/\n" + "diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c\n" + "index 1ca66ea..ed7d07b 100644\n" + "--- a/drivers/staging/ramster/tmem.c\n" + "+++ b/drivers/staging/ramster/tmem.c\n" + "@@ -27,6 +27,7 @@\n" + " #include <linux/list.h>\n" + " #include <linux/spinlock.h>\n" + " #include <linux/atomic.h>\n" + "+#include <linux/delay.h>\n" + " \n" + " #include \"tmem.h\"\n" + " \n" + "@@ -316,7 +317,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)\n" + " }\n" + " \n" + " static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,\n" + "-\t\t\t\t\tvoid *new_pampd)\n" + "+\t\t\t\t\tvoid *new_pampd, bool no_free)\n" + " {\n" + " \tstruct tmem_objnode **slot;\n" + " \tvoid *ret = NULL;\n" + "@@ -325,7 +326,9 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,\n" + " \tif ((slot != NULL) && (*slot != NULL)) {\n" + " \t\tvoid *old_pampd = *(void **)slot;\n" + " \t\t*(void **)slot = new_pampd;\n" + "-\t\t(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);\n" + "+\t\tif (!no_free)\n" + "+\t\t\t(*tmem_pamops.free)(old_pampd, obj->pool,\n" + "+\t\t\t\t\t\tNULL, 0, false);\n" + " \t\tret = new_pampd;\n" + " \t}\n" + " \treturn ret;\n" + "@@ -481,7 +484,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj,\n" + " \t\t\tif (ht == 1) {\n" + " \t\t\t\tobj->pampd_count--;\n" + " \t\t\t\t(*tmem_pamops.free)(objnode->slots[i],\n" + "-\t\t\t\t\t\tobj->pool, NULL, 0);\n" + "+\t\t\t\t\t\tobj->pool, NULL, 0, true);\n" + " \t\t\t\tobjnode->slots[i] = NULL;\n" + " \t\t\t\tcontinue;\n" + " \t\t\t}\n" + "@@ -498,7 +501,8 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)\n" + " \t\treturn;\n" + " \tif (obj->objnode_tree_height == 0) {\n" + " \t\tobj->pampd_count--;\n" + "-\t\t(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);\n" + "+\t\t(*tmem_pamops.free)(obj->objnode_tree_root,\n" + "+\t\t\t\t\tobj->pool, NULL, 0, true);\n" + " \t} else {\n" + " \t\ttmem_objnode_node_destroy(obj, obj->objnode_tree_root,\n" + " \t\t\t\t\tobj->objnode_tree_height);\n" + "@@ -529,7 +533,7 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)\n" + " * always flushes for simplicity.\n" + " */\n" + " int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,\n" + "-\t\tchar *data, size_t size, bool raw, bool ephemeral)\n" + "+\t\tchar *data, size_t size, bool raw, int ephemeral)\n" + " {\n" + " \tstruct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;\n" + " \tvoid *pampd = NULL, *pampd_del = NULL;\n" + "@@ -545,7 +549,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,\n" + " \t\t\t/* if found, is a dup put, flush the old one */\n" + " \t\t\tpampd_del = tmem_pampd_delete_from_obj(obj, index);\n" + " \t\t\tBUG_ON(pampd_del != pampd);\n" + "-\t\t\t(*tmem_pamops.free)(pampd, pool, oidp, index);\n" + "+\t\t\t(*tmem_pamops.free)(pampd, pool, oidp, index, true);\n" + " \t\t\tif (obj->pampd_count == 0) {\n" + " \t\t\t\tobjnew = obj;\n" + " \t\t\t\tobjfound = NULL;\n" + "@@ -576,7 +580,7 @@ delete_and_free:\n" + " \t(void)tmem_pampd_delete_from_obj(obj, index);\n" + " free:\n" + " \tif (pampd)\n" + "-\t\t(*tmem_pamops.free)(pampd, pool, NULL, 0);\n" + "+\t\t(*tmem_pamops.free)(pampd, pool, NULL, 0, true);\n" + " \tif (objnew) {\n" + " \t\ttmem_obj_free(objnew, hb);\n" + " \t\t(*tmem_hostops.obj_free)(objnew, pool);\n" + "@@ -586,6 +590,65 @@ out:\n" + " \treturn ret;\n" + " }\n" + " \n" + "+void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,\n" + "+\t\t\t\tuint32_t index, struct tmem_obj **ret_obj,\n" + "+\t\t\t\tvoid **saved_hb)\n" + "+{\n" + "+\tstruct tmem_hashbucket *hb;\n" + "+\tstruct tmem_obj *obj = NULL;\n" + "+\tvoid *pampd = NULL;\n" + "+\n" + "+\thb = &pool->hashbucket[tmem_oid_hash(oidp)];\n" + "+\tspin_lock(&hb->lock);\n" + "+\tobj = tmem_obj_find(hb, oidp);\n" + "+\tif (likely(obj != NULL))\n" + "+\t\tpampd = tmem_pampd_lookup_in_obj(obj, index);\n" + "+\t*ret_obj = obj;\n" + "+\t*saved_hb = (void *)hb;\n" + "+\t/* note, hashbucket remains locked */\n" + "+\treturn pampd;\n" + "+}\n" + "+\n" + "+void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,\n" + "+\t\t\t void *pampd, void *saved_hb, bool delete)\n" + "+{\n" + "+\tstruct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;\n" + "+\n" + "+\tBUG_ON(!spin_is_locked(&hb->lock));\n" + "+\tif (pampd != NULL) {\n" + "+\t\tBUG_ON(obj == NULL);\n" + "+\t\t(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);\n" + "+\t} else if (delete) {\n" + "+\t\tBUG_ON(obj == NULL);\n" + "+\t\t(void)tmem_pampd_delete_from_obj(obj, index);\n" + "+\t}\n" + "+\tspin_unlock(&hb->lock);\n" + "+}\n" + "+\n" + "+static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,\n" + "+\t\t\t\tstruct tmem_pool *pool, struct tmem_oid *oidp,\n" + "+\t\t\t\tuint32_t index, bool free, char *data)\n" + "+{\n" + "+\tvoid *old_pampd = *ppampd, *new_pampd = NULL;\n" + "+\tbool intransit = false;\n" + "+\tint ret = 0;\n" + "+\n" + "+\n" + "+\tif (!is_ephemeral(pool))\n" + "+\t\tnew_pampd = (*tmem_pamops.repatriate_preload)(\n" + "+\t\t\t\told_pampd, pool, oidp, index, &intransit);\n" + "+\tif (intransit)\n" + "+\t\tret = -EAGAIN;\n" + "+\telse if (new_pampd != NULL)\n" + "+\t\t*ppampd = new_pampd;\n" + "+\t/* must release the hb->lock else repatriate can't sleep */\n" + "+\tspin_unlock(&hb->lock);\n" + "+\tif (!intransit)\n" + "+\t\tret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,\n" + "+\t\t\t\t\t\toidp, index, free, data);\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + " /*\n" + " * \"Get\" a page, e.g. if one can be found, copy the tmem page with the\n" + " * matching handle from PAM space to the kernel. By tmem definition,\n" + "@@ -607,14 +670,38 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,\n" + " \tint ret = -1;\n" + " \tstruct tmem_hashbucket *hb;\n" + " \tbool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);\n" + "-\tbool lock_held = false;\n" + "+\tbool lock_held = 0;\n" + "+\tvoid **ppampd;\n" + " \n" + "+again:\n" + " \thb = &pool->hashbucket[tmem_oid_hash(oidp)];\n" + " \tspin_lock(&hb->lock);\n" + "-\tlock_held = true;\n" + "+\tlock_held = 1;\n" + " \tobj = tmem_obj_find(hb, oidp);\n" + " \tif (obj == NULL)\n" + " \t\tgoto out;\n" + "+\tppampd = __tmem_pampd_lookup_in_obj(obj, index);\n" + "+\tif (ppampd == NULL)\n" + "+\t\tgoto out;\n" + "+\tif (tmem_pamops.is_remote(*ppampd)) {\n" + "+\t\tret = tmem_repatriate(ppampd, hb, pool, oidp,\n" + "+\t\t\t\t\tindex, free, data);\n" + "+\t\tlock_held = 0; /* note hb->lock has been unlocked */\n" + "+\t\tif (ret == -EAGAIN) {\n" + "+\t\t\t/* rare I think, but should cond_resched()??? */\n" + "+\t\t\tusleep_range(10, 1000);\n" + "+\t\t\tgoto again;\n" + "+\t\t} else if (ret != 0) {\n" + "+#if 1\n" + "+\t\t\tif (ret != -ENOENT)\n" + "+\t\t\t\tpr_err(\"UNTESTED case in tmem_get, ret=%d\\n\",\n" + "+\t\t\t\t\t\tret);\n" + "+#endif\n" + "+\t\t\tret = -1;\n" + "+\t\t\tgoto out;\n" + "+\t\t}\n" + "+\t\tgoto out;\n" + "+\t}\n" + " \tif (free)\n" + " \t\tpampd = tmem_pampd_delete_from_obj(obj, index);\n" + " \telse\n" + "@@ -628,10 +715,6 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,\n" + " \t\t\tobj = NULL;\n" + " \t\t}\n" + " \t}\n" + "-\tif (tmem_pamops.is_remote(pampd)) {\n" + "-\t\tlock_held = false;\n" + "-\t\tspin_unlock(&hb->lock);\n" + "-\t}\n" + " \tif (free)\n" + " \t\tret = (*tmem_pamops.get_data_and_free)(\n" + " \t\t\t\tdata, size, raw, pampd, pool, oidp, index);\n" + "@@ -668,7 +751,7 @@ int tmem_flush_page(struct tmem_pool *pool,\n" + " \tpampd = tmem_pampd_delete_from_obj(obj, index);\n" + " \tif (pampd == NULL)\n" + " \t\tgoto out;\n" + "-\t(*tmem_pamops.free)(pampd, pool, oidp, index);\n" + "+\t(*tmem_pamops.free)(pampd, pool, oidp, index, true);\n" + " \tif (obj->pampd_count == 0) {\n" + " \t\ttmem_obj_free(obj, hb);\n" + " \t\t(*tmem_hostops.obj_free)(obj, pool);\n" + "@@ -682,8 +765,8 @@ out:\n" + " \n" + " /*\n" + " * If a page in tmem matches the handle, replace the page so that any\n" + "- * subsequent \"get\" gets the new page. Returns 0 if\n" + "- * there was a page to replace, else returns -1.\n" + "+ * subsequent \"get\" gets the new page. Returns the new page if\n" + "+ * there was a page to replace, else returns NULL.\n" + " */\n" + " int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,\n" + " \t\t\tuint32_t index, void *new_pampd)\n" + "@@ -697,7 +780,7 @@ int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,\n" + " \tobj = tmem_obj_find(hb, oidp);\n" + " \tif (obj == NULL)\n" + " \t\tgoto out;\n" + "-\tnew_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);\n" + "+\tnew_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);\n" + " \tret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);\n" + " out:\n" + " \tspin_unlock(&hb->lock);\n" + "diff --git a/drivers/staging/ramster/tmem.h b/drivers/staging/ramster/tmem.h\n" + "index ed147c4..47f1918 100644\n" + "--- a/drivers/staging/ramster/tmem.h\n" + "+++ b/drivers/staging/ramster/tmem.h\n" + "@@ -9,7 +9,6 @@\n" + " #ifndef _TMEM_H_\n" + " #define _TMEM_H_\n" + " \n" + "-#include <linux/types.h>\n" + " #include <linux/highmem.h>\n" + " #include <linux/hash.h>\n" + " #include <linux/atomic.h>\n" + "@@ -89,6 +88,31 @@ struct tmem_oid {\n" + " \tuint64_t oid[3];\n" + " };\n" + " \n" + "+struct tmem_xhandle {\n" + "+\tuint8_t client_id;\n" + "+\tuint8_t xh_data_cksum;\n" + "+\tuint16_t xh_data_size;\n" + "+\tuint16_t pool_id;\n" + "+\tstruct tmem_oid oid;\n" + "+\tuint32_t index;\n" + "+\tvoid *extra;\n" + "+};\n" + "+\n" + "+static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id,\n" + "+\t\t\t\t\tstruct tmem_pool *pool,\n" + "+\t\t\t\t\tstruct tmem_oid *oidp,\n" + "+\t\t\t\t\tuint32_t index)\n" + "+{\n" + "+\tstruct tmem_xhandle xh;\n" + "+\txh.client_id = client_id;\n" + "+\txh.xh_data_cksum = (uint8_t)-1;\n" + "+\txh.xh_data_size = (uint16_t)-1;\n" + "+\txh.pool_id = pool->pool_id;\n" + "+\txh.oid = *oidp;\n" + "+\txh.index = index;\n" + "+\treturn xh;\n" + "+}\n" + "+\n" + " static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)\n" + " {\n" + " \toidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;\n" + "@@ -147,7 +171,11 @@ struct tmem_obj {\n" + " \tunsigned int objnode_tree_height;\n" + " \tunsigned long objnode_count;\n" + " \tlong pampd_count;\n" + "-\tvoid *extra; /* for private use by pampd implementation */\n" + "+\t/* for current design of ramster, all pages belonging to\n" + "+\t * an object reside on the same remotenode and extra is\n" + "+\t * used to record the number of the remotenode so a\n" + "+\t * flush-object operation can specify it */\n" + "+\tvoid *extra; /* for use by pampd implementation */\n" + " \tDECL_SENTINEL\n" + " };\n" + " \n" + "@@ -174,9 +202,14 @@ struct tmem_pamops {\n" + " \tint (*get_data_and_free)(char *, size_t *, bool, void *,\n" + " \t\t\t\tstruct tmem_pool *, struct tmem_oid *,\n" + " \t\t\t\tuint32_t);\n" + "-\tvoid (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);\n" + "+\tvoid (*free)(void *, struct tmem_pool *,\n" + "+\t\t\t\tstruct tmem_oid *, uint32_t, bool);\n" + " \tvoid (*free_obj)(struct tmem_pool *, struct tmem_obj *);\n" + " \tbool (*is_remote)(void *);\n" + "+\tvoid *(*repatriate_preload)(void *, struct tmem_pool *,\n" + "+\t\t\t\t\tstruct tmem_oid *, uint32_t, bool *);\n" + "+\tint (*repatriate)(void *, void *, struct tmem_pool *,\n" + "+\t\t\t\tstruct tmem_oid *, uint32_t, bool, void *);\n" + " \tvoid (*new_obj)(struct tmem_obj *);\n" + " \tint (*replace_in_obj)(void *, struct tmem_obj *);\n" + " };\n" + "@@ -193,11 +226,16 @@ extern void tmem_register_hostops(struct tmem_hostops *m);\n" + " \n" + " /* core tmem accessor functions */\n" + " extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,\n" + "-\t\t\tchar *, size_t, bool, bool);\n" + "+\t\t\tchar *, size_t, bool, int);\n" + " extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,\n" + " \t\t\tchar *, size_t *, bool, int);\n" + " extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,\n" + " \t\t\tvoid *);\n" + "+extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *,\n" + "+\t\t\t\t uint32_t index, struct tmem_obj **,\n" + "+\t\t\t\t void **);\n" + "+extern void tmem_localify_finish(struct tmem_obj *, uint32_t index,\n" + "+\t\t\t\t void *, void *, bool);\n" + " extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,\n" + " \t\t\tuint32_t index);\n" + " extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);\n" + "diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c\n" + "index cd0ed84..92fc7b2 100644\n" + "--- a/drivers/staging/ramster/zcache-main.c\n" + "+++ b/drivers/staging/ramster/zcache-main.c\n" + "@@ -1,7 +1,7 @@\n" + " /*\n" + " * zcache.c\n" + " *\n" + "- * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.\n" + "+ * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.\n" + " * Copyright (c) 2010,2011, Nitin Gupta\n" + " *\n" + " * Zcache provides an in-kernel \"host implementation\" for transcendent memory\n" + "@@ -17,9 +17,11 @@\n" + " *\n" + " * [1] For a definition of page-accessible memory (aka PAM), see:\n" + " * http://marc.info/?l=linux-mm&m=127811271605009\n" + "+ * RAMSTER TODO:\n" + "+ * - handle remotifying of buddied pages (see zbud_remotify_zbpg)\n" + "+ * - kernel boot params: nocleancache/nofrontswap don't always work?!?\n" + " */\n" + " \n" + "-#include <linux/module.h>\n" + " #include <linux/cpu.h>\n" + " #include <linux/highmem.h>\n" + " #include <linux/list.h>\n" + "@@ -30,11 +32,13 @@\n" + " #include <linux/atomic.h>\n" + " #include <linux/math64.h>\n" + " #include \"tmem.h\"\n" + "+#include \"zcache.h\"\n" + "+#include \"ramster.h\"\n" + " \n" + " #include \"../zram/xvmalloc.h\" /* if built in drivers/staging */\n" + " \n" + " #if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))\n" + "-#error \"zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP\"\n" + "+#error \"ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP\"\n" + " #endif\n" + " #ifdef CONFIG_CLEANCACHE\n" + " #include <linux/cleancache.h>\n" + "@@ -43,6 +47,61 @@\n" + " #include <linux/frontswap.h>\n" + " #endif\n" + " \n" + "+enum ramster_remotify_op {\n" + "+\tRAMSTER_REMOTIFY_EPH_PUT,\n" + "+\tRAMSTER_REMOTIFY_PERS_PUT,\n" + "+\tRAMSTER_REMOTIFY_FLUSH_PAGE,\n" + "+\tRAMSTER_REMOTIFY_FLUSH_OBJ,\n" + "+\tRAMSTER_INTRANSIT_PERS\n" + "+};\n" + "+\n" + "+struct ramster_remotify_hdr {\n" + "+\tenum ramster_remotify_op op;\n" + "+\tstruct list_head list;\n" + "+};\n" + "+\n" + "+#define ZBH_SENTINEL 0x43214321\n" + "+#define ZBPG_SENTINEL 0xdeadbeef\n" + "+\n" + "+#define ZBUD_MAX_BUDS 2\n" + "+\n" + "+struct zbud_hdr {\n" + "+\tstruct ramster_remotify_hdr rem_op;\n" + "+\tuint16_t client_id;\n" + "+\tuint16_t pool_id;\n" + "+\tstruct tmem_oid oid;\n" + "+\tuint32_t index;\n" + "+\tuint16_t size; /* compressed size in bytes, zero means unused */\n" + "+\tDECL_SENTINEL\n" + "+};\n" + "+\n" + "+#define ZVH_SENTINEL 0x43214321\n" + "+static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;\n" + "+\n" + "+struct zv_hdr {\n" + "+\tstruct ramster_remotify_hdr rem_op;\n" + "+\tuint16_t client_id;\n" + "+\tuint16_t pool_id;\n" + "+\tstruct tmem_oid oid;\n" + "+\tuint32_t index;\n" + "+\tDECL_SENTINEL\n" + "+};\n" + "+\n" + "+struct flushlist_node {\n" + "+\tstruct ramster_remotify_hdr rem_op;\n" + "+\tstruct tmem_xhandle xh;\n" + "+};\n" + "+\n" + "+union {\n" + "+\tstruct ramster_remotify_hdr rem_op;\n" + "+\tstruct zv_hdr zv;\n" + "+\tstruct zbud_hdr zbud;\n" + "+\tstruct flushlist_node flist;\n" + "+} remotify_list_node;\n" + "+\n" + "+static LIST_HEAD(zcache_rem_op_list);\n" + "+static DEFINE_SPINLOCK(zcache_rem_op_list_lock);\n" + "+\n" + " #if 0\n" + " /* this is more aggressive but may cause other problems? */\n" + " #define ZCACHE_GFP_MASK\t(GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)\n" + "@@ -98,20 +157,6 @@ static inline bool is_local_client(struct zcache_client *cli)\n" + " * read or written unless the zbpg's lock is held.\n" + " */\n" + " \n" + "-#define ZBH_SENTINEL 0x43214321\n" + "-#define ZBPG_SENTINEL 0xdeadbeef\n" + "-\n" + "-#define ZBUD_MAX_BUDS 2\n" + "-\n" + "-struct zbud_hdr {\n" + "-\tuint16_t client_id;\n" + "-\tuint16_t pool_id;\n" + "-\tstruct tmem_oid oid;\n" + "-\tuint32_t index;\n" + "-\tuint16_t size; /* compressed size in bytes, zero means unused */\n" + "-\tDECL_SENTINEL\n" + "-};\n" + "-\n" + " struct zbud_page {\n" + " \tstruct list_head bud_list;\n" + " \tspinlock_t lock;\n" + "@@ -153,8 +198,37 @@ static unsigned long zcache_zbud_curr_zbytes;\n" + " static unsigned long zcache_zbud_cumul_zpages;\n" + " static unsigned long zcache_zbud_cumul_zbytes;\n" + " static unsigned long zcache_compress_poor;\n" + "+static unsigned long zcache_policy_percent_exceeded;\n" + " static unsigned long zcache_mean_compress_poor;\n" + " \n" + "+/*\n" + "+ * RAMster counters\n" + "+ * - Remote pages are pages with a local pampd but the data is remote\n" + "+ * - Foreign pages are pages stored locally but belonging to another node\n" + "+ */\n" + "+static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0);\n" + "+static unsigned long ramster_pers_remotify_enable;\n" + "+static unsigned long ramster_eph_remotify_enable;\n" + "+static unsigned long ramster_eph_pages_remoted;\n" + "+static unsigned long ramster_eph_pages_remote_failed;\n" + "+static unsigned long ramster_pers_pages_remoted;\n" + "+static unsigned long ramster_pers_pages_remote_failed;\n" + "+static unsigned long ramster_pers_pages_remote_nomem;\n" + "+static unsigned long ramster_remote_objects_flushed;\n" + "+static unsigned long ramster_remote_object_flushes_failed;\n" + "+static unsigned long ramster_remote_pages_flushed;\n" + "+static unsigned long ramster_remote_page_flushes_failed;\n" + "+static unsigned long ramster_remote_eph_pages_succ_get;\n" + "+static unsigned long ramster_remote_pers_pages_succ_get;\n" + "+static unsigned long ramster_remote_eph_pages_unsucc_get;\n" + "+static unsigned long ramster_remote_pers_pages_unsucc_get;\n" + "+static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0);\n" + "+static unsigned long ramster_curr_flnode_count_max;\n" + "+static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0);\n" + "+static unsigned long ramster_foreign_eph_pampd_count_max;\n" + "+static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0);\n" + "+static unsigned long ramster_foreign_pers_pampd_count_max;\n" + "+\n" + " /* forward references */\n" + " static void *zcache_get_free_page(void);\n" + " static void zcache_free_page(void *p);\n" + "@@ -210,6 +284,29 @@ static char *zbud_data(struct zbud_hdr *zh, unsigned size)\n" + " \treturn p;\n" + " }\n" + " \n" + "+static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh)\n" + "+{\n" + "+\tstruct zbud_page *zbpg;\n" + "+\tchar *p;\n" + "+\tunsigned budnum;\n" + "+\n" + "+\tASSERT_SENTINEL(zh, ZBH);\n" + "+\tbudnum = zbud_budnum(zh);\n" + "+\tzbpg = container_of(zh, struct zbud_page, buddy[budnum]);\n" + "+\tspin_lock(&zbpg->lock);\n" + "+\tBUG_ON(zh->size > *size);\n" + "+\tp = (char *)zbpg;\n" + "+\tif (budnum == 0)\n" + "+\t\tp += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &\n" + "+\t\t\t\t\t\t\tCHUNK_MASK);\n" + "+\telse if (budnum == 1)\n" + "+\t\tp += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK);\n" + "+\t/* client should be filled in by caller */\n" + "+\tmemcpy(data, p, zh->size);\n" + "+\t*size = zh->size;\n" + "+\tspin_unlock(&zbpg->lock);\n" + "+}\n" + "+\n" + " /*\n" + " * zbud raw page management\n" + " */\n" + "@@ -299,6 +396,7 @@ static void zbud_free_and_delist(struct zbud_hdr *zh)\n" + " \tstruct zbud_page *zbpg =\n" + " \t\tcontainer_of(zh, struct zbud_page, buddy[budnum]);\n" + " \n" + "+\tBUG_ON(!irqs_disabled());\n" + " \tspin_lock(&zbpg->lock);\n" + " \tif (list_empty(&zbpg->bud_list)) {\n" + " \t\t/* ignore zombie page... see zbud_evict_pages() */\n" + "@@ -358,8 +456,13 @@ static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,\n" + " \tif (unlikely(zbpg == NULL))\n" + " \t\tgoto out;\n" + " \t/* ok, have a page, now compress the data before taking locks */\n" + "+#if 1 /* 110721 FIX LOCK ORDERING TO ELIMINATE DEADLOCK */\n" + "+\tspin_lock(&zbud_budlists_spinlock);\n" + "+\tspin_lock(&zbpg->lock);\n" + "+#else\n" + " \tspin_lock(&zbpg->lock);\n" + " \tspin_lock(&zbud_budlists_spinlock);\n" + "+#endif\n" + " \tlist_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);\n" + " \tzbud_unbuddied[nchunks].count++;\n" + " \tzh = &zbpg->buddy[0];\n" + "@@ -389,12 +492,19 @@ init_zh:\n" + " \tzh->oid = *oid;\n" + " \tzh->pool_id = pool_id;\n" + " \tzh->client_id = client_id;\n" + "+#if 1 /* 110721 FIX LOCK ORDERING TO ELIMINATE DEADLOCK */\n" + "+\tto = zbud_data(zh, size);\n" + "+\tmemcpy(to, cdata, size);\n" + "+\tspin_unlock(&zbpg->lock);\n" + "+\tspin_unlock(&zbud_budlists_spinlock);\n" + "+#else\n" + " \t/* can wait to copy the data until the list locks are dropped */\n" + " \tspin_unlock(&zbud_budlists_spinlock);\n" + " \n" + " \tto = zbud_data(zh, size);\n" + " \tmemcpy(to, cdata, size);\n" + " \tspin_unlock(&zbpg->lock);\n" + "+#endif\n" + " \tzbud_cumul_chunk_counts[nchunks]++;\n" + " \tatomic_inc(&zcache_zbud_curr_zpages);\n" + " \tzcache_zbud_cumul_zpages++;\n" + "@@ -458,6 +568,7 @@ static void zbud_evict_zbpg(struct zbud_page *zbpg)\n" + " \tuint32_t index[ZBUD_MAX_BUDS];\n" + " \tstruct tmem_oid oid[ZBUD_MAX_BUDS];\n" + " \tstruct tmem_pool *pool;\n" + "+\tunsigned long flags;\n" + " \n" + " \tASSERT_SPINLOCK(&zbpg->lock);\n" + " \tBUG_ON(!list_empty(&zbpg->bud_list));\n" + "@@ -474,9 +585,12 @@ static void zbud_evict_zbpg(struct zbud_page *zbpg)\n" + " \t}\n" + " \tspin_unlock(&zbpg->lock);\n" + " \tfor (i = 0; i < j; i++) {\n" + "+\t\t/* FIXME FIXME this just evicts local ephemeral pages!!! */\n" + " \t\tpool = zcache_get_pool_by_id(client_id[i], pool_id[i]);\n" + " \t\tif (pool != NULL) {\n" + "+\t\t\tlocal_irq_save(flags);\n" + " \t\t\ttmem_flush_page(pool, &oid[i], index[i]);\n" + "+\t\t\tlocal_irq_restore(flags);\n" + " \t\t\tzcache_put_pool(pool);\n" + " \t\t}\n" + " \t}\n" + "@@ -496,7 +610,7 @@ static void zbud_evict_zbpg(struct zbud_page *zbpg)\n" + " static void zbud_evict_pages(int nr)\n" + " {\n" + " \tstruct zbud_page *zbpg;\n" + "-\tint i;\n" + "+\tint i, newly_unused_pages = 0;\n" + " \n" + " \t/* first try freeing any pages on unused list */\n" + " retry_unused_list:\n" + "@@ -512,7 +626,7 @@ retry_unused_list:\n" + " \t\tzcache_free_page(zbpg);\n" + " \t\tzcache_evicted_raw_pages++;\n" + " \t\tif (--nr <= 0)\n" + "-\t\t\tgoto out;\n" + "+\t\t\tgoto done;\n" + " \t\tgoto retry_unused_list;\n" + " \t}\n" + " \tspin_unlock_bh(&zbpg_unused_list_spinlock);\n" + "@@ -534,9 +648,10 @@ retry_unbud_list_i:\n" + " \t\t\tzcache_evicted_unbuddied_pages++;\n" + " \t\t\t/* want budlists unlocked when doing zbpg eviction */\n" + " \t\t\tzbud_evict_zbpg(zbpg);\n" + "+\t\t\tnewly_unused_pages++;\n" + " \t\t\tlocal_bh_enable();\n" + " \t\t\tif (--nr <= 0)\n" + "-\t\t\t\tgoto out;\n" + "+\t\t\t\tgoto evict_unused;\n" + " \t\t\tgoto retry_unbud_list_i;\n" + " \t\t}\n" + " \t\tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "@@ -547,7 +662,7 @@ retry_bud_list:\n" + " \tspin_lock_bh(&zbud_budlists_spinlock);\n" + " \tif (list_empty(&zbud_buddied_list)) {\n" + " \t\tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "-\t\tgoto out;\n" + "+\t\tgoto evict_unused;\n" + " \t}\n" + " \tlist_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {\n" + " \t\tif (unlikely(!spin_trylock(&zbpg->lock)))\n" + "@@ -558,16 +673,362 @@ retry_bud_list:\n" + " \t\tzcache_evicted_buddied_pages++;\n" + " \t\t/* want budlists unlocked when doing zbpg eviction */\n" + " \t\tzbud_evict_zbpg(zbpg);\n" + "+\t\tnewly_unused_pages++;\n" + " \t\tlocal_bh_enable();\n" + " \t\tif (--nr <= 0)\n" + "-\t\t\tgoto out;\n" + "+\t\t\tgoto evict_unused;\n" + " \t\tgoto retry_bud_list;\n" + " \t}\n" + " \tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "+\n" + "+evict_unused:\n" + "+\t/*\n" + "+\t * zbud_evict_zbpg just moves pages on the unused list, it doesn't\n" + "+\t * free them so we need to actually free them here.\n" + "+\t */\n" + "+\tspin_lock_bh(&zbpg_unused_list_spinlock);\n" + "+\tif (!list_empty(&zbpg_unused_list) && newly_unused_pages--) {\n" + "+\t\t/* can't walk list here, since it may change when unlocked */\n" + "+\t\tzbpg = list_first_entry(&zbpg_unused_list,\n" + "+\t\t\t\tstruct zbud_page, bud_list);\n" + "+\t\tlist_del_init(&zbpg->bud_list);\n" + "+\t\tzcache_zbpg_unused_list_count--;\n" + "+\t\tatomic_dec(&zcache_zbud_curr_raw_pages);\n" + "+\t\tspin_unlock_bh(&zbpg_unused_list_spinlock);\n" + "+\t\tzcache_free_page(zbpg);\n" + "+\t\tgoto evict_unused;\n" + "+\t}\n" + "+\tspin_unlock_bh(&zbpg_unused_list_spinlock);\n" + "+done:\n" + "+\treturn;\n" + "+}\n" + "+\n" + "+static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem);\n" + "+\n" + "+/* only doing unbuddied for now, so only one remote put per zbpg */\n" + "+static int zbud_remotify_zbpg(struct zbud_page *zbpg)\n" + "+{\n" + "+\tstruct zbud_hdr *zh;\n" + "+\tstruct tmem_xhandle xh;\n" + "+\tstruct tmem_pool *pool;\n" + "+\tbool ephemeral;\n" + "+\tchar *data;\n" + "+\tsize_t size;\n" + "+\tint remotenode, ret = -1;\n" + "+\tunsigned long flags;\n" + "+\tunsigned char cksum;\n" + "+\tchar *p;\n" + "+\tint i;\n" + "+\tunsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem);\n" + "+\n" + "+\tASSERT_SPINLOCK(&zbpg->lock);\n" + "+\tBUG_ON(!list_empty(&zbpg->bud_list));\n" + "+\n" + "+\tif (zbpg->buddy[0].size == 0)\n" + "+\t\tzh = &zbpg->buddy[1];\n" + "+\telse if (zbpg->buddy[1].size == 0)\n" + "+\t\tzh = &zbpg->buddy[0];\n" + "+\telse\n" + "+\t\tBUG(); /* apparently NOT unbuddied ?!? */\n" + "+\n" + "+\t/* don't remotify pages that are already remotified */\n" + "+\tif (zh->client_id != LOCAL_CLIENT) {\n" + "+\t\tspin_unlock(&zbpg->lock);\n" + "+\t\tret = 0;\n" + "+\t\tpreempt_enable();\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\txh.client_id = zh->client_id;\n" + "+\txh.pool_id = zh->pool_id;\n" + "+\txh.oid = zh->oid;\n" + "+\txh.index = zh->index;\n" + "+\tsize = zh->size;\n" + "+\tdata = zbud_data(zh, size);\n" + "+\tfor (p = data, cksum = 0, i = 0; i < size; i++)\n" + "+\t\tcksum += *p;\n" + "+\tmemcpy(tmpmem, data, size);\n" + "+\tdata = tmpmem;\n" + "+\tspin_unlock(&zbpg->lock);\n" + "+\tpreempt_enable();\t/* no locks held anymore */\n" + "+\tpool = zcache_get_pool_by_id(zh->client_id, zh->pool_id);\n" + "+\tBUG_ON(pool == NULL);\n" + "+\tephemeral = !pool->persistent;\n" + "+\tzcache_put_pool(pool);\n" + "+\tret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode);\n" + "+\tif (ret == 0) {\n" + "+\t\t/* data was successfully remoted so change the local version\n" + "+\t\t * to point to the remote node where it landed */\n" + "+\t\tstruct tmem_pool *pool;\n" + "+\t\tpool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);\n" + "+\t\tBUG_ON(pool == NULL);\n" + "+\t\tlocal_irq_save(flags);\n" + "+\t\t(void)tmem_replace(pool, &xh.oid, xh.index,\n" + "+\t\t\tpampd_make_remote(remotenode, size, cksum));\n" + "+\t\tlocal_irq_restore(flags);\n" + "+\t\tzcache_put_pool(pool);\n" + "+\t\tramster_eph_pages_remoted++;\n" + "+\t\tret = 1;\n" + "+\t} else\n" + "+\t\tramster_eph_pages_remote_failed++;\n" + "+\n" + "+out:\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+void zbud_remotify_pages(int nr)\n" + "+{\n" + "+\tstruct zbud_page *zbpg;\n" + "+\tint i, ret;\n" + "+\n" + "+\t/*\n" + "+\t * for now just try remotifying unbuddied pages, starting with\n" + "+\t * least space avail\n" + "+\t */\n" + "+\tfor (i = 0; i < MAX_CHUNK; i++) {\n" + "+retry_unbud_list_i:\n" + "+\t\tpreempt_disable(); /* enable in zbud_remotify_zbpg */\n" + "+\t\tspin_lock_bh(&zbud_budlists_spinlock);\n" + "+\t\tif (list_empty(&zbud_unbuddied[i].list)) {\n" + "+\t\t\tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "+\t\t\tcontinue;\n" + "+\t\t}\n" + "+\t\tlist_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {\n" + "+\t\t\tif (unlikely(!spin_trylock(&zbpg->lock))) {\n" + "+\t\t\t\tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "+\t\t\t\tpreempt_enable();\n" + "+\t\t\t\tcontinue;\n" + "+\t\t\t}\n" + "+\t\t\tlist_del_init(&zbpg->bud_list);\n" + "+\t\t\tzbud_unbuddied[i].count--;\n" + "+\t\t\tspin_unlock(&zbud_budlists_spinlock);\n" + "+\t\t\t/* want budlists unlocked when doing zbpg remotify */\n" + "+\t\t\tlocal_bh_enable();\n" + "+\t\t\tret = zbud_remotify_zbpg(zbpg);\n" + "+\t\t\t/* preemption is now re-enabled */\n" + "+\t\t\tif (ret == 0)\n" + "+\t\t\t\tBUG();\n" + "+\t\t\telse if (ret == 1)\n" + "+\t\t\t\t--nr;\n" + "+\t\t\telse {\n" + "+\t\t\t\t/* if fail to remotify any page, quit */\n" + "+pr_err(\"TESTING zbud_remotify_pages failed on page, trying to re-add\\n\");\n" + "+\t\t\t\tspin_lock_bh(&zbud_budlists_spinlock);\n" + "+\t\t\t\tspin_lock(&zbpg->lock);\n" + "+\t\t\t\tlist_add_tail(&zbpg->bud_list,\n" + "+\t\t\t\t\t&zbud_unbuddied[i].list);\n" + "+\t\t\t\tzbud_unbuddied[i].count++;\n" + "+\t\t\t\tspin_unlock(&zbpg->lock);\n" + "+\t\t\t\tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "+pr_err(\"TESTING zbud_remotify_pages failed on page, finished re-add\\n\");\n" + "+\t\t\t\tgoto out;\n" + "+\t\t\t}\n" + "+\t\t\tif (nr <= 0)\n" + "+\t\t\t\tgoto out;\n" + "+\t\t\tgoto retry_unbud_list_i;\n" + "+\t\t}\n" + "+\t\tspin_unlock_bh(&zbud_budlists_spinlock);\n" + "+\t}\n" + " out:\n" + " \treturn;\n" + " }\n" + " \n" + "+/* the \"flush list\" asynchronously collects pages to remotely flush */\n" + "+#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)\n" + "+static void ramster_flnode_free(struct flushlist_node *,\n" + "+\t\t\t\tstruct tmem_pool *);\n" + "+\n" + "+static void zcache_remote_flush_page(struct flushlist_node *flnode)\n" + "+{\n" + "+\tstruct tmem_xhandle *xh;\n" + "+\tint remotenode, ret;\n" + "+\n" + "+\tpreempt_disable();\n" + "+\txh = &flnode->xh;\n" + "+\tremotenode = flnode->xh.client_id;\n" + "+\tret = ramster_remote_flush(xh, remotenode);\n" + "+\tif (ret >= 0)\n" + "+\t\tramster_remote_pages_flushed++;\n" + "+\telse\n" + "+\t\tramster_remote_page_flushes_failed++;\n" + "+\tpreempt_enable_no_resched();\n" + "+\tramster_flnode_free(flnode, NULL);\n" + "+}\n" + "+\n" + "+static void zcache_remote_flush_object(struct flushlist_node *flnode)\n" + "+{\n" + "+\tstruct tmem_xhandle *xh;\n" + "+\tint remotenode, ret;\n" + "+\n" + "+\tpreempt_disable();\n" + "+\txh = &flnode->xh;\n" + "+\tremotenode = flnode->xh.client_id;\n" + "+\tret = ramster_remote_flush_object(xh, remotenode);\n" + "+\tif (ret >= 0)\n" + "+\t\tramster_remote_objects_flushed++;\n" + "+\telse\n" + "+\t\tramster_remote_object_flushes_failed++;\n" + "+\tpreempt_enable_no_resched();\n" + "+\tramster_flnode_free(flnode, NULL);\n" + "+}\n" + "+\n" + "+static void zcache_remote_eph_put(struct zbud_hdr *zbud)\n" + "+{\n" + "+\t/* FIXME */\n" + "+}\n" + "+\n" + "+static void zcache_remote_pers_put(struct zv_hdr *zv)\n" + "+{\n" + "+\tstruct tmem_xhandle xh;\n" + "+\tuint16_t size;\n" + "+\tbool ephemeral;\n" + "+\tint remotenode, ret = -1;\n" + "+\tchar *data;\n" + "+\tstruct tmem_pool *pool;\n" + "+\tunsigned long flags;\n" + "+\tunsigned char cksum;\n" + "+\tchar *p;\n" + "+\tint i;\n" + "+\tunsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem);\n" + "+\n" + "+\tASSERT_SENTINEL(zv, ZVH);\n" + "+\tBUG_ON(zv->client_id != LOCAL_CLIENT);\n" + "+\tlocal_bh_disable();\n" + "+\txh.client_id = zv->client_id;\n" + "+\txh.pool_id = zv->pool_id;\n" + "+\txh.oid = zv->oid;\n" + "+\txh.index = zv->index;\n" + "+\tsize = xv_get_object_size(zv) - sizeof(*zv);\n" + "+\tBUG_ON(size == 0 || size > zv_max_page_size);\n" + "+\tdata = (char *)zv + sizeof(*zv);\n" + "+\tfor (p = data, cksum = 0, i = 0; i < size; i++)\n" + "+\t\tcksum += *p;\n" + "+\tmemcpy(tmpmem, data, size);\n" + "+\tdata = tmpmem;\n" + "+\tpool = zcache_get_pool_by_id(zv->client_id, zv->pool_id);\n" + "+\tephemeral = is_ephemeral(pool);\n" + "+\tzcache_put_pool(pool);\n" + "+\t/* now OK to release lock set in caller */\n" + "+\tspin_unlock(&zcache_rem_op_list_lock);\n" + "+\tlocal_bh_enable();\n" + "+\tpreempt_disable();\n" + "+\tret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode);\n" + "+\tpreempt_enable_no_resched();\n" + "+\tif (ret != 0) {\n" + "+\t\t/*\n" + "+\t\t * This is some form of a memory leak... if the remote put\n" + "+\t\t * fails, there will never be another attempt to remotify\n" + "+\t\t * this page. But since we've dropped the zv pointer,\n" + "+\t\t * the page may have been freed or the data replaced\n" + "+\t\t * so we can't just \"put it back\" in the remote op list.\n" + "+\t\t * Even if we could, not sure where to put it in the list\n" + "+\t\t * because there may be flushes that must be strictly\n" + "+\t\t * ordered vs the put. So leave this as a FIXME for now.\n" + "+\t\t * But count them so we know if it becomes a problem.\n" + "+\t\t */\n" + "+\t\tramster_pers_pages_remote_failed++;\n" + "+\t\tgoto out;\n" + "+\t} else\n" + "+\t\tatomic_inc(&ramster_remote_pers_pages);\n" + "+\tramster_pers_pages_remoted++;\n" + "+\t/*\n" + "+\t * data was successfully remoted so change the local version to\n" + "+\t * point to the remote node where it landed\n" + "+\t */\n" + "+\tlocal_bh_disable();\n" + "+\tpool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);\n" + "+\tlocal_irq_save(flags);\n" + "+\t(void)tmem_replace(pool, &xh.oid, xh.index,\n" + "+\t\t\tpampd_make_remote(remotenode, size, cksum));\n" + "+\tlocal_irq_restore(flags);\n" + "+\tzcache_put_pool(pool);\n" + "+\tlocal_bh_enable();\n" + "+out:\n" + "+\treturn;\n" + "+}\n" + "+\n" + "+static void zcache_do_remotify_ops(int nr)\n" + "+{\n" + "+\tstruct ramster_remotify_hdr *rem_op;\n" + "+\tunion remotify_list_node *u;\n" + "+\n" + "+\twhile (1) {\n" + "+\t\tif (!nr)\n" + "+\t\t\tgoto out;\n" + "+\t\tspin_lock(&zcache_rem_op_list_lock);\n" + "+\t\tif (list_empty(&zcache_rem_op_list)) {\n" + "+\t\t\tspin_unlock(&zcache_rem_op_list_lock);\n" + "+\t\t\tgoto out;\n" + "+\t\t}\n" + "+\t\trem_op = list_first_entry(&zcache_rem_op_list,\n" + "+\t\t\t\tstruct ramster_remotify_hdr, list);\n" + "+\t\tlist_del_init(&rem_op->list);\n" + "+\t\tif (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT)\n" + "+\t\t\tspin_unlock(&zcache_rem_op_list_lock);\n" + "+\t\tu = (union remotify_list_node *)rem_op;\n" + "+\t\tswitch (rem_op->op) {\n" + "+\t\tcase RAMSTER_REMOTIFY_EPH_PUT:\n" + "+BUG();\n" + "+\t\t\tzcache_remote_eph_put((struct zbud_hdr *)rem_op);\n" + "+\t\t\tbreak;\n" + "+\t\tcase RAMSTER_REMOTIFY_PERS_PUT:\n" + "+\t\t\tzcache_remote_pers_put((struct zv_hdr *)rem_op);\n" + "+\t\t\tbreak;\n" + "+\t\tcase RAMSTER_REMOTIFY_FLUSH_PAGE:\n" + "+\t\t\tzcache_remote_flush_page((struct flushlist_node *)u);\n" + "+\t\t\tbreak;\n" + "+\t\tcase RAMSTER_REMOTIFY_FLUSH_OBJ:\n" + "+\t\t\tzcache_remote_flush_object((struct flushlist_node *)u);\n" + "+\t\t\tbreak;\n" + "+\t\tdefault:\n" + "+\t\t\tBUG();\n" + "+\t\t}\n" + "+\t}\n" + "+out:\n" + "+\treturn;\n" + "+}\n" + "+\n" + "+/*\n" + "+ * For now, just push over a few pages every few seconds to\n" + "+ * ensure that it basically works\n" + "+ */\n" + "+static struct workqueue_struct *ramster_remotify_workqueue;\n" + "+static void ramster_remotify_process(struct work_struct *work);\n" + "+static DECLARE_DELAYED_WORK(ramster_remotify_worker,\n" + "+\t\tramster_remotify_process);\n" + "+\n" + "+static void ramster_remotify_queue_delayed_work(unsigned long delay)\n" + "+{\n" + "+\tif (!queue_delayed_work(ramster_remotify_workqueue,\n" + "+\t\t\t\t&ramster_remotify_worker, delay))\n" + "+\t\tpr_err(\"ramster_remotify: bad workqueue\\n\");\n" + "+}\n" + "+\n" + "+\n" + "+static int use_frontswap;\n" + "+static int use_cleancache;\n" + "+static void ramster_remotify_process(struct work_struct *work)\n" + "+{\n" + "+\tBUG_ON(irqs_disabled());\n" + "+\tramster_remotify_queue_delayed_work(10 * HZ);\n" + "+#ifdef CONFIG_FRONTSWAP\n" + "+\tif (use_frontswap && ramster_pers_remotify_enable)\n" + "+\t\tzcache_do_remotify_ops(500); /* FIXME is this a good number? */\n" + "+#endif\n" + "+#ifdef CONFIG_CLEANCACHE\n" + "+\tif (use_cleancache && ramster_eph_remotify_enable)\n" + "+\t\tzbud_remotify_pages(100);\n" + "+#endif\n" + "+}\n" + "+\n" + "+static void ramster_remotify_init(void)\n" + "+{\n" + "+\tunsigned long n = 60UL;\n" + "+\tramster_remotify_workqueue =\n" + "+\t\tcreate_singlethread_workqueue(\"ramster_remotify\");\n" + "+\tramster_remotify_queue_delayed_work(n * HZ);\n" + "+}\n" + "+\n" + "+\n" + " static void zbud_init(void)\n" + " {\n" + " \tint i;\n" + "@@ -631,15 +1092,6 @@ static int zbud_show_cumul_chunk_counts(char *buf)\n" + " * necessary for decompression) immediately preceding the compressed data.\n" + " */\n" + " \n" + "-#define ZVH_SENTINEL 0x43214321\n" + "-\n" + "-struct zv_hdr {\n" + "-\tuint32_t pool_id;\n" + "-\tstruct tmem_oid oid;\n" + "-\tuint32_t index;\n" + "-\tDECL_SENTINEL\n" + "-};\n" + "-\n" + " /* rudimentary policy limits */\n" + " /* total number of persistent pages may not exceed this percentage */\n" + " static unsigned int zv_page_count_policy_percent = 75;\n" + "@@ -658,7 +1110,7 @@ static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;\n" + " static unsigned long zv_curr_dist_counts[NCHUNKS];\n" + " static unsigned long zv_cumul_dist_counts[NCHUNKS];\n" + " \n" + "-static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,\n" + "+static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id,\n" + " \t\t\t\tstruct tmem_oid *oid, uint32_t index,\n" + " \t\t\t\tvoid *cdata, unsigned clen)\n" + " {\n" + "@@ -671,7 +1123,7 @@ static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,\n" + " \n" + " \tBUG_ON(!irqs_disabled());\n" + " \tBUG_ON(chunks >= NCHUNKS);\n" + "-\tret = xv_malloc(xvpool, alloc_size,\n" + "+\tret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr),\n" + " \t\t\t&page, &offset, ZCACHE_GFP_MASK);\n" + " \tif (unlikely(ret))\n" + " \t\tgoto out;\n" + "@@ -682,12 +1134,50 @@ static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,\n" + " \tzv->oid = *oid;\n" + " \tzv->pool_id = pool_id;\n" + " \tSET_SENTINEL(zv, ZVH);\n" + "+\tINIT_LIST_HEAD(&zv->rem_op.list);\n" + "+\tzv->client_id = get_client_id_from_client(cli);\n" + "+\tzv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT;\n" + "+\tif (zv->client_id == LOCAL_CLIENT) {\n" + "+\t\tspin_lock(&zcache_rem_op_list_lock);\n" + "+\t\tlist_add_tail(&zv->rem_op.list, &zcache_rem_op_list);\n" + "+\t\tspin_unlock(&zcache_rem_op_list_lock);\n" + "+\t}\n" + " \tmemcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);\n" + " \tkunmap_atomic(zv, KM_USER0);\n" + " out:\n" + " \treturn zv;\n" + " }\n" + " \n" + "+/* similar to zv_create, but just reserve space, no data yet */\n" + "+static struct zv_hdr *zv_alloc(struct tmem_pool *pool,\n" + "+\t\t\t\tstruct tmem_oid *oid, uint32_t index,\n" + "+\t\t\t\tunsigned clen)\n" + "+{\n" + "+\tstruct zcache_client *cli = pool->client;\n" + "+\tstruct page *page;\n" + "+\tstruct zv_hdr *zv = NULL;\n" + "+\tuint32_t offset;\n" + "+\tint ret;\n" + "+\n" + "+\tBUG_ON(!irqs_disabled());\n" + "+\tBUG_ON(!is_local_client(pool->client));\n" + "+\tret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr),\n" + "+\t\t\t&page, &offset, ZCACHE_GFP_MASK);\n" + "+\tif (unlikely(ret))\n" + "+\t\tgoto out;\n" + "+\tzv = kmap_atomic(page, KM_USER0) + offset;\n" + "+\tSET_SENTINEL(zv, ZVH);\n" + "+\tINIT_LIST_HEAD(&zv->rem_op.list);\n" + "+\tzv->client_id = LOCAL_CLIENT;\n" + "+\tzv->rem_op.op = RAMSTER_INTRANSIT_PERS;\n" + "+\tzv->index = index;\n" + "+\tzv->oid = *oid;\n" + "+\tzv->pool_id = pool->pool_id;\n" + "+\tkunmap_atomic(zv, KM_USER0);\n" + "+out:\n" + "+\treturn zv;\n" + "+}\n" + "+\n" + " static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)\n" + " {\n" + " \tunsigned long flags;\n" + "@@ -700,8 +1190,13 @@ static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)\n" + " \tBUG_ON(chunks >= NCHUNKS);\n" + " \tzv_curr_dist_counts[chunks]--;\n" + " \tsize -= sizeof(*zv);\n" + "+\tspin_lock(&zcache_rem_op_list_lock);\n" + "+\tsize = xv_get_object_size(zv) - sizeof(*zv);\n" + " \tBUG_ON(size == 0);\n" + " \tINVERT_SENTINEL(zv, ZVH);\n" + "+\tif (!list_empty(&zv->rem_op.list))\n" + "+\t\tlist_del_init(&zv->rem_op.list);\n" + "+\tspin_unlock(&zcache_rem_op_list_lock);\n" + " \tpage = virt_to_page(zv);\n" + " \toffset = (unsigned long)zv & ~PAGE_MASK;\n" + " \tlocal_irq_save(flags);\n" + "@@ -727,6 +1222,29 @@ static void zv_decompress(struct page *page, struct zv_hdr *zv)\n" + " \tBUG_ON(clen != PAGE_SIZE);\n" + " }\n" + " \n" + "+static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv)\n" + "+{\n" + "+\tunsigned size;\n" + "+\n" + "+\tASSERT_SENTINEL(zv, ZVH);\n" + "+\tsize = xv_get_object_size(zv) - sizeof(*zv);\n" + "+\tBUG_ON(size == 0 || size > zv_max_page_size);\n" + "+\tBUG_ON(size > *bufsize);\n" + "+\tmemcpy(data, (char *)zv + sizeof(*zv), size);\n" + "+\t*bufsize = size;\n" + "+}\n" + "+\n" + "+static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size)\n" + "+{\n" + "+\tunsigned zv_size;\n" + "+\n" + "+\tASSERT_SENTINEL(zv, ZVH);\n" + "+\tzv_size = xv_get_object_size(zv) - sizeof(*zv);\n" + "+\tBUG_ON(zv_size != size);\n" + "+\tBUG_ON(zv_size == 0 || zv_size > zv_max_page_size);\n" + "+\tmemcpy((char *)zv + sizeof(*zv), data, size);\n" + "+}\n" + "+\n" + " #ifdef CONFIG_SYSFS\n" + " /*\n" + " * show a distribution of compression stats for zv pages.\n" + "@@ -979,6 +1497,7 @@ static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);\n" + " */\n" + " static struct kmem_cache *zcache_objnode_cache;\n" + " static struct kmem_cache *zcache_obj_cache;\n" + "+static struct kmem_cache *ramster_flnode_cache;\n" + " static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);\n" + " static unsigned long zcache_curr_obj_count_max;\n" + " static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);\n" + "@@ -994,6 +1513,7 @@ struct zcache_preload {\n" + " \tstruct tmem_obj *obj;\n" + " \tint nr;\n" + " \tstruct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];\n" + "+\tstruct flushlist_node *flnode;\n" + " };\n" + " static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };\n" + " \n" + "@@ -1002,6 +1522,7 @@ static int zcache_do_preload(struct tmem_pool *pool)\n" + " \tstruct zcache_preload *kp;\n" + " \tstruct tmem_objnode *objnode;\n" + " \tstruct tmem_obj *obj;\n" + "+\tstruct flushlist_node *flnode;\n" + " \tvoid *page;\n" + " \tint ret = -ENOMEM;\n" + " \n" + "@@ -1009,10 +1530,6 @@ static int zcache_do_preload(struct tmem_pool *pool)\n" + " \t\tgoto out;\n" + " \tif (unlikely(zcache_obj_cache == NULL))\n" + " \t\tgoto out;\n" + "-\tif (!spin_trylock(&zcache_direct_reclaim_lock)) {\n" + "-\t\tzcache_aborted_preload++;\n" + "-\t\tgoto out;\n" + "-\t}\n" + " \tpreempt_disable();\n" + " \tkp = &__get_cpu_var(zcache_preloads);\n" + " \twhile (kp->nr < ARRAY_SIZE(kp->objnodes)) {\n" + "@@ -1036,6 +1553,11 @@ static int zcache_do_preload(struct tmem_pool *pool)\n" + " \t\tzcache_failed_alloc++;\n" + " \t\tgoto unlock_out;\n" + " \t}\n" + "+\tflnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK);\n" + "+\tif (unlikely(flnode == NULL)) {\n" + "+\t\tzcache_failed_alloc++;\n" + "+\t\tgoto unlock_out;\n" + "+\t}\n" + " \tpage = (void *)__get_free_page(ZCACHE_GFP_MASK);\n" + " \tif (unlikely(page == NULL)) {\n" + " \t\tzcache_failed_get_free_pages++;\n" + "@@ -1048,17 +1570,40 @@ static int zcache_do_preload(struct tmem_pool *pool)\n" + " \t\tkp->obj = obj;\n" + " \telse\n" + " \t\tkmem_cache_free(zcache_obj_cache, obj);\n" + "+\tif (kp->flnode == NULL)\n" + "+\t\tkp->flnode = flnode;\n" + "+\telse\n" + "+\t\tkmem_cache_free(ramster_flnode_cache, flnode);\n" + " \tif (kp->page == NULL)\n" + " \t\tkp->page = page;\n" + " \telse\n" + " \t\tfree_page((unsigned long)page);\n" + " \tret = 0;\n" + " unlock_out:\n" + "-\tspin_unlock(&zcache_direct_reclaim_lock);\n" + " out:\n" + " \treturn ret;\n" + " }\n" + " \n" + "+static int ramster_do_preload_flnode_only(struct tmem_pool *pool)\n" + "+{\n" + "+\tstruct zcache_preload *kp;\n" + "+\tstruct flushlist_node *flnode;\n" + "+\tint ret = -ENOMEM;\n" + "+\n" + "+\tBUG_ON(!irqs_disabled());\n" + "+\tif (unlikely(ramster_flnode_cache == NULL))\n" + "+\t\tBUG();\n" + "+\tkp = &__get_cpu_var(zcache_preloads);\n" + "+\tflnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC);\n" + "+\tif (unlikely(flnode == NULL) && kp->flnode == NULL)\n" + "+\t\tBUG(); /* FIXME handle more gracefully, but how??? */\n" + "+\telse if (kp->flnode == NULL)\n" + "+\t\tkp->flnode = flnode;\n" + "+\telse\n" + "+\t\tkmem_cache_free(ramster_flnode_cache, flnode);\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + " static void *zcache_get_free_page(void)\n" + " {\n" + " \tstruct zcache_preload *kp;\n" + "@@ -1131,6 +1676,30 @@ static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)\n" + " \tkmem_cache_free(zcache_obj_cache, obj);\n" + " }\n" + " \n" + "+static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool)\n" + "+{\n" + "+\tstruct flushlist_node *flnode = NULL;\n" + "+\tstruct zcache_preload *kp;\n" + "+\tint count;\n" + "+\n" + "+\tkp = &__get_cpu_var(zcache_preloads);\n" + "+\tflnode = kp->flnode;\n" + "+\tBUG_ON(flnode == NULL);\n" + "+\tkp->flnode = NULL;\n" + "+\tcount = atomic_inc_return(&ramster_curr_flnode_count);\n" + "+\tif (count > ramster_curr_flnode_count_max)\n" + "+\t\tramster_curr_flnode_count_max = count;\n" + "+\treturn flnode;\n" + "+}\n" + "+\n" + "+static void ramster_flnode_free(struct flushlist_node *flnode,\n" + "+\t\t\t\tstruct tmem_pool *pool)\n" + "+{\n" + "+\tatomic_dec(&ramster_curr_flnode_count);\n" + "+\tBUG_ON(atomic_read(&ramster_curr_flnode_count) < 0);\n" + "+\tkmem_cache_free(ramster_flnode_cache, flnode);\n" + "+}\n" + "+\n" + " static struct tmem_hostops zcache_hostops = {\n" + " \t.obj_alloc = zcache_obj_alloc,\n" + " \t.obj_free = zcache_obj_free,\n" + "@@ -1150,22 +1719,20 @@ static unsigned long zcache_curr_pers_pampd_count_max;\n" + " /* forward reference */\n" + " static int zcache_compress(struct page *from, void **out_va, size_t *out_len);\n" + " \n" + "-static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,\n" + "+static int zcache_pampd_eph_create(char *data, size_t size, bool raw,\n" + " \t\t\t\tstruct tmem_pool *pool, struct tmem_oid *oid,\n" + "-\t\t\t\t uint32_t index)\n" + "+\t\t\t\tuint32_t index, void **pampd)\n" + " {\n" + "-\tvoid *pampd = NULL, *cdata;\n" + "-\tsize_t clen;\n" + "-\tint ret;\n" + "-\tunsigned long count;\n" + "-\tstruct page *page = (struct page *)(data);\n" + "+\tint ret = -1;\n" + "+\tvoid *cdata = data;\n" + "+\tsize_t clen = size;\n" + " \tstruct zcache_client *cli = pool->client;\n" + " \tuint16_t client_id = get_client_id_from_client(cli);\n" + "-\tunsigned long zv_mean_zsize;\n" + "-\tunsigned long curr_pers_pampd_count;\n" + "-\tu64 total_zsize;\n" + "+\tstruct page *page = NULL;\n" + "+\tunsigned long count;\n" + " \n" + "-\tif (eph) {\n" + "+\tif (!raw) {\n" + "+\t\tpage = virt_to_page(data);\n" + " \t\tret = zcache_compress(page, &cdata, &clen);\n" + " \t\tif (ret == 0)\n" + " \t\t\tgoto out;\n" + "@@ -1173,46 +1740,137 @@ static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,\n" + " \t\t\tzcache_compress_poor++;\n" + " \t\t\tgoto out;\n" + " \t\t}\n" + "-\t\tpampd = (void *)zbud_create(client_id, pool->pool_id, oid,\n" + "-\t\t\t\t\t\tindex, page, cdata, clen);\n" + "-\t\tif (pampd != NULL) {\n" + "-\t\t\tcount = atomic_inc_return(&zcache_curr_eph_pampd_count);\n" + "-\t\t\tif (count > zcache_curr_eph_pampd_count_max)\n" + "-\t\t\t\tzcache_curr_eph_pampd_count_max = count;\n" + "-\t\t}\n" + "-\t} else {\n" + "-\t\tcurr_pers_pampd_count =\n" + "-\t\t\tatomic_read(&zcache_curr_pers_pampd_count);\n" + "-\t\tif (curr_pers_pampd_count >\n" + "-\t\t (zv_page_count_policy_percent * totalram_pages) / 100)\n" + "-\t\t\tgoto out;\n" + "-\t\tret = zcache_compress(page, &cdata, &clen);\n" + "-\t\tif (ret == 0)\n" + "-\t\t\tgoto out;\n" + "-\t\t/* reject if compression is too poor */\n" + "-\t\tif (clen > zv_max_zsize) {\n" + "-\t\t\tzcache_compress_poor++;\n" + "+\t}\n" + "+\t*pampd = (void *)zbud_create(client_id, pool->pool_id, oid,\n" + "+\t\t\t\t\tindex, page, cdata, clen);\n" + "+\tif (*pampd == NULL) {\n" + "+\t\tret = -ENOMEM;\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\tret = 0;\n" + "+\tcount = atomic_inc_return(&zcache_curr_eph_pampd_count);\n" + "+\tif (count > zcache_curr_eph_pampd_count_max)\n" + "+\t\tzcache_curr_eph_pampd_count_max = count;\n" + "+\tif (client_id != LOCAL_CLIENT) {\n" + "+\t\tcount = atomic_inc_return(&ramster_foreign_eph_pampd_count);\n" + "+\t\tif (count > ramster_foreign_eph_pampd_count_max)\n" + "+\t\t\tramster_foreign_eph_pampd_count_max = count;\n" + "+\t}\n" + "+out:\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+static int zcache_pampd_pers_create(char *data, size_t size, bool raw,\n" + "+\t\t\t\tstruct tmem_pool *pool, struct tmem_oid *oid,\n" + "+\t\t\t\tuint32_t index, void **pampd)\n" + "+{\n" + "+\tint ret = -1;\n" + "+\tvoid *cdata = data;\n" + "+\tsize_t clen = size;\n" + "+\tstruct zcache_client *cli = pool->client;\n" + "+\tstruct page *page;\n" + "+\tunsigned long count;\n" + "+\tunsigned long zv_mean_zsize;\n" + "+\tstruct zv_hdr *zv;\n" + "+\tlong curr_pers_pampd_count;\n" + "+\tu64 total_zsize;\n" + "+\n" + "+\tcurr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) -\n" + "+\t\t\tatomic_read(&ramster_remote_pers_pages);\n" + "+\t/* should always be positive, but warn if accounting is off */\n" + "+\tWARN_ON_ONCE(curr_pers_pampd_count < 0);\n" + "+\tif (curr_pers_pampd_count >\n" + "+\t\t (zv_page_count_policy_percent * totalram_pages) / 100) {\n" + "+\t\tzcache_policy_percent_exceeded++;\n" + "+#if 0\n" + "+{\n" + "+static unsigned long cnt;\n" + "+cnt++;\n" + "+if (!(cnt&(cnt-1)))\n" + "+pr_err(\"TESTING zppc policy cnt=%lu, curr=%lu, limit=%lu, totalram=%lu\\n\",\n" + "+cnt, curr_pers_pampd_count,\n" + "+((zv_page_count_policy_percent * totalram_pages) / 100), totalram_pages);\n" + "+}\n" + "+#endif\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\tif (raw)\n" + "+\t\tgoto ok_to_create;\n" + "+\tpage = virt_to_page(data);\n" + "+\tif (zcache_compress(page, &cdata, &clen) == 0)\n" + "+\t\tgoto out;\n" + "+\t/* reject if compression is too poor */\n" + "+\tif (clen > zv_max_zsize) {\n" + "+\t\tzcache_compress_poor++;\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\t/* reject if mean compression is too poor */\n" + "+\tif ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {\n" + "+\t\ttotal_zsize = xv_get_total_size_bytes(cli->xvpool);\n" + "+\t\tzv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count);\n" + "+\t\tif (zv_mean_zsize > zv_max_mean_zsize) {\n" + "+\t\t\tzcache_mean_compress_poor++;\n" + " \t\t\tgoto out;\n" + " \t\t}\n" + "-\t\t/* reject if mean compression is too poor */\n" + "-\t\tif ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {\n" + "-\t\t\ttotal_zsize = xv_get_total_size_bytes(cli->xvpool);\n" + "-\t\t\tzv_mean_zsize = div_u64(total_zsize,\n" + "-\t\t\t\t\t\tcurr_pers_pampd_count);\n" + "-\t\t\tif (zv_mean_zsize > zv_max_mean_zsize) {\n" + "-\t\t\t\tzcache_mean_compress_poor++;\n" + "-\t\t\t\tgoto out;\n" + "-\t\t\t}\n" + "-\t\t}\n" + "-\t\tpampd = (void *)zv_create(cli->xvpool, pool->pool_id,\n" + "-\t\t\t\t\t\toid, index, cdata, clen);\n" + "-\t\tif (pampd == NULL)\n" + "-\t\t\tgoto out;\n" + "-\t\tcount = atomic_inc_return(&zcache_curr_pers_pampd_count);\n" + "-\t\tif (count > zcache_curr_pers_pampd_count_max)\n" + "-\t\t\tzcache_curr_pers_pampd_count_max = count;\n" + " \t}\n" + "+ok_to_create:\n" + "+\t*pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen);\n" + "+\tif (*pampd == NULL) {\n" + "+\t\tret = -ENOMEM;\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\tret = 0;\n" + "+\tcount = atomic_inc_return(&zcache_curr_pers_pampd_count);\n" + "+\tif (count > zcache_curr_pers_pampd_count_max)\n" + "+\t\tzcache_curr_pers_pampd_count_max = count;\n" + "+\tif (is_local_client(cli))\n" + "+\t\tgoto out;\n" + "+\tzv = *(struct zv_hdr **)pampd;\n" + "+\tcount = atomic_inc_return(&ramster_foreign_pers_pampd_count);\n" + "+\tif (count > ramster_foreign_pers_pampd_count_max)\n" + "+\t\tramster_foreign_pers_pampd_count_max = count;\n" + " out:\n" + "+#if 0\n" + "+if (ret == -ENOMEM) {\n" + "+static unsigned long cnt, lclcnt, fgncnt;\n" + "+cnt++;\n" + "+if (is_local_client(pool->client))\n" + "+ lclcnt++;\n" + "+else\n" + "+ fgncnt++;\n" + "+if (!(cnt&(cnt-1)))\n" + "+pr_err(\"TESTING zcache_pampd_create_PERS ENOMEM cnt=%lu, local=%lu, foreign=%lu, tot pampd_count=%lu, remote=%lu\\n\", cnt, lclcnt, fgncnt, (long)atomic_read(&zcache_curr_pers_pampd_count), (long)atomic_read(&ramster_remote_pers_pages));\n" + "+} else if (ret < 0) {\n" + "+static unsigned long cnt, lclcnt, fgncnt;\n" + "+cnt++;\n" + "+if (is_local_client(pool->client))\n" + "+ lclcnt++;\n" + "+else\n" + "+ fgncnt++;\n" + "+if (!(cnt&(cnt-1)))\n" + "+pr_err(\"TESTING zcache_pampd_create_PERS POLICYFAIL cnt=%lu, local=%lu, foreign=%lu, tot pampd_count=%lu, remote=%lu\\n\", cnt, lclcnt, fgncnt, (long)atomic_read(&zcache_curr_pers_pampd_count), (long)atomic_read(&ramster_remote_pers_pages));\n" + "+}\n" + "+#endif\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,\n" + "+\t\t\t\tstruct tmem_pool *pool, struct tmem_oid *oid,\n" + "+\t\t\t\tuint32_t index)\n" + "+{\n" + "+\tvoid *pampd = NULL;\n" + "+\tint ret;\n" + "+\tbool ephemeral;\n" + "+\n" + "+\tBUG_ON(preemptible());\n" + "+\tephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool));\n" + "+\tif (ephemeral)\n" + "+\t\tret = zcache_pampd_eph_create(data, size, raw, pool,\n" + "+\t\t\t\t\t\toid, index, &pampd);\n" + "+\telse\n" + "+\t\tret = zcache_pampd_pers_create(data, size, raw, pool,\n" + "+\t\t\t\t\t\toid, index, &pampd);\n" + "+\t/* FIXME add some counters here for failed creates? */\n" + " \treturn pampd;\n" + " }\n" + " \n" + "@@ -1226,75 +1884,368 @@ static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,\n" + " {\n" + " \tint ret = 0;\n" + " \n" + "-\tBUG_ON(is_ephemeral(pool));\n" + "-\tzv_decompress((struct page *)(data), pampd);\n" + "+\tBUG_ON(preemptible());\n" + "+\tBUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */\n" + "+\tBUG_ON(pampd_is_remote(pampd));\n" + "+\tif (raw)\n" + "+\t\tzv_copy_from_pampd(data, bufsize, pampd);\n" + "+\telse\n" + "+\t\tzv_decompress(virt_to_page(data), pampd);\n" + " \treturn ret;\n" + " }\n" + " \n" + "-/*\n" + "- * fill the pageframe corresponding to the struct page with the data\n" + "- * from the passed pampd\n" + "- */\n" + " static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,\n" + " \t\t\t\t\tvoid *pampd, struct tmem_pool *pool,\n" + " \t\t\t\t\tstruct tmem_oid *oid, uint32_t index)\n" + " {\n" + " \tint ret = 0;\n" + "+\tunsigned long flags;\n" + "+\tstruct zcache_client *cli = pool->client;\n" + " \n" + "-\tBUG_ON(!is_ephemeral(pool));\n" + "-\tzbud_decompress((struct page *)(data), pampd);\n" + "-\tzbud_free_and_delist((struct zbud_hdr *)pampd);\n" + "-\tatomic_dec(&zcache_curr_eph_pampd_count);\n" + "+\tBUG_ON(preemptible());\n" + "+\tBUG_ON(pampd_is_remote(pampd));\n" + "+\tif (is_ephemeral(pool)) {\n" + "+\t\tlocal_irq_save(flags);\n" + "+\t\tif (raw)\n" + "+\t\t\tzbud_copy_from_pampd(data, bufsize, pampd);\n" + "+\t\telse\n" + "+\t\t\tret = zbud_decompress(virt_to_page(data), pampd);\n" + "+\t\tzbud_free_and_delist((struct zbud_hdr *)pampd);\n" + "+\t\tlocal_irq_restore(flags);\n" + "+\t\tif (!is_local_client(cli)) {\n" + "+\t\t\tatomic_dec(&ramster_foreign_eph_pampd_count);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&ramster_foreign_eph_pampd_count) < 0);\n" + "+\t\t}\n" + "+\t\tatomic_dec(&zcache_curr_eph_pampd_count);\n" + "+\t\tWARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0);\n" + "+\t} else {\n" + "+\t\tif (is_local_client(cli))\n" + "+\t\t\tBUG();\n" + "+\t\tif (raw)\n" + "+\t\t\tzv_copy_from_pampd(data, bufsize, pampd);\n" + "+\t\telse\n" + "+\t\t\tzv_decompress(virt_to_page(data), pampd);\n" + "+\t\tzv_free(cli->xvpool, pampd);\n" + "+\t\tif (!is_local_client(cli)) {\n" + "+\t\t\tatomic_dec(&ramster_foreign_pers_pampd_count);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&ramster_foreign_pers_pampd_count) < 0);\n" + "+\t\t}\n" + "+\t\tatomic_dec(&zcache_curr_pers_pampd_count);\n" + "+\t\tWARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0);\n" + "+\t\tret = 0;\n" + "+\t}\n" + " \treturn ret;\n" + " }\n" + " \n" + "+static bool zcache_pampd_is_remote(void *pampd)\n" + "+{\n" + "+\treturn pampd_is_remote(pampd);\n" + "+}\n" + "+\n" + " /*\n" + " * free the pampd and remove it from any zcache lists\n" + " * pampd must no longer be pointed to from any tmem data structures!\n" + " */\n" + " static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,\n" + "-\t\t\t\tstruct tmem_oid *oid, uint32_t index)\n" + "+\t\t\t struct tmem_oid *oid, uint32_t index, bool acct)\n" + " {\n" + " \tstruct zcache_client *cli = pool->client;\n" + "-\n" + "-\tif (is_ephemeral(pool)) {\n" + "+\tbool eph = is_ephemeral(pool);\n" + "+\tstruct zv_hdr *zv;\n" + "+\n" + "+\tBUG_ON(preemptible());\n" + "+\tif (pampd_is_remote(pampd)) {\n" + "+\t\tWARN_ON(acct == false);\n" + "+\t\tif (oid == NULL) {\n" + "+\t\t\t/*\n" + "+\t\t\t * a NULL oid means to ignore this pampd free\n" + "+\t\t\t * as the remote freeing will be handled elsewhere\n" + "+\t\t\t */\n" + "+\t\t} else if (eph) {\n" + "+\t\t\t/* FIXME remote flush optional but probably good idea */\n" + "+\t\t\t/* FIXME get these working properly again */\n" + "+\t\t\tatomic_dec(&zcache_curr_eph_pampd_count);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0);\n" + "+\t\t} else if (pampd_is_intransit(pampd)) {\n" + "+\t\t\t/* did a pers remote get_and_free, so just free local */\n" + "+\t\t\tpampd = pampd_mask_intransit_and_remote(pampd);\n" + "+\t\t\tgoto local_pers;\n" + "+\t\t} else {\n" + "+\t\t\tstruct flushlist_node *flnode =\n" + "+\t\t\t\tramster_flnode_alloc(pool);\n" + "+\n" + "+\t\t\tflnode->xh.client_id = pampd_remote_node(pampd);\n" + "+\t\t\tflnode->xh.pool_id = pool->pool_id;\n" + "+\t\t\tflnode->xh.oid = *oid;\n" + "+\t\t\tflnode->xh.index = index;\n" + "+\t\t\tflnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE;\n" + "+\t\t\tspin_lock(&zcache_rem_op_list_lock);\n" + "+\t\t\tlist_add(&flnode->rem_op.list, &zcache_rem_op_list);\n" + "+\t\t\tspin_unlock(&zcache_rem_op_list_lock);\n" + "+\t\t\tatomic_dec(&zcache_curr_pers_pampd_count);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0);\n" + "+\t\t\tatomic_dec(&ramster_remote_pers_pages);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&ramster_remote_pers_pages) < 0);\n" + "+\t\t}\n" + "+\t} else if (eph) {\n" + " \t\tzbud_free_and_delist((struct zbud_hdr *)pampd);\n" + "-\t\tatomic_dec(&zcache_curr_eph_pampd_count);\n" + "-\t\tBUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);\n" + "+\t\tif (!is_local_client(pool->client)) {\n" + "+\t\t\tatomic_dec(&ramster_foreign_eph_pampd_count);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&ramster_foreign_eph_pampd_count) < 0);\n" + "+\t\t}\n" + "+\t\tif (acct)\n" + "+\t\t\tatomic_dec(&zcache_curr_eph_pampd_count);\n" + "+\t\t\t/* FIXME get these working properly again */\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&zcache_curr_eph_pampd_count) < 0);\n" + " \t} else {\n" + "-\t\tzv_free(cli->xvpool, (struct zv_hdr *)pampd);\n" + "-\t\tatomic_dec(&zcache_curr_pers_pampd_count);\n" + "-\t\tBUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);\n" + "+local_pers:\n" + "+\t\tzv = (struct zv_hdr *)pampd;\n" + "+\t\tif (!is_local_client(pool->client)) {\n" + "+\t\t\tatomic_dec(&ramster_foreign_pers_pampd_count);\n" + "+\t\t\tWARN_ON_ONCE(atomic_read(&ramster_foreign_pers_pampd_count) < 0);\n" + "+\t\t}\n" + "+\t\tzv_free(cli->xvpool, zv);\n" + "+\t\tif (acct)\n" + "+\t\t\tatomic_dec(&zcache_curr_pers_pampd_count);\n" + "+\t\t/* FIXME get these working properly again */\n" + "+\t\tWARN_ON_ONCE(atomic_read(&zcache_curr_pers_pampd_count) < 0);\n" + " \t}\n" + " }\n" + " \n" + "-static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)\n" + "+static void zcache_pampd_free_obj(struct tmem_pool *pool,\n" + "+\t\t\t\t\tstruct tmem_obj *obj)\n" + " {\n" + "+\tstruct flushlist_node *flnode;\n" + "+\n" + "+\tBUG_ON(preemptible());\n" + "+\tif (obj->extra == NULL)\n" + "+\t\treturn;\n" + "+\tBUG_ON(!pampd_is_remote(obj->extra));\n" + "+\tflnode = ramster_flnode_alloc(pool);\n" + "+\tflnode->xh.client_id = pampd_remote_node(obj->extra);\n" + "+\tflnode->xh.pool_id = pool->pool_id;\n" + "+\tflnode->xh.oid = obj->oid;\n" + "+\tflnode->xh.index = FLUSH_ENTIRE_OBJECT;\n" + "+\tflnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ;\n" + "+\tspin_lock(&zcache_rem_op_list_lock);\n" + "+\tlist_add(&flnode->rem_op.list, &zcache_rem_op_list);\n" + "+\tspin_unlock(&zcache_rem_op_list_lock);\n" + " }\n" + " \n" + "-static void zcache_pampd_new_obj(struct tmem_obj *obj)\n" + "+void zcache_pampd_new_obj(struct tmem_obj *obj)\n" + " {\n" + "+\tobj->extra = NULL;\n" + " }\n" + " \n" + "-static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)\n" + "+int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj)\n" + " {\n" + "-\treturn -1;\n" + "+\tint ret = -1;\n" + "+\n" + "+\tif (new_pampd != NULL) {\n" + "+\t\tif (obj->extra == NULL)\n" + "+\t\t\tobj->extra = new_pampd;\n" + "+\t\t/* enforce that all remote pages in an object reside\n" + "+\t\t * in the same node! */\n" + "+\t\telse if (pampd_remote_node(new_pampd) !=\n" + "+\t\t\t\tpampd_remote_node((void *)(obj->extra)))\n" + "+\t\t\tBUG();\n" + "+\t\tret = 0;\n" + "+\t}\n" + "+\treturn ret;\n" + " }\n" + " \n" + "-static bool zcache_pampd_is_remote(void *pampd)\n" + "+/*\n" + "+ * Called by the message handler after a (still compressed) page has been\n" + "+ * fetched from the remote machine in response to an \"is_remote\" tmem_get\n" + "+ * or persistent tmem_localify. For a tmem_get, \"extra\" is the address of\n" + "+ * the page that is to be filled to succesfully resolve the tmem_get; for\n" + "+ * a (persistent) tmem_localify, \"extra\" is NULL (as the data is placed only\n" + "+ * in the local zcache). \"data\" points to \"size\" bytes of (compressed) data\n" + "+ * passed in the message. In the case of a persistent remote get, if\n" + "+ * pre-allocation was successful (see zcache_repatriate_preload), the page\n" + "+ * is placed into both local zcache and at \"extra\".\n" + "+ */\n" + "+int zcache_localify(int pool_id, struct tmem_oid *oidp,\n" + "+\t\t\tuint32_t index, char *data, size_t size,\n" + "+\t\t\tvoid *extra)\n" + " {\n" + "-\treturn 0;\n" + "+\tint ret = -ENOENT;\n" + "+\tunsigned long flags;\n" + "+\tstruct tmem_pool *pool;\n" + "+\tbool ephemeral, delete = false;\n" + "+\tsize_t clen = PAGE_SIZE;\n" + "+\tvoid *pampd, *saved_hb;\n" + "+\tstruct tmem_obj *obj;\n" + "+\n" + "+\tpool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id);\n" + "+\tif (unlikely(pool == NULL))\n" + "+\t\t/* pool doesn't exist anymore */\n" + "+\t\tgoto out;\n" + "+\tephemeral = is_ephemeral(pool);\n" + "+\tlocal_irq_save(flags); /* FIXME: maybe only disable softirqs? */\n" + "+\tpampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb);\n" + "+\tif (pampd == NULL) {\n" + "+\t\t/* hmmm... must have been a flush while waiting */\n" + "+#if 1\n" + "+\t\tpr_err(\"UNTESTED pampd==NULL in zcache_localify\\n\");\n" + "+#endif\n" + "+\t\tif (ephemeral)\n" + "+\t\t\tramster_remote_eph_pages_unsucc_get++;\n" + "+\t\telse\n" + "+\t\t\tramster_remote_pers_pages_unsucc_get++;\n" + "+\t\tobj = NULL;\n" + "+\t\tgoto finish;\n" + "+\t} else if (unlikely(!pampd_is_remote(pampd))) {\n" + "+\t\t/* hmmm... must have been a dup put while waiting */\n" + "+#if 1\n" + "+\t\tpr_err(\"UNTESTED dup while waiting in zcache_localify\\n\");\n" + "+#endif\n" + "+\t\tif (ephemeral)\n" + "+\t\t\tramster_remote_eph_pages_unsucc_get++;\n" + "+\t\telse\n" + "+\t\t\tramster_remote_pers_pages_unsucc_get++;\n" + "+\t\tobj = NULL;\n" + "+\t\tpampd = NULL;\n" + "+\t\tret = -EEXIST;\n" + "+\t\tgoto finish;\n" + "+\t} else if (size == 0) {\n" + "+\t\t/* no remote data, delete the local is_remote pampd */\n" + "+\t\tpampd = NULL;\n" + "+\t\tif (ephemeral)\n" + "+\t\t\tramster_remote_eph_pages_unsucc_get++;\n" + "+\t\telse\n" + "+\t\t\tBUG();\n" + "+\t\tdelete = true;\n" + "+\t\tgoto finish;\n" + "+\t}\n" + "+\tif (!ephemeral && pampd_is_intransit(pampd)) {\n" + "+\t\t/* localify to zcache */\n" + "+\t\tpampd = pampd_mask_intransit_and_remote(pampd);\n" + "+\t\tzv_copy_to_pampd(pampd, data, size);\n" + "+\t} else {\n" + "+\t\tpampd = NULL;\n" + "+\t\tobj = NULL;\n" + "+\t}\n" + "+\tif (extra != NULL) {\n" + "+\t\t/* decompress direct-to-memory to complete remotify */\n" + "+\t\tret = lzo1x_decompress_safe((char *)data, size,\n" + "+\t\t\t\t\t\t(char *)extra, &clen);\n" + "+\t\tBUG_ON(ret != LZO_E_OK);\n" + "+\t\tBUG_ON(clen != PAGE_SIZE);\n" + "+\t}\n" + "+\tif (ephemeral)\n" + "+\t\tramster_remote_eph_pages_succ_get++;\n" + "+\telse\n" + "+\t\tramster_remote_pers_pages_succ_get++;\n" + "+\tret = 0;\n" + "+finish:\n" + "+\ttmem_localify_finish(obj, index, pampd, saved_hb, delete);\n" + "+\tzcache_put_pool(pool);\n" + "+\tlocal_irq_restore(flags);\n" + "+out:\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+/*\n" + "+ * Called on a remote persistent tmem_get to attempt to preallocate\n" + "+ * local storage for the data contained in the remote persistent page.\n" + "+ * If succesfully preallocated, returns the pampd, marked as remote and\n" + "+ * in_transit. Else returns NULL. Note that the appropriate tmem data\n" + "+ * structure must be locked.\n" + "+ */\n" + "+static void *zcache_pampd_repatriate_preload(void *pampd,\n" + "+\t\t\t\t\t\tstruct tmem_pool *pool,\n" + "+\t\t\t\t\t\tstruct tmem_oid *oid,\n" + "+\t\t\t\t\t\tuint32_t index,\n" + "+\t\t\t\t\t\tbool *intransit)\n" + "+{\n" + "+\tint clen = pampd_remote_size(pampd);\n" + "+\tvoid *ret_pampd = NULL;\n" + "+\tunsigned long flags;\n" + "+\n" + "+\tif (!pampd_is_remote(pampd))\n" + "+\t\tBUG();\n" + "+\tif (is_ephemeral(pool))\n" + "+\t\tBUG();\n" + "+\tif (pampd_is_intransit(pampd)) {\n" + "+\t\t/*\n" + "+\t\t * to avoid multiple allocations (and maybe a memory leak)\n" + "+\t\t * don't preallocate if already in the process of being\n" + "+\t\t * repatriated\n" + "+\t\t */\n" + "+\t\t*intransit = true;\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+#if 0\n" + "+{\n" + "+static unsigned long cnt;\n" + "+cnt++;\n" + "+if (!(cnt&(cnt-1)))\n" + "+pr_err(\"TESTING zcache_pampd_repat_preload, size=%d, cksum=??, cnt=%lu\\n\",\n" + "+clen, cnt);\n" + "+}\n" + "+#endif\n" + "+\t*intransit = false;\n" + "+\tlocal_irq_save(flags);\n" + "+\tret_pampd = (void *)zv_alloc(pool, oid, index, clen);\n" + "+\tif (ret_pampd != NULL) {\n" + "+\t\t/*\n" + "+\t\t * a pampd is marked intransit if it is remote and space has\n" + "+\t\t * been allocated for it locally (note, only happens for\n" + "+\t\t * persistent pages, in which case the remote copy is freed)\n" + "+\t\t */\n" + "+\t\tret_pampd = pampd_mark_intransit(ret_pampd);\n" + "+\t\tatomic_dec(&ramster_remote_pers_pages);\n" + "+\t\tWARN_ON_ONCE(atomic_read(&ramster_remote_pers_pages) < 0);\n" + "+\t} else\n" + "+\t\tramster_pers_pages_remote_nomem++;\n" + "+\tlocal_irq_restore(flags);\n" + "+out:\n" + "+\treturn ret_pampd;\n" + "+}\n" + "+\n" + "+/*\n" + "+ * Called on a remote tmem_get to invoke a message to fetch the page.\n" + "+ * Might sleep so no tmem locks can be held. \"extra\" is passed\n" + "+ * all the way through the round-trip messaging to zcache_localify.\n" + "+ */\n" + "+static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd,\n" + "+\t\t\t\t struct tmem_pool *pool,\n" + "+\t\t\t\t struct tmem_oid *oid, uint32_t index,\n" + "+\t\t\t\t bool free, void *extra)\n" + "+{\n" + "+\tstruct tmem_xhandle xh;\n" + "+\tint ret;\n" + "+\n" + "+\tif (pampd_is_intransit(real_pampd))\n" + "+\t\t/* have local space pre-reserved, so free remote copy */\n" + "+\t\tfree = true;\n" + "+\txh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index);\n" + "+\t/* unreliable request/response for now */\n" + "+\tret = ramster_remote_async_get(&xh, free,\n" + "+\t\t\t\t\tpampd_remote_node(fake_pampd),\n" + "+\t\t\t\t\tpampd_remote_size(fake_pampd),\n" + "+\t\t\t\t\tpampd_remote_cksum(fake_pampd),\n" + "+\t\t\t\t\textra);\n" + "+#if 1\n" + "+\tif (ret != 0 && ret != -ENOENT)\n" + "+\t\tpr_err(\"TESTING zcache_pampd_repatriate returns, ret=%d\\n\",\n" + "+\t\t\tret);\n" + "+#endif\n" + "+\treturn ret;\n" + " }\n" + " \n" + " static struct tmem_pamops zcache_pamops = {\n" + " \t.create = zcache_pampd_create,\n" + " \t.get_data = zcache_pampd_get_data,\n" + "-\t.get_data_and_free = zcache_pampd_get_data_and_free,\n" + " \t.free = zcache_pampd_free,\n" + "+\t.get_data_and_free = zcache_pampd_get_data_and_free,\n" + " \t.free_obj = zcache_pampd_free_obj,\n" + "+\t.is_remote = zcache_pampd_is_remote,\n" + "+\t.repatriate_preload = zcache_pampd_repatriate_preload,\n" + "+\t.repatriate = zcache_pampd_repatriate,\n" + " \t.new_obj = zcache_pampd_new_obj,\n" + " \t.replace_in_obj = zcache_pampd_replace_in_obj,\n" + "-\t.is_remote = zcache_pampd_is_remote,\n" + " };\n" + " \n" + " /*\n" + "@@ -1342,9 +2293,13 @@ static int zcache_cpu_notifier(struct notifier_block *nb,\n" + " \t\tper_cpu(zcache_workmem, cpu) =\n" + " \t\t\tkzalloc(LZO1X_MEM_COMPRESS,\n" + " \t\t\t\tGFP_KERNEL | __GFP_REPEAT);\n" + "+\t\tper_cpu(zcache_remoteputmem, cpu) =\n" + "+\t\t\tkzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);\n" + " \t\tbreak;\n" + " \tcase CPU_DEAD:\n" + " \tcase CPU_UP_CANCELED:\n" + "+\t\tkfree(per_cpu(zcache_remoteputmem, cpu));\n" + "+\t\tper_cpu(zcache_remoteputmem, cpu) = NULL;\n" + " \t\tfree_pages((unsigned long)per_cpu(zcache_dstmem, cpu),\n" + " \t\t\t\tLZO_DSTMEM_PAGE_ORDER);\n" + " \t\tper_cpu(zcache_dstmem, cpu) = NULL;\n" + "@@ -1427,6 +2382,7 @@ ZCACHE_SYSFS_RO(aborted_preload);\n" + " ZCACHE_SYSFS_RO(aborted_shrink);\n" + " ZCACHE_SYSFS_RO(compress_poor);\n" + " ZCACHE_SYSFS_RO(mean_compress_poor);\n" + "+ZCACHE_SYSFS_RO(policy_percent_exceeded);\n" + " ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);\n" + " ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);\n" + " ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);\n" + "@@ -1451,6 +2407,7 @@ static struct attribute *zcache_attrs[] = {\n" + " \t&zcache_flobj_found_attr.attr,\n" + " \t&zcache_failed_eph_puts_attr.attr,\n" + " \t&zcache_failed_pers_puts_attr.attr,\n" + "+\t&zcache_policy_percent_exceeded_attr.attr,\n" + " \t&zcache_compress_poor_attr.attr,\n" + " \t&zcache_mean_compress_poor_attr.attr,\n" + " \t&zcache_zbud_curr_raw_pages_attr.attr,\n" + "@@ -1483,6 +2440,151 @@ static struct attribute_group zcache_attr_group = {\n" + " \t.name = \"zcache\",\n" + " };\n" + " \n" + "+#define RAMSTER_SYSFS_RO(_name) \\\n" + "+\tstatic ssize_t ramster_##_name##_show(struct kobject *kobj, \\\n" + "+\t\t\t\tstruct kobj_attribute *attr, char *buf) \\\n" + "+\t{ \\\n" + "+\t\treturn sprintf(buf, \"%lu\\n\", ramster_##_name); \\\n" + "+\t} \\\n" + "+\tstatic struct kobj_attribute ramster_##_name##_attr = { \\\n" + "+\t\t.attr = { .name = __stringify(_name), .mode = 0444 }, \\\n" + "+\t\t.show = ramster_##_name##_show, \\\n" + "+\t}\n" + "+\n" + "+#define RAMSTER_SYSFS_RW(_name) \\\n" + "+\tstatic ssize_t ramster_##_name##_show(struct kobject *kobj, \\\n" + "+\t\t\t\tstruct kobj_attribute *attr, char *buf) \\\n" + "+\t{ \\\n" + "+\t\treturn sprintf(buf, \"%lu\\n\", ramster_##_name); \\\n" + "+\t} \\\n" + "+\tstatic ssize_t ramster_##_name##_store(struct kobject *kobj, \\\n" + "+\t\tstruct kobj_attribute *attr, const char *buf, size_t count) \\\n" + "+\t{ \\\n" + "+\t\tint err; \\\n" + "+\t\tunsigned long enable; \\\n" + "+\t\terr = strict_strtoul(buf, 10, &enable); \\\n" + "+\t\tif (err) \\\n" + "+\t\t\treturn -EINVAL; \\\n" + "+\t\tramster_##_name = enable; \\\n" + "+\t\treturn count; \\\n" + "+\t} \\\n" + "+\tstatic struct kobj_attribute ramster_##_name##_attr = { \\\n" + "+\t\t.attr = { .name = __stringify(_name), .mode = 0644 }, \\\n" + "+\t\t.show = ramster_##_name##_show, \\\n" + "+\t\t.store = ramster_##_name##_store, \\\n" + "+\t}\n" + "+\n" + "+#define RAMSTER_SYSFS_RO_ATOMIC(_name) \\\n" + "+\tstatic ssize_t ramster_##_name##_show(struct kobject *kobj, \\\n" + "+\t\t\t\tstruct kobj_attribute *attr, char *buf) \\\n" + "+\t{ \\\n" + "+\t return sprintf(buf, \"%d\\n\", atomic_read(&ramster_##_name)); \\\n" + "+\t} \\\n" + "+\tstatic struct kobj_attribute ramster_##_name##_attr = { \\\n" + "+\t\t.attr = { .name = __stringify(_name), .mode = 0444 }, \\\n" + "+\t\t.show = ramster_##_name##_show, \\\n" + "+\t}\n" + "+\n" + "+RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages);\n" + "+RAMSTER_SYSFS_RW(pers_remotify_enable);\n" + "+RAMSTER_SYSFS_RW(eph_remotify_enable);\n" + "+RAMSTER_SYSFS_RO(eph_pages_remoted);\n" + "+RAMSTER_SYSFS_RO(eph_pages_remote_failed);\n" + "+RAMSTER_SYSFS_RO(pers_pages_remoted);\n" + "+RAMSTER_SYSFS_RO(pers_pages_remote_failed);\n" + "+RAMSTER_SYSFS_RO(pers_pages_remote_nomem);\n" + "+RAMSTER_SYSFS_RO(remote_pages_flushed);\n" + "+RAMSTER_SYSFS_RO(remote_page_flushes_failed);\n" + "+RAMSTER_SYSFS_RO(remote_objects_flushed);\n" + "+RAMSTER_SYSFS_RO(remote_object_flushes_failed);\n" + "+RAMSTER_SYSFS_RO(remote_eph_pages_succ_get);\n" + "+RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get);\n" + "+RAMSTER_SYSFS_RO(remote_pers_pages_succ_get);\n" + "+RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get);\n" + "+RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count);\n" + "+RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max);\n" + "+RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count);\n" + "+RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max);\n" + "+RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count);\n" + "+RAMSTER_SYSFS_RO(curr_flnode_count_max);\n" + "+\n" + "+#define MANUAL_NODES 8\n" + "+static bool ramster_nodes_manual_up[MANUAL_NODES];\n" + "+static ssize_t ramster_manual_node_up_show(struct kobject *kobj,\n" + "+\t\t\t\tstruct kobj_attribute *attr, char *buf)\n" + "+{\n" + "+\tint i;\n" + "+\tchar *p = buf;\n" + "+\tfor (i = 0; i < MANUAL_NODES; i++)\n" + "+\t\tif (ramster_nodes_manual_up[i])\n" + "+\t\t\tp += sprintf(p, \"%d \", i);\n" + "+\tp += sprintf(p, \"\\n\");\n" + "+\treturn p - buf;\n" + "+}\n" + "+\n" + "+static ssize_t ramster_manual_node_up_store(struct kobject *kobj,\n" + "+\t\tstruct kobj_attribute *attr, const char *buf, size_t count)\n" + "+{\n" + "+\tint err;\n" + "+\tunsigned long node_num;\n" + "+\textern void o2net_hb_node_up_manual(int);\n" + "+\n" + "+\terr = strict_strtoul(buf, 10, &node_num);\n" + "+\tif (err) {\n" + "+\t\tpr_err(\"bad strtoul?\\n\");\n" + "+\t\treturn -EINVAL;\n" + "+\t}\n" + "+\tif (node_num >= MANUAL_NODES) {\n" + "+\t\tpr_err(\"bad node_num=%lu?\\n\", node_num);\n" + "+\t\treturn -EINVAL;\n" + "+\t}\n" + "+\tif (ramster_nodes_manual_up[node_num]) {\n" + "+\t\tpr_err(\"node %d already up, ignoring\\n\", (int)node_num);\n" + "+\t} else {\n" + "+\t\tramster_nodes_manual_up[node_num] = true;\n" + "+\t\to2net_hb_node_up_manual((int)node_num);\n" + "+\t}\n" + "+\treturn count;\n" + "+}\n" + "+\n" + "+static struct kobj_attribute ramster_manual_node_up_attr = {\n" + "+\t.attr = { .name = \"manual_node_up\", .mode = 0644 },\n" + "+\t.show = ramster_manual_node_up_show,\n" + "+\t.store = ramster_manual_node_up_store,\n" + "+};\n" + "+\n" + "+static struct attribute *ramster_attrs[] = {\n" + "+\t&ramster_pers_remotify_enable_attr.attr,\n" + "+\t&ramster_eph_remotify_enable_attr.attr,\n" + "+\t&ramster_remote_pers_pages_attr.attr,\n" + "+\t&ramster_eph_pages_remoted_attr.attr,\n" + "+\t&ramster_eph_pages_remote_failed_attr.attr,\n" + "+\t&ramster_pers_pages_remoted_attr.attr,\n" + "+\t&ramster_pers_pages_remote_failed_attr.attr,\n" + "+\t&ramster_pers_pages_remote_nomem_attr.attr,\n" + "+\t&ramster_remote_pages_flushed_attr.attr,\n" + "+\t&ramster_remote_page_flushes_failed_attr.attr,\n" + "+\t&ramster_remote_objects_flushed_attr.attr,\n" + "+\t&ramster_remote_object_flushes_failed_attr.attr,\n" + "+\t&ramster_remote_eph_pages_succ_get_attr.attr,\n" + "+\t&ramster_remote_eph_pages_unsucc_get_attr.attr,\n" + "+\t&ramster_remote_pers_pages_succ_get_attr.attr,\n" + "+\t&ramster_remote_pers_pages_unsucc_get_attr.attr,\n" + "+\t&ramster_foreign_eph_pampd_count_attr.attr,\n" + "+\t&ramster_foreign_eph_pampd_count_max_attr.attr,\n" + "+\t&ramster_foreign_pers_pampd_count_attr.attr,\n" + "+\t&ramster_foreign_pers_pampd_count_max_attr.attr,\n" + "+\t&ramster_curr_flnode_count_attr.attr,\n" + "+\t&ramster_curr_flnode_count_max_attr.attr,\n" + "+\t&ramster_manual_node_up_attr.attr,\n" + "+\tNULL,\n" + "+};\n" + "+\n" + "+static struct attribute_group ramster_attr_group = {\n" + "+\t.attrs = ramster_attrs,\n" + "+\t.name = \"ramster\",\n" + "+};\n" + "+\n" + " #endif /* CONFIG_SYSFS */\n" + " /*\n" + " * When zcache is disabled (\"frozen\"), pools can be created and destroyed,\n" + "@@ -1527,8 +2629,9 @@ static struct shrinker zcache_shrinker = {\n" + " * zcache shims between cleancache/frontswap ops and tmem\n" + " */\n" + " \n" + "-static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,\n" + "-\t\t\t\tuint32_t index, struct page *page)\n" + "+int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp,\n" + "+\t\t\tuint32_t index, char *data, size_t size,\n" + "+\t\t\tbool raw, int ephemeral)\n" + " {\n" + " \tstruct tmem_pool *pool;\n" + " \tint ret = -1;\n" + "@@ -1539,8 +2642,7 @@ static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,\n" + " \t\tgoto out;\n" + " \tif (!zcache_freeze && zcache_do_preload(pool) == 0) {\n" + " \t\t/* preload does preempt_disable on success */\n" + "-\t\tret = tmem_put(pool, oidp, index, (char *)(page),\n" + "-\t\t\t\tPAGE_SIZE, 0, is_ephemeral(pool));\n" + "+\t\tret = tmem_put(pool, oidp, index, data, size, raw, ephemeral);\n" + " \t\tif (ret < 0) {\n" + " \t\t\tif (is_ephemeral(pool))\n" + " \t\t\t\tzcache_failed_eph_puts++;\n" + "@@ -1560,27 +2662,40 @@ out:\n" + " \treturn ret;\n" + " }\n" + " \n" + "-static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,\n" + "-\t\t\t\tuint32_t index, struct page *page)\n" + "+int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp,\n" + "+\t\t\tuint32_t index, char *data, size_t *sizep,\n" + "+\t\t\tbool raw, int get_and_free)\n" + " {\n" + " \tstruct tmem_pool *pool;\n" + " \tint ret = -1;\n" + "-\tunsigned long flags;\n" + "-\tsize_t size = PAGE_SIZE;\n" + "+\tbool eph;\n" + " \n" + "-\tlocal_irq_save(flags);\n" + "+\tif (!raw) {\n" + "+\t\tBUG_ON(irqs_disabled());\n" + "+\t\tBUG_ON(in_softirq());\n" + "+\t}\n" + " \tpool = zcache_get_pool_by_id(cli_id, pool_id);\n" + "+\teph = is_ephemeral(pool);\n" + " \tif (likely(pool != NULL)) {\n" + " \t\tif (atomic_read(&pool->obj_count) > 0)\n" + "-\t\t\tret = tmem_get(pool, oidp, index, (char *)(page),\n" + "-\t\t\t\t\t&size, 0, is_ephemeral(pool));\n" + "+\t\t\tret = tmem_get(pool, oidp, index, data, sizep,\n" + "+\t\t\t\t\traw, get_and_free);\n" + " \t\tzcache_put_pool(pool);\n" + " \t}\n" + "-\tlocal_irq_restore(flags);\n" + "+\tWARN_ONCE((!eph && (ret != 0)), \"zcache_get fails on persistent pool, \"\n" + "+\t\t\t \"bad things are very likely to happen soon\\n\");\n" + "+#if 1\n" + "+{\n" + "+if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool)))\n" + "+pr_err(\"TESTING zcache_get tmem_get returns ret=%d\\n\", ret);\n" + "+}\n" + "+#endif\n" + "+\tif (ret == -EAGAIN)\n" + "+\t\tBUG(); /* FIXME... don't need this anymore??? let's ensure */\n" + " \treturn ret;\n" + " }\n" + " \n" + "-static int zcache_flush_page(int cli_id, int pool_id,\n" + "+int zcache_flush(int cli_id, int pool_id,\n" + " \t\t\t\tstruct tmem_oid *oidp, uint32_t index)\n" + " {\n" + " \tstruct tmem_pool *pool;\n" + "@@ -1590,6 +2705,7 @@ static int zcache_flush_page(int cli_id, int pool_id,\n" + " \tlocal_irq_save(flags);\n" + " \tzcache_flush_total++;\n" + " \tpool = zcache_get_pool_by_id(cli_id, pool_id);\n" + "+\tramster_do_preload_flnode_only(pool);\n" + " \tif (likely(pool != NULL)) {\n" + " \t\tif (atomic_read(&pool->obj_count) > 0)\n" + " \t\t\tret = tmem_flush_page(pool, oidp, index);\n" + "@@ -1601,8 +2717,7 @@ static int zcache_flush_page(int cli_id, int pool_id,\n" + " \treturn ret;\n" + " }\n" + " \n" + "-static int zcache_flush_object(int cli_id, int pool_id,\n" + "-\t\t\t\tstruct tmem_oid *oidp)\n" + "+int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp)\n" + " {\n" + " \tstruct tmem_pool *pool;\n" + " \tint ret = -1;\n" + "@@ -1611,6 +2726,7 @@ static int zcache_flush_object(int cli_id, int pool_id,\n" + " \tlocal_irq_save(flags);\n" + " \tzcache_flobj_total++;\n" + " \tpool = zcache_get_pool_by_id(cli_id, pool_id);\n" + "+\tramster_do_preload_flnode_only(pool);\n" + " \tif (likely(pool != NULL)) {\n" + " \t\tif (atomic_read(&pool->obj_count) > 0)\n" + " \t\t\tret = tmem_flush_object(pool, oidp);\n" + "@@ -1622,7 +2738,7 @@ static int zcache_flush_object(int cli_id, int pool_id,\n" + " \treturn ret;\n" + " }\n" + " \n" + "-static int zcache_destroy_pool(int cli_id, int pool_id)\n" + "+int zcache_client_destroy_pool(int cli_id, int pool_id)\n" + " {\n" + " \tstruct tmem_pool *pool = NULL;\n" + " \tstruct zcache_client *cli = NULL;\n" + "@@ -1649,13 +2765,17 @@ static int zcache_destroy_pool(int cli_id, int pool_id)\n" + " \tret = tmem_destroy_pool(pool);\n" + " \tlocal_bh_enable();\n" + " \tkfree(pool);\n" + "-\tpr_info(\"zcache: destroyed pool id=%d, cli_id=%d\\n\",\n" + "-\t\t\tpool_id, cli_id);\n" + "+\tpr_info(\"ramster: destroyed pool id=%d cli_id=%d\\n\", pool_id, cli_id);\n" + " out:\n" + " \treturn ret;\n" + " }\n" + " \n" + "-static int zcache_new_pool(uint16_t cli_id, uint32_t flags)\n" + "+static int zcache_destroy_pool(int pool_id)\n" + "+{\n" + "+\treturn zcache_client_destroy_pool(LOCAL_CLIENT, pool_id);\n" + "+}\n" + "+\n" + "+int zcache_new_pool(uint16_t cli_id, uint32_t flags)\n" + " {\n" + " \tint poolid = -1;\n" + " \tstruct tmem_pool *pool;\n" + "@@ -1670,7 +2790,7 @@ static int zcache_new_pool(uint16_t cli_id, uint32_t flags)\n" + " \tatomic_inc(&cli->refcount);\n" + " \tpool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);\n" + " \tif (pool == NULL) {\n" + "-\t\tpr_info(\"zcache: pool creation failed: out of memory\\n\");\n" + "+\t\tpr_info(\"ramster: pool creation failed: out of memory\\n\");\n" + " \t\tgoto out;\n" + " \t}\n" + " \n" + "@@ -1678,7 +2798,7 @@ static int zcache_new_pool(uint16_t cli_id, uint32_t flags)\n" + " \t\tif (cli->tmem_pools[poolid] == NULL)\n" + " \t\t\tbreak;\n" + " \tif (poolid >= MAX_POOLS_PER_CLIENT) {\n" + "-\t\tpr_info(\"zcache: pool creation failed: max exceeded\\n\");\n" + "+\t\tpr_info(\"ramster: pool creation failed: max exceeded\\n\");\n" + " \t\tkfree(pool);\n" + " \t\tpoolid = -1;\n" + " \t\tgoto out;\n" + "@@ -1688,7 +2808,7 @@ static int zcache_new_pool(uint16_t cli_id, uint32_t flags)\n" + " \tpool->pool_id = poolid;\n" + " \ttmem_new_pool(pool, flags);\n" + " \tcli->tmem_pools[poolid] = pool;\n" + "-\tpr_info(\"zcache: created %s tmem pool, id=%d, client=%d\\n\",\n" + "+\tpr_info(\"ramster: created %s tmem pool, id=%d, client=%d\\n\",\n" + " \t\tflags & TMEM_POOL_PERSIST ? \"persistent\" : \"ephemeral\",\n" + " \t\tpoolid, cli_id);\n" + " out:\n" + "@@ -1697,6 +2817,64 @@ out:\n" + " \treturn poolid;\n" + " }\n" + " \n" + "+static int zcache_local_new_pool(uint32_t flags)\n" + "+{\n" + "+\treturn zcache_new_pool(LOCAL_CLIENT, flags);\n" + "+}\n" + "+\n" + "+int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral)\n" + "+{\n" + "+\tstruct tmem_pool *pool;\n" + "+\tstruct zcache_client *cli = NULL;\n" + "+\tuint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST;\n" + "+\tint ret = -1;\n" + "+\n" + "+\tif (cli_id == LOCAL_CLIENT)\n" + "+\t\tgoto out;\n" + "+\tif (pool_id >= MAX_POOLS_PER_CLIENT)\n" + "+\t\tgoto out;\n" + "+\telse if ((unsigned int)cli_id < MAX_CLIENTS)\n" + "+\t\tcli = &zcache_clients[cli_id];\n" + "+\tif ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap))\n" + "+\t\tBUG(); /* FIXME, handle more gracefully later */\n" + "+\tif (!cli->allocated) {\n" + "+\t\tif (zcache_new_client(cli_id))\n" + "+\t\t\tBUG(); /* FIXME, handle more gracefully later */\n" + "+\t\tcli = &zcache_clients[cli_id];\n" + "+\t}\n" + "+\tatomic_inc(&cli->refcount);\n" + "+\tpool = cli->tmem_pools[pool_id];\n" + "+\tif (pool != NULL) {\n" + "+\t\tif (pool->persistent && ephemeral) {\n" + "+\t\t\tpr_err(\"zcache_autocreate_pool: type mismatch\\n\");\n" + "+\t\t\tgoto out;\n" + "+\t\t}\n" + "+\t\tret = 0;\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\tpool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);\n" + "+\tif (pool == NULL) {\n" + "+\t\tpr_info(\"ramster: pool creation failed: out of memory\\n\");\n" + "+\t\tgoto out;\n" + "+\t}\n" + "+\tatomic_set(&pool->refcount, 0);\n" + "+\tpool->client = cli;\n" + "+\tpool->pool_id = pool_id;\n" + "+\ttmem_new_pool(pool, flags);\n" + "+\tcli->tmem_pools[pool_id] = pool;\n" + "+\tpr_info(\"ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\\n\",\n" + "+\t\tflags & TMEM_POOL_PERSIST ? \"persistent\" : \"ephemeral\",\n" + "+\t\tpool_id, cli_id);\n" + "+\tret = 0;\n" + "+out:\n" + "+\tif (cli == NULL)\n" + "+\t\tBUG(); /* FIXME, handle more gracefully later */\n" + "+\t\t/* pr_err(\"zcache_autocreate_pool: failed\\n\"); */\n" + "+\tif (cli != NULL)\n" + "+\t\tatomic_dec(&cli->refcount);\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + " /**********\n" + " * Two kernel functionalities currently can be layered on top of tmem.\n" + " * These are \"cleancache\" which is used as a second-chance cache for clean\n" + "@@ -1713,8 +2891,12 @@ static void zcache_cleancache_put_page(int pool_id,\n" + " \tu32 ind = (u32) index;\n" + " \tstruct tmem_oid oid = *(struct tmem_oid *)&key;\n" + " \n" + "-\tif (likely(ind == index))\n" + "-\t\t(void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);\n" + "+\tif (likely(ind == index)) {\n" + "+\t\tchar *kva = page_address(page);\n" + "+\n" + "+\t\t(void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index,\n" + "+\t\t\tkva, PAGE_SIZE, 0, 1);\n" + "+\t}\n" + " }\n" + " \n" + " static int zcache_cleancache_get_page(int pool_id,\n" + "@@ -1725,8 +2907,15 @@ static int zcache_cleancache_get_page(int pool_id,\n" + " \tstruct tmem_oid oid = *(struct tmem_oid *)&key;\n" + " \tint ret = -1;\n" + " \n" + "-\tif (likely(ind == index))\n" + "-\t\tret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);\n" + "+\tpreempt_disable();\n" + "+\tif (likely(ind == index)) {\n" + "+\t\tchar *kva = page_address(page);\n" + "+\t\tsize_t size = PAGE_SIZE;\n" + "+\n" + "+\t\tret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index,\n" + "+\t\t\tkva, &size, 0, 0);\n" + "+\t}\n" + "+\tpreempt_enable();\n" + " \treturn ret;\n" + " }\n" + " \n" + "@@ -1738,7 +2927,7 @@ static void zcache_cleancache_flush_page(int pool_id,\n" + " \tstruct tmem_oid oid = *(struct tmem_oid *)&key;\n" + " \n" + " \tif (likely(ind == index))\n" + "-\t\t(void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);\n" + "+\t\t(void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind);\n" + " }\n" + " \n" + " static void zcache_cleancache_flush_inode(int pool_id,\n" + "@@ -1752,7 +2941,7 @@ static void zcache_cleancache_flush_inode(int pool_id,\n" + " static void zcache_cleancache_flush_fs(int pool_id)\n" + " {\n" + " \tif (pool_id >= 0)\n" + "-\t\t(void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);\n" + "+\t\t(void)zcache_destroy_pool(pool_id);\n" + " }\n" + " \n" + " static int zcache_cleancache_init_fs(size_t pagesize)\n" + "@@ -1760,7 +2949,7 @@ static int zcache_cleancache_init_fs(size_t pagesize)\n" + " \tBUG_ON(sizeof(struct cleancache_filekey) !=\n" + " \t\t\t\tsizeof(struct tmem_oid));\n" + " \tBUG_ON(pagesize != PAGE_SIZE);\n" + "-\treturn zcache_new_pool(LOCAL_CLIENT, 0);\n" + "+\treturn zcache_local_new_pool(0);\n" + " }\n" + " \n" + " static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)\n" + "@@ -1769,7 +2958,7 @@ static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)\n" + " \tBUG_ON(sizeof(struct cleancache_filekey) !=\n" + " \t\t\t\tsizeof(struct tmem_oid));\n" + " \tBUG_ON(pagesize != PAGE_SIZE);\n" + "-\treturn zcache_new_pool(LOCAL_CLIENT, 0);\n" + "+\treturn zcache_local_new_pool(0);\n" + " }\n" + " \n" + " static struct cleancache_ops zcache_cleancache_ops = {\n" + "@@ -1799,7 +2988,7 @@ static int zcache_frontswap_poolid = -1;\n" + " * Swizzling increases objects per swaptype, increasing tmem concurrency\n" + " * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS\n" + " */\n" + "-#define SWIZ_BITS\t\t4\n" + "+#define SWIZ_BITS\t\t8\n" + " #define SWIZ_MASK\t\t((1 << SWIZ_BITS) - 1)\n" + " #define _oswiz(_type, _ind)\t((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))\n" + " #define iswiz(_ind)\t\t(_ind >> SWIZ_BITS)\n" + "@@ -1819,12 +3008,14 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,\n" + " \tstruct tmem_oid oid = oswiz(type, ind);\n" + " \tint ret = -1;\n" + " \tunsigned long flags;\n" + "+\tchar *kva;\n" + " \n" + " \tBUG_ON(!PageLocked(page));\n" + " \tif (likely(ind64 == ind)) {\n" + " \t\tlocal_irq_save(flags);\n" + "-\t\tret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,\n" + "-\t\t\t\t\t&oid, iswiz(ind), page);\n" + "+\t\tkva = page_address(page);\n" + "+\t\tret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid,\n" + "+\t\t\t\t&oid, iswiz(ind), kva, PAGE_SIZE, 0, 0);\n" + " \t\tlocal_irq_restore(flags);\n" + " \t}\n" + " \treturn ret;\n" + "@@ -1840,10 +3031,16 @@ static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,\n" + " \tstruct tmem_oid oid = oswiz(type, ind);\n" + " \tint ret = -1;\n" + " \n" + "+\tpreempt_disable(); /* FIXME, remove this? */\n" + " \tBUG_ON(!PageLocked(page));\n" + "-\tif (likely(ind64 == ind))\n" + "-\t\tret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,\n" + "-\t\t\t\t\t&oid, iswiz(ind), page);\n" + "+\tif (likely(ind64 == ind)) {\n" + "+\t\tchar *kva = page_address(page);\n" + "+\t\tsize_t size = PAGE_SIZE;\n" + "+\n" + "+\t\tret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid,\n" + "+\t\t\t\t\t&oid, iswiz(ind), kva, &size, 0, -1);\n" + "+\t}\n" + "+\tpreempt_enable(); /* FIXME, remove this? */\n" + " \treturn ret;\n" + " }\n" + " \n" + "@@ -1855,7 +3052,7 @@ static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)\n" + " \tstruct tmem_oid oid = oswiz(type, ind);\n" + " \n" + " \tif (likely(ind64 == ind))\n" + "-\t\t(void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,\n" + "+\t\t(void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid,\n" + " \t\t\t\t\t&oid, iswiz(ind));\n" + " }\n" + " \n" + "@@ -1877,7 +3074,7 @@ static void zcache_frontswap_init(unsigned ignored)\n" + " \t/* a single tmem poolid is used for all frontswap \"types\" (swapfiles) */\n" + " \tif (zcache_frontswap_poolid < 0)\n" + " \t\tzcache_frontswap_poolid =\n" + "-\t\t\tzcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);\n" + "+\t\t\t\tzcache_local_new_pool(TMEM_POOL_PERSIST);\n" + " }\n" + " \n" + " static struct frontswap_ops zcache_frontswap_ops = {\n" + "@@ -1898,19 +3095,125 @@ struct frontswap_ops zcache_frontswap_register_ops(void)\n" + " #endif\n" + " \n" + " /*\n" + "+ * frontswap selfshrinking\n" + "+ */\n" + "+\n" + "+#ifdef CONFIG_FRONTSWAP\n" + "+/* In HZ, controls frequency of worker invocation. */\n" + "+static unsigned int selfshrink_interval __read_mostly = 5;\n" + "+\n" + "+static void selfshrink_process(struct work_struct *work);\n" + "+static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process);\n" + "+\n" + "+/* Enable/disable with sysfs. */\n" + "+static bool frontswap_selfshrinking __read_mostly;\n" + "+\n" + "+/* Enable/disable with kernel boot option. */\n" + "+static bool use_frontswap_selfshrink __initdata = true;\n" + "+\n" + "+/*\n" + "+ * The default values for the following parameters were deemed reasonable\n" + "+ * by experimentation, may be workload-dependent, and can all be\n" + "+ * adjusted via sysfs.\n" + "+ */\n" + "+\n" + "+/* Control rate for frontswap shrinking. Higher hysteresis is slower. */\n" + "+static unsigned int frontswap_hysteresis __read_mostly = 20;\n" + "+\n" + "+/*\n" + "+ * Number of selfshrink worker invocations to wait before observing that\n" + "+ * frontswap selfshrinking should commence. Note that selfshrinking does\n" + "+ * not use a separate worker thread.\n" + "+ */\n" + "+static unsigned int frontswap_inertia __read_mostly = 3;\n" + "+\n" + "+/* Countdown to next invocation of frontswap_shrink() */\n" + "+static unsigned long frontswap_inertia_counter;\n" + "+\n" + "+/*\n" + "+ * Invoked by the selfshrink worker thread, uses current number of pages\n" + "+ * in frontswap (frontswap_curr_pages()), previous status, and control\n" + "+ * values (hysteresis and inertia) to determine if frontswap should be\n" + "+ * shrunk and what the new frontswap size should be. Note that\n" + "+ * frontswap_shrink is essentially a partial swapoff that immediately\n" + "+ * transfers pages from the \"swap device\" (frontswap) back into kernel\n" + "+ * RAM; despite the name, frontswap \"shrinking\" is very different from\n" + "+ * the \"shrinker\" interface used by the kernel MM subsystem to reclaim\n" + "+ * memory.\n" + "+ */\n" + "+static void frontswap_selfshrink(void)\n" + "+{\n" + "+\tstatic unsigned long cur_frontswap_pages;\n" + "+\tstatic unsigned long last_frontswap_pages;\n" + "+\tstatic unsigned long tgt_frontswap_pages;\n" + "+\n" + "+\tlast_frontswap_pages = cur_frontswap_pages;\n" + "+\tcur_frontswap_pages = frontswap_curr_pages();\n" + "+\tif (!cur_frontswap_pages ||\n" + "+\t\t\t(cur_frontswap_pages > last_frontswap_pages)) {\n" + "+\t\tfrontswap_inertia_counter = frontswap_inertia;\n" + "+\t\treturn;\n" + "+\t}\n" + "+\tif (frontswap_inertia_counter && --frontswap_inertia_counter)\n" + "+\t\treturn;\n" + "+\tif (cur_frontswap_pages <= frontswap_hysteresis)\n" + "+\t\ttgt_frontswap_pages = 0;\n" + "+\telse\n" + "+\t\ttgt_frontswap_pages = cur_frontswap_pages -\n" + "+\t\t\t(cur_frontswap_pages / frontswap_hysteresis);\n" + "+\tfrontswap_shrink(tgt_frontswap_pages);\n" + "+}\n" + "+\n" + "+static int __init ramster_nofrontswap_selfshrink_setup(char *s)\n" + "+{\n" + "+\tuse_frontswap_selfshrink = false;\n" + "+\treturn 1;\n" + "+}\n" + "+\n" + "+__setup(\"noselfshrink\", ramster_nofrontswap_selfshrink_setup);\n" + "+\n" + "+static void selfshrink_process(struct work_struct *work)\n" + "+{\n" + "+\tif (frontswap_selfshrinking && frontswap_enabled) {\n" + "+\t\tfrontswap_selfshrink();\n" + "+\t\tschedule_delayed_work(&selfshrink_worker,\n" + "+\t\t\tselfshrink_interval * HZ);\n" + "+\t}\n" + "+}\n" + "+\n" + "+static int ramster_enabled;\n" + "+\n" + "+static int __init ramster_selfshrink_init(void)\n" + "+{\n" + "+\tfrontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink;\n" + "+\tif (frontswap_selfshrinking)\n" + "+\t\tpr_info(\"ramster: Initializing frontswap \"\n" + "+\t\t\t\t\t\"selfshrinking driver.\\n\");\n" + "+\telse\n" + "+\t\treturn -ENODEV;\n" + "+\n" + "+\tschedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ);\n" + "+\n" + "+\treturn 0;\n" + "+}\n" + "+\n" + "+subsys_initcall(ramster_selfshrink_init);\n" + "+#endif\n" + "+\n" + "+/*\n" + " * zcache initialization\n" + "- * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR\n" + "+ * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR\n" + " * NOTHING HAPPENS!\n" + " */\n" + " \n" + "-static int zcache_enabled;\n" + "+static int ramster_enabled;\n" + " \n" + "-static int __init enable_zcache(char *s)\n" + "+static int __init enable_ramster(char *s)\n" + " {\n" + "-\tzcache_enabled = 1;\n" + "+\tramster_enabled = 1;\n" + " \treturn 1;\n" + " }\n" + "-__setup(\"zcache\", enable_zcache);\n" + "+__setup(\"ramster\", enable_ramster);\n" + " \n" + " /* allow independent dynamic disabling of cleancache and frontswap */\n" + " \n" + "@@ -1918,16 +3221,22 @@ static int use_cleancache = 1;\n" + " \n" + " static int __init no_cleancache(char *s)\n" + " {\n" + "+\tpr_info(\"INIT no_cleancache called\\n\");\n" + " \tuse_cleancache = 0;\n" + " \treturn 1;\n" + " }\n" + " \n" + "-__setup(\"nocleancache\", no_cleancache);\n" + "+/*\n" + "+ * FIXME: need to guarantee this gets checked before zcache_init is called\n" + "+ * What is the correct way to achieve this?\n" + "+ */\n" + "+early_param(\"nocleancache\", no_cleancache);\n" + " \n" + " static int use_frontswap = 1;\n" + " \n" + " static int __init no_frontswap(char *s)\n" + " {\n" + "+\tpr_info(\"INIT no_frontswap called\\n\");\n" + " \tuse_frontswap = 0;\n" + " \treturn 1;\n" + " }\n" + "@@ -1940,20 +3249,22 @@ static int __init zcache_init(void)\n" + " \n" + " #ifdef CONFIG_SYSFS\n" + " \tret = sysfs_create_group(mm_kobj, &zcache_attr_group);\n" + "+\tret = sysfs_create_group(mm_kobj, &ramster_attr_group);\n" + " \tif (ret) {\n" + "-\t\tpr_err(\"zcache: can't create sysfs\\n\");\n" + "+\t\tpr_err(\"ramster: can't create sysfs\\n\");\n" + " \t\tgoto out;\n" + " \t}\n" + " #endif /* CONFIG_SYSFS */\n" + " #if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)\n" + "-\tif (zcache_enabled) {\n" + "+\tif (ramster_enabled) {\n" + " \t\tunsigned int cpu;\n" + " \n" + "+\t\t(void)ramster_o2net_register_handlers();\n" + " \t\ttmem_register_hostops(&zcache_hostops);\n" + " \t\ttmem_register_pamops(&zcache_pamops);\n" + " \t\tret = register_cpu_notifier(&zcache_cpu_notifier_block);\n" + " \t\tif (ret) {\n" + "-\t\t\tpr_err(\"zcache: can't register cpu notifier\\n\");\n" + "+\t\t\tpr_err(\"ramster: can't register cpu notifier\\n\");\n" + " \t\t\tgoto out;\n" + " \t\t}\n" + " \t\tfor_each_online_cpu(cpu) {\n" + "@@ -1966,35 +3277,39 @@ static int __init zcache_init(void)\n" + " \t\t\t\tsizeof(struct tmem_objnode), 0, 0, NULL);\n" + " \tzcache_obj_cache = kmem_cache_create(\"zcache_obj\",\n" + " \t\t\t\tsizeof(struct tmem_obj), 0, 0, NULL);\n" + "-\tret = zcache_new_client(LOCAL_CLIENT);\n" + "-\tif (ret) {\n" + "-\t\tpr_err(\"zcache: can't create client\\n\");\n" + "-\t\tgoto out;\n" + "-\t}\n" + "+\tramster_flnode_cache = kmem_cache_create(\"ramster_flnode\",\n" + "+\t\t\t\tsizeof(struct flushlist_node), 0, 0, NULL);\n" + " #endif\n" + " #ifdef CONFIG_CLEANCACHE\n" + "-\tif (zcache_enabled && use_cleancache) {\n" + "+\tpr_info(\"INIT ramster_enabled=%d use_cleancache=%d\\n\",\n" + "+\t\t\t\t\tramster_enabled, use_cleancache);\n" + "+\tif (ramster_enabled && use_cleancache) {\n" + " \t\tstruct cleancache_ops old_ops;\n" + " \n" + " \t\tzbud_init();\n" + " \t\tregister_shrinker(&zcache_shrinker);\n" + " \t\told_ops = zcache_cleancache_register_ops();\n" + "-\t\tpr_info(\"zcache: cleancache enabled using kernel \"\n" + "+\t\tpr_info(\"ramster: cleancache enabled using kernel \"\n" + " \t\t\t\"transcendent memory and compression buddies\\n\");\n" + " \t\tif (old_ops.init_fs != NULL)\n" + "-\t\t\tpr_warning(\"zcache: cleancache_ops overridden\");\n" + "+\t\t\tpr_warning(\"ramster: cleancache_ops overridden\");\n" + " \t}\n" + " #endif\n" + " #ifdef CONFIG_FRONTSWAP\n" + "-\tif (zcache_enabled && use_frontswap) {\n" + "+\tpr_info(\"INIT ramster_enabled=%d use_frontswap=%d\\n\",\n" + "+\t\t\t\t\tramster_enabled, use_frontswap);\n" + "+\tif (ramster_enabled && use_frontswap) {\n" + " \t\tstruct frontswap_ops old_ops;\n" + " \n" + "+\t\tzcache_new_client(LOCAL_CLIENT);\n" + " \t\told_ops = zcache_frontswap_register_ops();\n" + "-\t\tpr_info(\"zcache: frontswap enabled using kernel \"\n" + "+\t\tpr_info(\"ramster: frontswap enabled using kernel \"\n" + " \t\t\t\"transcendent memory and xvmalloc\\n\");\n" + " \t\tif (old_ops.init != NULL)\n" + " \t\t\tpr_warning(\"ktmem: frontswap_ops overridden\");\n" + " \t}\n" + "+\tif (ramster_enabled && (use_frontswap || use_cleancache))\n" + "+\t\tramster_remotify_init();\n" + " #endif\n" + " out:\n" + " \treturn ret;\n" + "-- \n" + 1.7.1 -90870adf1d69b7b5967a97261c86ddc9cb6e0c11facb5f4fcd63533ec035ecc8 +5d403d027cb9861f53b2e7192022d04eac01505a7df43e2e8b1ac8ebec574da4
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.