* RFC: dm-ssdcache
@ 2012-03-16 15:38 Hannes Reinecke
2012-03-16 18:06 ` Mike Snitzer
2012-03-17 19:47 ` RFC: dm-ssdcache Mark Hills
0 siblings, 2 replies; 4+ messages in thread
From: Hannes Reinecke @ 2012-03-16 15:38 UTC (permalink / raw)
To: device-mapper development
[-- Attachment #1: Type: text/plain, Size: 700 bytes --]
Hi all,
to through some more confusion in an already heated debate, here is
the 'dm-ssdcache' module I recently wrote.
And held a talk about at LinuxCon Europe 2011 :-)
Upon request I'll be generating a proper patch ...
Not saying it's perfect, and surely doesn't have all the nifty
features bcache has, but it should serve as a 'minimal'
implementation. And probaly will start yet another discussion,
if nothing else.
So, comments etc are welcome.
Cheers,
Hannes
--
Dr. Hannes Reinecke zSeries & Storage
hare@suse.de +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
[-- Attachment #2: dm-ssdcache.c --]
[-- Type: text/x-csrc, Size: 53407 bytes --]
/*
* dm-ssdcache.c
*
* Copyright (c) 2011 Hannes Reinecke, SUSE Linux Products GmbH
*
* This file is released under the GPL.
*/
#include "dm.h"
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/device-mapper.h>
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
#define DM_MSG_PREFIX "ssdcache: "
// #define SSD_DEBUG
#define SSD_LOG
#define SSDCACHE_USE_RADIX_TREE
#ifdef SSD_LOG
#define DPRINTK( s, arg... ) printk(DM_MSG_PREFIX s "\n", ##arg)
#define WPRINTK( w, s, arg... ) printk(DM_MSG_PREFIX "%lu: %s (cte %lx:%02lx): "\
s "\n", (w)->nr, __FUNCTION__, \
(w)->cmd->hash, \
(w)->cte_idx, ##arg)
#else
#define DPRINTK( s, arg... )
#define WPRINTK( w, s, arg... )
#endif
#define SSDCACHE_COPY_PAGES 1024
#define MIN_SIO_ITEMS 1024
#define MIN_CTE_NUM 512
#define MIN_CMD_NUM 64
#define DEFAULT_CTE_NUM 4096
#define DEFAULT_BLOCKSIZE 256
#define DEFAULT_ALIASING 16
#define DEFAULT_ASSOCIATIVITY 4
/* Caching modes */
enum ssdcache_mode_t {
CACHE_MODE_WRITETHROUGH,
CACHE_MODE_WRITEBACK,
CACHE_MODE_READCACHE,
};
/* Caching strategies */
enum ssdcache_strategy_t {
CACHE_LRU,
CACHE_LFU,
};
struct ssdcache_md;
struct ssdcache_io;
struct ssdcache_te {
unsigned long index; /* Offset within table entry block */
unsigned long atime; /* Timestamp of the block's last access */
unsigned long count; /* Number of accesses */
DECLARE_BITMAP(clean, DEFAULT_BLOCKSIZE);
DECLARE_BITMAP(target_busy, DEFAULT_BLOCKSIZE);
DECLARE_BITMAP(cache_busy, DEFAULT_BLOCKSIZE);
sector_t sector; /* Sector number on target device */
struct ssdcache_md *md; /* Backlink to metadirectory */
struct rcu_head rcu;
};
struct ssdcache_md {
spinlock_t lock; /* Lock to protect operations on the bio list */
unsigned long hash; /* Hash number */
unsigned int num_cte; /* Number of table entries */
unsigned long atime;
struct ssdcache_ctx *sc;
struct ssdcache_te *te[DEFAULT_ALIASING]; /* RCU Table entries */
};
struct ssdcache_options {
unsigned int assoc;
enum ssdcache_mode_t mode;
enum ssdcache_strategy_t strategy;
unsigned async_lookup:1;
unsigned disable_writeback:1;
unsigned queue_busy:1;
unsigned skip_write_insert:1;
unsigned evict_on_write:1;
unsigned cmd_preload:1;
};
struct ssdcache_ctx {
struct dm_dev *target_dev;
struct dm_dev *cache_dev;
struct dm_io_client *iocp;
#ifdef SSDCACHE_USE_RADIX_TREE
struct radix_tree_root md_tree;
#else
struct ssdcache_md **md_table;
#endif
spinlock_t cmd_lock;
unsigned long hash_bits;
unsigned long block_size;
unsigned long block_mask;
sector_t data_offset;
unsigned long nr_sio;
unsigned long sio_active;
unsigned long cte_active;
struct ssdcache_options options;
unsigned long read_clean;
unsigned long read_busy;
unsigned long read_invalid;
unsigned long read_miss;
unsigned long write_clean;
unsigned long write_busy;
unsigned long write_cancel;
unsigned long write_invalid;
unsigned long write_miss;
unsigned long write_done;
unsigned long write_skip;
unsigned long lookup_failed;
unsigned long lookup_busy;
unsigned long cache_bypassed;
unsigned long cache_overruns;
unsigned long cache_evictions;
unsigned long writeback_cancelled;
unsigned long bio_cancelled;
};
struct ssdcache_io {
struct list_head list;
spinlock_t lock;
struct kref kref;
unsigned long nr;
struct ssdcache_ctx *sc;
struct ssdcache_md *cmd;
long cte_idx;
struct bio *bio;
struct bio *writeback_bio;
unsigned long bio_sector;
DECLARE_BITMAP(bio_mask, DEFAULT_BLOCKSIZE);
int error;
};
static enum ssdcache_mode_t default_cache_mode = CACHE_MODE_WRITETHROUGH;
static enum ssdcache_strategy_t default_cache_strategy = CACHE_LFU;
#define CACHE_IS_WRITETHROUGH(sc) \
((sc)->options.mode == CACHE_MODE_WRITETHROUGH)
#define CACHE_IS_WRITEBACK(sc) \
((sc)->options.mode == CACHE_MODE_WRITEBACK)
#define CACHE_IS_READCACHE(sc) \
((sc)->options.mode == CACHE_MODE_READCACHE)
#define CACHE_USE_LRU(sc) ((sc)->options.strategy == CACHE_LRU)
static DEFINE_SPINLOCK(_work_lock);
static struct workqueue_struct *_ssdcached_wq;
static struct work_struct _ssdcached_work;
static LIST_HEAD(_cte_work);
static LIST_HEAD(_io_work);
static struct kmem_cache *_sio_cache;
static struct kmem_cache *_cmd_cache;
static struct kmem_cache *_cte_cache;
static mempool_t *_sio_pool;
static mempool_t *_cmd_pool;
static mempool_t *_cte_pool;
/* Cache metadirectory states */
enum cmd_state {
CMD_STATE_UNMAPPED,
CMD_STATE_MAPPED,
CMD_STATE_RESERVED,
};
enum cte_match_t {
CTE_READ_CLEAN,
CTE_READ_BUSY,
CTE_READ_INVALID,
CTE_READ_MISS,
CTE_WRITE_CLEAN,
CTE_WRITE_BUSY,
CTE_WRITE_CANCEL,
CTE_WRITE_INVALID,
CTE_WRITE_MISS,
CTE_WRITE_DONE,
CTE_WRITE_SKIP,
CTE_LOOKUP_FAILED,
};
/*
* Slab pools
*/
static int pool_init(void)
{
_sio_cache = kmem_cache_create("ssdcache-sio",
sizeof(struct ssdcache_io),
__alignof__(struct ssdcache_io),
0, NULL);
if (!_sio_cache)
return -ENOMEM;
_cmd_cache = kmem_cache_create("ssdcache-cmd",
sizeof(struct ssdcache_md),
__alignof__(struct ssdcache_md),
0, NULL);
if (!_cmd_cache) {
kmem_cache_destroy(_sio_cache);
return -ENOMEM;
}
_cte_cache = kmem_cache_create("ssdcache-cte",
sizeof(struct ssdcache_te),
__alignof__(struct ssdcache_te),
0, NULL);
if (!_cte_cache) {
kmem_cache_destroy(_cmd_cache);
kmem_cache_destroy(_sio_cache);
return -ENOMEM;
}
_sio_pool = mempool_create(MIN_SIO_ITEMS, mempool_alloc_slab,
mempool_free_slab, _sio_cache);
if (!_sio_pool) {
kmem_cache_destroy(_cte_cache);
kmem_cache_destroy(_cmd_cache);
kmem_cache_destroy(_sio_cache);
return -ENOMEM;
}
_cmd_pool = mempool_create(MIN_CMD_NUM, mempool_alloc_slab,
mempool_free_slab, _cmd_cache);
if (!_cmd_pool) {
mempool_destroy(_sio_pool);
kmem_cache_destroy(_cte_cache);
kmem_cache_destroy(_cmd_cache);
kmem_cache_destroy(_sio_cache);
}
_cte_pool = mempool_create(MIN_CTE_NUM, mempool_alloc_slab,
mempool_free_slab, _cte_cache);
if (!_cte_pool) {
mempool_destroy(_cmd_pool);
mempool_destroy(_sio_pool);
kmem_cache_destroy(_cte_cache);
kmem_cache_destroy(_cmd_cache);
kmem_cache_destroy(_sio_cache);
}
return 0;
}
static void pool_exit(void)
{
mempool_destroy(_cte_pool);
mempool_destroy(_cmd_pool);
mempool_destroy(_sio_pool);
kmem_cache_destroy(_cte_cache);
kmem_cache_destroy(_cmd_cache);
kmem_cache_destroy(_sio_cache);
}
/*
* cache metadirectory handling
*/
static inline struct ssdcache_md *cmd_lookup(struct ssdcache_ctx *sc,
unsigned long hash_number)
{
struct ssdcache_md *cmd;
rcu_read_lock();
#ifdef SSDCACHE_USE_RADIX_TREE
cmd = radix_tree_lookup(&sc->md_tree, hash_number);
#else
cmd = rcu_dereference(sc->md_table[hash_number]);
#endif
rcu_read_unlock();
return cmd;
}
static inline struct ssdcache_md *cmd_insert(struct ssdcache_ctx *sc,
unsigned long hash_number)
{
struct ssdcache_md *cmd;
cmd = mempool_alloc(_cmd_pool, GFP_NOIO | __GFP_ZERO);
if (!cmd)
return NULL;
cmd->hash = hash_number;
cmd->num_cte = DEFAULT_ALIASING;
spin_lock_init(&cmd->lock);
cmd->atime = jiffies;
cmd->sc = sc;
#ifdef SSDCACHE_USE_RADIX_TREE
if (radix_tree_preload(GFP_NOIO)) {
mempool_free(cmd, _cmd_pool);
return NULL;
}
#endif
spin_lock(&sc->cmd_lock);
#ifdef SSDCACHE_USE_RADIX_TREE
if (radix_tree_insert(&sc->md_tree, hash_number, cmd)) {
mempool_free(cmd, _cmd_pool);
cmd = radix_tree_lookup(&sc->md_tree, hash_number);
BUG_ON(!cmd);
BUG_ON(cmd->hash != hash_number);
}
#else
rcu_assign_pointer(sc->md_table[hash_number], cmd);
#endif
spin_unlock(&sc->cmd_lock);
#ifdef SSDCACHE_USE_RADIX_TREE
radix_tree_preload_end();
#endif
return cmd;
}
static void cmd_remove(struct ssdcache_md *cmd)
{
struct ssdcache_te *cte;
int j;
if (!cmd)
return;
for (j = 0; j < cmd->num_cte; j++) {
spin_lock_irq(&cmd->lock);
cte = cmd->te[j];
if (cte)
rcu_assign_pointer(cmd->te[j], NULL);
spin_unlock_irq(&cmd->lock);
if (cte) {
synchronize_rcu();
mempool_free(cte, _cte_pool);
}
}
mempool_free(cmd, _cmd_pool);
}
#define cte_bio_align(s,b) ((b)->bi_sector & ~(s)->block_mask)
#define cte_bio_offset(s,b) ((b)->bi_sector & (s)->block_mask)
void sio_bio_mask(struct ssdcache_io *sio, struct bio *bio)
{
unsigned long offset = cte_bio_offset(sio->sc, bio);
int i;
for (i = 0; i < bio_sectors(bio); i++) {
set_bit(offset + i, sio->bio_mask);
}
}
struct ssdcache_te * cte_new(struct ssdcache_ctx *sc, struct ssdcache_md *cmd,
unsigned int index)
{
struct ssdcache_te *newcte;
newcte = mempool_alloc(_cte_pool, GFP_NOWAIT | __GFP_ZERO);
if (!newcte)
return NULL;
newcte->index = index;
newcte->atime = jiffies;
newcte->count = 1;
newcte->md = cmd;
sc->cte_active++;
return newcte;
}
static void cte_reset(struct rcu_head *rp)
{
struct ssdcache_te *cte = container_of(rp, struct ssdcache_te, rcu);
cte->md->sc->cte_active--;
mempool_free(cte, _cte_pool);
}
static bool cte_is_clean(struct ssdcache_te *cte, unsigned long *mask)
{
DECLARE_BITMAP(tmpmask, DEFAULT_BLOCKSIZE);
bitmap_and(tmpmask, cte->clean, mask, DEFAULT_BLOCKSIZE);
return bitmap_equal(tmpmask, mask, DEFAULT_BLOCKSIZE);
}
static bool cte_cache_is_busy(struct ssdcache_te *cte, unsigned long *mask)
{
DECLARE_BITMAP(tmpmask, DEFAULT_BLOCKSIZE);
bitmap_and(tmpmask, cte->cache_busy, mask, DEFAULT_BLOCKSIZE);
return !bitmap_empty(tmpmask, DEFAULT_BLOCKSIZE);
}
static bool cte_target_is_busy(struct ssdcache_te *cte, unsigned long *mask)
{
DECLARE_BITMAP(tmpmask, DEFAULT_BLOCKSIZE);
bitmap_and(tmpmask, cte->target_busy, mask, DEFAULT_BLOCKSIZE);
return !bitmap_empty(tmpmask, DEFAULT_BLOCKSIZE);
}
#define cte_set_bitmap(c,s,m) bitmap_or((c)->m , (c)->m , \
(s)->bio_mask, DEFAULT_BLOCKSIZE)
#define cte_unset_bitmap(c,s,m) bitmap_andnot((c)->m , (c)->m , \
(s)->bio_mask, DEFAULT_BLOCKSIZE)
static bool sio_cache_is_busy(struct ssdcache_io *sio)
{
struct ssdcache_te *cte;
bool match = false;
if (!sio->cmd)
return false;
BUG_ON(sio->cte_idx == -1);
rcu_read_lock();
cte = rcu_dereference(sio->cmd->te[sio->cte_idx]);
if (cte)
match = cte_cache_is_busy(cte, sio->bio_mask);
rcu_read_unlock();
return match;
}
static bool sio_target_is_busy(struct ssdcache_io *sio)
{
struct ssdcache_te *cte;
bool match = false;
if (!sio->cmd || sio->cte_idx == -1)
return false;
rcu_read_lock();
cte = rcu_dereference(sio->cmd->te[sio->cte_idx]);
if (cte)
match = cte_target_is_busy(cte, sio->bio_mask);
rcu_read_unlock();
return match;
}
static bool sio_match_sector(struct ssdcache_io *sio)
{
struct ssdcache_te *cte;
bool match = false;
if (!sio || !sio->cmd || sio->cte_idx == -1)
return false;
rcu_read_lock();
cte = rcu_dereference(sio->cmd->te[sio->cte_idx]);
if (cte)
match = (sio->bio_sector == cte->sector);
rcu_read_unlock();
return match;
}
static void sio_cleanup_cte(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
bool is_invalid = false;
BUG_ON(!sio);
BUG_ON(!sio->cmd);
BUG_ON(sio->cte_idx == -1);
rcu_read_lock();
oldcte = rcu_dereference(sio->cmd->te[sio->cte_idx]);
if (oldcte &&
bitmap_empty(oldcte->clean, DEFAULT_BLOCKSIZE) &&
bitmap_empty(oldcte->target_busy, DEFAULT_BLOCKSIZE) &&
bitmap_empty(oldcte->cache_busy, DEFAULT_BLOCKSIZE))
is_invalid = true;
rcu_read_unlock();
if (!oldcte)
return;
if (!is_invalid) {
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
/* Failure is okay; we'll drop the cte then */
} else {
#ifdef SSD_DEBUG
WPRINTK(sio, "drop invalid cte");
#endif
newcte = NULL;
}
spin_lock_irq(&sio->cmd->lock);
if (newcte) {
oldcte = sio->cmd->te[sio->cte_idx];
*newcte = *oldcte;
newcte->atime = jiffies;
newcte->count++;
}
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
if (oldcte)
call_rcu(&oldcte->rcu, cte_reset);
}
static void sio_finish_cache_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
BUG_ON(!sio);
BUG_ON(!sio->cmd);
BUG_ON(sio->cte_idx == -1);
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
if (newcte) {
if (oldcte)
*newcte = *oldcte;
newcte->atime = jiffies;
newcte->count++;
/* Reset busy bitmaps */
cte_unset_bitmap(newcte, sio, cache_busy);
/* Update the clean bitmap */
if (sio->error)
cte_unset_bitmap(newcte, sio, clean);
else
cte_set_bitmap(newcte, sio, clean);
}
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
if (oldcte)
call_rcu(&oldcte->rcu, cte_reset);
}
static void sio_finish_target_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
BUG_ON(!sio);
BUG_ON(!sio->cmd);
BUG_ON(sio->cte_idx == -1);
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
if (newcte) {
if (oldcte)
*newcte = *oldcte;
newcte->atime = jiffies;
newcte->count++;
/* Reset busy bitmaps */
if (CACHE_IS_READCACHE(sio->sc)) {
cte_unset_bitmap(newcte, sio, cache_busy);
} else {
cte_unset_bitmap(newcte, sio, target_busy);
/* Upon error reset the clean bitmap */
if (sio->error)
cte_unset_bitmap(newcte, sio, clean);
}
}
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
if (oldcte)
call_rcu(&oldcte->rcu, cte_reset);
}
static enum cte_match_t sio_new_cache_write(struct ssdcache_io *sio, int rw)
{
struct ssdcache_te *newcte, *oldcte;
enum cte_match_t retval;
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
if (!newcte) {
/* Ouch */
WPRINTK(sio, "oom, drop old cte");
sio->cte_idx = -1;
sio->sc->lookup_failed++;
retval = CTE_LOOKUP_FAILED;
}
spin_lock_irq(&sio->cmd->lock);
if (newcte) {
if (rw == WRITE) {
cte_set_bitmap(newcte, sio, cache_busy);
if (!CACHE_IS_READCACHE(sio->sc))
cte_set_bitmap(newcte, sio, target_busy);
sio->sc->write_miss++;
retval = CTE_WRITE_MISS;
} else {
sio->sc->read_miss++;
retval = CTE_READ_MISS;
}
newcte->sector = sio->bio_sector;
}
oldcte = sio->cmd->te[sio->cte_idx];
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
if (oldcte)
call_rcu(&oldcte->rcu, cte_reset);
return retval;
}
static void cte_start_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
if (!newcte)
return;
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
BUG_ON(!oldcte);
*newcte = *oldcte;
newcte->count++;
cte_unset_bitmap(newcte, sio, clean);
cte_set_bitmap(newcte, sio, cache_busy);
if (!CACHE_IS_READCACHE(sio->sc)) {
if (sio->sc->options.async_lookup) {
if (!sio->error) {
cte_set_bitmap(newcte, sio, target_busy);
}
} else {
cte_set_bitmap(newcte, sio, target_busy);
}
}
newcte->atime = jiffies;
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
call_rcu(&oldcte->rcu, cte_reset);
}
static void cte_start_cache_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
if (!newcte)
return;
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
BUG_ON(!oldcte);
*newcte = *oldcte;
newcte->count++;
cte_unset_bitmap(newcte, sio, clean);
cte_set_bitmap(newcte, sio, cache_busy);
newcte->atime = jiffies;
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
call_rcu(&oldcte->rcu, cte_reset);
}
static void cte_start_target_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
if (!newcte)
return;
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
BUG_ON(!oldcte);
*newcte = *oldcte;
newcte->count++;
cte_set_bitmap(newcte, sio, target_busy);
newcte->atime = jiffies;
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
call_rcu(&oldcte->rcu, cte_reset);
}
static void cte_cancel_target_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
if (!newcte)
return;
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
BUG_ON(!oldcte);
*newcte = *oldcte;
newcte->count++;
cte_unset_bitmap(newcte, sio, clean);
cte_unset_bitmap(newcte, sio, cache_busy);
cte_unset_bitmap(newcte, sio, target_busy);
newcte->atime = jiffies;
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
call_rcu(&oldcte->rcu, cte_reset);
}
static void cte_cancel_cache_write(struct ssdcache_io *sio)
{
struct ssdcache_te *newcte, *oldcte;
/* Check if we should drop the old cte */
newcte = cte_new(sio->sc, sio->cmd, sio->cte_idx);
if (!newcte)
return;
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
BUG_ON(!oldcte);
*newcte = *oldcte;
newcte->count++;
cte_unset_bitmap(newcte, sio, clean);
cte_unset_bitmap(newcte, sio, cache_busy);
newcte->atime = jiffies;
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], newcte);
spin_unlock_irq(&sio->cmd->lock);
call_rcu(&oldcte->rcu, cte_reset);
}
static void sio_cte_invalidate(struct ssdcache_io *sio)
{
struct ssdcache_te *oldcte;
spin_lock_irq(&sio->cmd->lock);
oldcte = sio->cmd->te[sio->cte_idx];
BUG_ON(!oldcte);
rcu_assign_pointer(sio->cmd->te[sio->cte_idx], NULL);
spin_unlock_irq(&sio->cmd->lock);
call_rcu(&oldcte->rcu, cte_reset);
}
/*
* Workqueue handling
*/
static struct ssdcache_io *ssdcache_create_sio(struct ssdcache_ctx *sc)
{
struct ssdcache_io *sio;
sio = mempool_alloc(_sio_pool, GFP_NOIO);
if (!sio)
return NULL;
memset(sio, 0, sizeof(struct ssdcache_io));
sio->sc = sc;
sio->cte_idx = -1;
sio->nr = ++sc->nr_sio;
sc->sio_active++;
kref_init(&sio->kref);
INIT_LIST_HEAD(&sio->list);
spin_lock_init(&sio->lock);
return sio;
}
static void ssdcache_destroy_sio(struct kref *kref)
{
struct ssdcache_io *sio = container_of(kref, struct ssdcache_io, kref);
BUG_ON(!list_empty(&sio->list));
if (sio->bio) {
struct bio_vec *bvec;
int i;
bio_for_each_segment(bvec, sio->bio, i)
put_page(bvec->bv_page);
bio_put(sio->bio);
sio->bio = NULL;
}
if (sio->error != -ESTALE &&
sio_match_sector(sio) &&
!sio_cache_is_busy(sio) &&
!sio_target_is_busy(sio))
sio_cleanup_cte(sio);
sio->sc->sio_active--;
mempool_free(sio, _sio_pool);
}
static void ssdcache_get_sio(struct ssdcache_io *sio)
{
kref_get(&sio->kref);
}
static int ssdcache_put_sio(struct ssdcache_io *sio)
{
return kref_put(&sio->kref, ssdcache_destroy_sio);
}
static inline void push_sio(struct list_head *q, struct ssdcache_io *sio)
{
unsigned long flags;
spin_lock_irqsave(&_work_lock, flags);
list_add_tail(&sio->list, q);
spin_unlock_irqrestore(&_work_lock, flags);
}
static void ssdcache_schedule_sio(struct ssdcache_io *sio)
{
ssdcache_get_sio(sio);
push_sio(&_io_work, sio);
queue_work(_ssdcached_wq, &_ssdcached_work);
}
static void map_secondary_bio(struct ssdcache_io *sio, struct bio *bio)
{
struct bio_vec *bvec;
int i;
/* Kick off secondary writes */
sio->bio = bio_clone(bio, GFP_NOIO);
BUG_ON(!sio->bio);
bio_for_each_segment(bvec, sio->bio, i)
get_page(bvec->bv_page);
}
static void map_writeback_bio(struct ssdcache_io *sio, struct bio *bio)
{
struct bio_vec *bvec;
int i;
BUG_ON(sio->writeback_bio);
sio->writeback_bio = bio_clone(bio, GFP_NOIO);
if (!sio->writeback_bio) {
WPRINTK(sio, "bio_clone failed");
return;
}
sio->writeback_bio->bi_rw |= WRITE;
bio_for_each_segment(bvec, sio->writeback_bio, i)
get_page(bvec->bv_page);
}
static void unmap_writeback_bio(struct ssdcache_io *sio)
{
int i;
struct bio_vec *bvec;
if (sio->writeback_bio) {
/* Release bio */
bio_for_each_segment(bvec, sio->writeback_bio, i)
put_page(bvec->bv_page);
bio_put(sio->writeback_bio);
sio->writeback_bio = NULL;
}
}
static void cache_io_callback(unsigned long error, void *context)
{
struct ssdcache_io *sio = context;
if (!sio_match_sector(sio)) {
WPRINTK(sio, "cte overrun, not updating state");
sio->sc->cache_overruns++;
} else if (!sio_cache_is_busy(sio)) {
if (!CACHE_IS_READCACHE(sio->sc))
WPRINTK(sio, "cte not busy, not updating state");
} else {
if (error) {
WPRINTK(sio, "finished with %lu", error);
sio->error = -EIO;
}
if (sio->error == -EUCLEAN) {
#ifdef SSD_DEBUG
WPRINTK(sio, "reset EUCLEAN");
#endif
sio->error = 0;
}
sio_finish_cache_write(sio);
}
unmap_writeback_bio(sio);
ssdcache_put_sio(sio);
queue_work(_ssdcached_wq, &_ssdcached_work);
}
static void target_io_callback(unsigned long error, void *context)
{
struct ssdcache_io *sio = context;
if (!sio->cmd) {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s: cte lookup not finished",
sio->nr, __FUNCTION__);
#endif
if (!sio->error)
sio->error = -EUCLEAN;
sio->sc->cache_overruns++;
} else if (!sio_match_sector(sio)) {
WPRINTK(sio, "cte overrun, not updating state");
sio->sc->cache_overruns++;
} else if (!sio_target_is_busy(sio)) {
WPRINTK(sio, "cte not busy, not updating state");
} else {
if (error) {
WPRINTK(sio, "finished with %lu", error);
sio->error = -EIO;
}
sio_finish_target_write(sio);
}
ssdcache_put_sio(sio);
queue_work(_ssdcached_wq, &_ssdcached_work);
}
static inline sector_t to_cache_sector(struct ssdcache_io *sio,
sector_t data_sector)
{
sector_t sector_offset, cte_offset, cmd_offset, cache_sector;
sector_offset = data_sector & sio->sc->block_mask;
BUG_ON(!sio->cmd);
BUG_ON(sio->cte_idx < 0);
cte_offset = sio->cte_idx * sio->sc->block_size;
cmd_offset = sio->cmd->hash * sio->cmd->num_cte * sio->sc->block_size;
cache_sector = sio->sc->data_offset + cmd_offset + cte_offset + sector_offset;
if (cache_sector > i_size_read(sio->sc->cache_dev->bdev->bd_inode)) {
WPRINTK(sio, "access beyond end of device");
BUG();
}
return cache_sector;
}
static void write_to_cache(struct ssdcache_io *sio, struct bio *bio)
{
struct dm_io_region cache;
struct dm_io_request iorq;
cache.bdev = sio->sc->cache_dev->bdev;
cache.sector = to_cache_sector(sio, bio->bi_sector);
cache.count = bio_sectors(bio);
iorq.bi_rw = WRITE;
iorq.mem.type = DM_IO_BVEC;
iorq.mem.ptr.bvec = bio_iovec(bio);
iorq.notify.fn = cache_io_callback;
iorq.notify.context = sio;
iorq.client = sio->sc->iocp;
dm_io(&iorq, 1, &cache, NULL);
}
static void write_to_target(struct ssdcache_io *sio, struct bio *bio)
{
struct dm_io_region target;
struct dm_io_request iorq;
target.bdev = sio->sc->target_dev->bdev;
target.sector = bio->bi_sector;
target.count = bio_sectors(bio);
iorq.bi_rw = WRITE;
iorq.mem.type = DM_IO_BVEC;
iorq.mem.ptr.bvec = bio_iovec(bio);
iorq.notify.fn = target_io_callback;
iorq.notify.context = sio;
iorq.client = sio->sc->iocp;
dm_io(&iorq, 1, &target, NULL);
}
static void sio_start_prefetch(struct ssdcache_io *sio, struct bio *bio)
{
if (!sio->sc->options.disable_writeback) {
/* Setup clone for writing to cache device */
map_writeback_bio(sio, bio);
}
bio->bi_bdev = sio->sc->target_dev->bdev;
}
/*
* sio_start_write_busy
*
* That one's a little tricky.
* We hit this when a cache write is still in flight.
* However, for writethrough it actually only matters
* that the _target_ write is completed.
* So we can start the target write and defer the cache
* write for until after the original cache write completed.
*/
static void sio_start_write_busy(struct ssdcache_io *sio, struct bio *bio)
{
if (!CACHE_IS_READCACHE(sio->sc))
/* Setup clone for writing to cache device */
map_writeback_bio(sio, bio);
if (!CACHE_IS_WRITEBACK(sio->sc)) {
bio->bi_bdev = sio->sc->target_dev->bdev;
} else {
bio->bi_bdev = sio->sc->cache_dev->bdev;
bio->bi_sector = to_cache_sector(sio, bio->bi_sector);
}
}
static void sio_start_write_miss(struct ssdcache_io *sio, struct bio *bio)
{
if (!sio->sc->options.async_lookup &&
!CACHE_IS_READCACHE(sio->sc)) {
map_secondary_bio(sio, bio);
ssdcache_schedule_sio(sio);
}
if (!CACHE_IS_WRITEBACK(sio->sc)) {
bio->bi_bdev = sio->sc->target_dev->bdev;
} else {
bio->bi_bdev = sio->sc->cache_dev->bdev;
bio->bi_sector = to_cache_sector(sio, bio->bi_sector);
}
}
/*
* sio_check_writeback
*
* Check if the pending writeback bio is safe for
* submission.
* At this point the target read has completed
* and we try to writeback the original bio to
* the cache to increase the likelyhood of a
* cache hit.
* However, this is an optimisation. So whenever
* the cte has been evicted or a write is already
* outstanding on the same cte we can safely
* cancel the writeback.
*/
static bool sio_check_writeback(struct ssdcache_io *sio)
{
struct ssdcache_te *cte;
BUG_ON(!sio);
BUG_ON(!sio->cmd);
BUG_ON(sio->cte_idx == -1);
rcu_read_lock();
cte = rcu_dereference(sio->cmd->te[sio->cte_idx]);
rcu_read_unlock();
if (!cte) {
#ifdef SSD_DEBUG
WPRINTK(sio, "invalid cte");
#endif
return false;
}
if (cte->sector != sio->bio_sector) {
#ifdef SSD_DEBUG
WPRINTK(sio, "wrong sector %llx %llx",
(unsigned long long)cte->sector,
(unsigned long long)sio->bio_sector);
#endif
return false;
}
/* Check if there is an outstanding cache write */
if (cte_cache_is_busy(cte, sio->bio_mask)) {
#ifdef SSD_DEBUG
WPRINTK(sio, "cache sector busy");
#endif
return false;
}
return true;
}
/*
* Hashing
*
* We implement double hashing to avoid hash collisions.
*/
static unsigned long ssdcache_hash_64(struct ssdcache_ctx *sc, sector_t sector)
{
unsigned long value, hash_number, sector_shift;
sector_shift = fls(sc->block_size) - 1;
value = sector >> sector_shift;
hash_number = (unsigned long)hash_64(value, sc->hash_bits);
return hash_number;
}
static unsigned long ssdcache_hash_wrap(struct ssdcache_ctx *sc, sector_t sector)
{
unsigned long sector_shift, value, hash_mask;
sector_shift = fls(sc->block_size) - 1;
value = sector >> sector_shift;
hash_mask = (1UL << sc->hash_bits) - 1;
return value & hash_mask;
}
static unsigned long rehash_block(struct ssdcache_ctx *sc, sector_t sector,
unsigned long hash_number)
{
unsigned long result, hash_mask;
hash_mask = (1UL << sc->hash_bits) - 1;
result = hash_number + ssdcache_hash_64(sc, sector);
if ((result & hash_mask) == hash_number)
result++;
return result & hash_mask;
}
static enum cte_match_t cte_match(struct ssdcache_io *sio, int rw)
{
unsigned long hash_number;
unsigned long cte_atime, oldest_atime;
unsigned long cte_count, oldest_count;
int skip_cmd_instantiation = 0;
int evict_cte_on_write = 0;
int invalid, oldest, i, index, busy = 0, assoc = 0;
enum cte_match_t retval = CTE_LOOKUP_FAILED;
hash_number = ssdcache_hash_wrap(sio->sc, sio->bio_sector);
if (rw == WRITE && sio->sc->options.skip_write_insert)
skip_cmd_instantiation = 1;
if (rw == WRITE && sio->sc->options.evict_on_write)
evict_cte_on_write = 1;
retry:
oldest_atime = jiffies;
oldest_count = -1;
oldest = -1;
invalid = -1;
index = -1;
/* Lookup cmd */
sio->cmd = cmd_lookup(sio->sc, hash_number);
if (!sio->cmd) {
if (skip_cmd_instantiation) {
/* Skip cte instantiation on WRITE */
sio->cte_idx = -1;
sio->sc->write_skip++;
return CTE_WRITE_SKIP;
}
if (sio->error) {
/* Target write already completed */
sio->cte_idx = -1;
sio->sc->write_done++;
return CTE_WRITE_DONE;
}
sio->cmd = cmd_insert(sio->sc, hash_number);
if (!sio->cmd) {
DPRINTK("%lu: %s: cmd insertion failure",
sio->nr, __FUNCTION__);
sio->sc->lookup_failed++;
sio->cte_idx = -1;
retval = CTE_LOOKUP_FAILED;
goto out;
} else {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s (cte %lx:0): use first clean entry",
sio->nr, __FUNCTION__, sio->cmd->hash);
#endif
/* Clean cmd, first entry is useable */
invalid = 0;
/* Skip cte lookup */
goto found;
}
}
for (i = 0; i < sio->cmd->num_cte; i++) {
struct ssdcache_te *cte;
rcu_read_lock();
cte = rcu_dereference(sio->cmd->te[i]);
rcu_read_unlock();
if (!cte) {
if (invalid == -1)
invalid = i;
continue;
}
if (cte->sector == sio->bio_sector) {
sio->cte_idx = i;
if (cte_is_clean(cte, sio->bio_mask) && rw == READ) {
sio->sc->read_clean++;
retval = CTE_READ_CLEAN;
/* WRITE_CLEAN is mapped to WRITE_INVALID */
} else if (cte_target_is_busy(cte, sio->bio_mask)) {
/* Target busy */
if (rw == WRITE) {
if (CACHE_IS_READCACHE(sio->sc)) {
sio_cte_invalidate(sio);
sio->sc->write_cancel++;
retval = CTE_WRITE_CANCEL;
} else if ((CACHE_IS_WRITEBACK(sio->sc)) &&
sio->sc->options.queue_busy) {
cte_start_cache_write(sio);
sio->sc->write_busy++;
retval = CTE_WRITE_BUSY;
} else {
cte_cancel_target_write(sio);
sio->sc->write_cancel++;
retval = CTE_WRITE_CANCEL;
}
} else {
sio->sc->read_busy++;
retval = CTE_READ_BUSY;
}
} else if (cte_cache_is_busy(cte, sio->bio_mask)) {
/* Cache busy */
if (rw == WRITE) {
if (CACHE_IS_READCACHE(sio->sc)) {
sio_cte_invalidate(sio);
sio->sc->write_cancel++;
retval = CTE_WRITE_CANCEL;
} else if (CACHE_IS_WRITETHROUGH(sio->sc) &&
sio->sc->options.queue_busy) {
cte_start_target_write(sio);
sio->sc->write_busy++;
retval = CTE_WRITE_BUSY;
} else {
cte_cancel_cache_write(sio);
sio->sc->write_cancel++;
retval = CTE_WRITE_CANCEL;
}
} else {
sio->sc->read_busy++;
retval = CTE_READ_BUSY;
}
} else {
/* Invalid cte sector */
if (rw == WRITE) {
cte_start_write(sio);
sio->sc->write_invalid++;
retval = CTE_WRITE_INVALID;
} else {
sio->sc->read_invalid++;
retval = CTE_READ_INVALID;
}
}
goto out;
}
/*
* Do not attempt to evict entries when
* target writes have already completed.
*/
if (sio->error) {
DPRINTK("%lu: %s (cte %lx:%x): error %d",
sio->nr, __FUNCTION__, sio->cmd->hash, i,
sio->error);
continue;
}
/* Break out if we have found an invalid entry */
if (invalid != -1)
break;
/* Skip cache eviction on WRITE */
if (!evict_cte_on_write)
continue;
/* Can only eject non-busy entries */
if (cte_target_is_busy(cte, sio->bio_mask) ||
cte_cache_is_busy(cte, sio->bio_mask)) {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s (cte %lx:%x): skip busy cte",
sio->nr, __FUNCTION__, sio->cmd->hash, i);
#endif
busy++;
continue;
}
/* Can only eject CLEAN entries */
if (!cte_is_clean(cte, sio->bio_mask)) {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s (cte %lx:%x): skip not-clean cte",
sio->nr, __FUNCTION__, sio->cmd->hash, i);
#endif
busy++;
continue;
}
if (CACHE_USE_LRU(sio->sc)) {
/* Select the oldest clean entry */
rcu_read_lock();
cte_atime = rcu_dereference(cte)->atime;
rcu_read_unlock();
if (time_before_eq(cte_atime, oldest_atime)) {
oldest_atime = cte_atime;
oldest = i;
}
} else {
/* Select the lowest access count */
rcu_read_lock();
cte_count = rcu_dereference(cte)->count;
rcu_read_unlock();
if (cte_count <= oldest_count) {
oldest_count = cte_count;
oldest = i;
}
}
}
if (invalid == -1 && assoc < sio->sc->options.assoc && !sio->error) {
hash_number = rehash_block(sio->sc, sio->bio_sector,
hash_number);
assoc++;
#ifdef SSD_DEBUG
DPRINTK("%lu: %s (cte %lx:%x): retry with assoc %d",
sio->nr, __FUNCTION__, sio->cmd->hash, i, assoc);
#endif
goto retry;
}
found:
if (invalid != -1) {
index = invalid;
} else if (oldest != -1) {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s (cte %lx:%x): drop oldest cte", sio->nr,
__FUNCTION__, sio->cmd->hash, oldest);
#endif
sio->sc->cache_evictions++;
index = oldest;
} else {
index = -1;
}
if (index != -1) {
sio->cte_idx = index;
retval = sio_new_cache_write(sio, rw);
} else if (sio->error) {
sio->cte_idx = -1;
sio->sc->write_done++;
retval = CTE_WRITE_DONE;
} else if (!evict_cte_on_write) {
sio->cte_idx = -1;
sio->sc->write_skip++;
retval = CTE_WRITE_SKIP;
} else {
DPRINTK("%lu: %s (cte %lx:ff): %d/%d ctes busy", sio->nr,
__FUNCTION__, sio->cmd->hash, busy, assoc);
sio->cte_idx = -1;
sio->sc->lookup_busy++;
}
out:
return retval;
}
static void sio_lookup_async(struct ssdcache_io *sio)
{
enum cte_match_t ret = CTE_LOOKUP_FAILED;
if (!sio->cmd || sio->cte_idx == -1)
ret = cte_match(sio, WRITE);
switch (ret) {
case CTE_WRITE_INVALID:
case CTE_WRITE_CLEAN:
case CTE_WRITE_MISS:
ssdcache_get_sio(sio);
write_to_cache(sio, sio->bio);
break;
case CTE_WRITE_BUSY:
WPRINTK(sio, "cte busy for write");
sio->error = -EBUSY;
break;
case CTE_WRITE_DONE:
#ifdef SSD_DEBUG
DPRINTK("%lu: %s: cte already done",
sio->nr, __FUNCTION__);
#endif
break;
default:
DPRINTK("%lu: %s (cte %lx:%lx): cte lookup failed %d",
sio->nr, __FUNCTION__,
sio->cmd ? sio->cmd->hash : 0xffff,
sio->cte_idx == -1 ? 0xfff : sio->cte_idx, ret);
sio->error = -ENOENT;
}
}
static void sio_in_flight(void)
{
struct ssdcache_io *sio;
unsigned long flags;
int in_flight = 0;
spin_lock_irqsave(&_work_lock, flags);
list_for_each_entry(sio, &_io_work, list) {
in_flight++;
}
spin_unlock_irqrestore(&_work_lock, flags);
DPRINTK("%d sios in flight", in_flight);
}
/*
* process_sio
*
* Using list_splice() here shifts processing onto
* a local list, so we can't figure out how many
* sios are still pending.
* Using an open-coded retry loop allows us to
* traverse the list of outstanding requests
* even from another context/function.
*/
static void process_sio(struct work_struct *ignored)
{
unsigned long flags;
struct ssdcache_io *sio;
retry:
spin_lock_irqsave(&_work_lock, flags);
if (!list_empty(&_io_work)) {
sio = list_first_entry(&_io_work, struct ssdcache_io, list);
list_del_init(&sio->list);
} else {
sio = NULL;
}
spin_unlock_irqrestore(&_work_lock, flags);
if (sio) {
if (sio->bio) {
/* secondary write */
if (sio->bio->bi_rw & REQ_FLUSH) {
/* DM only sends empty flushes */
DPRINTK("%lu: %s: flush done",
sio->nr, __FUNCTION__);
bio_endio(sio->bio, 0);
bio_put(sio->bio);
sio->bio = NULL;
} else if (!sio_match_sector(sio)) {
WPRINTK(sio, "target cte overrun");
sio->error = -ESTALE;
sio->sc->cache_overruns++;
} else if (CACHE_IS_WRITETHROUGH(sio->sc)) {
if (!sio_cache_is_busy(sio)) {
WPRINTK(sio, "cache cte not busy");
sio->error = -ESTALE;
sio->sc->bio_cancelled++;
} else {
if (sio->sc->options.async_lookup)
sio_lookup_async(sio);
else {
ssdcache_get_sio(sio);
write_to_cache(sio, sio->bio);
}
}
} else {
if (!sio_target_is_busy(sio)) {
WPRINTK(sio, "target cte not busy");
sio->error = -ESTALE;
sio->sc->bio_cancelled++;
} else {
ssdcache_get_sio(sio);
if (sio->sc->options.async_lookup)
cte_start_target_write(sio);
write_to_target(sio, sio->bio);
}
}
} else if (sio->writeback_bio) {
if (!sio_check_writeback(sio)) {
/* Cancel writeback */
unmap_writeback_bio(sio);
sio->error = -ESTALE;
sio->sc->writeback_cancelled++;
} else {
/* Start writing to cache device */
ssdcache_get_sio(sio);
cte_start_cache_write(sio);
write_to_cache(sio, sio->writeback_bio);
}
} else {
WPRINTK(sio, "invalid workqueue state");
}
ssdcache_put_sio(sio);
goto retry;
}
}
static int ssdcache_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
struct ssdcache_ctx *sc = ti->private;
struct ssdcache_io *sio;
/* We don't support DISCARD or SECURE_DISCARD (yet) */
if (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)) {
bio->bi_bdev = sc->target_dev->bdev;
map_context->ptr = NULL;
return -EOPNOTSUPP;
}
if (bio_cur_bytes(bio) > to_bytes(sc->block_size)) {
DPRINTK("bio size %u larger than block size",
bio_cur_bytes(bio));
sc->cache_bypassed++;
bio->bi_bdev = sc->target_dev->bdev;
map_context->ptr = NULL;
return DM_MAPIO_REMAPPED;
}
if (bio_cur_bytes(bio) == 0 &&
!(bio->bi_rw & REQ_FLUSH)) {
DPRINTK("zero-sized bio (bi_rw %lx)", bio->bi_rw);
sc->cache_bypassed++;
bio->bi_bdev = sc->target_dev->bdev;
map_context->ptr = NULL;
return DM_MAPIO_REMAPPED;
}
sio = ssdcache_create_sio(sc);
if (!sio) {
DPRINTK("sio creation failure");
sio->sc->cache_bypassed++;
bio->bi_bdev = sc->target_dev->bdev;
ssdcache_put_sio(sio);
return DM_MAPIO_REMAPPED;
}
sio->bio_sector = cte_bio_align(sc, bio);
sio_bio_mask(sio, bio);
map_context->ptr = sio;
if (bio->bi_rw & REQ_FLUSH) {
DPRINTK("%lu: %s: flush start", sio->nr, __FUNCTION__);
#ifdef SSD_DEBUG
sio_in_flight();
#endif
sio->bio = bio;
bio_get(bio);
map_context->ptr = NULL;
ssdcache_schedule_sio(sio);
return DM_MAPIO_SUBMITTED;
}
if (sc->options.async_lookup &&
(bio_data_dir(bio) == WRITE)) {
map_secondary_bio(sio, bio);
ssdcache_schedule_sio(sio);
if (CACHE_IS_WRITETHROUGH(sc)) {
bio->bi_bdev = sc->target_dev->bdev;
return DM_MAPIO_REMAPPED;
}
}
switch (cte_match(sio, bio_data_dir(bio))) {
case CTE_READ_CLEAN:
/* Cache hit, cte clean */
#ifdef SSD_DEBUG
WPRINTK(sio, "read hit clean %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
bio->bi_bdev = sio->sc->cache_dev->bdev;
bio->bi_sector = to_cache_sector(sio, bio->bi_sector);
map_context->ptr = NULL;
ssdcache_put_sio(sio);
break;
case CTE_READ_BUSY:
WPRINTK(sio, "read hit busy %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
/* Do not start prefetching here, sector is already busy */
bio->bi_bdev = sc->target_dev->bdev;
map_context->ptr = NULL;
ssdcache_put_sio(sio);
break;
case CTE_READ_INVALID:
#ifdef SSD_DEBUG
WPRINTK(sio, "read invalid %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
sio_start_prefetch(sio, bio);
if (!sio->writeback_bio) {
map_context->ptr = NULL;
ssdcache_put_sio(sio);
}
break;
case CTE_READ_MISS:
#ifdef SSD_DEBUG
WPRINTK(sio, "read miss %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
sio_start_prefetch(sio, bio);
if (!sio->writeback_bio) {
map_context->ptr = NULL;
ssdcache_put_sio(sio);
}
break;
case CTE_LOOKUP_FAILED:
DPRINTK("%lu: %s: lookup failure %llx %u",
sio->nr, __FUNCTION__,
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
/* Fallthrough */
case CTE_WRITE_SKIP:
bio->bi_bdev = sc->target_dev->bdev;
map_context->ptr = NULL;
ssdcache_put_sio(sio);
break;
case CTE_WRITE_DONE:
/* Write to target already completed */
DPRINTK("%lu: %s: write target done", sio->nr, __FUNCTION__);
map_context->ptr = NULL;
bio_endio(bio, sio->error);
ssdcache_put_sio(sio);
return DM_MAPIO_SUBMITTED;
break;
case CTE_WRITE_BUSY:
#ifdef SSD_DEBUG
WPRINTK(sio, "write hit busy %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
sio_start_write_busy(sio, bio);
if (sc->options.queue_busy && !sio->writeback_bio) {
map_context->ptr = NULL;
ssdcache_put_sio(sio);
}
break;
case CTE_WRITE_CANCEL:
WPRINTK(sio, "write hit cancel %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
map_context->ptr = NULL;
ssdcache_put_sio(sio);
break;
case CTE_WRITE_INVALID:
#ifdef SSD_DEBUG
WPRINTK(sio, "write hit invalid %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
sio_start_write_miss(sio, bio);
break;
case CTE_WRITE_CLEAN:
#ifdef SSD_DEBUG
WPRINTK(sio, "write hit clean %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
sio_start_write_miss(sio, bio);
break;
case CTE_WRITE_MISS:
#ifdef SSD_DEBUG
WPRINTK(sio, "write miss %llx %u",
(unsigned long long)bio->bi_sector,
bio_cur_bytes(bio));
#endif
sio_start_write_miss(sio, bio);
break;
}
return DM_MAPIO_REMAPPED;
}
static int ssdcache_endio(struct dm_target *ti, struct bio *bio,
int error, union map_info *map_context)
{
struct ssdcache_io *sio = map_context->ptr;
if (!sio)
return error;
if (!sio->cmd || sio->cte_idx == -1) {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s: cte lookup not finished",
sio->nr, __FUNCTION__);
#endif
sio->sc->cache_overruns++;
sio->error = error ? error : -EUCLEAN;
goto out;
}
if (error) {
WPRINTK(sio, "finished with %u", error);
sio->error = error;
}
if (sio->error) {
WPRINTK(sio, "error %d", sio->error);
unmap_writeback_bio(sio);
}
if (bio_data_dir(bio) == WRITE) {
bool to_cache = bio->bi_bdev == sio->sc->cache_dev->bdev;
if (!sio_match_sector(sio)) {
WPRINTK(sio, "cte overrun, not updating state");
sio->sc->cache_overruns++;
} else if (to_cache) {
if (!sio_cache_is_busy(sio)) {
WPRINTK(sio, "cache not busy, not updating");
sio->sc->cache_overruns++;
} else {
sio_finish_cache_write(sio);
}
} else {
if (CACHE_IS_READCACHE(sio->sc)) {
sio_finish_target_write(sio);
} else if (!sio_target_is_busy(sio)) {
WPRINTK(sio, "target not busy, not updating");
sio->sc->cache_overruns++;
} else {
sio_finish_target_write(sio);
}
}
}
if (sio->writeback_bio) {
/* Kick off writeback */
ssdcache_schedule_sio(sio);
}
out:
ssdcache_put_sio(sio);
return error;
}
static int ssdcache_parse_options(struct dm_target *ti,
struct dm_arg_set *as,
struct ssdcache_ctx *sc)
{
int r;
unsigned int argc;
const char *opt_name;
static struct dm_arg _args[] = {
{0, 5, "invalid number of options"},
};
r = dm_read_arg_group(_args, as, &argc, &ti->error);
if (r)
return -EINVAL;
if (!argc)
return 0;
do {
opt_name = dm_shift_arg(as);
argc--;
if (!strcasecmp(opt_name, "lfu")) {
sc->options.strategy = CACHE_LFU;
continue;
}
if (!strcasecmp(opt_name, "lru")) {
sc->options.strategy = CACHE_LRU;
continue;
}
if (!strcasecmp(opt_name, "async_lookup")) {
sc->options.async_lookup = 1;
continue;
}
if (!strcasecmp(opt_name, "queue_busy")) {
sc->options.queue_busy = 1;
continue;
}
if (!strcasecmp(opt_name, "disable_writeback")) {
sc->options.disable_writeback = 1;
continue;
}
if (!strcasecmp(opt_name, "skip_write_insert")) {
sc->options.skip_write_insert = 1;
continue;
}
if (!strcasecmp(opt_name, "evict_on_write")) {
sc->options.evict_on_write = 1;
continue;
}
if (!strcasecmp(opt_name, "cmd_preload")) {
sc->options.cmd_preload = 1;
continue;
}
} while (argc);
return 0;
}
void ssdcache_format_options(struct ssdcache_ctx *sc, char *optstr)
{
int optnum = 0;
if (sc->options.strategy != default_cache_strategy)
optnum++;
if (sc->options.async_lookup)
optnum++;
if (sc->options.queue_busy)
optnum++;
if (sc->options.disable_writeback)
optnum++;
if (sc->options.skip_write_insert)
optnum++;
if (sc->options.evict_on_write)
optnum++;
if (sc->options.cmd_preload)
optnum++;
if (!optnum) {
optstr[0] = '\0';
return;
}
sprintf(optstr," options %d ", optnum);
if (sc->options.strategy != default_cache_strategy) {
if (sc->options.strategy == CACHE_LFU)
strcat(optstr, "lfu ");
else
strcat(optstr, "lru ");
}
if (sc->options.async_lookup)
strcat(optstr, "async_lookup ");
if (sc->options.queue_busy)
strcat(optstr, "queue_busy ");
if (sc->options.disable_writeback)
strcat(optstr, "disable_writeback ");
if (sc->options.skip_write_insert)
strcat(optstr, "skip_write_insert ");
if (sc->options.evict_on_write)
strcat(optstr, "evict_on_write ");
if (sc->options.cmd_preload)
strcat(optstr, "cmd_preload ");
optstr[strlen(optstr)] = '\0';
}
/*
* Construct a ssdcache mapping: <target_dev_path> <cache_dev_path>
*/
static int ssdcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct ssdcache_ctx *sc;
struct dm_arg_set as;
const char *devname;
const char *argname;
unsigned long num_cmd;
unsigned long cdev_size;
unsigned long long tdev_size;
int r = 0;
as.argc = argc;
as.argv = argv;
sc = kzalloc(sizeof(*sc), GFP_KERNEL);
if (sc == NULL) {
ti->error = "dm-ssdcache: Cannot allocate ssdcache context";
return -ENOMEM;
}
devname = dm_shift_arg(&as);
if (!devname) {
ti->error = "dm-ssdcache: Target device is not specified";
r = -EINVAL;
goto bad;
}
if (dm_get_device(ti, devname, dm_table_get_mode(ti->table),
&sc->target_dev)) {
ti->error = "dm-ssdcache: Target device lookup failed";
r = -EINVAL;
goto bad;
}
devname = dm_shift_arg(&as);
if (!devname) {
ti->error = "dm-ssdcache: Cache device is not specified";
r = -EINVAL;
goto bad;
}
if (dm_get_device(ti, devname, dm_table_get_mode(ti->table),
&sc->cache_dev)) {
ti->error = "dm-ssdcache: Cache device lookup failed";
dm_put_device(ti, sc->target_dev);
r = -EINVAL;
goto bad;
}
sc->block_size = DEFAULT_BLOCKSIZE;
sc->options.strategy = default_cache_strategy;
sc->options.mode = default_cache_mode;
sc->options.assoc = DEFAULT_ASSOCIATIVITY;
while ((argname = dm_shift_arg(&as)) != NULL) {
if (!strcasecmp(argname, "blocksize")) {
if (sscanf(dm_shift_arg(&as), "%lu", &sc->block_size) != 1) {
ti->error = "Invalid blocksize";
}
if (sc->block_size < 1) {
ti->error = "blocksize too small";
sc->block_size = DEFAULT_BLOCKSIZE;
}
} else if (!strcasecmp(argname, "assoc")) {
if (sscanf(dm_shift_arg(&as), "%d",
&sc->options.assoc) != 1) {
ti->error = "Invalid associativity";
}
if (sc->block_size < 1) {
ti->error = "Associativity must be at least 1";
sc->block_size = DEFAULT_ASSOCIATIVITY;
}
} else if (!strcasecmp(argname, "writeback")) {
sc->options.mode = CACHE_MODE_WRITEBACK;
} else if (!strcasecmp(argname, "writethrough")) {
sc->options.mode = CACHE_MODE_WRITETHROUGH;
} else if (!strcasecmp(argname, "readcache")) {
sc->options.mode = CACHE_MODE_READCACHE;
} else if (!strcasecmp(argname, "options")) {
if (ssdcache_parse_options(ti, &as, sc)) {
r = -EINVAL;
goto bad_io_client;
}
} else {
ti->error = "Invalid argument";
r = -EINVAL;
goto bad_io_client;
}
}
sc->iocp = dm_io_client_create();
if (IS_ERR(sc->iocp)) {
r = PTR_ERR(sc->iocp);
ti->error = "Failed to create io client\n";
goto bad_io_client;
}
spin_lock_init(&sc->cmd_lock);
cdev_size = i_size_read(sc->cache_dev->bdev->bd_inode);
tdev_size = i_size_read(sc->target_dev->bdev->bd_inode);
num_cmd = cdev_size / to_bytes(sc->block_size) / DEFAULT_ALIASING;
/*
* Hash bit calculation might return a lower number
* for the possible number of ctes, so adjust that
* as well.
*/
sc->hash_bits = fls(num_cmd) - 1;
num_cmd = (1UL << sc->hash_bits);
DPRINTK("block size %lu, hash bits %lu, num cmd %lu",
to_bytes(sc->block_size), sc->hash_bits, num_cmd);
#ifdef SSDCACHE_USE_RADIX_TREE
INIT_RADIX_TREE(&sc->md_tree, GFP_ATOMIC);
#else
sc->md_table = vmalloc(num_cmd * sizeof(struct ssdcache_md *));
memset(sc->md_table, 0, num_cmd * sizeof(struct ssdcache_md *));
#endif
sc->data_offset = 0;
sc->block_mask = sc->block_size - 1;
sc->nr_sio = 0;
if (sc->options.cmd_preload) {
int i;
for (i = 0; i < num_cmd; i++)
cmd_insert(sc, i);
}
ti->num_flush_requests = 1;
ti->num_discard_requests = 1;
ti->private = sc;
ti->split_io = sc->block_size;
return 0;
bad_io_client:
dm_put_device(ti, sc->target_dev);
dm_put_device(ti, sc->cache_dev);
bad:
kfree(sc);
return r;
}
static void ssdcache_dtr(struct dm_target *ti)
{
struct ssdcache_ctx *sc = (struct ssdcache_ctx *) ti->private;
#ifdef SSDCACHE_USE_RADIX_TREE
unsigned long pos = 0, nr_cmds;
struct ssdcache_md *cmds[MIN_CMD_NUM];
#endif
struct ssdcache_md *cmd;
int i;
#ifdef SSDCACHE_USE_RADIX_TREE
do {
spin_lock(&sc->cmd_lock);
nr_cmds = radix_tree_gang_lookup(&sc->md_tree,
(void **)cmds, pos,
MIN_CMD_NUM);
for (i = 0; i < nr_cmds; i++) {
pos = cmds[i]->hash;
cmd = radix_tree_delete(&sc->md_tree, pos);
spin_unlock(&sc->cmd_lock);
cmd_remove(cmd);
spin_lock(&sc->cmd_lock);
}
spin_unlock(&sc->cmd_lock);
pos++;
} while (nr_cmds == MIN_CMD_NUM);
#else
for (i = 0; i < (1UL << sc->hash_bits); i++) {
spin_lock(&sc->cmd_lock);
cmd = sc->md_table[i];
rcu_assign_pointer(sc->md_table[i], NULL);
spin_unlock(&sc->cmd_lock);
synchronize_rcu();
cmd_remove(cmd);
}
vfree(sc->md_table);
#endif
dm_io_client_destroy(sc->iocp);
dm_put_device(ti, sc->target_dev);
dm_put_device(ti, sc->cache_dev);
kfree(sc);
}
static int ssdcache_status(struct dm_target *ti, status_type_t type,
char *result, unsigned int maxlen)
{
struct ssdcache_ctx *sc = (struct ssdcache_ctx *) ti->private;
#ifdef SSDCACHE_USE_RADIX_TREE
unsigned long nr_elems, pos = 0;
struct ssdcache_md *cmds[MIN_CMD_NUM];
#endif
struct ssdcache_md *cmd;
struct ssdcache_te *cte;
char optstr[512], modestr[64];
unsigned long nr_cmds = 0, nr_ctes = 0;
unsigned long nr_cache_busy = 0, nr_target_busy = 0, nr_cte_full = 0;
int i, j;
rcu_read_lock();
#ifdef SSDCACHE_USE_RADIX_TREE
do {
nr_elems = radix_tree_gang_lookup(&sc->md_tree,
(void **)cmds, pos,
MIN_CMD_NUM);
for (i = 0; i < nr_elems; i++) {
cmd = cmds[i];
pos = cmd->hash;
nr_cmds++;
for (j = 0; j < cmd->num_cte; j++) {
cte = rcu_dereference(cmd->te[j]);
if (cte) {
nr_ctes++;
if (!bitmap_empty(cte->target_busy,
DEFAULT_BLOCKSIZE))
nr_target_busy++;
if (!bitmap_empty(cte->cache_busy,
DEFAULT_BLOCKSIZE))
nr_cache_busy++;
if (bitmap_full(cte->clean,
DEFAULT_BLOCKSIZE))
nr_cte_full++;
}
}
}
pos++;
} while (nr_elems == MIN_CMD_NUM);
#else
for (i = 0; i < (1UL << sc->hash_bits); i++) {
cmd = rcu_dereference(sc->md_table[i]);
if (cmd) {
nr_cmds++;
for (j = 0; j < cmd->num_cte; j++) {
cte = rcu_dereference(cmd->te[j]);
if (cte) {
nr_ctes++;
if (!bitmap_empty(cte->target_busy,
DEFAULT_BLOCKSIZE))
nr_target_busy++;
if (!bitmap_empty(cte->cache_busy,
DEFAULT_BLOCKSIZE))
nr_cache_busy++;
if (bitmap_full(cte->clean,
DEFAULT_BLOCKSIZE))
nr_cte_full++;
}
}
}
}
#endif
rcu_read_unlock();
switch (type) {
case STATUSTYPE_INFO:
snprintf(result, maxlen, "cmd %lu/%lu cte %lu/%lu/%lu\n"
"\tread clean %lu busy %lu invalid %lu miss %lu\n"
"\twrite busy %lu cancel %lu invalid %lu "
"miss %lu done %lu skip %lu\n"
"\tbypassed %lu evicts %lu cancelled writeback %lu bio %lu",
nr_cmds, (1UL << sc->hash_bits), nr_cte_full, nr_ctes,
(1UL << sc->hash_bits) * DEFAULT_ALIASING,
sc->read_clean, sc->read_busy,
sc->read_invalid, sc->read_miss,
sc->write_busy, sc->write_cancel, sc->write_invalid,
sc->write_miss, sc->write_done, sc->write_skip,
sc->cache_bypassed, sc->cache_evictions,
sc->writeback_cancelled, sc->bio_cancelled);
break;
case STATUSTYPE_TABLE:
if (sc->options.mode == CACHE_MODE_WRITEBACK)
strcat(modestr, "writeback");
else if (sc->options.mode == CACHE_MODE_READCACHE)
strcat(modestr, "readcache");
else
strcat(modestr, "writethrough");
ssdcache_format_options(sc, optstr);
snprintf(result, maxlen, "%s %s blocksize %lu %s assoc %d%s",
sc->target_dev->name, sc->cache_dev->name,
sc->block_size, modestr, sc->options.assoc, optstr);
break;
}
return 0;
}
static int ssdcache_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
struct ssdcache_ctx *sc = ti->private;
if (!sc)
return 0;
return fn(ti, sc->target_dev, 0, ti->len, data);
}
static struct target_type ssdcache_target = {
.name = "ssdcache",
.version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = ssdcache_ctr,
.dtr = ssdcache_dtr,
.map = ssdcache_map,
.end_io = ssdcache_endio,
.status = ssdcache_status,
.iterate_devices = ssdcache_iterate_devices,
};
int __init dm_ssdcache_init(void)
{
int r;
r = pool_init();
if (r < 0) {
DMERR("kmempool allocation failed: %d", r);
return r;
}
_ssdcached_wq = create_singlethread_workqueue("kssdcached");
if (!_ssdcached_wq) {
DMERR("failed to start kssdcached");
pool_exit();
return -ENOMEM;
}
INIT_WORK(&_ssdcached_work, process_sio);
r = dm_register_target(&ssdcache_target);
if (r < 0) {
DMERR("register failed %d", r);
destroy_workqueue(_ssdcached_wq);
pool_exit();
}
return r;
}
void dm_ssdcache_exit(void)
{
pool_exit();
destroy_workqueue(_ssdcached_wq);
dm_unregister_target(&ssdcache_target);
}
module_init(dm_ssdcache_init);
module_exit(dm_ssdcache_exit);
MODULE_DESCRIPTION(DM_NAME " cache target");
MODULE_AUTHOR("Hannes Reinecke <hare@suse.de>");
MODULE_LICENSE("GPL");
[-- Attachment #3: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: RFC: dm-ssdcache
2012-03-16 15:38 RFC: dm-ssdcache Hannes Reinecke
@ 2012-03-16 18:06 ` Mike Snitzer
2012-03-19 13:57 ` [PATCH] dm ssdcache: fix and/or tweak various low hanging fruit Mike Snitzer
2012-03-17 19:47 ` RFC: dm-ssdcache Mark Hills
1 sibling, 1 reply; 4+ messages in thread
From: Mike Snitzer @ 2012-03-16 18:06 UTC (permalink / raw)
To: Hannes Reinecke; +Cc: device-mapper development
On Fri, Mar 16 2012 at 11:38am -0400,
Hannes Reinecke <hare@suse.de> wrote:
> Hi all,
>
> to through some more confusion in an already heated debate, here is
> the 'dm-ssdcache' module I recently wrote.
> And held a talk about at LinuxCon Europe 2011 :-)
Thanks for posting.
Do you happen to have any slides you can share from your Linuxcon
Europe talk?
https://events.linuxfoundation.org/events/linuxcon-europe/reinecke
> Upon request I'll be generating a proper patch ...
>
> Not saying it's perfect, and surely doesn't have all the nifty
> features bcache has, but it should serve as a 'minimal'
> implementation. And probaly will start yet another discussion,
> if nothing else.
>
> So, comments etc are welcome.
Please expect some from me next week. And obviously we can continue
discussion at LSF.
Thanks again,
Mike
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: RFC: dm-ssdcache
2012-03-16 15:38 RFC: dm-ssdcache Hannes Reinecke
2012-03-16 18:06 ` Mike Snitzer
@ 2012-03-17 19:47 ` Mark Hills
1 sibling, 0 replies; 4+ messages in thread
From: Mark Hills @ 2012-03-17 19:47 UTC (permalink / raw)
To: Hannes Reinecke; +Cc: device-mapper development
On Fri, 16 Mar 2012, Hannes Reinecke wrote:
> Hi all,
>
> to through some more confusion in an already heated debate, here is
> the 'dm-ssdcache' module I recently wrote.
> And held a talk about at LinuxCon Europe 2011 :-)
>
> Upon request I'll be generating a proper patch ...
>
> Not saying it's perfect, and surely doesn't have all the nifty
> features bcache has, but it should serve as a 'minimal'
> implementation. And probaly will start yet another discussion,
> if nothing else.
>
> So, comments etc are welcome.
Thanks for sharing this. I gave it a try with mixed but promising
resulting. I'll share my observations in case they are useful.
Writethrough mode worked as expected, as did 'readcache'. But I found the
caching would stop after a short time; data already in the cache was read
from there, but no new data written. dmesg no longer prints "flush start",
"flush done", but no other indication; possibly some lock is taken in some
way?
It took me a while to work out how to switch to LRU behaviour, as I had to
give the number of option arguments, for reference here it is:
dmsetup create test --table "0 625137282 ssdcache /dev/sdc1 /dev/sdb1 options 1 lru"
In writeback I found I was quickly getting ext4 reports of corruption.
Even just between formatting the device and using it. Possibly it
by-passes the cache a little too readily under some reasons (eg. large bio
size) when data is in the cache?
In all cases I zeroed the front of the cache device before use, but I
couldn't see quickly from the code whether the cache device is intended to
be persistent and if I need a better way to initialise it, I assume not?
"dmsetup remove" seems to take along time (several minutes), not spinning
on CPU or disk. But it does eventually complete. Possibly this is linked
to the non-caching issue and some kind of lock. But then that doesn't
explain why it eventually completes.
Thanks
--
Mark
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH] dm ssdcache: fix and/or tweak various low hanging fruit
2012-03-16 18:06 ` Mike Snitzer
@ 2012-03-19 13:57 ` Mike Snitzer
0 siblings, 0 replies; 4+ messages in thread
From: Mike Snitzer @ 2012-03-19 13:57 UTC (permalink / raw)
To: Hannes Reinecke; +Cc: device-mapper development
Initial review (which hasn't yet touched on design, algorithms,
naming, etc) uncovered some small things:
- remove ': ' from DM_MSG_PREFIX, tweaked {D,W}PRINTK
- eliminate 2 4-byte holes in ssdcache_md structure
- clean up pool_init() error handling, switched to using KMEM_CACHE()
- fix ssd_cache_ctr() error path, dm_put_device for target_dev was
missing if failed to get cache_dev
- fix ssdcache_iterate_devices() to consult cache_dev too because it
is in the data path (resulting ssdcache dev now stacks limits
properly, e.g.: if you mix a 4K ssd with a 512b target dev)
- wrap sio_in_flight with SSD_DEBUG
- document ssdcache_ctr (still have yet to make sense of the options)
- small s/eject/evict/ comment tweak
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
drivers/md/dm-ssdcache.c | 130 ++++++++++++++++++++++-----------------------
1 files changed, 64 insertions(+), 66 deletions(-)
diff --git a/drivers/md/dm-ssdcache.c b/drivers/md/dm-ssdcache.c
index f05e40f..1ab6cb3 100644
--- a/drivers/md/dm-ssdcache.c
+++ b/drivers/md/dm-ssdcache.c
@@ -1,6 +1,4 @@
/*
- * dm-ssdcache.c
- *
* Copyright (c) 2011 Hannes Reinecke, SUSE Linux Products GmbH
*
* This file is released under the GPL.
@@ -17,21 +15,21 @@
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
-#define DM_MSG_PREFIX "ssdcache: "
+#define DM_MSG_PREFIX "ssdcache"
// #define SSD_DEBUG
#define SSD_LOG
#define SSDCACHE_USE_RADIX_TREE
#ifdef SSD_LOG
-#define DPRINTK( s, arg... ) printk(DM_MSG_PREFIX s "\n", ##arg)
-#define WPRINTK( w, s, arg... ) printk(DM_MSG_PREFIX "%lu: %s (cte %lx:%02lx): "\
- s "\n", (w)->nr, __FUNCTION__, \
- (w)->cmd->hash, \
- (w)->cte_idx, ##arg)
+#define DPRINTK(s, arg...) printk(DM_MSG_PREFIX ": " s "\n", ## arg)
+#define WPRINTK(w, s, arg...) printk(DM_MSG_PREFIX ": %lu: %s (cte %lx:%02lx): " \
+ s "\n", (w)->nr, __FUNCTION__, \
+ (w)->cmd->hash, \
+ (w)->cte_idx, ## arg)
#else
-#define DPRINTK( s, arg... )
-#define WPRINTK( w, s, arg... )
+#define DPRINTK(s, arg...)
+#define WPRINTK(w, s, arg...)
#endif
#define SSDCACHE_COPY_PAGES 1024
@@ -57,6 +55,8 @@ enum ssdcache_strategy_t {
CACHE_LFU,
};
+/* FIXME: add 'dm_' prefix to ssdcache_{md,io,te} structures */
+
struct ssdcache_md;
struct ssdcache_io;
@@ -74,8 +74,8 @@ struct ssdcache_te {
struct ssdcache_md {
spinlock_t lock; /* Lock to protect operations on the bio list */
- unsigned long hash; /* Hash number */
unsigned int num_cte; /* Number of table entries */
+ unsigned long hash; /* Hash number */
unsigned long atime;
struct ssdcache_ctx *sc;
struct ssdcache_te *te[DEFAULT_ALIASING]; /* RCU Table entries */
@@ -198,63 +198,47 @@ enum cte_match_t {
static int pool_init(void)
{
- _sio_cache = kmem_cache_create("ssdcache-sio",
- sizeof(struct ssdcache_io),
- __alignof__(struct ssdcache_io),
- 0, NULL);
+ _sio_cache = KMEM_CACHE(ssdcache_io, 0);
if (!_sio_cache)
return -ENOMEM;
- _cmd_cache = kmem_cache_create("ssdcache-cmd",
- sizeof(struct ssdcache_md),
- __alignof__(struct ssdcache_md),
- 0, NULL);
-
- if (!_cmd_cache) {
- kmem_cache_destroy(_sio_cache);
- return -ENOMEM;
- }
-
- _cte_cache = kmem_cache_create("ssdcache-cte",
- sizeof(struct ssdcache_te),
- __alignof__(struct ssdcache_te),
- 0, NULL);
+ _cmd_cache = KMEM_CACHE(ssdcache_md, 0);
+ if (!_cmd_cache)
+ goto bad_cmd_cache;
- if (!_cte_cache) {
- kmem_cache_destroy(_cmd_cache);
- kmem_cache_destroy(_sio_cache);
- return -ENOMEM;
- }
+ _cte_cache = KMEM_CACHE(ssdcache_te, 0);
+ if (!_cte_cache)
+ goto bad_cte_cache;
_sio_pool = mempool_create(MIN_SIO_ITEMS, mempool_alloc_slab,
- mempool_free_slab, _sio_cache);
- if (!_sio_pool) {
- kmem_cache_destroy(_cte_cache);
- kmem_cache_destroy(_cmd_cache);
- kmem_cache_destroy(_sio_cache);
- return -ENOMEM;
- }
+ mempool_free_slab, _sio_cache);
+ if (!_sio_pool)
+ goto bad_sio_pool;
_cmd_pool = mempool_create(MIN_CMD_NUM, mempool_alloc_slab,
mempool_free_slab, _cmd_cache);
- if (!_cmd_pool) {
- mempool_destroy(_sio_pool);
- kmem_cache_destroy(_cte_cache);
- kmem_cache_destroy(_cmd_cache);
- kmem_cache_destroy(_sio_cache);
- }
+ if (!_cmd_pool)
+ goto bad_cmd_pool;
_cte_pool = mempool_create(MIN_CTE_NUM, mempool_alloc_slab,
mempool_free_slab, _cte_cache);
- if (!_cte_pool) {
- mempool_destroy(_cmd_pool);
- mempool_destroy(_sio_pool);
- kmem_cache_destroy(_cte_cache);
- kmem_cache_destroy(_cmd_cache);
- kmem_cache_destroy(_sio_cache);
- }
+ if (!_cte_pool)
+ goto bad_cte_pool;
return 0;
+
+bad_cte_pool:
+ mempool_destroy(_cmd_pool);
+bad_cmd_pool:
+ mempool_destroy(_sio_pool);
+bad_sio_pool:
+ kmem_cache_destroy(_cte_cache);
+bad_cte_cache:
+ kmem_cache_destroy(_cmd_cache);
+bad_cmd_cache:
+ kmem_cache_destroy(_sio_cache);
+
+ return -ENOMEM;
}
static void pool_exit(void)
@@ -1280,7 +1264,7 @@ retry:
busy++;
continue;
}
- /* Can only eject CLEAN entries */
+ /* Can only evict CLEAN entries */
if (!cte_is_clean(cte, sio->bio_mask)) {
#ifdef SSD_DEBUG
DPRINTK("%lu: %s (cte %lx:%x): skip not-clean cte",
@@ -1387,6 +1371,7 @@ static void sio_lookup_async(struct ssdcache_io *sio)
}
}
+#ifdef SSD_DEBUG
static void sio_in_flight(void)
{
struct ssdcache_io *sio;
@@ -1400,6 +1385,7 @@ static void sio_in_flight(void)
spin_unlock_irqrestore(&_work_lock, flags);
DPRINTK("%d sios in flight", in_flight);
}
+#endif
/*
* process_sio
@@ -1726,7 +1712,7 @@ static int ssdcache_parse_options(struct dm_target *ti,
unsigned int argc;
const char *opt_name;
static struct dm_arg _args[] = {
- {0, 5, "invalid number of options"},
+ {0, 7, "invalid number of options"},
};
r = dm_read_arg_group(_args, as, &argc, &ti->error);
@@ -1831,7 +1817,12 @@ void ssdcache_format_options(struct ssdcache_ctx *sc, char *optstr)
}
/*
- * Construct a ssdcache mapping: <target_dev_path> <cache_dev_path>
+ * Construct an ssdcache mapping:
+ *
+ * ssdcache <target_dev_path> <cache_dev_path>
+ * [blocksize <value>] [assoc <value>] [writeback|writethrough|readcache]
+ * [options <#option args> [lfu|lru] [async_lookup] [queue_busy]
+ * [disable_writeback] [skip_write_insert] [evict_on_write] [cmd_preload] ]
*/
static int ssdcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
@@ -1849,35 +1840,35 @@ static int ssdcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
sc = kzalloc(sizeof(*sc), GFP_KERNEL);
if (sc == NULL) {
- ti->error = "dm-ssdcache: Cannot allocate ssdcache context";
+ ti->error = "Cannot allocate ssdcache context";
return -ENOMEM;
}
devname = dm_shift_arg(&as);
if (!devname) {
- ti->error = "dm-ssdcache: Target device is not specified";
+ ti->error = "Target device is not specified";
r = -EINVAL;
goto bad;
}
if (dm_get_device(ti, devname, dm_table_get_mode(ti->table),
&sc->target_dev)) {
- ti->error = "dm-ssdcache: Target device lookup failed";
+ ti->error = "Target device lookup failed";
r = -EINVAL;
goto bad;
}
devname = dm_shift_arg(&as);
if (!devname) {
- ti->error = "dm-ssdcache: Cache device is not specified";
+ ti->error = "Cache device is not specified";
r = -EINVAL;
- goto bad;
+ goto bad_cache_dev;
}
if (dm_get_device(ti, devname, dm_table_get_mode(ti->table),
&sc->cache_dev)) {
- ti->error = "dm-ssdcache: Cache device lookup failed";
+ ti->error = "Cache device lookup failed";
dm_put_device(ti, sc->target_dev);
r = -EINVAL;
- goto bad;
+ goto bad_cache_dev;
}
sc->block_size = DEFAULT_BLOCKSIZE;
@@ -1968,8 +1959,9 @@ static int ssdcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return 0;
bad_io_client:
- dm_put_device(ti, sc->target_dev);
dm_put_device(ti, sc->cache_dev);
+bad_cache_dev:
+ dm_put_device(ti, sc->target_dev);
bad:
kfree(sc);
return r;
@@ -2121,11 +2113,17 @@ static int ssdcache_status(struct dm_target *ti, status_type_t type,
static int ssdcache_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
+ int r = 0;
struct ssdcache_ctx *sc = ti->private;
if (!sc)
return 0;
- return fn(ti, sc->target_dev, 0, ti->len, data);
+
+ r = fn(ti, sc->cache_dev, 0, ti->len, data);
+ if (!r)
+ r = fn(ti, sc->target_dev, 0, ti->len, data);
+
+ return r;
}
static struct target_type ssdcache_target = {
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2012-03-19 13:57 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-03-16 15:38 RFC: dm-ssdcache Hannes Reinecke
2012-03-16 18:06 ` Mike Snitzer
2012-03-19 13:57 ` [PATCH] dm ssdcache: fix and/or tweak various low hanging fruit Mike Snitzer
2012-03-17 19:47 ` RFC: dm-ssdcache Mark Hills
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.