From mboxrd@z Thu Jan 1 00:00:00 1970 From: Dan Magenheimer Subject: [PATCH] [xen-unstable] tmem: add page deduplication with optional compression or trailing-zero-elimination Date: Mon, 5 Apr 2010 15:23:28 -0700 (PDT) Message-ID: <3450dd6f-8c1f-4b56-bbef-a506d3a3c968@default> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: quoted-printable Return-path: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xensource.com Errors-To: xen-devel-bounces@lists.xensource.com To: Keir Fraser , xen-devel@lists.xensource.com List-Id: xen-devel@lists.xenproject.org Add "page deduplication" capability (with optional compression and trailing-zero elimination) to Xen's tmem. (Transparent to tmem-enabled guests.) Ephemeral pages that have the exact same content are "combined" so that only one page frame is needed. Since ephemeral pages are essentially read-only, no C-O-W (and thus no equivalent of swapping) is necessary. Deduplication can be combined with compression or "trailing zero elimination" for even more space savings. No non-tmem code is affected. Signed-off-by: Dan Magenheimer Points of interest: - Modifications to LRU eviction algorithm to accommodate dedup'ed pages - New data structures to allow lookup of matching pages and track references. (Algorithm used is similar to that used by KSM in KVM/Linux: No hashing required.) - Lock (and rbtree) chosen by first byte of data to allow reasonably high concurrency without greatly complicating lock management. - Statistics added so "dedup ratio" can be monitored. - Dedup is disabled/enabled by Xen command line option and must be combined with the tmem Xen option. Trailing zero elimination ("tze") saves significant tmem RAM when many data files are less than 1-3 pages and remaining (unused) space at the end of a page is zero-filled on disk or in memory, as may be the case for example when VMs are serving a large number of (deduplicate'able) web pages. Compression already was a tmem option; v2 of this patch combines compression with deduplication to further improve tmem RAM utilization. Either option (tmem_tze or tmem_compress) can be enabled at xen boot time in addition to deduplication (tmem_dedup) but compression overrides/disables tze. Both have a significant CPU cost so are useful primarily when memory is more constrained than CPU cycles, for example on a many-core machine with many low CPU-utilization RAM-needy VMs. tools/misc/xen-tmem-list-parse.c | 30 ++ xen/common/tmem.c | 474 ++++++++++++++++++++++++++++++++--= ----- xen/common/tmem_xen.c | 33 ++ xen/include/xen/tmem_xen.h | 122 +++++++++- 4 files changed, 576 insertions(+), 83 deletions(-) diff -r b8d2a4134a68 tools/misc/xen-tmem-list-parse.c --- a/tools/misc/xen-tmem-list-parse.c=09Wed Mar 03 17:41:58 2010 +0000 +++ b/tools/misc/xen-tmem-list-parse.c=09Mon Apr 05 16:09:58 2010 -0600 @@ -110,13 +110,39 @@ void parse_global(char *s) unsigned long long rtree_node_max =3D parse(s,"Nm"); unsigned long long pgp_count =3D parse(s,"Pc"); unsigned long long pgp_max =3D parse(s,"Pm"); + unsigned long long page_count =3D parse(s,"Fc"); + unsigned long long max_page_count =3D parse(s,"Fm"); + unsigned long long pcd_count =3D parse(s,"Sc"); + unsigned long long max_pcd_count =3D parse(s,"Sm"); + unsigned long long pcd_tot_tze_size =3D parse(s,"Zt"); + unsigned long long pcd_tot_csize =3D parse(s,"Gz"); + unsigned long long deduped_puts =3D parse(s,"Gd"); + unsigned long long tot_good_eph_puts =3D parse(s,"Ep"); =20 printf("total tmem ops=3D%llu (errors=3D%llu) -- tmem pages avail=3D%l= lu\n", total_ops, errored_ops, avail_pages); printf("datastructs: objs=3D%llu (max=3D%llu) pgps=3D%llu (max=3D%llu)= " - "nodes=3D%llu (max=3D%llu)\n", + "nodes=3D%llu (max=3D%llu) pages=3D%llu (max=3D%llu) ", obj_count, obj_max, pgp_count, pgp_max, - rtree_node_count, rtree_node_max); + rtree_node_count, rtree_node_max, + page_count,max_page_count); + if (max_pcd_count !=3D 0 && global_eph_count !=3D 0 && tot_good_eph_pu= ts !=3D 0) { + printf("pcds=3D%llu (max=3D%llu) ", + pcd_count,max_pcd_count); + printf("deduped: avg=3D%4.2f%% (curr=3D%4.2f%%) ", + ((deduped_puts*1.0)/tot_good_eph_puts)*100, + (1.0-(pcd_count*1.0)/global_eph_count)*100); + } + if (pcd_count !=3D 0) + { + if (pcd_tot_tze_size && (pcd_tot_tze_size < pcd_count*PAGE_SIZE= )) + printf("tze savings=3D%4.2f%% ", + (1.0-(pcd_tot_tze_size*1.0)/(pcd_count*PAGE_SIZE))*100)= ; + if (pcd_tot_csize && (pcd_tot_csize < pcd_count*PAGE_SIZE)) + printf("compression savings=3D%4.2f%% ", + (1.0-(pcd_tot_csize*1.0)/(pcd_count*PAGE_SIZE))*100); + } + printf("\n"); printf("misc: failed_copies=3D%llu alloc_failed=3D%llu alloc_page_fail= ed=3D%llu " "low_mem=3D%llu evicted=3D%llu/%llu relinq=3D%llu/%llu, " "max_evicts_per_relinq=3D%llu, flush_pools=3D%llu, " diff -r b8d2a4134a68 xen/common/tmem.c --- a/xen/common/tmem.c=09Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/common/tmem.c=09Mon Apr 05 16:09:58 2010 -0600 @@ -6,11 +6,10 @@ * Copyright (c) 2009, Dan Magenheimer, Oracle Corp. */ =20 -/* TODO list: 090129 - - improve on reclamation policy +/* TODO list: 090129 (updated 100318) + - any better reclamation policy? - use different tlsf pools for each client (maybe each pool) - - implement page accounting and minimal QoS limits - - test shared access more completely (need pv cluster fs) + - test shared access more completely (ocfs2) - add feedback-driven compression (not for persistent pools though!) - add data-structure total bytes overhead stats */ @@ -77,12 +76,17 @@ static unsigned long relinq_pgs =3D 0, rel static unsigned long relinq_pgs =3D 0, relinq_attempts =3D 0; static unsigned long max_evicts_per_relinq =3D 0; static unsigned long low_on_memory =3D 0; +static unsigned long deduped_puts =3D 0; +static unsigned long tot_good_eph_puts =3D 0; static int global_obj_count_max =3D 0; static int global_pgp_count_max =3D 0; +static int global_pcd_count_max =3D 0; static int global_page_count_max =3D 0; static int global_rtree_node_count_max =3D 0; static long global_eph_count_max =3D 0; static unsigned long failed_copies; +static unsigned long pcd_tot_tze_size =3D 0; +static unsigned long pcd_tot_csize =3D 0; =20 DECL_CYC_COUNTER(succ_get); DECL_CYC_COUNTER(succ_put); @@ -108,6 +112,7 @@ DECL_CYC_COUNTER(decompress); =20 struct tm_pool; struct tmem_page_descriptor; +struct tmem_page_content_descriptor; struct client { struct list_head client_list; struct tm_pool *pools[MAX_POOLS_PER_DOMAIN]; @@ -219,12 +224,17 @@ struct tmem_page_descriptor { obj_t *obj; uint64_t inv_oid; /* used for invalid list only */ }; + pagesize_t size; /* 0 =3D=3D PAGE_SIZE (pfp), -1 =3D=3D data invalid, + else compressed data (cdata) */ uint32_t index; - size_t size; /* 0 =3D=3D PAGE_SIZE (pfp), -1 =3D=3D data invalid, - else compressed data (cdata) */ + /* must hold pcd_tree_rwlocks[firstbyte] to use pcd pointer/siblings *= / + uint16_t firstbyte; /* NON_SHAREABLE->pfp otherwise->pcd */ + bool_t eviction_attempted; /* CHANGE TO lifetimes? (settable) */ + struct list_head pcd_siblings; union { pfp_t *pfp; /* page frame pointer */ char *cdata; /* compressed data */ + struct tmem_page_content_descriptor *pcd; /* page dedup */ }; union { uint64_t timestamp; @@ -233,6 +243,25 @@ struct tmem_page_descriptor { DECL_SENTINEL }; typedef struct tmem_page_descriptor pgp_t; + +#define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64)) + +struct tmem_page_content_descriptor { + union { + pfp_t *pfp; /* page frame pointer */ + char *cdata; /* if compression_enabled */ + char *tze; /* if !compression_enabled, trailing zeroes eliminated = */ + }; + struct list_head pgp_list; + struct rb_node pcd_rb_tree_node; + uint32_t pgp_ref_count; + pagesize_t size; /* if compression_enabled -> 0 *pfp */ +}; +typedef struct tmem_page_content_descriptor pcd_t; +struct rb_root pcd_tree_roots[256]; /* choose based on first byte of page = */ +rwlock_t pcd_tree_rwlocks[256]; /* poor man's concurrency for now */ =20 static LIST_HEAD(global_ephemeral_page_list); /* all pages in ephemeral po= ols */ =20 @@ -267,6 +296,7 @@ static long global_eph_count =3D 0; /* ato static long global_eph_count =3D 0; /* atomicity depends on eph_lists_spin= lock */ static atomic_t global_obj_count =3D ATOMIC_INIT(0); static atomic_t global_pgp_count =3D ATOMIC_INIT(0); +static atomic_t global_pcd_count =3D ATOMIC_INIT(0); static atomic_t global_page_count =3D ATOMIC_INIT(0); static atomic_t global_rtree_node_count =3D ATOMIC_INIT(0); =20 @@ -336,6 +366,229 @@ static NOINLINE void tmem_page_free(pool atomic_dec_and_assert(global_page_count); } =20 +/************ PAGE CONTENT DESCRIPTOR MANIPULATION ROUTINES ***********/ + +#define NOT_SHAREABLE ((uint16_t)-1UL) + +static NOINLINE int pcd_copy_to_client(tmem_cli_mfn_t cmfn, pgp_t *pgp) +{ + uint8_t firstbyte =3D pgp->firstbyte; + pcd_t *pcd; + int ret; + + ASSERT(tmh_dedup_enabled()); + tmem_read_lock(&pcd_tree_rwlocks[firstbyte]); + pcd =3D pgp->pcd; + if ( pgp->size < PAGE_SIZE && pgp->size !=3D 0 && + pcd->size < PAGE_SIZE && pcd->size !=3D 0 ) + ret =3D tmh_decompress_to_client(cmfn, pcd->cdata, pcd->size, NULL= ); + else if ( tmh_tze_enabled() && pcd->size < PAGE_SIZE ) + ret =3D tmh_copy_tze_to_client(cmfn, pcd->tze, pcd->size); + else + ret =3D tmh_copy_to_client(cmfn, pcd->pfp, 0, 0, PAGE_SIZE, NULL); + tmem_read_unlock(&pcd_tree_rwlocks[firstbyte]); + return ret; +} + +/* ensure pgp no longer points to pcd, nor vice-versa */ +/* take pcd rwlock unless have_pcd_rwlock is set, always unlock when done = */ +static NOINLINE void pcd_disassociate(pgp_t *pgp, pool_t *pool, bool_t hav= e_pcd_rwlock) +{ + pcd_t *pcd =3D pgp->pcd; + pfp_t *pfp =3D pgp->pcd->pfp; + uint16_t firstbyte =3D pgp->firstbyte; + char *pcd_tze =3D pgp->pcd->tze; + pagesize_t pcd_size =3D pcd->size; + pagesize_t pgp_size =3D pgp->size; + char *pcd_cdata =3D pgp->pcd->cdata; + pagesize_t pcd_csize =3D pgp->pcd->size; + + ASSERT(tmh_dedup_enabled()); + ASSERT(firstbyte !=3D NOT_SHAREABLE); + ASSERT(firstbyte < 256); + =20 + if ( have_pcd_rwlock ) + ASSERT_WRITELOCK(&pcd_tree_rwlocks[firstbyte]); + else + tmem_write_lock(&pcd_tree_rwlocks[firstbyte]); + list_del_init(&pgp->pcd_siblings); + pgp->pcd =3D NULL; + pgp->firstbyte =3D NOT_SHAREABLE; + pgp->size =3D -1; + if ( --pcd->pgp_ref_count ) + { + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); + return; + } + + /* no more references to this pcd, recycle it and the physical page */ + ASSERT(list_empty(&pcd->pgp_list)); + pcd->pfp =3D NULL; + /* remove pcd from rbtree */ + rb_erase(&pcd->pcd_rb_tree_node,&pcd_tree_roots[firstbyte]); + /* reinit the struct for safety for now */ + RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); + /* now free up the pcd memory */ + tmem_free(pcd,sizeof(pcd_t),NULL); + atomic_dec_and_assert(global_pcd_count); + if ( pgp_size !=3D 0 && pcd_size < PAGE_SIZE ) + { + /* compressed data */ + tmem_free(pcd_cdata,pcd_csize,pool); + pcd_tot_csize -=3D pcd_csize; + } + else if ( pcd_size !=3D PAGE_SIZE ) + { + /* trailing zero data */ + pcd_tot_tze_size -=3D pcd_size; + if ( pcd_size ) + tmem_free(pcd_tze,pcd_size,pool); + } else { + /* real physical page */ + if ( tmh_tze_enabled() ) + pcd_tot_tze_size -=3D PAGE_SIZE; + if ( tmh_compression_enabled() ) + pcd_tot_csize -=3D PAGE_SIZE; + tmem_page_free(pool,pfp); + } + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); +} + + +static NOINLINE int pcd_associate(pgp_t *pgp, char *cdata, pagesize_t csiz= e) +{ + struct rb_node **new, *parent =3D NULL; + struct rb_root *root; + pcd_t *pcd; + int cmp; + pagesize_t pfp_size =3D 0; + uint8_t firstbyte =3D (cdata =3D=3D NULL) ? tmh_get_first_byte(pgp->pf= p) : *cdata; + int ret =3D 0; + + if ( !tmh_dedup_enabled() ) + return 0; + ASSERT(pgp->obj !=3D NULL); + ASSERT(pgp->obj->pool !=3D NULL); + ASSERT(!pgp->obj->pool->persistent); + if ( cdata =3D=3D NULL ) + { + ASSERT(pgp->pfp !=3D NULL); + pfp_size =3D PAGE_SIZE; + if ( tmh_tze_enabled() ) + { + pfp_size =3D tmh_tze_pfp_scan(pgp->pfp); + if ( pfp_size > PCD_TZE_MAX_SIZE ) + pfp_size =3D PAGE_SIZE; + } + ASSERT(pfp_size <=3D PAGE_SIZE); + ASSERT(!(pfp_size & (sizeof(uint64_t)-1))); + } + tmem_write_lock(&pcd_tree_rwlocks[firstbyte]); + + /* look for page match */ + root =3D &pcd_tree_roots[firstbyte]; + new =3D &(root->rb_node); + while ( *new ) + { + pcd =3D container_of(*new, pcd_t, pcd_rb_tree_node); + parent =3D *new; + /* compare new entry and rbtree entry, set cmp accordingly */ + if ( cdata !=3D NULL ) + { + if ( pcd->size < PAGE_SIZE ) + /* both new entry and rbtree entry are compressed */ + cmp =3D tmh_pcd_cmp(cdata,csize,pcd->cdata,pcd->size); + else + /* new entry is compressed, rbtree entry is not */ + cmp =3D -1; + } else if ( pcd->size < PAGE_SIZE ) + /* rbtree entry is compressed, rbtree entry is not */ + cmp =3D 1; + else if ( tmh_tze_enabled() ) { + if ( pcd->size < PAGE_SIZE ) + /* both new entry and rbtree entry are trailing zero */ + cmp =3D tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->tze,pcd->si= ze); + else + /* new entry is trailing zero, rbtree entry is not */ + cmp =3D tmh_tze_pfp_cmp(pgp->pfp,pfp_size,pcd->pfp,PAGE_SI= ZE); + } else { + /* both new entry and rbtree entry are full physical pages */ + ASSERT(pgp->pfp !=3D NULL); + ASSERT(pcd->pfp !=3D NULL); + cmp =3D tmh_page_cmp(pgp->pfp,pcd->pfp); + } + + /* walk tree or match depending on cmp */ + if ( cmp < 0 ) + new =3D &((*new)->rb_left); + else if ( cmp > 0 ) + new =3D &((*new)->rb_right); + else + { + /* match! if not compressed, free the no-longer-needed page */ + /* but if compressed, data is assumed static so don't free! */ + if ( cdata =3D=3D NULL ) + tmem_page_free(pgp->obj->pool,pgp->pfp); + deduped_puts++; + goto match; + } + } + + /* exited while loop with no match, so alloc a pcd and put it in the t= ree */ + if ( (pcd =3D tmem_malloc(pcd_t, NULL)) =3D=3D NULL ) + { + ret =3D -ENOMEM; + goto unlock; + } else if ( cdata !=3D NULL ) { + if ( (pcd->cdata =3D tmem_malloc_bytes(csize,pgp->obj->pool)) =3D= =3D NULL ) + { + tmem_free(pcd,sizeof(pcd_t),NULL); + ret =3D -ENOMEM; + goto unlock; + } + } + atomic_inc_and_max(global_pcd_count); + RB_CLEAR_NODE(&pcd->pcd_rb_tree_node); /* is this necessary */ + INIT_LIST_HEAD(&pcd->pgp_list); /* is this necessary */ + pcd->pgp_ref_count =3D 0; + if ( cdata !=3D NULL ) + { + memcpy(pcd->cdata,cdata,csize); + pcd->size =3D csize; + pcd_tot_csize +=3D csize; + } else if ( pfp_size =3D=3D 0 ) { + ASSERT(tmh_tze_enabled()); + pcd->size =3D 0; + pcd->tze =3D NULL; + } else if ( pfp_size < PAGE_SIZE && + ((pcd->tze =3D tmem_malloc_bytes(pfp_size,pgp->obj->pool)) !=3D N= ULL) ) { + tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size); + pcd->size =3D pfp_size; + pcd_tot_tze_size +=3D pfp_size; + tmem_page_free(pgp->obj->pool,pgp->pfp); + } else { + pcd->pfp =3D pgp->pfp; + pcd->size =3D PAGE_SIZE; + if ( tmh_tze_enabled() ) + pcd_tot_tze_size +=3D PAGE_SIZE; + if ( tmh_compression_enabled() ) + pcd_tot_csize +=3D PAGE_SIZE; + } + rb_link_node(&pcd->pcd_rb_tree_node, parent, new); + rb_insert_color(&pcd->pcd_rb_tree_node, root); + +match: + pcd->pgp_ref_count++; + list_add(&pgp->pcd_siblings,&pcd->pgp_list); + pgp->firstbyte =3D firstbyte; + pgp->eviction_attempted =3D 0; + pgp->pcd =3D pcd; + +unlock: + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); + return ret; +} + /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/ =20 /* allocate a pgp_t and associate it with an object */ @@ -353,6 +606,12 @@ static NOINLINE pgp_t *pgp_alloc(obj_t * INIT_LIST_HEAD(&pgp->global_eph_pages); INIT_LIST_HEAD(&pgp->client_eph_pages); pgp->pfp =3D NULL; + if ( tmh_dedup_enabled() ) + { + pgp->firstbyte =3D NOT_SHAREABLE; + pgp->eviction_attempted =3D 0; + INIT_LIST_HEAD(&pgp->pcd_siblings); + } pgp->size =3D -1; pgp->index =3D -1; pgp->timestamp =3D get_cycles(); @@ -374,18 +633,20 @@ static pgp_t *pgp_lookup_in_obj(obj_t *o =20 static NOINLINE void pgp_free_data(pgp_t *pgp, pool_t *pool) { + pagesize_t pgp_size =3D pgp->size; + if ( pgp->pfp =3D=3D NULL ) return; - if ( !pgp->size ) + if ( tmh_dedup_enabled() && pgp->firstbyte !=3D NOT_SHAREABLE ) + pcd_disassociate(pgp,pool,0); /* pgp->size lost */ + else if ( pgp_size ) + tmem_free(pgp->cdata,pgp_size,pool); + else tmem_page_free(pgp->obj->pool,pgp->pfp); - else + if ( pool !=3D NULL && pgp_size ) { - tmem_free(pgp->cdata,pgp->size,pool); - if ( pool !=3D NULL ) - { - pool->client->compressed_pages--; - pool->client->compressed_sum_size -=3D pgp->size; - } + pool->client->compressed_pages--; + pool->client->compressed_sum_size -=3D pgp_size; } pgp->pfp =3D NULL; pgp->size =3D -1; @@ -987,10 +1248,56 @@ static void client_freeze(client_t *clie =20 /************ MEMORY REVOCATION ROUTINES *******************************/ =20 +static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock) +{ + obj_t *obj =3D pgp->obj; + pool_t *pool =3D obj->pool; + client_t *client =3D pool->client; + uint16_t firstbyte =3D pgp->firstbyte; + + if ( pool->is_dying ) + return 0; + if ( tmh_lock_all && !obj->no_evict ) + return 1;=20 + if ( tmem_spin_trylock(&obj->obj_spinlock) ) + { + if ( tmh_dedup_enabled() ) + { + firstbyte =3D pgp->firstbyte; + if ( firstbyte =3D=3D NOT_SHAREABLE ) + goto obj_unlock; + ASSERT(firstbyte < 256); + if ( !tmem_write_trylock(&pcd_tree_rwlocks[firstbyte]) ) + goto obj_unlock; + if ( pgp->pcd->pgp_ref_count > 1 && !pgp->eviction_attempted ) + { + pgp->eviction_attempted++; + list_del(&pgp->global_eph_pages); + list_add_tail(&pgp->global_eph_pages,&global_ephemeral_pag= e_list); + list_del(&pgp->client_eph_pages); + list_add_tail(&pgp->client_eph_pages,&client->ephemeral_pa= ge_list); + goto pcd_unlock; + } + } + if ( obj->pgp_count > 1 ) + return 1; + if ( tmem_write_trylock(&pool->pool_rwlock) ) + { + *hold_pool_rwlock =3D 1; + return 1; + } +pcd_unlock: + tmem_write_unlock(&pcd_tree_rwlocks[firstbyte]); +obj_unlock: + tmem_spin_unlock(&obj->obj_spinlock); + } + return 0; +} + static int tmem_evict(void) { client_t *client =3D tmh_client_from_current(); - pgp_t *pgp =3D NULL, *pgp_del; + pgp_t *pgp =3D NULL, *pgp2, *pgp_del; obj_t *obj; pool_t *pool; int ret =3D 0; @@ -1001,49 +1308,15 @@ static int tmem_evict(void) if ( (client !=3D NULL) && client_over_quota(client) && !list_empty(&client->ephemeral_page_list) ) { - list_for_each_entry(pgp,&client->ephemeral_page_list,client_eph_pa= ges) - { - obj =3D pgp->obj; - pool =3D obj->pool; - if ( pool->is_dying ) - continue; - if ( tmh_lock_all && !obj->no_evict ) + list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,cli= ent_eph_pages) + if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; - if ( tmem_spin_trylock(&obj->obj_spinlock) ) - { - if ( obj->pgp_count > 1 ) - goto found; - if ( tmem_write_trylock(&pool->pool_rwlock) ) - { - hold_pool_rwlock =3D 1; - goto found; - } - tmem_spin_unlock(&obj->obj_spinlock); - } - } } else if ( list_empty(&global_ephemeral_page_list) ) { goto out; } else { - list_for_each_entry(pgp,&global_ephemeral_page_list,global_eph_pag= es) - { - obj =3D pgp->obj; - pool =3D obj->pool; - if ( pool->is_dying ) - continue; - if ( tmh_lock_all && !obj->no_evict ) + list_for_each_entry_safe(pgp,pgp2,&global_ephemeral_page_list,glob= al_eph_pages) + if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) ) goto found; - if ( tmem_spin_trylock(&obj->obj_spinlock) ) - { - if ( obj->pgp_count > 1 ) - goto found; - if ( tmem_write_trylock(&pool->pool_rwlock) ) - { - hold_pool_rwlock =3D 1; - goto found; - } - tmem_spin_unlock(&obj->obj_spinlock); - } - } } =20 ret =3D 0; @@ -1057,10 +1330,16 @@ found: ASSERT(obj->no_evict =3D=3D 0); ASSERT(obj->pool !=3D NULL); ASSERT_SENTINEL(obj,OBJ); + pool =3D obj->pool; =20 ASSERT_SPINLOCK(&obj->obj_spinlock); pgp_del =3D pgp_delete_from_obj(obj, pgp->index); ASSERT(pgp_del =3D=3D pgp); + if ( tmh_dedup_enabled() && pgp->firstbyte !=3D NOT_SHAREABLE ) + { + ASSERT(pgp->pcd->pgp_ref_count =3D=3D 1 || pgp->eviction_attempted= ); + pcd_disassociate(pgp,pool,1); + } pgp_delete(pgp,1); if ( obj->pgp_count =3D=3D 0 ) { @@ -1129,25 +1408,30 @@ static NOINLINE int do_tmem_put_compress #ifdef __i386__ return -ENOMEM; #endif + if ( pgp->pfp !=3D NULL ) - pgp_free_data(pgp, pgp->obj->pool); /* FIXME... is this right? */ + pgp_free_data(pgp, pgp->obj->pool); START_CYC_COUNTER(compress); ret =3D tmh_compress_from_client(cmfn, &dst, &size, cva); if ( (ret =3D=3D -EFAULT) || (ret =3D=3D 0) ) goto out; - else if ( (size =3D=3D 0) || (size >=3D tmem_subpage_maxsize()) ) + else if ( (size =3D=3D 0) || (size >=3D tmem_subpage_maxsize()) ) { ret =3D 0; - else if ( (p =3D tmem_malloc_bytes(size,pgp->obj->pool)) =3D=3D NULL ) + goto out; + } else if ( tmh_dedup_enabled() && !is_persistent(pgp->obj->pool) ) { + if ( (ret =3D pcd_associate(pgp,dst,size)) =3D=3D -ENOMEM ) + goto out; + } else if ( (p =3D tmem_malloc_bytes(size,pgp->obj->pool)) =3D=3D NULL= ) { ret =3D -ENOMEM; - else - { + goto out; + } else { memcpy(p,dst,size); pgp->cdata =3D p; - pgp->size =3D size; - pgp->obj->pool->client->compressed_pages++; - pgp->obj->pool->client->compressed_sum_size +=3D size; - ret =3D 1; } + pgp->size =3D size; + pgp->obj->pool->client->compressed_pages++; + pgp->obj->pool->client->compressed_sum_size +=3D size; + ret =3D 1; =20 out: END_CYC_COUNTER(compress); @@ -1155,7 +1439,7 @@ out: } =20 static NOINLINE int do_tmem_dup_put(pgp_t *pgp, tmem_cli_mfn_t cmfn, - uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva) + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void= *cva) { pool_t *pool; obj_t *obj; @@ -1197,6 +1481,11 @@ copy_uncompressed: ret =3D tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,= 0); if ( ret =3D=3D -EFAULT ) goto bad_copy; + if ( tmh_dedup_enabled() && !is_persistent(pool) ) + { + if ( pcd_associate(pgp,NULL,0) =3D=3D -ENOMEM ) + goto failed_dup; + } pgp->size =3D 0; =20 done: @@ -1239,8 +1528,8 @@ failed_dup: =20 static NOINLINE int do_tmem_put(pool_t *pool, uint64_t oid, uint32_t index, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cva) + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cva) { obj_t *obj =3D NULL, *objfound =3D NULL, *objnew =3D NULL; pgp_t *pgp =3D NULL, *pgpdel =3D NULL; @@ -1308,13 +1597,18 @@ copy_uncompressed: copy_uncompressed: if ( ( pgp->pfp =3D tmem_page_alloc(pool) ) =3D=3D NULL ) { - ret =3D=3D -ENOMEM; + ret =3D -ENOMEM; goto delete_and_free; } /* tmh_copy_from_client properly handles len=3D=3D0 (TMEM_NEW_PAGE) */ ret =3D tmh_copy_from_client(pgp->pfp,cmfn,tmem_offset,pfn_offset,len,= cva); if ( ret =3D=3D -EFAULT ) goto bad_copy; + if ( tmh_dedup_enabled() && !is_persistent(pool) ) + { + if ( pcd_associate(pgp,NULL,0) =3D=3D -ENOMEM ) + goto delete_and_free; + } pgp->size =3D 0; =20 insert_page: @@ -1344,6 +1638,8 @@ insert_page: pool->good_puts++; if ( is_persistent(pool) ) client->succ_pers_puts++; + else + tot_good_eph_puts++; return 1; =20 delete_and_free: @@ -1376,8 +1672,8 @@ ASSERT(0); } =20 static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index= , - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cva) + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cva) { obj_t *obj; pgp_t *pgp; @@ -1404,15 +1700,18 @@ static NOINLINE int do_tmem_get(pool_t * return 0; } ASSERT(pgp->size !=3D -1); - if ( pgp->size !=3D 0 ) + if ( tmh_dedup_enabled() && !is_persistent(pool) && + pgp->firstbyte !=3D NOT_SHAREABLE ) { + if ( pcd_copy_to_client(cmfn, pgp) =3D=3D -EFAULT ) + goto bad_copy; + } else if ( pgp->size !=3D 0 ) { START_CYC_COUNTER(decompress); if ( tmh_decompress_to_client(cmfn, pgp->cdata, pgp->size, cva) =3D=3D -EFAULT ) goto bad_copy; END_CYC_COUNTER(decompress); - } - else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, + } else if ( tmh_copy_to_client(cmfn, pgp->pfp, tmem_offset, pfn_offset, len, cva) =3D=3D -EFAULT) goto bad_copy; if ( is_ephemeral(pool) ) @@ -1855,11 +2154,15 @@ static int tmemc_list_global(tmem_cli_va total_flush_pool, use_long ? ',' : '\n'); if (use_long) n +=3D scnprintf(info+n,BSIZE-n, - "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d\n", + "Ec:%ld,Em:%ld,Oc:%d,Om:%d,Nc:%d,Nm:%d,Pc:%d,Pm:%d," + "Fc:%d,Fm:%d,Sc:%d,Sm:%d,Ep:%lu,Gd:%lu,Zt:%lu,Gz:%lu\n", global_eph_count, global_eph_count_max, _atomic_read(global_obj_count), global_obj_count_max, _atomic_read(global_rtree_node_count), global_rtree_node_count_m= ax, - _atomic_read(global_pgp_count), global_pgp_count_max); + _atomic_read(global_pgp_count), global_pgp_count_max, + _atomic_read(global_page_count), global_page_count_max, + _atomic_read(global_pcd_count), global_pcd_count_max, + tot_good_eph_puts,deduped_puts,pcd_tot_tze_size,pcd_tot_csize); if ( sum + n >=3D len ) return sum; tmh_copy_to_client_buf_offset(buf,off+sum,info,n+1); @@ -1912,6 +2215,13 @@ static int tmemc_set_var_one(client_t *c #ifdef __i386__ return -1; #endif + if ( tmh_dedup_enabled() ) + { + printk("tmem: compression %s for all %ss, cannot be changed " + "when tmem_dedup is enabled\n", + tmh_compression_enabled() ? "enabled" : "disabled",client_str)= ; + return -1; + } client->compress =3D arg1 ? 1 : 0; printk("tmem: compression %s for %s=3D%d\n", arg1 ? "enabled" : "disabled",cli_id_str,cli_id); @@ -2569,14 +2879,28 @@ EXPORT void *tmem_relinquish_pages(unsig /* called at hypervisor startup */ EXPORT void init_tmem(void) { + int i; if ( !tmh_enabled() ) return; =20 radix_tree_init(); + if ( tmh_dedup_enabled() ) + for (i =3D 0; i < 256; i++ ) + { + pcd_tree_roots[i] =3D RB_ROOT; + rwlock_init(&pcd_tree_rwlocks[i]); + } + if ( tmh_init() ) { - printk("tmem: initialized comp=3D%d global-lock=3D%d\n", - tmh_compression_enabled(), tmh_lock_all); + printk("tmem: initialized comp=3D%d dedup=3D%d tze=3D%d global-loc= k=3D%d\n", + tmh_compression_enabled(), tmh_dedup_enabled(), tmh_tze_enable= d(), + tmh_lock_all); + if ( tmh_dedup_enabled()&&tmh_compression_enabled()&&tmh_tze_enabl= ed() ) + { + tmh_tze_disable(); + printk("tmem: tze and compression not compatible, disabling tz= e\n"); + } tmem_initialized =3D 1; } else diff -r b8d2a4134a68 xen/common/tmem_xen.c --- a/xen/common/tmem_xen.c=09Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/common/tmem_xen.c=09Mon Apr 05 16:09:58 2010 -0600 @@ -19,6 +19,12 @@ boolean_param("tmem", opt_tmem); =20 EXPORT int opt_tmem_compress =3D 0; boolean_param("tmem_compress", opt_tmem_compress); + +EXPORT int opt_tmem_dedup =3D 0; +boolean_param("tmem_dedup", opt_tmem_dedup); + +EXPORT int opt_tmem_tze =3D 0; +boolean_param("tmem_tze", opt_tmem_tze); =20 EXPORT int opt_tmem_shared_auth =3D 0; boolean_param("tmem_shared_auth", opt_tmem_shared_auth); @@ -103,8 +109,8 @@ static inline void *cli_mfn_to_va(tmem_c #endif =20 EXPORT int tmh_copy_from_client(pfp_t *pfp, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cli_va) + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cli_va) { unsigned long tmem_mfn; void *tmem_va; @@ -148,7 +154,7 @@ EXPORT int tmh_compress_from_client(tmem } =20 EXPORT int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, - uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cli_va) + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *c= li_va) { unsigned long tmem_mfn, cli_mfn =3D 0; int mark_dirty =3D 1; @@ -195,6 +201,27 @@ EXPORT int tmh_decompress_to_client(tmem unmap_domain_page(cli_va); paging_mark_dirty(current->domain,cli_mfn); } + mb(); + return 1; +} + +EXPORT int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, + pagesize_t len) +{ + void *cli_va; + unsigned long cli_mfn; + + ASSERT(!(len & (sizeof(uint64_t)-1))); + ASSERT(len <=3D PAGE_SIZE); + ASSERT(len > 0 || tmem_va =3D=3D NULL); + if ( (cli_va =3D cli_mfn_to_va(cmfn,&cli_mfn)) =3D=3D NULL) + return -EFAULT; + if ( len > 0 ) + memcpy((char *)cli_va,(char *)tmem_va,len); + if ( len < PAGE_SIZE ) + memset((char *)cli_va+len,0,PAGE_SIZE-len); + unmap_domain_page(cli_va); + paging_mark_dirty(current->domain,cli_mfn); mb(); return 1; } diff -r b8d2a4134a68 xen/include/xen/tmem_xen.h --- a/xen/include/xen/tmem_xen.h=09Wed Mar 03 17:41:58 2010 +0000 +++ b/xen/include/xen/tmem_xen.h=09Mon Apr 05 16:09:58 2010 -0600 @@ -26,6 +26,8 @@ struct tmem_host_dependent_client { }; typedef struct tmem_host_dependent_client tmh_client_t; =20 +typedef uint32_t pagesize_t; /* like size_t, must handle largest PAGE_SIZ= E */ + #define IS_PAGE_ALIGNED(addr) \ ((void *)((((unsigned long)addr + (PAGE_SIZE - 1)) & PAGE_MASK)) =3D=3D = addr) #define IS_VALID_PAGE(_pi) ( mfn_valid(page_to_mfn(_pi)) ) @@ -52,6 +54,23 @@ static inline int tmh_compression_enable static inline int tmh_compression_enabled(void) { return opt_tmem_compress; +} + +extern int opt_tmem_dedup; +static inline int tmh_dedup_enabled(void) +{ + return opt_tmem_dedup; +} + +extern int opt_tmem_tze; +static inline int tmh_tze_enabled(void) +{ + return opt_tmem_tze; +} + +static inline void tmh_tze_disable(void) +{ + opt_tmem_tze =3D 0; } =20 extern int opt_tmem_shared_auth; @@ -326,6 +345,101 @@ static inline bool_t tmh_current_is_priv return IS_PRIV(current->domain); } =20 +static inline uint8_t tmh_get_first_byte(pfp_t *pfp) +{ + void *p =3D __map_domain_page(pfp); + + return (uint8_t)(*(char *)p); +} + +static inline int tmh_page_cmp(pfp_t *pfp1, pfp_t *pfp2) +{ + const uint64_t *p1 =3D (uint64_t *)__map_domain_page(pfp1); + const uint64_t *p2 =3D (uint64_t *)__map_domain_page(pfp2); + int i; + + // FIXME: code in assembly? +ASSERT(p1 !=3D NULL); +ASSERT(p2 !=3D NULL); + for ( i =3D PAGE_SIZE/sizeof(uint64_t); i && *p1 =3D=3D *p2; i--, *p1+= +, *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + +static inline int tmh_pcd_cmp(void *va1, pagesize_t len1, void *va2, pages= ize_t len2) +{ + const char *p1 =3D (char *)va1; + const char *p2 =3D (char *)va2; + pagesize_t i; + + ASSERT(len1 <=3D PAGE_SIZE); + ASSERT(len2 <=3D PAGE_SIZE); + if ( len1 < len2 ) + return -1; + if ( len1 > len2 ) + return 1; + ASSERT(len1 =3D=3D len2); + for ( i =3D len2; i && *p1 =3D=3D *p2; i--, *p1++, *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + +static inline int tmh_tze_pfp_cmp(pfp_t *pfp1, pagesize_t pfp_len, void *t= va, pagesize_t tze_len) +{ + const uint64_t *p1 =3D (uint64_t *)__map_domain_page(pfp1); + const uint64_t *p2; + pagesize_t i; + + if ( tze_len =3D=3D PAGE_SIZE ) + p2 =3D (uint64_t *)__map_domain_page((pfp_t *)tva); + else + p2 =3D (uint64_t *)tva; + ASSERT(pfp_len <=3D PAGE_SIZE); + ASSERT(!(pfp_len & (sizeof(uint64_t)-1))); + ASSERT(tze_len <=3D PAGE_SIZE); + ASSERT(!(tze_len & (sizeof(uint64_t)-1))); + if ( pfp_len < tze_len ) + return -1; + if ( pfp_len > tze_len ) + return 1; + ASSERT(pfp_len =3D=3D tze_len); + for ( i =3D tze_len/sizeof(uint64_t); i && *p1 =3D=3D *p2; i--, *p1++,= *p2++ ); + if ( !i ) + return 0; + if ( *p1 < *p2 ) + return -1; + return 1; +} + +/* return the size of the data in the pfp, ignoring trailing zeroes and + * rounded up to the nearest multiple of 8 */ +static inline pagesize_t tmh_tze_pfp_scan(pfp_t *pfp) +{ + const uint64_t *p =3D (uint64_t *)__map_domain_page(pfp); + pagesize_t bytecount =3D PAGE_SIZE; + pagesize_t len =3D PAGE_SIZE/sizeof(uint64_t); + p +=3D len; + while ( len-- && !*--p ) + bytecount -=3D sizeof(uint64_t); + return bytecount; +} + +static inline void tmh_tze_copy_from_pfp(void *tva, pfp_t *pfp, pagesize_t= len) +{ + uint64_t *p1 =3D (uint64_t *)tva; + const uint64_t *p2 =3D (uint64_t *)__map_domain_page(pfp); + + pagesize_t i; + ASSERT(!(len & (sizeof(uint64_t)-1))); + for ( i =3D len/sizeof(uint64_t); i--; *p1++ =3D *p2++); +} + /* these typedefs are in the public/tmem.h interface typedef XEN_GUEST_HANDLE(void) cli_mfn_t; typedef XEN_GUEST_HANDLE(char) cli_va_t; @@ -378,11 +492,13 @@ extern int tmh_compress_from_client(tmem extern int tmh_compress_from_client(tmem_cli_mfn_t,void**,size_t *,void*); =20 extern int tmh_copy_from_client(pfp_t *pfp, - tmem_cli_mfn_t cmfn, uint32_t tmem_offset, - uint32_t pfn_offset, uint32_t len, void *cva); + tmem_cli_mfn_t cmfn, pagesize_t tmem_offset, + pagesize_t pfn_offset, pagesize_t len, void *cva); =20 extern int tmh_copy_to_client(tmem_cli_mfn_t cmfn, pfp_t *pfp, - uint32_t tmem_offset, uint32_t pfn_offset, uint32_t len, void *cva); + pagesize_t tmem_offset, pagesize_t pfn_offset, pagesize_t len, void *c= va); + +extern int tmh_copy_tze_to_client(tmem_cli_mfn_t cmfn, void *tmem_va, page= size_t len); =20 =20 #define TMEM_PERF