diff for duplicates of <20160526215022.GA2322@bbox> diff --git a/a/1.txt b/N1/1.txt index 87a1ab2..79528a4 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -1 +1,1373 @@ Follow up Sergey's review + +>From 2deede28c91910a9d3493feae30bed507e72f213 Mon Sep 17 00:00:00 2001 +From: Minchan Kim <minchan@kernel.org> +Date: Thu, 5 May 2016 00:01:03 +0900 +Subject: [PATCH v6r2] zsmalloc: page migration support + +This patch introduces run-time migration feature for zspage. + +For migration, VM uses page.lru field so it would be better to not use +page.next field which is unified with page.lru for own purpose. +For that, firstly, we can get first object offset of the page via +runtime calculation instead of using page.index so we can use +page.index as link for page chaining instead of page.next. + +In case of huge object, it stores handle to page.index instead of +next link of page chaining because huge object doesn't need to next +link for page chaining. So get_next_page need to identify huge +object to return NULL. For it, this patch uses PG_owner_priv_1 flag +of the page flag. + +For migration, it supports three functions + +* zs_page_isolate + +It isolates a zspage which includes a subpage VM want to migrate +from class so anyone cannot allocate new object from the zspage. + +We could try to isolate a zspage by the number of subpage so +subsequent isolation trial of other subpage of the zpsage shouldn't +fail. For that, we introduce zspage.isolated count. With that, +zs_page_isolate can know whether zspage is already isolated or not +for migration so if it is isolated for migration, subsequent +isolation trial can be successful without trying further isolation. + +* zs_page_migrate + +First of all, it holds write-side zspage->lock to prevent migrate other +subpage in zspage. Then, lock all objects in the page VM want to migrate. +The reason we should lock all objects in the page is due to race between +zs_map_object and zs_page_migrate. + +zs_map_object zs_page_migrate + +pin_tag(handle) +obj = handle_to_obj(handle) +obj_to_location(obj, &page, &obj_idx); + + write_lock(&zspage->lock) + if (!trypin_tag(handle)) + goto unpin_object + +zspage = get_zspage(page); +read_lock(&zspage->lock); + +If zs_page_migrate doesn't do trypin_tag, zs_map_object's page can +be stale by migration so it goes crash. + +If it locks all of objects successfully, it copies content from +old page to new one, finally, create new zspage chain with new page. +And if it's last isolated subpage in the zspage, put the zspage back +to class. + +* zs_page_putback + +It returns isolated zspage to right fullness_group list if it fails to +migrate a page. If it find a zspage is ZS_EMPTY, it queues zspage +freeing to workqueue. See below about async zspage freeing. + +This patch introduces asynchronous zspage free. The reason to need it +is we need page_lock to clear PG_movable but unfortunately, +zs_free path should be atomic so the apporach is try to grab page_lock. +If it got page_lock of all of pages successfully, it can free zspage +immediately. Otherwise, it queues free request and free zspage via +workqueue in process context. + +If zs_free finds the zspage is isolated when it try to free zspage, +it delays the freeing until zs_page_putback finds it so it will free +free the zspage finally. + +In this patch, we expand fullness_list from ZS_EMPTY to ZS_FULL. +First of all, it will use ZS_EMPTY list for delay freeing. +And with adding ZS_FULL list, it makes to identify whether zspage is +isolated or not via list_empty(&zspage->list) test. + +Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Signed-off-by: Minchan Kim <minchan@kernel.org> +--- + include/uapi/linux/magic.h | 1 + + mm/zsmalloc.c | 793 ++++++++++++++++++++++++++++++++++++++------- + 2 files changed, 672 insertions(+), 122 deletions(-) + +diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h +index d829ce63529d..e398beac67b8 100644 +--- a/include/uapi/linux/magic.h ++++ b/include/uapi/linux/magic.h +@@ -81,5 +81,6 @@ + /* Since UDF 2.01 is ISO 13346 based... */ + #define UDF_SUPER_MAGIC 0x15013346 + #define BALLOON_KVM_MAGIC 0x13661366 ++#define ZSMALLOC_MAGIC 0x58295829 + + #endif /* __LINUX_MAGIC_H__ */ +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index c6fb543cfb98..a80100db16d6 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -17,14 +17,14 @@ + * + * Usage of struct page fields: + * page->private: points to zspage +- * page->index: offset of the first object starting in this page. +- * For the first page, this is always 0, so we use this field +- * to store handle for huge object. +- * page->next: links together all component pages of a zspage ++ * page->freelist(index): links together all component pages of a zspage ++ * For the huge page, this is always 0, so we use this field ++ * to store handle. + * + * Usage of struct page flags: + * PG_private: identifies the first component page + * PG_private2: identifies the last component page ++ * PG_owner_priv_1: indentifies the huge component page + * + */ + +@@ -49,6 +49,11 @@ + #include <linux/debugfs.h> + #include <linux/zsmalloc.h> + #include <linux/zpool.h> ++#include <linux/mount.h> ++#include <linux/compaction.h> ++#include <linux/pagemap.h> ++ ++#define ZSPAGE_MAGIC 0x58 + + /* + * This must be power of 2 and greater than of equal to sizeof(link_free). +@@ -136,25 +141,23 @@ + * We do not maintain any list for completely empty or full pages + */ + enum fullness_group { +- ZS_ALMOST_FULL, +- ZS_ALMOST_EMPTY, + ZS_EMPTY, +- ZS_FULL ++ ZS_ALMOST_EMPTY, ++ ZS_ALMOST_FULL, ++ ZS_FULL, ++ NR_ZS_FULLNESS, + }; + + enum zs_stat_type { ++ CLASS_EMPTY, ++ CLASS_ALMOST_EMPTY, ++ CLASS_ALMOST_FULL, ++ CLASS_FULL, + OBJ_ALLOCATED, + OBJ_USED, +- CLASS_ALMOST_FULL, +- CLASS_ALMOST_EMPTY, ++ NR_ZS_STAT_TYPE, + }; + +-#ifdef CONFIG_ZSMALLOC_STAT +-#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +-#else +-#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +-#endif +- + struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; + }; +@@ -163,6 +166,10 @@ struct zs_size_stat { + static struct dentry *zs_stat_root; + #endif + ++#ifdef CONFIG_COMPACTION ++static struct vfsmount *zsmalloc_mnt; ++#endif ++ + /* + * number of size_classes + */ +@@ -186,23 +193,36 @@ static const int fullness_threshold_frac = 4; + + struct size_class { + spinlock_t lock; +- struct list_head fullness_list[2]; ++ struct list_head fullness_list[NR_ZS_FULLNESS]; + /* + * Size of objects stored in this class. Must be multiple + * of ZS_ALIGN. + */ + int size; + int objs_per_zspage; +- unsigned int index; +- +- struct zs_size_stat stats; +- + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; +- /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +- bool huge; ++ ++ unsigned int index; ++ struct zs_size_stat stats; + }; + ++/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ ++static void SetPageHugeObject(struct page *page) ++{ ++ SetPageOwnerPriv1(page); ++} ++ ++static void ClearPageHugeObject(struct page *page) ++{ ++ ClearPageOwnerPriv1(page); ++} ++ ++static int PageHugeObject(struct page *page) ++{ ++ return PageOwnerPriv1(page); ++} ++ + /* + * Placed within free objects to form a singly linked list. + * For every zspage, zspage->freeobj gives head of this list. +@@ -244,6 +264,10 @@ struct zs_pool { + #ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; + #endif ++#ifdef CONFIG_COMPACTION ++ struct inode *inode; ++ struct work_struct free_work; ++#endif + }; + + /* +@@ -252,16 +276,23 @@ struct zs_pool { + */ + #define FULLNESS_BITS 2 + #define CLASS_BITS 8 ++#define ISOLATED_BITS 3 ++#define MAGIC_VAL_BITS 8 + + struct zspage { + struct { + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS; ++ unsigned int isolated:ISOLATED_BITS; ++ unsigned int magic:MAGIC_VAL_BITS; + }; + unsigned int inuse; + unsigned int freeobj; + struct page *first_page; + struct list_head list; /* fullness list */ ++#ifdef CONFIG_COMPACTION ++ rwlock_t lock; ++#endif + }; + + struct mapping_area { +@@ -274,6 +305,28 @@ struct mapping_area { + enum zs_mapmode vm_mm; /* mapping mode */ + }; + ++#ifdef CONFIG_COMPACTION ++static int zs_register_migration(struct zs_pool *pool); ++static void zs_unregister_migration(struct zs_pool *pool); ++static void migrate_lock_init(struct zspage *zspage); ++static void migrate_read_lock(struct zspage *zspage); ++static void migrate_read_unlock(struct zspage *zspage); ++static void kick_deferred_free(struct zs_pool *pool); ++static void init_deferred_free(struct zs_pool *pool); ++static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); ++#else ++static int zsmalloc_mount(void) { return 0; } ++static void zsmalloc_unmount(void) {} ++static int zs_register_migration(struct zs_pool *pool) { return 0; } ++static void zs_unregister_migration(struct zs_pool *pool) {} ++static void migrate_lock_init(struct zspage *zspage) {} ++static void migrate_read_lock(struct zspage *zspage) {} ++static void migrate_read_unlock(struct zspage *zspage) {} ++static void kick_deferred_free(struct zs_pool *pool) {} ++static void init_deferred_free(struct zs_pool *pool) {} ++static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} ++#endif ++ + static int create_cache(struct zs_pool *pool) + { + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, +@@ -301,7 +354,7 @@ static void destroy_cache(struct zs_pool *pool) + static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + { + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, +- gfp & ~__GFP_HIGHMEM); ++ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + } + + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) +@@ -311,7 +364,8 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) + + static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) + { +- return kmem_cache_alloc(pool->zspage_cachep, flags & ~__GFP_HIGHMEM); ++ return kmem_cache_alloc(pool->zspage_cachep, ++ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); + }; + + static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +@@ -421,11 +475,17 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ + static DEFINE_PER_CPU(struct mapping_area, zs_map_area); + ++static bool is_zspage_isolated(struct zspage *zspage) ++{ ++ return zspage->isolated; ++} ++ + static int is_first_page(struct page *page) + { + return PagePrivate(page); + } + ++/* Protected by class->lock */ + static inline int get_zspage_inuse(struct zspage *zspage) + { + return zspage->inuse; +@@ -441,20 +501,12 @@ static inline void mod_zspage_inuse(struct zspage *zspage, int val) + zspage->inuse += val; + } + +-static inline int get_first_obj_offset(struct page *page) ++static inline struct page *get_first_page(struct zspage *zspage) + { +- if (is_first_page(page)) +- return 0; ++ struct page *first_page = zspage->first_page; + +- return page->index; +-} +- +-static inline void set_first_obj_offset(struct page *page, int offset) +-{ +- if (is_first_page(page)) +- return; +- +- page->index = offset; ++ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); ++ return first_page; + } + + static inline unsigned int get_freeobj(struct zspage *zspage) +@@ -471,6 +523,8 @@ static void get_zspage_mapping(struct zspage *zspage, + unsigned int *class_idx, + enum fullness_group *fullness) + { ++ VM_BUG_ON(zspage->magic != ZSPAGE_MAGIC); ++ + *fullness = zspage->fullness; + *class_idx = zspage->class; + } +@@ -504,23 +558,19 @@ static int get_size_class_index(int size) + static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) + { +- if (type < NR_ZS_STAT_TYPE) +- class->stats.objs[type] += cnt; ++ class->stats.objs[type] += cnt; + } + + static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) + { +- if (type < NR_ZS_STAT_TYPE) +- class->stats.objs[type] -= cnt; ++ class->stats.objs[type] -= cnt; + } + + static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) + { +- if (type < NR_ZS_STAT_TYPE) +- return class->stats.objs[type]; +- return 0; ++ return class->stats.objs[type]; + } + + #ifdef CONFIG_ZSMALLOC_STAT +@@ -664,6 +714,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) + } + #endif + ++ + /* + * For each size class, zspages are divided into different groups + * depending on how "full" they are. This was done so that we could +@@ -704,15 +755,9 @@ static void insert_zspage(struct size_class *class, + { + struct zspage *head; + +- if (fullness >= ZS_EMPTY) +- return; +- ++ zs_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); +- +- zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? +- CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); +- + /* + * We want to see more ZS_FULL pages and less almost empty/full. + * Put pages with higher ->inuse first. +@@ -734,14 +779,11 @@ static void remove_zspage(struct size_class *class, + struct zspage *zspage, + enum fullness_group fullness) + { +- if (fullness >= ZS_EMPTY) +- return; +- + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); ++ VM_BUG_ON(is_zspage_isolated(zspage)); + + list_del_init(&zspage->list); +- zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? +- CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); ++ zs_stat_dec(class, fullness, 1); + } + + /* +@@ -764,8 +806,11 @@ static enum fullness_group fix_fullness_group(struct size_class *class, + if (newfg == currfg) + goto out; + +- remove_zspage(class, zspage, currfg); +- insert_zspage(class, zspage, newfg); ++ if (!is_zspage_isolated(zspage)) { ++ remove_zspage(class, zspage, currfg); ++ insert_zspage(class, zspage, newfg); ++ } ++ + set_zspage_mapping(zspage, class_idx, newfg); + + out: +@@ -808,19 +853,45 @@ static int get_pages_per_zspage(int class_size) + return max_usedpc_order; + } + +-static struct page *get_first_page(struct zspage *zspage) ++static struct zspage *get_zspage(struct page *page) + { +- return zspage->first_page; ++ struct zspage *zspage = (struct zspage *)page->private; ++ ++ VM_BUG_ON(zspage->magic != ZSPAGE_MAGIC); ++ return zspage; + } + +-static struct zspage *get_zspage(struct page *page) ++static struct page *get_next_page(struct page *page) + { +- return (struct zspage *)page->private; ++ if (unlikely(PageHugeObject(page))) ++ return NULL; ++ ++ return page->freelist; + } + +-static struct page *get_next_page(struct page *page) ++/* Get byte offset of first object in the @page */ ++static int get_first_obj_offset(struct size_class *class, ++ struct page *first_page, struct page *page) + { +- return page->next; ++ int pos; ++ int page_idx = 0; ++ int ofs = 0; ++ struct page *cursor = first_page; ++ ++ if (first_page == page) ++ goto out; ++ ++ while (page != cursor) { ++ page_idx++; ++ cursor = get_next_page(cursor); ++ } ++ ++ pos = class->objs_per_zspage * class->size * ++ page_idx / class->pages_per_zspage; ++ ++ ofs = (pos + class->size) % PAGE_SIZE; ++out: ++ return ofs; + } + + /** +@@ -857,16 +928,20 @@ static unsigned long handle_to_obj(unsigned long handle) + return *(unsigned long *)handle; + } + +-static unsigned long obj_to_head(struct size_class *class, struct page *page, +- void *obj) ++static unsigned long obj_to_head(struct page *page, void *obj) + { +- if (class->huge) { ++ if (unlikely(PageHugeObject(page))) { + VM_BUG_ON_PAGE(!is_first_page(page), page); + return page->index; + } else + return *(unsigned long *)obj; + } + ++static inline int testpin_tag(unsigned long handle) ++{ ++ return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); ++} ++ + static inline int trypin_tag(unsigned long handle) + { + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); +@@ -884,27 +959,93 @@ static void unpin_tag(unsigned long handle) + + static void reset_page(struct page *page) + { ++ __ClearPageMovable(page); + clear_bit(PG_private, &page->flags); + clear_bit(PG_private_2, &page->flags); + set_page_private(page, 0); +- page->index = 0; ++ ClearPageHugeObject(page); ++ page->freelist = NULL; + } + +-static void free_zspage(struct zs_pool *pool, struct zspage *zspage) ++/* ++ * To prevent zspage destroy during migration, zspage freeing should ++ * hold locks of all pages in the zspage. ++ */ ++void lock_zspage(struct zspage *zspage) ++{ ++ struct page *page = get_first_page(zspage); ++ ++ do { ++ lock_page(page); ++ } while ((page = get_next_page(page)) != NULL); ++} ++ ++int trylock_zspage(struct zspage *zspage) ++{ ++ struct page *cursor, *fail; ++ ++ for (cursor = get_first_page(zspage); cursor != NULL; cursor = ++ get_next_page(cursor)) { ++ if (!trylock_page(cursor)) { ++ fail = cursor; ++ goto unlock; ++ } ++ } ++ ++ return 1; ++unlock: ++ for (cursor = get_first_page(zspage); cursor != fail; cursor = ++ get_next_page(cursor)) ++ unlock_page(cursor); ++ ++ return 0; ++} ++ ++static void __free_zspage(struct zs_pool *pool, struct size_class *class, ++ struct zspage *zspage) + { + struct page *page, *next; ++ enum fullness_group fg; ++ unsigned int class_idx; ++ ++ get_zspage_mapping(zspage, &class_idx, &fg); ++ ++ assert_spin_locked(&class->lock); + + VM_BUG_ON(get_zspage_inuse(zspage)); ++ VM_BUG_ON(fg != ZS_EMPTY); + +- next = page = zspage->first_page; ++ next = page = get_first_page(zspage); + do { +- next = page->next; ++ VM_BUG_ON_PAGE(!PageLocked(page), page); ++ next = get_next_page(page); + reset_page(page); ++ unlock_page(page); + put_page(page); + page = next; + } while (page != NULL); + + cache_free_zspage(pool, zspage); ++ ++ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( ++ class->size, class->pages_per_zspage)); ++ atomic_long_sub(class->pages_per_zspage, ++ &pool->pages_allocated); ++} ++ ++static void free_zspage(struct zs_pool *pool, struct size_class *class, ++ struct zspage *zspage) ++{ ++ VM_BUG_ON(get_zspage_inuse(zspage)); ++ VM_BUG_ON(list_empty(&zspage->list)); ++ ++ if (!trylock_zspage(zspage)) { ++ kick_deferred_free(pool); ++ return; ++ } ++ ++ remove_zspage(class, zspage, ZS_EMPTY); ++ __free_zspage(pool, class, zspage); + } + + /* Initialize a newly allocated zspage */ +@@ -912,15 +1053,13 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) + { + unsigned int freeobj = 1; + unsigned long off = 0; +- struct page *page = zspage->first_page; ++ struct page *page = get_first_page(zspage); + + while (page) { + struct page *next_page; + struct link_free *link; + void *vaddr; + +- set_first_obj_offset(page, off); +- + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); + +@@ -952,16 +1091,17 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) + set_freeobj(zspage, 0); + } + +-static void create_page_chain(struct zspage *zspage, struct page *pages[], +- int nr_pages) ++static void create_page_chain(struct size_class *class, struct zspage *zspage, ++ struct page *pages[]) + { + int i; + struct page *page; + struct page *prev_page = NULL; ++ int nr_pages = class->pages_per_zspage; + + /* + * Allocate individual pages and link them together as: +- * 1. all pages are linked together using page->next ++ * 1. all pages are linked together using page->freelist + * 2. each sub-page point to zspage using page->private + * + * we set PG_private to identify the first page (i.e. no other sub-page +@@ -970,16 +1110,18 @@ static void create_page_chain(struct zspage *zspage, struct page *pages[], + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + set_page_private(page, (unsigned long)zspage); ++ page->freelist = NULL; + if (i == 0) { + zspage->first_page = page; + SetPagePrivate(page); ++ if (unlikely(class->objs_per_zspage == 1 && ++ class->pages_per_zspage == 1)) ++ SetPageHugeObject(page); + } else { +- prev_page->next = page; ++ prev_page->freelist = page; + } +- if (i == nr_pages - 1) { ++ if (i == nr_pages - 1) + SetPagePrivate2(page); +- page->next = NULL; +- } + prev_page = page; + } + } +@@ -999,6 +1141,8 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, + return NULL; + + memset(zspage, 0, sizeof(struct zspage)); ++ zspage->magic = ZSPAGE_MAGIC; ++ migrate_lock_init(zspage); + + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; +@@ -1013,7 +1157,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, + pages[i] = page; + } + +- create_page_chain(zspage, pages, class->pages_per_zspage); ++ create_page_chain(class, zspage, pages); + init_zspage(class, zspage); + + return zspage; +@@ -1024,7 +1168,7 @@ static struct zspage *find_get_zspage(struct size_class *class) + int i; + struct zspage *zspage; + +- for (i = ZS_ALMOST_FULL; i <= ZS_ALMOST_EMPTY; i++) { ++ for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { + zspage = list_first_entry_or_null(&class->fullness_list[i], + struct zspage, list); + if (zspage) +@@ -1289,6 +1433,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); ++ ++ /* migration cannot move any subpage in this zspage */ ++ migrate_read_lock(zspage); ++ + get_zspage_mapping(zspage, &class_idx, &fg); + class = pool->size_class[class_idx]; + off = (class->size * obj_idx) & ~PAGE_MASK; +@@ -1309,7 +1457,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + + ret = __zs_map_object(area, pages, off, class->size); + out: +- if (!class->huge) ++ if (likely(!PageHugeObject(page))) + ret += ZS_HANDLE_SIZE; + + return ret; +@@ -1348,6 +1496,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + __zs_unmap_object(area, pages, off, class->size); + } + put_cpu_var(zs_map_area); ++ ++ migrate_read_unlock(zspage); + unpin_tag(handle); + } + EXPORT_SYMBOL_GPL(zs_unmap_object); +@@ -1377,7 +1527,7 @@ static unsigned long obj_malloc(struct size_class *class, + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + set_freeobj(zspage, link->next >> OBJ_ALLOCATED_TAG); +- if (!class->huge) ++ if (likely(!PageHugeObject(m_page))) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else +@@ -1407,6 +1557,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) + { + unsigned long handle, obj; + struct size_class *class; ++ enum fullness_group newfg; + struct zspage *zspage; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) +@@ -1422,28 +1573,37 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) + + spin_lock(&class->lock); + zspage = find_get_zspage(class); +- +- if (!zspage) { ++ if (likely(zspage)) { ++ obj = obj_malloc(class, zspage, handle); ++ /* Now move the zspage to another fullness group, if required */ ++ fix_fullness_group(class, zspage); ++ record_obj(handle, obj); + spin_unlock(&class->lock); +- zspage = alloc_zspage(pool, class, gfp); +- if (unlikely(!zspage)) { +- cache_free_handle(pool, handle); +- return 0; +- } + +- set_zspage_mapping(zspage, class->index, ZS_EMPTY); +- atomic_long_add(class->pages_per_zspage, +- &pool->pages_allocated); ++ return handle; ++ } + +- spin_lock(&class->lock); +- zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( +- class->size, class->pages_per_zspage)); ++ spin_unlock(&class->lock); ++ ++ zspage = alloc_zspage(pool, class, gfp); ++ if (!zspage) { ++ cache_free_handle(pool, handle); ++ return 0; + } + ++ spin_lock(&class->lock); + obj = obj_malloc(class, zspage, handle); +- /* Now move the zspage to another fullness group, if required */ +- fix_fullness_group(class, zspage); ++ newfg = get_fullness_group(class, zspage); ++ insert_zspage(class, zspage, newfg); ++ set_zspage_mapping(zspage, class->index, newfg); + record_obj(handle, obj); ++ atomic_long_add(class->pages_per_zspage, ++ &pool->pages_allocated); ++ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( ++ class->size, class->pages_per_zspage)); ++ ++ /* We completely set up zspage so mark them as movable */ ++ SetZsPageMovable(pool, zspage); + spin_unlock(&class->lock); + + return handle; +@@ -1484,6 +1644,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) + int class_idx; + struct size_class *class; + enum fullness_group fullness; ++ bool isolated; + + if (unlikely(!handle)) + return; +@@ -1493,22 +1654,28 @@ void zs_free(struct zs_pool *pool, unsigned long handle) + obj_to_location(obj, &f_page, &f_objidx); + zspage = get_zspage(f_page); + ++ migrate_read_lock(zspage); ++ + get_zspage_mapping(zspage, &class_idx, &fullness); + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + obj_free(class, obj); + fullness = fix_fullness_group(class, zspage); +- if (fullness == ZS_EMPTY) { +- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( +- class->size, class->pages_per_zspage)); +- atomic_long_sub(class->pages_per_zspage, +- &pool->pages_allocated); +- free_zspage(pool, zspage); ++ if (fullness != ZS_EMPTY) { ++ migrate_read_unlock(zspage); ++ goto out; + } ++ ++ isolated = is_zspage_isolated(zspage); ++ migrate_read_unlock(zspage); ++ /* If zspage is isolated, zs_page_putback will free the zspage */ ++ if (likely(!isolated)) ++ free_zspage(pool, class, zspage); ++out: ++ + spin_unlock(&class->lock); + unpin_tag(handle); +- + cache_free_handle(pool, handle); + } + EXPORT_SYMBOL_GPL(zs_free); +@@ -1587,12 +1754,13 @@ static unsigned long find_alloced_obj(struct size_class *class, + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); ++ struct zspage *zspage = get_zspage(page); + +- offset = get_first_obj_offset(page); ++ offset = get_first_obj_offset(class, get_first_page(zspage), page); + offset += class->size * index; + + while (offset < PAGE_SIZE) { +- head = obj_to_head(class, page, addr + offset); ++ head = obj_to_head(page, addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) +@@ -1684,6 +1852,7 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source) + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { ++ VM_BUG_ON(is_zspage_isolated(zspage)); + remove_zspage(class, zspage, fg[i]); + return zspage; + } +@@ -1704,6 +1873,8 @@ static enum fullness_group putback_zspage(struct size_class *class, + { + enum fullness_group fullness; + ++ VM_BUG_ON(is_zspage_isolated(zspage)); ++ + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); +@@ -1711,6 +1882,377 @@ static enum fullness_group putback_zspage(struct size_class *class, + return fullness; + } + ++#ifdef CONFIG_COMPACTION ++static struct dentry *zs_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ static const struct dentry_operations ops = { ++ .d_dname = simple_dname, ++ }; ++ ++ return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); ++} ++ ++static struct file_system_type zsmalloc_fs = { ++ .name = "zsmalloc", ++ .mount = zs_mount, ++ .kill_sb = kill_anon_super, ++}; ++ ++static int zsmalloc_mount(void) ++{ ++ int ret = 0; ++ ++ zsmalloc_mnt = kern_mount(&zsmalloc_fs); ++ if (IS_ERR(zsmalloc_mnt)) ++ ret = PTR_ERR(zsmalloc_mnt); ++ ++ return ret; ++} ++ ++static void zsmalloc_unmount(void) ++{ ++ kern_unmount(zsmalloc_mnt); ++} ++ ++static void migrate_lock_init(struct zspage *zspage) ++{ ++ rwlock_init(&zspage->lock); ++} ++ ++static void migrate_read_lock(struct zspage *zspage) ++{ ++ read_lock(&zspage->lock); ++} ++ ++static void migrate_read_unlock(struct zspage *zspage) ++{ ++ read_unlock(&zspage->lock); ++} ++ ++static void migrate_write_lock(struct zspage *zspage) ++{ ++ write_lock(&zspage->lock); ++} ++ ++static void migrate_write_unlock(struct zspage *zspage) ++{ ++ write_unlock(&zspage->lock); ++} ++ ++/* Number of isolated subpage for *page migration* in this zspage */ ++static void inc_zspage_isolation(struct zspage *zspage) ++{ ++ zspage->isolated++; ++} ++ ++static void dec_zspage_isolation(struct zspage *zspage) ++{ ++ zspage->isolated--; ++} ++ ++static void replace_sub_page(struct size_class *class, struct zspage *zspage, ++ struct page *newpage, struct page *oldpage) ++{ ++ struct page *page; ++ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; ++ int idx = 0; ++ ++ page = get_first_page(zspage); ++ do { ++ if (page == oldpage) ++ pages[idx] = newpage; ++ else ++ pages[idx] = page; ++ idx++; ++ } while ((page = get_next_page(page)) != NULL); ++ ++ create_page_chain(class, zspage, pages); ++ if (unlikely(PageHugeObject(oldpage))) ++ newpage->index = oldpage->index; ++ __SetPageMovable(newpage, page_mapping(oldpage)); ++} ++ ++bool zs_page_isolate(struct page *page, isolate_mode_t mode) ++{ ++ struct zs_pool *pool; ++ struct size_class *class; ++ int class_idx; ++ enum fullness_group fullness; ++ struct zspage *zspage; ++ struct address_space *mapping; ++ ++ /* ++ * Page is locked so zspage couldn't be destroyed. For detail, look at ++ * lock_zspage in free_zspage. ++ */ ++ VM_BUG_ON_PAGE(!PageMovable(page), page); ++ VM_BUG_ON_PAGE(PageIsolated(page), page); ++ ++ zspage = get_zspage(page); ++ ++ /* ++ * Without class lock, fullness could be stale while class_idx is okay ++ * because class_idx is constant unless page is freed so we should get ++ * fullness again under class lock. ++ */ ++ get_zspage_mapping(zspage, &class_idx, &fullness); ++ mapping = page_mapping(page); ++ pool = mapping->private_data; ++ class = pool->size_class[class_idx]; ++ ++ spin_lock(&class->lock); ++ if (get_zspage_inuse(zspage) == 0) { ++ spin_unlock(&class->lock); ++ return false; ++ } ++ ++ /* zspage is isolated for object migration */ ++ if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { ++ spin_unlock(&class->lock); ++ return false; ++ } ++ ++ /* ++ * If this is first time isolation for the zspage, isolate zspage from ++ * size_class to prevent further object allocation from the zspage. ++ */ ++ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { ++ get_zspage_mapping(zspage, &class_idx, &fullness); ++ remove_zspage(class, zspage, fullness); ++ } ++ ++ inc_zspage_isolation(zspage); ++ spin_unlock(&class->lock); ++ ++ return true; ++} ++ ++int zs_page_migrate(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ ++ struct zs_pool *pool; ++ struct size_class *class; ++ int class_idx; ++ enum fullness_group fullness; ++ struct zspage *zspage; ++ struct page *dummy; ++ void *s_addr, *d_addr, *addr; ++ int offset, pos; ++ unsigned long handle, head; ++ unsigned long old_obj, new_obj; ++ unsigned int obj_idx; ++ int ret = -EAGAIN; ++ ++ VM_BUG_ON_PAGE(!PageMovable(page), page); ++ VM_BUG_ON_PAGE(!PageIsolated(page), page); ++ ++ zspage = get_zspage(page); ++ ++ /* Concurrent compactor cannot migrate any subpage in zspage */ ++ migrate_write_lock(zspage); ++ get_zspage_mapping(zspage, &class_idx, &fullness); ++ pool = mapping->private_data; ++ class = pool->size_class[class_idx]; ++ offset = get_first_obj_offset(class, get_first_page(zspage), page); ++ ++ spin_lock(&class->lock); ++ if (!get_zspage_inuse(zspage)) { ++ ret = -EBUSY; ++ goto unlock_class; ++ } ++ ++ pos = offset; ++ s_addr = kmap_atomic(page); ++ while (pos < PAGE_SIZE) { ++ head = obj_to_head(page, s_addr + pos); ++ if (head & OBJ_ALLOCATED_TAG) { ++ handle = head & ~OBJ_ALLOCATED_TAG; ++ if (!trypin_tag(handle)) ++ goto unpin_objects; ++ } ++ pos += class->size; ++ } ++ ++ /* ++ * Here, any user cannot access all objects in the zspage so let's move. ++ */ ++ d_addr = kmap_atomic(newpage); ++ memcpy(d_addr, s_addr, PAGE_SIZE); ++ kunmap_atomic(d_addr); ++ ++ for (addr = s_addr + offset; addr < s_addr + pos; ++ addr += class->size) { ++ head = obj_to_head(page, addr); ++ if (head & OBJ_ALLOCATED_TAG) { ++ handle = head & ~OBJ_ALLOCATED_TAG; ++ if (!testpin_tag(handle)) ++ BUG(); ++ ++ old_obj = handle_to_obj(handle); ++ obj_to_location(old_obj, &dummy, &obj_idx); ++ new_obj = (unsigned long)location_to_obj(newpage, ++ obj_idx); ++ new_obj |= BIT(HANDLE_PIN_BIT); ++ record_obj(handle, new_obj); ++ } ++ } ++ ++ replace_sub_page(class, zspage, newpage, page); ++ get_page(newpage); ++ ++ dec_zspage_isolation(zspage); ++ ++ /* ++ * Page migration is done so let's putback isolated zspage to ++ * the list if @page is final isolated subpage in the zspage. ++ */ ++ if (!is_zspage_isolated(zspage)) ++ putback_zspage(class, zspage); ++ ++ reset_page(page); ++ put_page(page); ++ page = newpage; ++ ++ ret = 0; ++unpin_objects: ++ for (addr = s_addr + offset; addr < s_addr + pos; ++ addr += class->size) { ++ head = obj_to_head(page, addr); ++ if (head & OBJ_ALLOCATED_TAG) { ++ handle = head & ~OBJ_ALLOCATED_TAG; ++ if (!testpin_tag(handle)) ++ BUG(); ++ unpin_tag(handle); ++ } ++ } ++ kunmap_atomic(s_addr); ++unlock_class: ++ spin_unlock(&class->lock); ++ migrate_write_unlock(zspage); ++ ++ return ret; ++} ++ ++void zs_page_putback(struct page *page) ++{ ++ struct zs_pool *pool; ++ struct size_class *class; ++ int class_idx; ++ enum fullness_group fg; ++ struct address_space *mapping; ++ struct zspage *zspage; ++ ++ VM_BUG_ON_PAGE(!PageMovable(page), page); ++ VM_BUG_ON_PAGE(!PageIsolated(page), page); ++ ++ zspage = get_zspage(page); ++ get_zspage_mapping(zspage, &class_idx, &fg); ++ mapping = page_mapping(page); ++ pool = mapping->private_data; ++ class = pool->size_class[class_idx]; ++ ++ spin_lock(&class->lock); ++ dec_zspage_isolation(zspage); ++ if (!is_zspage_isolated(zspage)) { ++ fg = putback_zspage(class, zspage); ++ /* ++ * Due to page_lock, we cannot free zspage immediately ++ * so let's defer. ++ */ ++ if (fg == ZS_EMPTY) ++ schedule_work(&pool->free_work); ++ } ++ spin_unlock(&class->lock); ++} ++ ++const struct address_space_operations zsmalloc_aops = { ++ .isolate_page = zs_page_isolate, ++ .migratepage = zs_page_migrate, ++ .putback_page = zs_page_putback, ++}; ++ ++static int zs_register_migration(struct zs_pool *pool) ++{ ++ pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); ++ if (IS_ERR(pool->inode)) { ++ pool->inode = NULL; ++ return 1; ++ } ++ ++ pool->inode->i_mapping->private_data = pool; ++ pool->inode->i_mapping->a_ops = &zsmalloc_aops; ++ return 0; ++} ++ ++static void zs_unregister_migration(struct zs_pool *pool) ++{ ++ flush_work(&pool->free_work); ++ if (pool->inode) ++ iput(pool->inode); ++} ++ ++/* ++ * Caller should hold page_lock of all pages in the zspage ++ * In here, we cannot use zspage meta data. ++ */ ++static void async_free_zspage(struct work_struct *work) ++{ ++ int i; ++ struct size_class *class; ++ unsigned int class_idx; ++ enum fullness_group fullness; ++ struct zspage *zspage, *tmp; ++ LIST_HEAD(free_pages); ++ struct zs_pool *pool = container_of(work, struct zs_pool, ++ free_work); ++ ++ for (i = 0; i < zs_size_classes; i++) { ++ class = pool->size_class[i]; ++ if (class->index != i) ++ continue; ++ ++ spin_lock(&class->lock); ++ list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); ++ spin_unlock(&class->lock); ++ } ++ ++ ++ list_for_each_entry_safe(zspage, tmp, &free_pages, list) { ++ list_del(&zspage->list); ++ lock_zspage(zspage); ++ ++ get_zspage_mapping(zspage, &class_idx, &fullness); ++ VM_BUG_ON(fullness != ZS_EMPTY); ++ class = pool->size_class[class_idx]; ++ spin_lock(&class->lock); ++ __free_zspage(pool, pool->size_class[class_idx], zspage); ++ spin_unlock(&class->lock); ++ } ++}; ++ ++static void kick_deferred_free(struct zs_pool *pool) ++{ ++ schedule_work(&pool->free_work); ++} ++ ++static void init_deferred_free(struct zs_pool *pool) ++{ ++ INIT_WORK(&pool->free_work, async_free_zspage); ++} ++ ++static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) ++{ ++ struct page *page = get_first_page(zspage); ++ ++ do { ++ WARN_ON(!trylock_page(page)); ++ __SetPageMovable(page, pool->inode->i_mapping); ++ unlock_page(page); ++ } while ((page = get_next_page(page)) != NULL); ++} ++#endif ++ + /* + * + * Based on the number of unused allocated objects calculate +@@ -1745,10 +2287,10 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) + break; + + cc.index = 0; +- cc.s_page = src_zspage->first_page; ++ cc.s_page = get_first_page(src_zspage); + + while ((dst_zspage = isolate_zspage(class, false))) { +- cc.d_page = dst_zspage->first_page; ++ cc.d_page = get_first_page(dst_zspage); + /* + * If there is no more space in dst_page, resched + * and see if anyone had allocated another zspage. +@@ -1765,11 +2307,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) + + putback_zspage(class, dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { +- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( +- class->size, class->pages_per_zspage)); +- atomic_long_sub(class->pages_per_zspage, +- &pool->pages_allocated); +- free_zspage(pool, src_zspage); ++ free_zspage(pool, class, src_zspage); + pool->stats.pages_compacted += class->pages_per_zspage; + } + spin_unlock(&class->lock); +@@ -1885,6 +2423,7 @@ struct zs_pool *zs_create_pool(const char *name) + if (!pool) + return NULL; + ++ init_deferred_free(pool); + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), + GFP_KERNEL); + if (!pool->size_class) { +@@ -1939,12 +2478,10 @@ struct zs_pool *zs_create_pool(const char *name) + class->pages_per_zspage = pages_per_zspage; + class->objs_per_zspage = class->pages_per_zspage * + PAGE_SIZE / class->size; +- if (pages_per_zspage == 1 && class->objs_per_zspage == 1) +- class->huge = true; + spin_lock_init(&class->lock); + pool->size_class[i] = class; +- for (fullness = ZS_ALMOST_FULL; fullness <= ZS_ALMOST_EMPTY; +- fullness++) ++ for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; ++ fullness++) + INIT_LIST_HEAD(&class->fullness_list[fullness]); + + prev_class = class; +@@ -1953,6 +2490,9 @@ struct zs_pool *zs_create_pool(const char *name) + /* debug only, don't abort if it fails */ + zs_pool_stat_create(pool, name); + ++ if (zs_register_migration(pool)) ++ goto err; ++ + /* + * Not critical, we still can use the pool + * and user can trigger compaction manually. +@@ -1972,6 +2512,7 @@ void zs_destroy_pool(struct zs_pool *pool) + int i; + + zs_unregister_shrinker(pool); ++ zs_unregister_migration(pool); + zs_pool_stat_destroy(pool); + + for (i = 0; i < zs_size_classes; i++) { +@@ -1984,7 +2525,7 @@ void zs_destroy_pool(struct zs_pool *pool) + if (class->index != i) + continue; + +- for (fg = ZS_ALMOST_FULL; fg <= ZS_ALMOST_EMPTY; fg++) { ++ for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { + if (!list_empty(&class->fullness_list[fg])) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); +@@ -2002,7 +2543,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); + + static int __init zs_init(void) + { +- int ret = zs_register_cpu_notifier(); ++ int ret; ++ ++ ret = zsmalloc_mount(); ++ if (ret) ++ goto out; ++ ++ ret = zs_register_cpu_notifier(); + + if (ret) + goto notifier_fail; +@@ -2019,7 +2566,8 @@ static int __init zs_init(void) + + notifier_fail: + zs_unregister_cpu_notifier(); +- ++ zsmalloc_unmount(); ++out: + return ret; + } + +@@ -2028,6 +2576,7 @@ static void __exit zs_exit(void) + #ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); + #endif ++ zsmalloc_unmount(); + zs_unregister_cpu_notifier(); + + zs_stat_exit(); +-- +1.9.1 diff --git a/a/content_digest b/N1/content_digest index f11822e..9e5d209 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -4,11 +4,1383 @@ "Subject\0[PATCH v6r2 11/12] zsmalloc: page migration support\0" "Date\0Fri, 27 May 2016 06:50:22 +0900\0" "To\0Andrew Morton <akpm@linux-foundation.org>\0" - "Cc\0linux-mm@kvack.org" - linux-kernel@vger.kernel.org + "Cc\0<linux-mm@kvack.org>" + <linux-kernel@vger.kernel.org> " Sergey Senozhatsky <sergey.senozhatsky@gmail.com>\0" "\00:1\0" "b\0" - Follow up Sergey's review + "Follow up Sergey's review\n" + "\n" + ">From 2deede28c91910a9d3493feae30bed507e72f213 Mon Sep 17 00:00:00 2001\n" + "From: Minchan Kim <minchan@kernel.org>\n" + "Date: Thu, 5 May 2016 00:01:03 +0900\n" + "Subject: [PATCH v6r2] zsmalloc: page migration support\n" + "\n" + "This patch introduces run-time migration feature for zspage.\n" + "\n" + "For migration, VM uses page.lru field so it would be better to not use\n" + "page.next field which is unified with page.lru for own purpose.\n" + "For that, firstly, we can get first object offset of the page via\n" + "runtime calculation instead of using page.index so we can use\n" + "page.index as link for page chaining instead of page.next.\n" + "\n" + "In case of huge object, it stores handle to page.index instead of\n" + "next link of page chaining because huge object doesn't need to next\n" + "link for page chaining. So get_next_page need to identify huge\n" + "object to return NULL. For it, this patch uses PG_owner_priv_1 flag\n" + "of the page flag.\n" + "\n" + "For migration, it supports three functions\n" + "\n" + "* zs_page_isolate\n" + "\n" + "It isolates a zspage which includes a subpage VM want to migrate\n" + "from class so anyone cannot allocate new object from the zspage.\n" + "\n" + "We could try to isolate a zspage by the number of subpage so\n" + "subsequent isolation trial of other subpage of the zpsage shouldn't\n" + "fail. For that, we introduce zspage.isolated count. With that,\n" + "zs_page_isolate can know whether zspage is already isolated or not\n" + "for migration so if it is isolated for migration, subsequent\n" + "isolation trial can be successful without trying further isolation.\n" + "\n" + "* zs_page_migrate\n" + "\n" + "First of all, it holds write-side zspage->lock to prevent migrate other\n" + "subpage in zspage. Then, lock all objects in the page VM want to migrate.\n" + "The reason we should lock all objects in the page is due to race between\n" + "zs_map_object and zs_page_migrate.\n" + "\n" + "zs_map_object\t\t\t\tzs_page_migrate\n" + "\n" + "pin_tag(handle)\n" + "obj = handle_to_obj(handle)\n" + "obj_to_location(obj, &page, &obj_idx);\n" + "\n" + "\t\t\t\t\twrite_lock(&zspage->lock)\n" + "\t\t\t\t\tif (!trypin_tag(handle))\n" + "\t\t\t\t\t\tgoto unpin_object\n" + "\n" + "zspage = get_zspage(page);\n" + "read_lock(&zspage->lock);\n" + "\n" + "If zs_page_migrate doesn't do trypin_tag, zs_map_object's page can\n" + "be stale by migration so it goes crash.\n" + "\n" + "If it locks all of objects successfully, it copies content from\n" + "old page to new one, finally, create new zspage chain with new page.\n" + "And if it's last isolated subpage in the zspage, put the zspage back\n" + "to class.\n" + "\n" + "* zs_page_putback\n" + "\n" + "It returns isolated zspage to right fullness_group list if it fails to\n" + "migrate a page. If it find a zspage is ZS_EMPTY, it queues zspage\n" + "freeing to workqueue. See below about async zspage freeing.\n" + "\n" + "This patch introduces asynchronous zspage free. The reason to need it\n" + "is we need page_lock to clear PG_movable but unfortunately,\n" + "zs_free path should be atomic so the apporach is try to grab page_lock.\n" + "If it got page_lock of all of pages successfully, it can free zspage\n" + "immediately. Otherwise, it queues free request and free zspage via\n" + "workqueue in process context.\n" + "\n" + "If zs_free finds the zspage is isolated when it try to free zspage,\n" + "it delays the freeing until zs_page_putback finds it so it will free\n" + "free the zspage finally.\n" + "\n" + "In this patch, we expand fullness_list from ZS_EMPTY to ZS_FULL.\n" + "First of all, it will use ZS_EMPTY list for delay freeing.\n" + "And with adding ZS_FULL list, it makes to identify whether zspage is\n" + "isolated or not via list_empty(&zspage->list) test.\n" + "\n" + "Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>\n" + "Signed-off-by: Minchan Kim <minchan@kernel.org>\n" + "---\n" + " include/uapi/linux/magic.h | 1 +\n" + " mm/zsmalloc.c | 793 ++++++++++++++++++++++++++++++++++++++-------\n" + " 2 files changed, 672 insertions(+), 122 deletions(-)\n" + "\n" + "diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h\n" + "index d829ce63529d..e398beac67b8 100644\n" + "--- a/include/uapi/linux/magic.h\n" + "+++ b/include/uapi/linux/magic.h\n" + "@@ -81,5 +81,6 @@\n" + " /* Since UDF 2.01 is ISO 13346 based... */\n" + " #define UDF_SUPER_MAGIC\t\t0x15013346\n" + " #define BALLOON_KVM_MAGIC\t0x13661366\n" + "+#define ZSMALLOC_MAGIC\t\t0x58295829\n" + " \n" + " #endif /* __LINUX_MAGIC_H__ */\n" + "diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c\n" + "index c6fb543cfb98..a80100db16d6 100644\n" + "--- a/mm/zsmalloc.c\n" + "+++ b/mm/zsmalloc.c\n" + "@@ -17,14 +17,14 @@\n" + " *\n" + " * Usage of struct page fields:\n" + " *\tpage->private: points to zspage\n" + "- *\tpage->index: offset of the first object starting in this page.\n" + "- *\t\tFor the first page, this is always 0, so we use this field\n" + "- *\t\tto store handle for huge object.\n" + "- *\tpage->next: links together all component pages of a zspage\n" + "+ *\tpage->freelist(index): links together all component pages of a zspage\n" + "+ *\t\tFor the huge page, this is always 0, so we use this field\n" + "+ *\t\tto store handle.\n" + " *\n" + " * Usage of struct page flags:\n" + " *\tPG_private: identifies the first component page\n" + " *\tPG_private2: identifies the last component page\n" + "+ *\tPG_owner_priv_1: indentifies the huge component page\n" + " *\n" + " */\n" + " \n" + "@@ -49,6 +49,11 @@\n" + " #include <linux/debugfs.h>\n" + " #include <linux/zsmalloc.h>\n" + " #include <linux/zpool.h>\n" + "+#include <linux/mount.h>\n" + "+#include <linux/compaction.h>\n" + "+#include <linux/pagemap.h>\n" + "+\n" + "+#define ZSPAGE_MAGIC\t0x58\n" + " \n" + " /*\n" + " * This must be power of 2 and greater than of equal to sizeof(link_free).\n" + "@@ -136,25 +141,23 @@\n" + " * We do not maintain any list for completely empty or full pages\n" + " */\n" + " enum fullness_group {\n" + "-\tZS_ALMOST_FULL,\n" + "-\tZS_ALMOST_EMPTY,\n" + " \tZS_EMPTY,\n" + "-\tZS_FULL\n" + "+\tZS_ALMOST_EMPTY,\n" + "+\tZS_ALMOST_FULL,\n" + "+\tZS_FULL,\n" + "+\tNR_ZS_FULLNESS,\n" + " };\n" + " \n" + " enum zs_stat_type {\n" + "+\tCLASS_EMPTY,\n" + "+\tCLASS_ALMOST_EMPTY,\n" + "+\tCLASS_ALMOST_FULL,\n" + "+\tCLASS_FULL,\n" + " \tOBJ_ALLOCATED,\n" + " \tOBJ_USED,\n" + "-\tCLASS_ALMOST_FULL,\n" + "-\tCLASS_ALMOST_EMPTY,\n" + "+\tNR_ZS_STAT_TYPE,\n" + " };\n" + " \n" + "-#ifdef CONFIG_ZSMALLOC_STAT\n" + "-#define NR_ZS_STAT_TYPE\t(CLASS_ALMOST_EMPTY + 1)\n" + "-#else\n" + "-#define NR_ZS_STAT_TYPE\t(OBJ_USED + 1)\n" + "-#endif\n" + "-\n" + " struct zs_size_stat {\n" + " \tunsigned long objs[NR_ZS_STAT_TYPE];\n" + " };\n" + "@@ -163,6 +166,10 @@ struct zs_size_stat {\n" + " static struct dentry *zs_stat_root;\n" + " #endif\n" + " \n" + "+#ifdef CONFIG_COMPACTION\n" + "+static struct vfsmount *zsmalloc_mnt;\n" + "+#endif\n" + "+\n" + " /*\n" + " * number of size_classes\n" + " */\n" + "@@ -186,23 +193,36 @@ static const int fullness_threshold_frac = 4;\n" + " \n" + " struct size_class {\n" + " \tspinlock_t lock;\n" + "-\tstruct list_head fullness_list[2];\n" + "+\tstruct list_head fullness_list[NR_ZS_FULLNESS];\n" + " \t/*\n" + " \t * Size of objects stored in this class. Must be multiple\n" + " \t * of ZS_ALIGN.\n" + " \t */\n" + " \tint size;\n" + " \tint objs_per_zspage;\n" + "-\tunsigned int index;\n" + "-\n" + "-\tstruct zs_size_stat stats;\n" + "-\n" + " \t/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */\n" + " \tint pages_per_zspage;\n" + "-\t/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */\n" + "-\tbool huge;\n" + "+\n" + "+\tunsigned int index;\n" + "+\tstruct zs_size_stat stats;\n" + " };\n" + " \n" + "+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */\n" + "+static void SetPageHugeObject(struct page *page)\n" + "+{\n" + "+\tSetPageOwnerPriv1(page);\n" + "+}\n" + "+\n" + "+static void ClearPageHugeObject(struct page *page)\n" + "+{\n" + "+\tClearPageOwnerPriv1(page);\n" + "+}\n" + "+\n" + "+static int PageHugeObject(struct page *page)\n" + "+{\n" + "+\treturn PageOwnerPriv1(page);\n" + "+}\n" + "+\n" + " /*\n" + " * Placed within free objects to form a singly linked list.\n" + " * For every zspage, zspage->freeobj gives head of this list.\n" + "@@ -244,6 +264,10 @@ struct zs_pool {\n" + " #ifdef CONFIG_ZSMALLOC_STAT\n" + " \tstruct dentry *stat_dentry;\n" + " #endif\n" + "+#ifdef CONFIG_COMPACTION\n" + "+\tstruct inode *inode;\n" + "+\tstruct work_struct free_work;\n" + "+#endif\n" + " };\n" + " \n" + " /*\n" + "@@ -252,16 +276,23 @@ struct zs_pool {\n" + " */\n" + " #define FULLNESS_BITS\t2\n" + " #define CLASS_BITS\t8\n" + "+#define ISOLATED_BITS\t3\n" + "+#define MAGIC_VAL_BITS\t8\n" + " \n" + " struct zspage {\n" + " \tstruct {\n" + " \t\tunsigned int fullness:FULLNESS_BITS;\n" + " \t\tunsigned int class:CLASS_BITS;\n" + "+\t\tunsigned int isolated:ISOLATED_BITS;\n" + "+\t\tunsigned int magic:MAGIC_VAL_BITS;\n" + " \t};\n" + " \tunsigned int inuse;\n" + " \tunsigned int freeobj;\n" + " \tstruct page *first_page;\n" + " \tstruct list_head list; /* fullness list */\n" + "+#ifdef CONFIG_COMPACTION\n" + "+\trwlock_t lock;\n" + "+#endif\n" + " };\n" + " \n" + " struct mapping_area {\n" + "@@ -274,6 +305,28 @@ struct mapping_area {\n" + " \tenum zs_mapmode vm_mm; /* mapping mode */\n" + " };\n" + " \n" + "+#ifdef CONFIG_COMPACTION\n" + "+static int zs_register_migration(struct zs_pool *pool);\n" + "+static void zs_unregister_migration(struct zs_pool *pool);\n" + "+static void migrate_lock_init(struct zspage *zspage);\n" + "+static void migrate_read_lock(struct zspage *zspage);\n" + "+static void migrate_read_unlock(struct zspage *zspage);\n" + "+static void kick_deferred_free(struct zs_pool *pool);\n" + "+static void init_deferred_free(struct zs_pool *pool);\n" + "+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);\n" + "+#else\n" + "+static int zsmalloc_mount(void) { return 0; }\n" + "+static void zsmalloc_unmount(void) {}\n" + "+static int zs_register_migration(struct zs_pool *pool) { return 0; }\n" + "+static void zs_unregister_migration(struct zs_pool *pool) {}\n" + "+static void migrate_lock_init(struct zspage *zspage) {}\n" + "+static void migrate_read_lock(struct zspage *zspage) {}\n" + "+static void migrate_read_unlock(struct zspage *zspage) {}\n" + "+static void kick_deferred_free(struct zs_pool *pool) {}\n" + "+static void init_deferred_free(struct zs_pool *pool) {}\n" + "+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}\n" + "+#endif\n" + "+\n" + " static int create_cache(struct zs_pool *pool)\n" + " {\n" + " \tpool->handle_cachep = kmem_cache_create(\"zs_handle\", ZS_HANDLE_SIZE,\n" + "@@ -301,7 +354,7 @@ static void destroy_cache(struct zs_pool *pool)\n" + " static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)\n" + " {\n" + " \treturn (unsigned long)kmem_cache_alloc(pool->handle_cachep,\n" + "-\t\t\tgfp & ~__GFP_HIGHMEM);\n" + "+\t\t\tgfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));\n" + " }\n" + " \n" + " static void cache_free_handle(struct zs_pool *pool, unsigned long handle)\n" + "@@ -311,7 +364,8 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle)\n" + " \n" + " static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)\n" + " {\n" + "-\treturn kmem_cache_alloc(pool->zspage_cachep, flags & ~__GFP_HIGHMEM);\n" + "+\treturn kmem_cache_alloc(pool->zspage_cachep,\n" + "+\t\t\tflags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));\n" + " };\n" + " \n" + " static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)\n" + "@@ -421,11 +475,17 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)\n" + " /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */\n" + " static DEFINE_PER_CPU(struct mapping_area, zs_map_area);\n" + " \n" + "+static bool is_zspage_isolated(struct zspage *zspage)\n" + "+{\n" + "+\treturn zspage->isolated;\n" + "+}\n" + "+\n" + " static int is_first_page(struct page *page)\n" + " {\n" + " \treturn PagePrivate(page);\n" + " }\n" + " \n" + "+/* Protected by class->lock */\n" + " static inline int get_zspage_inuse(struct zspage *zspage)\n" + " {\n" + " \treturn zspage->inuse;\n" + "@@ -441,20 +501,12 @@ static inline void mod_zspage_inuse(struct zspage *zspage, int val)\n" + " \tzspage->inuse += val;\n" + " }\n" + " \n" + "-static inline int get_first_obj_offset(struct page *page)\n" + "+static inline struct page *get_first_page(struct zspage *zspage)\n" + " {\n" + "-\tif (is_first_page(page))\n" + "-\t\treturn 0;\n" + "+\tstruct page *first_page = zspage->first_page;\n" + " \n" + "-\treturn page->index;\n" + "-}\n" + "-\n" + "-static inline void set_first_obj_offset(struct page *page, int offset)\n" + "-{\n" + "-\tif (is_first_page(page))\n" + "-\t\treturn;\n" + "-\n" + "-\tpage->index = offset;\n" + "+\tVM_BUG_ON_PAGE(!is_first_page(first_page), first_page);\n" + "+\treturn first_page;\n" + " }\n" + " \n" + " static inline unsigned int get_freeobj(struct zspage *zspage)\n" + "@@ -471,6 +523,8 @@ static void get_zspage_mapping(struct zspage *zspage,\n" + " \t\t\t\tunsigned int *class_idx,\n" + " \t\t\t\tenum fullness_group *fullness)\n" + " {\n" + "+\tVM_BUG_ON(zspage->magic != ZSPAGE_MAGIC);\n" + "+\n" + " \t*fullness = zspage->fullness;\n" + " \t*class_idx = zspage->class;\n" + " }\n" + "@@ -504,23 +558,19 @@ static int get_size_class_index(int size)\n" + " static inline void zs_stat_inc(struct size_class *class,\n" + " \t\t\t\tenum zs_stat_type type, unsigned long cnt)\n" + " {\n" + "-\tif (type < NR_ZS_STAT_TYPE)\n" + "-\t\tclass->stats.objs[type] += cnt;\n" + "+\tclass->stats.objs[type] += cnt;\n" + " }\n" + " \n" + " static inline void zs_stat_dec(struct size_class *class,\n" + " \t\t\t\tenum zs_stat_type type, unsigned long cnt)\n" + " {\n" + "-\tif (type < NR_ZS_STAT_TYPE)\n" + "-\t\tclass->stats.objs[type] -= cnt;\n" + "+\tclass->stats.objs[type] -= cnt;\n" + " }\n" + " \n" + " static inline unsigned long zs_stat_get(struct size_class *class,\n" + " \t\t\t\tenum zs_stat_type type)\n" + " {\n" + "-\tif (type < NR_ZS_STAT_TYPE)\n" + "-\t\treturn class->stats.objs[type];\n" + "-\treturn 0;\n" + "+\treturn class->stats.objs[type];\n" + " }\n" + " \n" + " #ifdef CONFIG_ZSMALLOC_STAT\n" + "@@ -664,6 +714,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)\n" + " }\n" + " #endif\n" + " \n" + "+\n" + " /*\n" + " * For each size class, zspages are divided into different groups\n" + " * depending on how \"full\" they are. This was done so that we could\n" + "@@ -704,15 +755,9 @@ static void insert_zspage(struct size_class *class,\n" + " {\n" + " \tstruct zspage *head;\n" + " \n" + "-\tif (fullness >= ZS_EMPTY)\n" + "-\t\treturn;\n" + "-\n" + "+\tzs_stat_inc(class, fullness, 1);\n" + " \thead = list_first_entry_or_null(&class->fullness_list[fullness],\n" + " \t\t\t\t\tstruct zspage, list);\n" + "-\n" + "-\tzs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?\n" + "-\t\t\tCLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);\n" + "-\n" + " \t/*\n" + " \t * We want to see more ZS_FULL pages and less almost empty/full.\n" + " \t * Put pages with higher ->inuse first.\n" + "@@ -734,14 +779,11 @@ static void remove_zspage(struct size_class *class,\n" + " \t\t\t\tstruct zspage *zspage,\n" + " \t\t\t\tenum fullness_group fullness)\n" + " {\n" + "-\tif (fullness >= ZS_EMPTY)\n" + "-\t\treturn;\n" + "-\n" + " \tVM_BUG_ON(list_empty(&class->fullness_list[fullness]));\n" + "+\tVM_BUG_ON(is_zspage_isolated(zspage));\n" + " \n" + " \tlist_del_init(&zspage->list);\n" + "-\tzs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?\n" + "-\t\t\tCLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);\n" + "+\tzs_stat_dec(class, fullness, 1);\n" + " }\n" + " \n" + " /*\n" + "@@ -764,8 +806,11 @@ static enum fullness_group fix_fullness_group(struct size_class *class,\n" + " \tif (newfg == currfg)\n" + " \t\tgoto out;\n" + " \n" + "-\tremove_zspage(class, zspage, currfg);\n" + "-\tinsert_zspage(class, zspage, newfg);\n" + "+\tif (!is_zspage_isolated(zspage)) {\n" + "+\t\tremove_zspage(class, zspage, currfg);\n" + "+\t\tinsert_zspage(class, zspage, newfg);\n" + "+\t}\n" + "+\n" + " \tset_zspage_mapping(zspage, class_idx, newfg);\n" + " \n" + " out:\n" + "@@ -808,19 +853,45 @@ static int get_pages_per_zspage(int class_size)\n" + " \treturn max_usedpc_order;\n" + " }\n" + " \n" + "-static struct page *get_first_page(struct zspage *zspage)\n" + "+static struct zspage *get_zspage(struct page *page)\n" + " {\n" + "-\treturn zspage->first_page;\n" + "+\tstruct zspage *zspage = (struct zspage *)page->private;\n" + "+\n" + "+\tVM_BUG_ON(zspage->magic != ZSPAGE_MAGIC);\n" + "+\treturn zspage;\n" + " }\n" + " \n" + "-static struct zspage *get_zspage(struct page *page)\n" + "+static struct page *get_next_page(struct page *page)\n" + " {\n" + "-\treturn (struct zspage *)page->private;\n" + "+\tif (unlikely(PageHugeObject(page)))\n" + "+\t\treturn NULL;\n" + "+\n" + "+\treturn page->freelist;\n" + " }\n" + " \n" + "-static struct page *get_next_page(struct page *page)\n" + "+/* Get byte offset of first object in the @page */\n" + "+static int get_first_obj_offset(struct size_class *class,\n" + "+\t\t\t\tstruct page *first_page, struct page *page)\n" + " {\n" + "-\treturn page->next;\n" + "+\tint pos;\n" + "+\tint page_idx = 0;\n" + "+\tint ofs = 0;\n" + "+\tstruct page *cursor = first_page;\n" + "+\n" + "+\tif (first_page == page)\n" + "+\t\tgoto out;\n" + "+\n" + "+\twhile (page != cursor) {\n" + "+\t\tpage_idx++;\n" + "+\t\tcursor = get_next_page(cursor);\n" + "+\t}\n" + "+\n" + "+\tpos = class->objs_per_zspage * class->size *\n" + "+\t\tpage_idx / class->pages_per_zspage;\n" + "+\n" + "+\tofs = (pos + class->size) % PAGE_SIZE;\n" + "+out:\n" + "+\treturn ofs;\n" + " }\n" + " \n" + " /**\n" + "@@ -857,16 +928,20 @@ static unsigned long handle_to_obj(unsigned long handle)\n" + " \treturn *(unsigned long *)handle;\n" + " }\n" + " \n" + "-static unsigned long obj_to_head(struct size_class *class, struct page *page,\n" + "-\t\t\tvoid *obj)\n" + "+static unsigned long obj_to_head(struct page *page, void *obj)\n" + " {\n" + "-\tif (class->huge) {\n" + "+\tif (unlikely(PageHugeObject(page))) {\n" + " \t\tVM_BUG_ON_PAGE(!is_first_page(page), page);\n" + " \t\treturn page->index;\n" + " \t} else\n" + " \t\treturn *(unsigned long *)obj;\n" + " }\n" + " \n" + "+static inline int testpin_tag(unsigned long handle)\n" + "+{\n" + "+\treturn bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);\n" + "+}\n" + "+\n" + " static inline int trypin_tag(unsigned long handle)\n" + " {\n" + " \treturn bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);\n" + "@@ -884,27 +959,93 @@ static void unpin_tag(unsigned long handle)\n" + " \n" + " static void reset_page(struct page *page)\n" + " {\n" + "+\t__ClearPageMovable(page);\n" + " \tclear_bit(PG_private, &page->flags);\n" + " \tclear_bit(PG_private_2, &page->flags);\n" + " \tset_page_private(page, 0);\n" + "-\tpage->index = 0;\n" + "+\tClearPageHugeObject(page);\n" + "+\tpage->freelist = NULL;\n" + " }\n" + " \n" + "-static void free_zspage(struct zs_pool *pool, struct zspage *zspage)\n" + "+/*\n" + "+ * To prevent zspage destroy during migration, zspage freeing should\n" + "+ * hold locks of all pages in the zspage.\n" + "+ */\n" + "+void lock_zspage(struct zspage *zspage)\n" + "+{\n" + "+\tstruct page *page = get_first_page(zspage);\n" + "+\n" + "+\tdo {\n" + "+\t\tlock_page(page);\n" + "+\t} while ((page = get_next_page(page)) != NULL);\n" + "+}\n" + "+\n" + "+int trylock_zspage(struct zspage *zspage)\n" + "+{\n" + "+\tstruct page *cursor, *fail;\n" + "+\n" + "+\tfor (cursor = get_first_page(zspage); cursor != NULL; cursor =\n" + "+\t\t\t\t\tget_next_page(cursor)) {\n" + "+\t\tif (!trylock_page(cursor)) {\n" + "+\t\t\tfail = cursor;\n" + "+\t\t\tgoto unlock;\n" + "+\t\t}\n" + "+\t}\n" + "+\n" + "+\treturn 1;\n" + "+unlock:\n" + "+\tfor (cursor = get_first_page(zspage); cursor != fail; cursor =\n" + "+\t\t\t\t\tget_next_page(cursor))\n" + "+\t\tunlock_page(cursor);\n" + "+\n" + "+\treturn 0;\n" + "+}\n" + "+\n" + "+static void __free_zspage(struct zs_pool *pool, struct size_class *class,\n" + "+\t\t\t\tstruct zspage *zspage)\n" + " {\n" + " \tstruct page *page, *next;\n" + "+\tenum fullness_group fg;\n" + "+\tunsigned int class_idx;\n" + "+\n" + "+\tget_zspage_mapping(zspage, &class_idx, &fg);\n" + "+\n" + "+\tassert_spin_locked(&class->lock);\n" + " \n" + " \tVM_BUG_ON(get_zspage_inuse(zspage));\n" + "+\tVM_BUG_ON(fg != ZS_EMPTY);\n" + " \n" + "-\tnext = page = zspage->first_page;\n" + "+\tnext = page = get_first_page(zspage);\n" + " \tdo {\n" + "-\t\tnext = page->next;\n" + "+\t\tVM_BUG_ON_PAGE(!PageLocked(page), page);\n" + "+\t\tnext = get_next_page(page);\n" + " \t\treset_page(page);\n" + "+\t\tunlock_page(page);\n" + " \t\tput_page(page);\n" + " \t\tpage = next;\n" + " \t} while (page != NULL);\n" + " \n" + " \tcache_free_zspage(pool, zspage);\n" + "+\n" + "+\tzs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(\n" + "+\t\t\tclass->size, class->pages_per_zspage));\n" + "+\tatomic_long_sub(class->pages_per_zspage,\n" + "+\t\t\t\t\t&pool->pages_allocated);\n" + "+}\n" + "+\n" + "+static void free_zspage(struct zs_pool *pool, struct size_class *class,\n" + "+\t\t\t\tstruct zspage *zspage)\n" + "+{\n" + "+\tVM_BUG_ON(get_zspage_inuse(zspage));\n" + "+\tVM_BUG_ON(list_empty(&zspage->list));\n" + "+\n" + "+\tif (!trylock_zspage(zspage)) {\n" + "+\t\tkick_deferred_free(pool);\n" + "+\t\treturn;\n" + "+\t}\n" + "+\n" + "+\tremove_zspage(class, zspage, ZS_EMPTY);\n" + "+\t__free_zspage(pool, class, zspage);\n" + " }\n" + " \n" + " /* Initialize a newly allocated zspage */\n" + "@@ -912,15 +1053,13 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)\n" + " {\n" + " \tunsigned int freeobj = 1;\n" + " \tunsigned long off = 0;\n" + "-\tstruct page *page = zspage->first_page;\n" + "+\tstruct page *page = get_first_page(zspage);\n" + " \n" + " \twhile (page) {\n" + " \t\tstruct page *next_page;\n" + " \t\tstruct link_free *link;\n" + " \t\tvoid *vaddr;\n" + " \n" + "-\t\tset_first_obj_offset(page, off);\n" + "-\n" + " \t\tvaddr = kmap_atomic(page);\n" + " \t\tlink = (struct link_free *)vaddr + off / sizeof(*link);\n" + " \n" + "@@ -952,16 +1091,17 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)\n" + " \tset_freeobj(zspage, 0);\n" + " }\n" + " \n" + "-static void create_page_chain(struct zspage *zspage, struct page *pages[],\n" + "-\t\t\t\tint nr_pages)\n" + "+static void create_page_chain(struct size_class *class, struct zspage *zspage,\n" + "+\t\t\t\tstruct page *pages[])\n" + " {\n" + " \tint i;\n" + " \tstruct page *page;\n" + " \tstruct page *prev_page = NULL;\n" + "+\tint nr_pages = class->pages_per_zspage;\n" + " \n" + " \t/*\n" + " \t * Allocate individual pages and link them together as:\n" + "-\t * 1. all pages are linked together using page->next\n" + "+\t * 1. all pages are linked together using page->freelist\n" + " \t * 2. each sub-page point to zspage using page->private\n" + " \t *\n" + " \t * we set PG_private to identify the first page (i.e. no other sub-page\n" + "@@ -970,16 +1110,18 @@ static void create_page_chain(struct zspage *zspage, struct page *pages[],\n" + " \tfor (i = 0; i < nr_pages; i++) {\n" + " \t\tpage = pages[i];\n" + " \t\tset_page_private(page, (unsigned long)zspage);\n" + "+\t\tpage->freelist = NULL;\n" + " \t\tif (i == 0) {\n" + " \t\t\tzspage->first_page = page;\n" + " \t\t\tSetPagePrivate(page);\n" + "+\t\t\tif (unlikely(class->objs_per_zspage == 1 &&\n" + "+\t\t\t\t\tclass->pages_per_zspage == 1))\n" + "+\t\t\t\tSetPageHugeObject(page);\n" + " \t\t} else {\n" + "-\t\t\tprev_page->next = page;\n" + "+\t\t\tprev_page->freelist = page;\n" + " \t\t}\n" + "-\t\tif (i == nr_pages - 1) {\n" + "+\t\tif (i == nr_pages - 1)\n" + " \t\t\tSetPagePrivate2(page);\n" + "-\t\t\tpage->next = NULL;\n" + "-\t\t}\n" + " \t\tprev_page = page;\n" + " \t}\n" + " }\n" + "@@ -999,6 +1141,8 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,\n" + " \t\treturn NULL;\n" + " \n" + " \tmemset(zspage, 0, sizeof(struct zspage));\n" + "+\tzspage->magic = ZSPAGE_MAGIC;\n" + "+\tmigrate_lock_init(zspage);\n" + " \n" + " \tfor (i = 0; i < class->pages_per_zspage; i++) {\n" + " \t\tstruct page *page;\n" + "@@ -1013,7 +1157,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,\n" + " \t\tpages[i] = page;\n" + " \t}\n" + " \n" + "-\tcreate_page_chain(zspage, pages, class->pages_per_zspage);\n" + "+\tcreate_page_chain(class, zspage, pages);\n" + " \tinit_zspage(class, zspage);\n" + " \n" + " \treturn zspage;\n" + "@@ -1024,7 +1168,7 @@ static struct zspage *find_get_zspage(struct size_class *class)\n" + " \tint i;\n" + " \tstruct zspage *zspage;\n" + " \n" + "-\tfor (i = ZS_ALMOST_FULL; i <= ZS_ALMOST_EMPTY; i++) {\n" + "+\tfor (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {\n" + " \t\tzspage = list_first_entry_or_null(&class->fullness_list[i],\n" + " \t\t\t\tstruct zspage, list);\n" + " \t\tif (zspage)\n" + "@@ -1289,6 +1433,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,\n" + " \tobj = handle_to_obj(handle);\n" + " \tobj_to_location(obj, &page, &obj_idx);\n" + " \tzspage = get_zspage(page);\n" + "+\n" + "+\t/* migration cannot move any subpage in this zspage */\n" + "+\tmigrate_read_lock(zspage);\n" + "+\n" + " \tget_zspage_mapping(zspage, &class_idx, &fg);\n" + " \tclass = pool->size_class[class_idx];\n" + " \toff = (class->size * obj_idx) & ~PAGE_MASK;\n" + "@@ -1309,7 +1457,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,\n" + " \n" + " \tret = __zs_map_object(area, pages, off, class->size);\n" + " out:\n" + "-\tif (!class->huge)\n" + "+\tif (likely(!PageHugeObject(page)))\n" + " \t\tret += ZS_HANDLE_SIZE;\n" + " \n" + " \treturn ret;\n" + "@@ -1348,6 +1496,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)\n" + " \t\t__zs_unmap_object(area, pages, off, class->size);\n" + " \t}\n" + " \tput_cpu_var(zs_map_area);\n" + "+\n" + "+\tmigrate_read_unlock(zspage);\n" + " \tunpin_tag(handle);\n" + " }\n" + " EXPORT_SYMBOL_GPL(zs_unmap_object);\n" + "@@ -1377,7 +1527,7 @@ static unsigned long obj_malloc(struct size_class *class,\n" + " \tvaddr = kmap_atomic(m_page);\n" + " \tlink = (struct link_free *)vaddr + m_offset / sizeof(*link);\n" + " \tset_freeobj(zspage, link->next >> OBJ_ALLOCATED_TAG);\n" + "-\tif (!class->huge)\n" + "+\tif (likely(!PageHugeObject(m_page)))\n" + " \t\t/* record handle in the header of allocated chunk */\n" + " \t\tlink->handle = handle;\n" + " \telse\n" + "@@ -1407,6 +1557,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)\n" + " {\n" + " \tunsigned long handle, obj;\n" + " \tstruct size_class *class;\n" + "+\tenum fullness_group newfg;\n" + " \tstruct zspage *zspage;\n" + " \n" + " \tif (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))\n" + "@@ -1422,28 +1573,37 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)\n" + " \n" + " \tspin_lock(&class->lock);\n" + " \tzspage = find_get_zspage(class);\n" + "-\n" + "-\tif (!zspage) {\n" + "+\tif (likely(zspage)) {\n" + "+\t\tobj = obj_malloc(class, zspage, handle);\n" + "+\t\t/* Now move the zspage to another fullness group, if required */\n" + "+\t\tfix_fullness_group(class, zspage);\n" + "+\t\trecord_obj(handle, obj);\n" + " \t\tspin_unlock(&class->lock);\n" + "-\t\tzspage = alloc_zspage(pool, class, gfp);\n" + "-\t\tif (unlikely(!zspage)) {\n" + "-\t\t\tcache_free_handle(pool, handle);\n" + "-\t\t\treturn 0;\n" + "-\t\t}\n" + " \n" + "-\t\tset_zspage_mapping(zspage, class->index, ZS_EMPTY);\n" + "-\t\tatomic_long_add(class->pages_per_zspage,\n" + "-\t\t\t\t\t&pool->pages_allocated);\n" + "+\t\treturn handle;\n" + "+\t}\n" + " \n" + "-\t\tspin_lock(&class->lock);\n" + "-\t\tzs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(\n" + "-\t\t\t\tclass->size, class->pages_per_zspage));\n" + "+\tspin_unlock(&class->lock);\n" + "+\n" + "+\tzspage = alloc_zspage(pool, class, gfp);\n" + "+\tif (!zspage) {\n" + "+\t\tcache_free_handle(pool, handle);\n" + "+\t\treturn 0;\n" + " \t}\n" + " \n" + "+\tspin_lock(&class->lock);\n" + " \tobj = obj_malloc(class, zspage, handle);\n" + "-\t/* Now move the zspage to another fullness group, if required */\n" + "-\tfix_fullness_group(class, zspage);\n" + "+\tnewfg = get_fullness_group(class, zspage);\n" + "+\tinsert_zspage(class, zspage, newfg);\n" + "+\tset_zspage_mapping(zspage, class->index, newfg);\n" + " \trecord_obj(handle, obj);\n" + "+\tatomic_long_add(class->pages_per_zspage,\n" + "+\t\t\t\t&pool->pages_allocated);\n" + "+\tzs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(\n" + "+\t\t\tclass->size, class->pages_per_zspage));\n" + "+\n" + "+\t/* We completely set up zspage so mark them as movable */\n" + "+\tSetZsPageMovable(pool, zspage);\n" + " \tspin_unlock(&class->lock);\n" + " \n" + " \treturn handle;\n" + "@@ -1484,6 +1644,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)\n" + " \tint class_idx;\n" + " \tstruct size_class *class;\n" + " \tenum fullness_group fullness;\n" + "+\tbool isolated;\n" + " \n" + " \tif (unlikely(!handle))\n" + " \t\treturn;\n" + "@@ -1493,22 +1654,28 @@ void zs_free(struct zs_pool *pool, unsigned long handle)\n" + " \tobj_to_location(obj, &f_page, &f_objidx);\n" + " \tzspage = get_zspage(f_page);\n" + " \n" + "+\tmigrate_read_lock(zspage);\n" + "+\n" + " \tget_zspage_mapping(zspage, &class_idx, &fullness);\n" + " \tclass = pool->size_class[class_idx];\n" + " \n" + " \tspin_lock(&class->lock);\n" + " \tobj_free(class, obj);\n" + " \tfullness = fix_fullness_group(class, zspage);\n" + "-\tif (fullness == ZS_EMPTY) {\n" + "-\t\tzs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(\n" + "-\t\t\t\tclass->size, class->pages_per_zspage));\n" + "-\t\tatomic_long_sub(class->pages_per_zspage,\n" + "-\t\t\t\t&pool->pages_allocated);\n" + "-\t\tfree_zspage(pool, zspage);\n" + "+\tif (fullness != ZS_EMPTY) {\n" + "+\t\tmigrate_read_unlock(zspage);\n" + "+\t\tgoto out;\n" + " \t}\n" + "+\n" + "+\tisolated = is_zspage_isolated(zspage);\n" + "+\tmigrate_read_unlock(zspage);\n" + "+\t/* If zspage is isolated, zs_page_putback will free the zspage */\n" + "+\tif (likely(!isolated))\n" + "+\t\tfree_zspage(pool, class, zspage);\n" + "+out:\n" + "+\n" + " \tspin_unlock(&class->lock);\n" + " \tunpin_tag(handle);\n" + "-\n" + " \tcache_free_handle(pool, handle);\n" + " }\n" + " EXPORT_SYMBOL_GPL(zs_free);\n" + "@@ -1587,12 +1754,13 @@ static unsigned long find_alloced_obj(struct size_class *class,\n" + " \tint offset = 0;\n" + " \tunsigned long handle = 0;\n" + " \tvoid *addr = kmap_atomic(page);\n" + "+\tstruct zspage *zspage = get_zspage(page);\n" + " \n" + "-\toffset = get_first_obj_offset(page);\n" + "+\toffset = get_first_obj_offset(class, get_first_page(zspage), page);\n" + " \toffset += class->size * index;\n" + " \n" + " \twhile (offset < PAGE_SIZE) {\n" + "-\t\thead = obj_to_head(class, page, addr + offset);\n" + "+\t\thead = obj_to_head(page, addr + offset);\n" + " \t\tif (head & OBJ_ALLOCATED_TAG) {\n" + " \t\t\thandle = head & ~OBJ_ALLOCATED_TAG;\n" + " \t\t\tif (trypin_tag(handle))\n" + "@@ -1684,6 +1852,7 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source)\n" + " \t\tzspage = list_first_entry_or_null(&class->fullness_list[fg[i]],\n" + " \t\t\t\t\t\t\tstruct zspage, list);\n" + " \t\tif (zspage) {\n" + "+\t\t\tVM_BUG_ON(is_zspage_isolated(zspage));\n" + " \t\t\tremove_zspage(class, zspage, fg[i]);\n" + " \t\t\treturn zspage;\n" + " \t\t}\n" + "@@ -1704,6 +1873,8 @@ static enum fullness_group putback_zspage(struct size_class *class,\n" + " {\n" + " \tenum fullness_group fullness;\n" + " \n" + "+\tVM_BUG_ON(is_zspage_isolated(zspage));\n" + "+\n" + " \tfullness = get_fullness_group(class, zspage);\n" + " \tinsert_zspage(class, zspage, fullness);\n" + " \tset_zspage_mapping(zspage, class->index, fullness);\n" + "@@ -1711,6 +1882,377 @@ static enum fullness_group putback_zspage(struct size_class *class,\n" + " \treturn fullness;\n" + " }\n" + " \n" + "+#ifdef CONFIG_COMPACTION\n" + "+static struct dentry *zs_mount(struct file_system_type *fs_type,\n" + "+\t\t\t\tint flags, const char *dev_name, void *data)\n" + "+{\n" + "+\tstatic const struct dentry_operations ops = {\n" + "+\t\t.d_dname = simple_dname,\n" + "+\t};\n" + "+\n" + "+\treturn mount_pseudo(fs_type, \"zsmalloc:\", NULL, &ops, ZSMALLOC_MAGIC);\n" + "+}\n" + "+\n" + "+static struct file_system_type zsmalloc_fs = {\n" + "+\t.name\t\t= \"zsmalloc\",\n" + "+\t.mount\t\t= zs_mount,\n" + "+\t.kill_sb\t= kill_anon_super,\n" + "+};\n" + "+\n" + "+static int zsmalloc_mount(void)\n" + "+{\n" + "+\tint ret = 0;\n" + "+\n" + "+\tzsmalloc_mnt = kern_mount(&zsmalloc_fs);\n" + "+\tif (IS_ERR(zsmalloc_mnt))\n" + "+\t\tret = PTR_ERR(zsmalloc_mnt);\n" + "+\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+static void zsmalloc_unmount(void)\n" + "+{\n" + "+\tkern_unmount(zsmalloc_mnt);\n" + "+}\n" + "+\n" + "+static void migrate_lock_init(struct zspage *zspage)\n" + "+{\n" + "+\trwlock_init(&zspage->lock);\n" + "+}\n" + "+\n" + "+static void migrate_read_lock(struct zspage *zspage)\n" + "+{\n" + "+\tread_lock(&zspage->lock);\n" + "+}\n" + "+\n" + "+static void migrate_read_unlock(struct zspage *zspage)\n" + "+{\n" + "+\tread_unlock(&zspage->lock);\n" + "+}\n" + "+\n" + "+static void migrate_write_lock(struct zspage *zspage)\n" + "+{\n" + "+\twrite_lock(&zspage->lock);\n" + "+}\n" + "+\n" + "+static void migrate_write_unlock(struct zspage *zspage)\n" + "+{\n" + "+\twrite_unlock(&zspage->lock);\n" + "+}\n" + "+\n" + "+/* Number of isolated subpage for *page migration* in this zspage */\n" + "+static void inc_zspage_isolation(struct zspage *zspage)\n" + "+{\n" + "+\tzspage->isolated++;\n" + "+}\n" + "+\n" + "+static void dec_zspage_isolation(struct zspage *zspage)\n" + "+{\n" + "+\tzspage->isolated--;\n" + "+}\n" + "+\n" + "+static void replace_sub_page(struct size_class *class, struct zspage *zspage,\n" + "+\t\t\t\tstruct page *newpage, struct page *oldpage)\n" + "+{\n" + "+\tstruct page *page;\n" + "+\tstruct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };\n" + "+\tint idx = 0;\n" + "+\n" + "+\tpage = get_first_page(zspage);\n" + "+\tdo {\n" + "+\t\tif (page == oldpage)\n" + "+\t\t\tpages[idx] = newpage;\n" + "+\t\telse\n" + "+\t\t\tpages[idx] = page;\n" + "+\t\tidx++;\n" + "+\t} while ((page = get_next_page(page)) != NULL);\n" + "+\n" + "+\tcreate_page_chain(class, zspage, pages);\n" + "+\tif (unlikely(PageHugeObject(oldpage)))\n" + "+\t\tnewpage->index = oldpage->index;\n" + "+\t__SetPageMovable(newpage, page_mapping(oldpage));\n" + "+}\n" + "+\n" + "+bool zs_page_isolate(struct page *page, isolate_mode_t mode)\n" + "+{\n" + "+\tstruct zs_pool *pool;\n" + "+\tstruct size_class *class;\n" + "+\tint class_idx;\n" + "+\tenum fullness_group fullness;\n" + "+\tstruct zspage *zspage;\n" + "+\tstruct address_space *mapping;\n" + "+\n" + "+\t/*\n" + "+\t * Page is locked so zspage couldn't be destroyed. For detail, look at\n" + "+\t * lock_zspage in free_zspage.\n" + "+\t */\n" + "+\tVM_BUG_ON_PAGE(!PageMovable(page), page);\n" + "+\tVM_BUG_ON_PAGE(PageIsolated(page), page);\n" + "+\n" + "+\tzspage = get_zspage(page);\n" + "+\n" + "+\t/*\n" + "+\t * Without class lock, fullness could be stale while class_idx is okay\n" + "+\t * because class_idx is constant unless page is freed so we should get\n" + "+\t * fullness again under class lock.\n" + "+\t */\n" + "+\tget_zspage_mapping(zspage, &class_idx, &fullness);\n" + "+\tmapping = page_mapping(page);\n" + "+\tpool = mapping->private_data;\n" + "+\tclass = pool->size_class[class_idx];\n" + "+\n" + "+\tspin_lock(&class->lock);\n" + "+\tif (get_zspage_inuse(zspage) == 0) {\n" + "+\t\tspin_unlock(&class->lock);\n" + "+\t\treturn false;\n" + "+\t}\n" + "+\n" + "+\t/* zspage is isolated for object migration */\n" + "+\tif (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {\n" + "+\t\tspin_unlock(&class->lock);\n" + "+\t\treturn false;\n" + "+\t}\n" + "+\n" + "+\t/*\n" + "+\t * If this is first time isolation for the zspage, isolate zspage from\n" + "+\t * size_class to prevent further object allocation from the zspage.\n" + "+\t */\n" + "+\tif (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {\n" + "+\t\tget_zspage_mapping(zspage, &class_idx, &fullness);\n" + "+\t\tremove_zspage(class, zspage, fullness);\n" + "+\t}\n" + "+\n" + "+\tinc_zspage_isolation(zspage);\n" + "+\tspin_unlock(&class->lock);\n" + "+\n" + "+\treturn true;\n" + "+}\n" + "+\n" + "+int zs_page_migrate(struct address_space *mapping, struct page *newpage,\n" + "+\t\tstruct page *page, enum migrate_mode mode)\n" + "+{\n" + "+\tstruct zs_pool *pool;\n" + "+\tstruct size_class *class;\n" + "+\tint class_idx;\n" + "+\tenum fullness_group fullness;\n" + "+\tstruct zspage *zspage;\n" + "+\tstruct page *dummy;\n" + "+\tvoid *s_addr, *d_addr, *addr;\n" + "+\tint offset, pos;\n" + "+\tunsigned long handle, head;\n" + "+\tunsigned long old_obj, new_obj;\n" + "+\tunsigned int obj_idx;\n" + "+\tint ret = -EAGAIN;\n" + "+\n" + "+\tVM_BUG_ON_PAGE(!PageMovable(page), page);\n" + "+\tVM_BUG_ON_PAGE(!PageIsolated(page), page);\n" + "+\n" + "+\tzspage = get_zspage(page);\n" + "+\n" + "+\t/* Concurrent compactor cannot migrate any subpage in zspage */\n" + "+\tmigrate_write_lock(zspage);\n" + "+\tget_zspage_mapping(zspage, &class_idx, &fullness);\n" + "+\tpool = mapping->private_data;\n" + "+\tclass = pool->size_class[class_idx];\n" + "+\toffset = get_first_obj_offset(class, get_first_page(zspage), page);\n" + "+\n" + "+\tspin_lock(&class->lock);\n" + "+\tif (!get_zspage_inuse(zspage)) {\n" + "+\t\tret = -EBUSY;\n" + "+\t\tgoto unlock_class;\n" + "+\t}\n" + "+\n" + "+\tpos = offset;\n" + "+\ts_addr = kmap_atomic(page);\n" + "+\twhile (pos < PAGE_SIZE) {\n" + "+\t\thead = obj_to_head(page, s_addr + pos);\n" + "+\t\tif (head & OBJ_ALLOCATED_TAG) {\n" + "+\t\t\thandle = head & ~OBJ_ALLOCATED_TAG;\n" + "+\t\t\tif (!trypin_tag(handle))\n" + "+\t\t\t\tgoto unpin_objects;\n" + "+\t\t}\n" + "+\t\tpos += class->size;\n" + "+\t}\n" + "+\n" + "+\t/*\n" + "+\t * Here, any user cannot access all objects in the zspage so let's move.\n" + "+\t */\n" + "+\td_addr = kmap_atomic(newpage);\n" + "+\tmemcpy(d_addr, s_addr, PAGE_SIZE);\n" + "+\tkunmap_atomic(d_addr);\n" + "+\n" + "+\tfor (addr = s_addr + offset; addr < s_addr + pos;\n" + "+\t\t\t\t\taddr += class->size) {\n" + "+\t\thead = obj_to_head(page, addr);\n" + "+\t\tif (head & OBJ_ALLOCATED_TAG) {\n" + "+\t\t\thandle = head & ~OBJ_ALLOCATED_TAG;\n" + "+\t\t\tif (!testpin_tag(handle))\n" + "+\t\t\t\tBUG();\n" + "+\n" + "+\t\t\told_obj = handle_to_obj(handle);\n" + "+\t\t\tobj_to_location(old_obj, &dummy, &obj_idx);\n" + "+\t\t\tnew_obj = (unsigned long)location_to_obj(newpage,\n" + "+\t\t\t\t\t\t\t\tobj_idx);\n" + "+\t\t\tnew_obj |= BIT(HANDLE_PIN_BIT);\n" + "+\t\t\trecord_obj(handle, new_obj);\n" + "+\t\t}\n" + "+\t}\n" + "+\n" + "+\treplace_sub_page(class, zspage, newpage, page);\n" + "+\tget_page(newpage);\n" + "+\n" + "+\tdec_zspage_isolation(zspage);\n" + "+\n" + "+\t/*\n" + "+\t * Page migration is done so let's putback isolated zspage to\n" + "+\t * the list if @page is final isolated subpage in the zspage.\n" + "+\t */\n" + "+\tif (!is_zspage_isolated(zspage))\n" + "+\t\tputback_zspage(class, zspage);\n" + "+\n" + "+\treset_page(page);\n" + "+\tput_page(page);\n" + "+\tpage = newpage;\n" + "+\n" + "+\tret = 0;\n" + "+unpin_objects:\n" + "+\tfor (addr = s_addr + offset; addr < s_addr + pos;\n" + "+\t\t\t\t\t\taddr += class->size) {\n" + "+\t\thead = obj_to_head(page, addr);\n" + "+\t\tif (head & OBJ_ALLOCATED_TAG) {\n" + "+\t\t\thandle = head & ~OBJ_ALLOCATED_TAG;\n" + "+\t\t\tif (!testpin_tag(handle))\n" + "+\t\t\t\tBUG();\n" + "+\t\t\tunpin_tag(handle);\n" + "+\t\t}\n" + "+\t}\n" + "+\tkunmap_atomic(s_addr);\n" + "+unlock_class:\n" + "+\tspin_unlock(&class->lock);\n" + "+\tmigrate_write_unlock(zspage);\n" + "+\n" + "+\treturn ret;\n" + "+}\n" + "+\n" + "+void zs_page_putback(struct page *page)\n" + "+{\n" + "+\tstruct zs_pool *pool;\n" + "+\tstruct size_class *class;\n" + "+\tint class_idx;\n" + "+\tenum fullness_group fg;\n" + "+\tstruct address_space *mapping;\n" + "+\tstruct zspage *zspage;\n" + "+\n" + "+\tVM_BUG_ON_PAGE(!PageMovable(page), page);\n" + "+\tVM_BUG_ON_PAGE(!PageIsolated(page), page);\n" + "+\n" + "+\tzspage = get_zspage(page);\n" + "+\tget_zspage_mapping(zspage, &class_idx, &fg);\n" + "+\tmapping = page_mapping(page);\n" + "+\tpool = mapping->private_data;\n" + "+\tclass = pool->size_class[class_idx];\n" + "+\n" + "+\tspin_lock(&class->lock);\n" + "+\tdec_zspage_isolation(zspage);\n" + "+\tif (!is_zspage_isolated(zspage)) {\n" + "+\t\tfg = putback_zspage(class, zspage);\n" + "+\t\t/*\n" + "+\t\t * Due to page_lock, we cannot free zspage immediately\n" + "+\t\t * so let's defer.\n" + "+\t\t */\n" + "+\t\tif (fg == ZS_EMPTY)\n" + "+\t\t\tschedule_work(&pool->free_work);\n" + "+\t}\n" + "+\tspin_unlock(&class->lock);\n" + "+}\n" + "+\n" + "+const struct address_space_operations zsmalloc_aops = {\n" + "+\t.isolate_page = zs_page_isolate,\n" + "+\t.migratepage = zs_page_migrate,\n" + "+\t.putback_page = zs_page_putback,\n" + "+};\n" + "+\n" + "+static int zs_register_migration(struct zs_pool *pool)\n" + "+{\n" + "+\tpool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);\n" + "+\tif (IS_ERR(pool->inode)) {\n" + "+\t\tpool->inode = NULL;\n" + "+\t\treturn 1;\n" + "+\t}\n" + "+\n" + "+\tpool->inode->i_mapping->private_data = pool;\n" + "+\tpool->inode->i_mapping->a_ops = &zsmalloc_aops;\n" + "+\treturn 0;\n" + "+}\n" + "+\n" + "+static void zs_unregister_migration(struct zs_pool *pool)\n" + "+{\n" + "+\tflush_work(&pool->free_work);\n" + "+\tif (pool->inode)\n" + "+\t\tiput(pool->inode);\n" + "+}\n" + "+\n" + "+/*\n" + "+ * Caller should hold page_lock of all pages in the zspage\n" + "+ * In here, we cannot use zspage meta data.\n" + "+ */\n" + "+static void async_free_zspage(struct work_struct *work)\n" + "+{\n" + "+\tint i;\n" + "+\tstruct size_class *class;\n" + "+\tunsigned int class_idx;\n" + "+\tenum fullness_group fullness;\n" + "+\tstruct zspage *zspage, *tmp;\n" + "+\tLIST_HEAD(free_pages);\n" + "+\tstruct zs_pool *pool = container_of(work, struct zs_pool,\n" + "+\t\t\t\t\tfree_work);\n" + "+\n" + "+\tfor (i = 0; i < zs_size_classes; i++) {\n" + "+\t\tclass = pool->size_class[i];\n" + "+\t\tif (class->index != i)\n" + "+\t\t\tcontinue;\n" + "+\n" + "+\t\tspin_lock(&class->lock);\n" + "+\t\tlist_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);\n" + "+\t\tspin_unlock(&class->lock);\n" + "+\t}\n" + "+\n" + "+\n" + "+\tlist_for_each_entry_safe(zspage, tmp, &free_pages, list) {\n" + "+\t\tlist_del(&zspage->list);\n" + "+\t\tlock_zspage(zspage);\n" + "+\n" + "+\t\tget_zspage_mapping(zspage, &class_idx, &fullness);\n" + "+\t\tVM_BUG_ON(fullness != ZS_EMPTY);\n" + "+\t\tclass = pool->size_class[class_idx];\n" + "+\t\tspin_lock(&class->lock);\n" + "+\t\t__free_zspage(pool, pool->size_class[class_idx], zspage);\n" + "+\t\tspin_unlock(&class->lock);\n" + "+\t}\n" + "+};\n" + "+\n" + "+static void kick_deferred_free(struct zs_pool *pool)\n" + "+{\n" + "+\tschedule_work(&pool->free_work);\n" + "+}\n" + "+\n" + "+static void init_deferred_free(struct zs_pool *pool)\n" + "+{\n" + "+\tINIT_WORK(&pool->free_work, async_free_zspage);\n" + "+}\n" + "+\n" + "+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)\n" + "+{\n" + "+\tstruct page *page = get_first_page(zspage);\n" + "+\n" + "+\tdo {\n" + "+\t\tWARN_ON(!trylock_page(page));\n" + "+\t\t__SetPageMovable(page, pool->inode->i_mapping);\n" + "+\t\tunlock_page(page);\n" + "+\t} while ((page = get_next_page(page)) != NULL);\n" + "+}\n" + "+#endif\n" + "+\n" + " /*\n" + " *\n" + " * Based on the number of unused allocated objects calculate\n" + "@@ -1745,10 +2287,10 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)\n" + " \t\t\tbreak;\n" + " \n" + " \t\tcc.index = 0;\n" + "-\t\tcc.s_page = src_zspage->first_page;\n" + "+\t\tcc.s_page = get_first_page(src_zspage);\n" + " \n" + " \t\twhile ((dst_zspage = isolate_zspage(class, false))) {\n" + "-\t\t\tcc.d_page = dst_zspage->first_page;\n" + "+\t\t\tcc.d_page = get_first_page(dst_zspage);\n" + " \t\t\t/*\n" + " \t\t\t * If there is no more space in dst_page, resched\n" + " \t\t\t * and see if anyone had allocated another zspage.\n" + "@@ -1765,11 +2307,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)\n" + " \n" + " \t\tputback_zspage(class, dst_zspage);\n" + " \t\tif (putback_zspage(class, src_zspage) == ZS_EMPTY) {\n" + "-\t\t\tzs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(\n" + "-\t\t\t\t\tclass->size, class->pages_per_zspage));\n" + "-\t\t\tatomic_long_sub(class->pages_per_zspage,\n" + "-\t\t\t\t\t&pool->pages_allocated);\n" + "-\t\t\tfree_zspage(pool, src_zspage);\n" + "+\t\t\tfree_zspage(pool, class, src_zspage);\n" + " \t\t\tpool->stats.pages_compacted += class->pages_per_zspage;\n" + " \t\t}\n" + " \t\tspin_unlock(&class->lock);\n" + "@@ -1885,6 +2423,7 @@ struct zs_pool *zs_create_pool(const char *name)\n" + " \tif (!pool)\n" + " \t\treturn NULL;\n" + " \n" + "+\tinit_deferred_free(pool);\n" + " \tpool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),\n" + " \t\t\tGFP_KERNEL);\n" + " \tif (!pool->size_class) {\n" + "@@ -1939,12 +2478,10 @@ struct zs_pool *zs_create_pool(const char *name)\n" + " \t\tclass->pages_per_zspage = pages_per_zspage;\n" + " \t\tclass->objs_per_zspage = class->pages_per_zspage *\n" + " \t\t\t\t\t\tPAGE_SIZE / class->size;\n" + "-\t\tif (pages_per_zspage == 1 && class->objs_per_zspage == 1)\n" + "-\t\t\tclass->huge = true;\n" + " \t\tspin_lock_init(&class->lock);\n" + " \t\tpool->size_class[i] = class;\n" + "-\t\tfor (fullness = ZS_ALMOST_FULL; fullness <= ZS_ALMOST_EMPTY;\n" + "-\t\t\t\t\t\t\t\tfullness++)\n" + "+\t\tfor (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;\n" + "+\t\t\t\t\t\t\tfullness++)\n" + " \t\t\tINIT_LIST_HEAD(&class->fullness_list[fullness]);\n" + " \n" + " \t\tprev_class = class;\n" + "@@ -1953,6 +2490,9 @@ struct zs_pool *zs_create_pool(const char *name)\n" + " \t/* debug only, don't abort if it fails */\n" + " \tzs_pool_stat_create(pool, name);\n" + " \n" + "+\tif (zs_register_migration(pool))\n" + "+\t\tgoto err;\n" + "+\n" + " \t/*\n" + " \t * Not critical, we still can use the pool\n" + " \t * and user can trigger compaction manually.\n" + "@@ -1972,6 +2512,7 @@ void zs_destroy_pool(struct zs_pool *pool)\n" + " \tint i;\n" + " \n" + " \tzs_unregister_shrinker(pool);\n" + "+\tzs_unregister_migration(pool);\n" + " \tzs_pool_stat_destroy(pool);\n" + " \n" + " \tfor (i = 0; i < zs_size_classes; i++) {\n" + "@@ -1984,7 +2525,7 @@ void zs_destroy_pool(struct zs_pool *pool)\n" + " \t\tif (class->index != i)\n" + " \t\t\tcontinue;\n" + " \n" + "-\t\tfor (fg = ZS_ALMOST_FULL; fg <= ZS_ALMOST_EMPTY; fg++) {\n" + "+\t\tfor (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {\n" + " \t\t\tif (!list_empty(&class->fullness_list[fg])) {\n" + " \t\t\t\tpr_info(\"Freeing non-empty class with size %db, fullness group %d\\n\",\n" + " \t\t\t\t\tclass->size, fg);\n" + "@@ -2002,7 +2543,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);\n" + " \n" + " static int __init zs_init(void)\n" + " {\n" + "-\tint ret = zs_register_cpu_notifier();\n" + "+\tint ret;\n" + "+\n" + "+\tret = zsmalloc_mount();\n" + "+\tif (ret)\n" + "+\t\tgoto out;\n" + "+\n" + "+\tret = zs_register_cpu_notifier();\n" + " \n" + " \tif (ret)\n" + " \t\tgoto notifier_fail;\n" + "@@ -2019,7 +2566,8 @@ static int __init zs_init(void)\n" + " \n" + " notifier_fail:\n" + " \tzs_unregister_cpu_notifier();\n" + "-\n" + "+\tzsmalloc_unmount();\n" + "+out:\n" + " \treturn ret;\n" + " }\n" + " \n" + "@@ -2028,6 +2576,7 @@ static void __exit zs_exit(void)\n" + " #ifdef CONFIG_ZPOOL\n" + " \tzpool_unregister_driver(&zs_zpool_driver);\n" + " #endif\n" + "+\tzsmalloc_unmount();\n" + " \tzs_unregister_cpu_notifier();\n" + " \n" + " \tzs_stat_exit();\n" + "-- \n" + 1.9.1 -a1c2acab743285edd666efc0d67fbd835126a7da73632a7c5eac03d9c209f0dc +6e6939e85ab37f8514cf4fca36b16778ab9643b1aada135fc6cbf491c610aaa9
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.