Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* [PATCH v3 06/16] zsmalloc: squeeze inuse into page->mapping
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

Currently, we store class:fullness into page->mapping.
The number of class we can support is 255 and fullness is 4 so
(8 + 2 = 10bit) is enough to represent them.
Meanwhile, the bits we need to store in-use objects in zspage
is that 11bit is enough.

For example, If we assume that 64K PAGE_SIZE, class_size 32
which is worst case, class->pages_per_zspage become 1 so
the number of objects in zspage is 2048 so 11bit is enough.
The next class is 32 + 256(i.e., ZS_SIZE_CLASS_DELTA).
With worst case that ZS_MAX_PAGES_PER_ZSPAGE, 64K * 4 /
(32 + 256) = 910 so 11bit is still enough.

So, we could squeeze inuse object count to page->mapping.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 103 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 32 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 8649d0243e6c..4dd72a803568 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -34,8 +34,7 @@
  *		metadata.
  *	page->lru: links together first pages of various zspages.
  *		Basically forming list of zspages in a fullness group.
- *	page->mapping: class index and fullness group of the zspage
- *	page->inuse: the number of objects that are used in this zspage
+ *	page->mapping: override by struct zs_meta
  *
  * Usage of struct page flags:
  *	PG_private: identifies the first component page
@@ -132,6 +131,13 @@
 /* each chunk includes extra space to keep handle */
 #define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
 
+#define CLASS_BITS	8
+#define CLASS_MASK	((1 << CLASS_BITS) - 1)
+#define FULLNESS_BITS	2
+#define FULLNESS_MASK	((1 << FULLNESS_BITS) - 1)
+#define INUSE_BITS	11
+#define INUSE_MASK	((1 << INUSE_BITS) - 1)
+
 /*
  * On systems with 4K page size, this gives 255 size classes! There is a
  * trader-off here:
@@ -145,7 +151,7 @@
  *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
  *  (reason above)
  */
-#define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> CLASS_BITS)
 
 /*
  * We do not maintain any list for completely empty or full pages
@@ -155,7 +161,7 @@ enum fullness_group {
 	ZS_ALMOST_EMPTY,
 	_ZS_NR_FULLNESS_GROUPS,
 
-	ZS_EMPTY,
+	ZS_EMPTY = _ZS_NR_FULLNESS_GROUPS,
 	ZS_FULL
 };
 
@@ -263,14 +269,11 @@ struct zs_pool {
 #endif
 };
 
-/*
- * A zspage's class index and fullness group
- * are encoded in its (first)page->mapping
- */
-#define CLASS_IDX_BITS	28
-#define FULLNESS_BITS	4
-#define CLASS_IDX_MASK	((1 << CLASS_IDX_BITS) - 1)
-#define FULLNESS_MASK	((1 << FULLNESS_BITS) - 1)
+struct zs_meta {
+	unsigned long class:CLASS_BITS;
+	unsigned long fullness:FULLNESS_BITS;
+	unsigned long inuse:INUSE_BITS;
+};
 
 struct mapping_area {
 #ifdef CONFIG_PGTABLE_MAPPING
@@ -412,28 +415,61 @@ static int is_last_page(struct page *page)
 	return PagePrivate2(page);
 }
 
+static int get_zspage_inuse(struct page *first_page)
+{
+	struct zs_meta *m;
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	m = (struct zs_meta *)&first_page->mapping;
+
+	return m->inuse;
+}
+
+static void set_zspage_inuse(struct page *first_page, int val)
+{
+	struct zs_meta *m;
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	m = (struct zs_meta *)&first_page->mapping;
+	m->inuse = val;
+}
+
+static void mod_zspage_inuse(struct page *first_page, int val)
+{
+	struct zs_meta *m;
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	m = (struct zs_meta *)&first_page->mapping;
+	m->inuse += val;
+}
+
 static void get_zspage_mapping(struct page *first_page,
 				unsigned int *class_idx,
 				enum fullness_group *fullness)
 {
-	unsigned long m;
+	struct zs_meta *m;
+
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (unsigned long)first_page->mapping;
-	*fullness = m & FULLNESS_MASK;
-	*class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+	m = (struct zs_meta *)&first_page->mapping;
+	*fullness = m->fullness;
+	*class_idx = m->class;
 }
 
 static void set_zspage_mapping(struct page *first_page,
 				unsigned int class_idx,
 				enum fullness_group fullness)
 {
-	unsigned long m;
+	struct zs_meta *m;
+
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
-			(fullness & FULLNESS_MASK);
-	first_page->mapping = (struct address_space *)m;
+	m = (struct zs_meta *)&first_page->mapping;
+	m->fullness = fullness;
+	m->class = class_idx;
 }
 
 /*
@@ -632,9 +668,7 @@ static enum fullness_group get_fullness_group(struct size_class *class,
 	int inuse, objs_per_zspage;
 	enum fullness_group fg;
 
-	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-	inuse = first_page->inuse;
+	inuse = get_zspage_inuse(first_page);
 	objs_per_zspage = class->objs_per_zspage;
 
 	if (inuse == 0)
@@ -677,10 +711,10 @@ static void insert_zspage(struct size_class *class,
 
 	/*
 	 * We want to see more ZS_FULL pages and less almost
-	 * empty/full. Put pages with higher ->inuse first.
+	 * empty/full. Put pages with higher inuse first.
 	 */
 	list_add_tail(&first_page->lru, &(*head)->lru);
-	if (first_page->inuse >= (*head)->inuse)
+	if (get_zspage_inuse(first_page) >= get_zspage_inuse(*head))
 		*head = first_page;
 }
 
@@ -896,7 +930,7 @@ static void free_zspage(struct page *first_page)
 	struct page *nextp, *tmp, *head_extra;
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-	VM_BUG_ON_PAGE(first_page->inuse, first_page);
+	VM_BUG_ON_PAGE(get_zspage_inuse(first_page), first_page);
 
 	head_extra = (struct page *)page_private(first_page);
 
@@ -992,7 +1026,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 			SetPagePrivate(page);
 			set_page_private(page, 0);
 			first_page = page;
-			first_page->inuse = 0;
+			set_zspage_inuse(page, 0);
 		}
 		if (i == 1)
 			set_page_private(first_page, (unsigned long)page);
@@ -1237,9 +1271,7 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 
 static bool zspage_full(struct size_class *class, struct page *first_page)
 {
-	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-	return first_page->inuse == class->objs_per_zspage;
+	return get_zspage_inuse(first_page) == class->objs_per_zspage;
 }
 
 unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1372,7 +1404,7 @@ static unsigned long obj_malloc(struct size_class *class,
 		/* record handle in first_page->private */
 		set_page_private(first_page, handle);
 	kunmap_atomic(vaddr);
-	first_page->inuse++;
+	mod_zspage_inuse(first_page, 1);
 	zs_stat_inc(class, OBJ_USED, 1);
 
 	return obj;
@@ -1457,7 +1489,7 @@ static void obj_free(struct size_class *class, unsigned long obj)
 		set_page_private(first_page, 0);
 	kunmap_atomic(vaddr);
 	first_page->freelist = (void *)obj;
-	first_page->inuse--;
+	mod_zspage_inuse(first_page, -1);
 	zs_stat_dec(class, OBJ_USED, 1);
 }
 
@@ -2002,6 +2034,13 @@ static int __init zs_init(void)
 	if (ret)
 		goto notifier_fail;
 
+	/*
+	 * A zspage's class index, fullness group, inuse object count are
+	 * encoded in its (first)page->mapping so sizeof(struct zs_meta)
+	 * should be less than sizeof(page->mapping(i.e., unsigned long)).
+	 */
+	BUILD_BUG_ON(sizeof(struct zs_meta) > sizeof(unsigned long));
+
 	init_zs_size_classes();
 
 #ifdef CONFIG_ZPOOL
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 07/16] zsmalloc: remove page_mapcount_reset
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

We don't use page->_mapcount any more so no need to reset.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 4dd72a803568..0f6cce9b9119 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -922,7 +922,6 @@ static void reset_page(struct page *page)
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	page->freelist = NULL;
-	page_mapcount_reset(page);
 }
 
 static void free_zspage(struct page *first_page)
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 08/16] zsmalloc: squeeze freelist into page->mapping
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

Zsmalloc stores first free object's position into first_page->freelist
in each zspage. If we change it with object index from first_page
instead of location, we could squeeze it into page->mapping because
the number of bit we need to store offset is at most 11bit.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 158 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 96 insertions(+), 62 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0f6cce9b9119..807998462539 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -18,9 +18,7 @@
  * Usage of struct page fields:
  *	page->private: points to the first component (0-order) page
  *	page->index (union with page->freelist): offset of the first object
- *		starting in this page. For the first page, this is
- *		always 0, so we use this field (aka freelist) to point
- *		to the first free object in zspage.
+ *		starting in this page.
  *	page->lru: links together all component pages (except the first page)
  *		of a zspage
  *
@@ -29,9 +27,6 @@
  *	page->private: refers to the component page after the first page
  *		If the page is first_page for huge object, it stores handle.
  *		Look at size_class->huge.
- *	page->freelist: points to the first free object in zspage.
- *		Free objects are linked together using in-place
- *		metadata.
  *	page->lru: links together first pages of various zspages.
  *		Basically forming list of zspages in a fullness group.
  *	page->mapping: override by struct zs_meta
@@ -131,6 +126,7 @@
 /* each chunk includes extra space to keep handle */
 #define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
 
+#define FREEOBJ_BITS 11
 #define CLASS_BITS	8
 #define CLASS_MASK	((1 << CLASS_BITS) - 1)
 #define FULLNESS_BITS	2
@@ -228,17 +224,17 @@ struct size_class {
 
 /*
  * Placed within free objects to form a singly linked list.
- * For every zspage, first_page->freelist gives head of this list.
+ * For every zspage, first_page->freeobj gives head of this list.
  *
  * This must be power of 2 and less than or equal to ZS_ALIGN
  */
 struct link_free {
 	union {
 		/*
-		 * Position of next free chunk (encodes <PFN, obj_idx>)
+		 * free object list
 		 * It's valid for non-allocated object
 		 */
-		void *next;
+		unsigned long next;
 		/*
 		 * Handle of allocated object.
 		 */
@@ -270,6 +266,7 @@ struct zs_pool {
 };
 
 struct zs_meta {
+	unsigned long freeobj:FREEOBJ_BITS;
 	unsigned long class:CLASS_BITS;
 	unsigned long fullness:FULLNESS_BITS;
 	unsigned long inuse:INUSE_BITS;
@@ -446,6 +443,26 @@ static void mod_zspage_inuse(struct page *first_page, int val)
 	m->inuse += val;
 }
 
+static void set_freeobj(struct page *first_page, int idx)
+{
+	struct zs_meta *m;
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	m = (struct zs_meta *)&first_page->mapping;
+	m->freeobj = idx;
+}
+
+static unsigned long get_freeobj(struct page *first_page)
+{
+	struct zs_meta *m;
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	m = (struct zs_meta *)&first_page->mapping;
+	return m->freeobj;
+}
+
 static void get_zspage_mapping(struct page *first_page,
 				unsigned int *class_idx,
 				enum fullness_group *fullness)
@@ -837,30 +854,33 @@ static struct page *get_next_page(struct page *page)
 	return next;
 }
 
-/*
- * Encode <page, obj_idx> as a single handle value.
- * We use the least bit of handle for tagging.
- */
-static void *location_to_obj(struct page *page, unsigned long obj_idx)
+static void objidx_to_page_and_offset(struct size_class *class,
+				struct page *first_page,
+				unsigned long obj_idx,
+				struct page **obj_page,
+				unsigned long *offset_in_page)
 {
-	unsigned long obj;
+	int i;
+	unsigned long offset;
+	struct page *cursor;
+	int nr_page;
 
-	if (!page) {
-		VM_BUG_ON(obj_idx);
-		return NULL;
-	}
+	offset = obj_idx * class->size;
+	cursor = first_page;
+	nr_page = offset >> PAGE_SHIFT;
 
-	obj = page_to_pfn(page) << OBJ_INDEX_BITS;
-	obj |= ((obj_idx) & OBJ_INDEX_MASK);
-	obj <<= OBJ_TAG_BITS;
+	*offset_in_page = offset & ~PAGE_MASK;
+
+	for (i = 0; i < nr_page; i++)
+		cursor = get_next_page(cursor);
 
-	return (void *)obj;
+	*obj_page = cursor;
 }
 
-/*
- * Decode <page, obj_idx> pair from the given object handle. We adjust the
- * decoded obj_idx back to its original value since it was adjusted in
- * location_to_obj().
+/**
+ * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * @page: page object resides in zspage
+ * @obj_idx: object index
  */
 static void obj_to_location(unsigned long obj, struct page **page,
 				unsigned long *obj_idx)
@@ -870,6 +890,23 @@ static void obj_to_location(unsigned long obj, struct page **page,
 	*obj_idx = (obj & OBJ_INDEX_MASK);
 }
 
+/**
+ * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
+ * @page: page object resides in zspage
+ * @obj_idx: object index
+ */
+static unsigned long location_to_obj(struct page *page,
+				unsigned long obj_idx)
+{
+	unsigned long obj;
+
+	obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+	obj |= obj_idx & OBJ_INDEX_MASK;
+	obj <<= OBJ_TAG_BITS;
+
+	return obj;
+}
+
 static unsigned long handle_to_obj(unsigned long handle)
 {
 	return *(unsigned long *)handle;
@@ -885,17 +922,6 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
 		return *(unsigned long *)obj;
 }
 
-static unsigned long obj_idx_to_offset(struct page *page,
-				unsigned long obj_idx, int class_size)
-{
-	unsigned long off = 0;
-
-	if (!is_first_page(page))
-		off = page->index;
-
-	return off + obj_idx * class_size;
-}
-
 static inline int trypin_tag(unsigned long handle)
 {
 	unsigned long *ptr = (unsigned long *)handle;
@@ -952,6 +978,7 @@ static void free_zspage(struct page *first_page)
 /* Initialize a newly allocated zspage */
 static void init_zspage(struct size_class *class, struct page *first_page)
 {
+	int freeobj = 1;
 	unsigned long off = 0;
 	struct page *page = first_page;
 
@@ -960,14 +987,11 @@ static void init_zspage(struct size_class *class, struct page *first_page)
 	while (page) {
 		struct page *next_page;
 		struct link_free *link;
-		unsigned int i = 1;
 		void *vaddr;
 
 		/*
 		 * page->index stores offset of first object starting
-		 * in the page. For the first page, this is always 0,
-		 * so we use first_page->index (aka ->freelist) to store
-		 * head of corresponding zspage's freelist.
+		 * in the page.
 		 */
 		if (page != first_page)
 			page->index = off;
@@ -976,7 +1000,7 @@ static void init_zspage(struct size_class *class, struct page *first_page)
 		link = (struct link_free *)vaddr + off / sizeof(*link);
 
 		while ((off += class->size) < PAGE_SIZE) {
-			link->next = location_to_obj(page, i++);
+			link->next = freeobj++ << OBJ_ALLOCATED_TAG;
 			link += class->size / sizeof(*link);
 		}
 
@@ -986,11 +1010,21 @@ static void init_zspage(struct size_class *class, struct page *first_page)
 		 * page (if present)
 		 */
 		next_page = get_next_page(page);
-		link->next = location_to_obj(next_page, 0);
+		if (next_page) {
+			link->next = freeobj++ << OBJ_ALLOCATED_TAG;
+		} else {
+			/*
+			 * Reset OBJ_ALLOCATED_TAG bit to last link for
+			 * migration to know it is allocated object or not.
+			 */
+			link->next = -1 << OBJ_ALLOCATED_TAG;
+		}
 		kunmap_atomic(vaddr);
 		page = next_page;
 		off %= PAGE_SIZE;
 	}
+
+	set_freeobj(first_page, 0);
 }
 
 /*
@@ -1040,7 +1074,6 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 
 	init_zspage(class, first_page);
 
-	first_page->freelist = location_to_obj(first_page, 0);
 	error = 0; /* Success */
 
 cleanup:
@@ -1320,7 +1353,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	obj_to_location(obj, &page, &obj_idx);
 	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
 	class = pool->size_class[class_idx];
-	off = obj_idx_to_offset(page, obj_idx, class->size);
+	off = (class->size * obj_idx) & ~PAGE_MASK;
 
 	area = &get_cpu_var(zs_map_area);
 	area->vm_mm = mm;
@@ -1359,7 +1392,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 	obj_to_location(obj, &page, &obj_idx);
 	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
 	class = pool->size_class[class_idx];
-	off = obj_idx_to_offset(page, obj_idx, class->size);
+	off = (class->size * obj_idx) & ~PAGE_MASK;
 
 	area = this_cpu_ptr(&zs_map_area);
 	if (off + class->size <= PAGE_SIZE)
@@ -1385,17 +1418,17 @@ static unsigned long obj_malloc(struct size_class *class,
 	struct link_free *link;
 
 	struct page *m_page;
-	unsigned long m_objidx, m_offset;
+	unsigned long m_offset;
 	void *vaddr;
 
 	handle |= OBJ_ALLOCATED_TAG;
-	obj = (unsigned long)first_page->freelist;
-	obj_to_location(obj, &m_page, &m_objidx);
-	m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+	obj = get_freeobj(first_page);
+	objidx_to_page_and_offset(class, first_page, obj,
+				&m_page, &m_offset);
 
 	vaddr = kmap_atomic(m_page);
 	link = (struct link_free *)vaddr + m_offset / sizeof(*link);
-	first_page->freelist = link->next;
+	set_freeobj(first_page, link->next >> OBJ_ALLOCATED_TAG);
 	if (!class->huge)
 		/* record handle in the header of allocated chunk */
 		link->handle = handle;
@@ -1406,6 +1439,8 @@ static unsigned long obj_malloc(struct size_class *class,
 	mod_zspage_inuse(first_page, 1);
 	zs_stat_inc(class, OBJ_USED, 1);
 
+	obj = location_to_obj(m_page, obj);
+
 	return obj;
 }
 
@@ -1475,19 +1510,17 @@ static void obj_free(struct size_class *class, unsigned long obj)
 
 	obj &= ~OBJ_ALLOCATED_TAG;
 	obj_to_location(obj, &f_page, &f_objidx);
+	f_offset = (class->size * f_objidx) & ~PAGE_MASK;
 	first_page = get_first_page(f_page);
-
-	f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
-
 	vaddr = kmap_atomic(f_page);
 
 	/* Insert this object in containing zspage's freelist */
 	link = (struct link_free *)(vaddr + f_offset);
-	link->next = first_page->freelist;
+	link->next = get_freeobj(first_page) << OBJ_ALLOCATED_TAG;
 	if (class->huge)
 		set_page_private(first_page, 0);
 	kunmap_atomic(vaddr);
-	first_page->freelist = (void *)obj;
+	set_freeobj(first_page, f_objidx);
 	mod_zspage_inuse(first_page, -1);
 	zs_stat_dec(class, OBJ_USED, 1);
 }
@@ -1543,8 +1576,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
 	obj_to_location(src, &s_page, &s_objidx);
 	obj_to_location(dst, &d_page, &d_objidx);
 
-	s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
-	d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+	s_off = (class->size * s_objidx) & ~PAGE_MASK;
+	d_off = (class->size * d_objidx) & ~PAGE_MASK;
 
 	if (s_off + class->size > PAGE_SIZE)
 		s_size = PAGE_SIZE - s_off;
@@ -2034,9 +2067,10 @@ static int __init zs_init(void)
 		goto notifier_fail;
 
 	/*
-	 * A zspage's class index, fullness group, inuse object count are
-	 * encoded in its (first)page->mapping so sizeof(struct zs_meta)
-	 * should be less than sizeof(page->mapping(i.e., unsigned long)).
+	 * A zspage's a free object index, class index, fullness group,
+	 * inuse object count are encoded in its (first)page->mapping
+	 * so sizeof(struct zs_meta) should be less than
+	 * sizeof(page->mapping(i.e., unsigned long)).
 	 */
 	BUILD_BUG_ON(sizeof(struct zs_meta) > sizeof(unsigned long));
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 09/16] zsmalloc: move struct zs_meta from mapping to freelist
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

For supporting migration from VM, we need to have address_space
on every page so zsmalloc shouldn't use page->mapping. So,
this patch moves zs_meta from mapping to freelist.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 807998462539..d4d33a819832 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -29,7 +29,7 @@
  *		Look at size_class->huge.
  *	page->lru: links together first pages of various zspages.
  *		Basically forming list of zspages in a fullness group.
- *	page->mapping: override by struct zs_meta
+ *	page->freelist: override by struct zs_meta
  *
  * Usage of struct page flags:
  *	PG_private: identifies the first component page
@@ -418,7 +418,7 @@ static int get_zspage_inuse(struct page *first_page)
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 
 	return m->inuse;
 }
@@ -429,7 +429,7 @@ static void set_zspage_inuse(struct page *first_page, int val)
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 	m->inuse = val;
 }
 
@@ -439,7 +439,7 @@ static void mod_zspage_inuse(struct page *first_page, int val)
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 	m->inuse += val;
 }
 
@@ -449,7 +449,7 @@ static void set_freeobj(struct page *first_page, int idx)
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 	m->freeobj = idx;
 }
 
@@ -459,7 +459,7 @@ static unsigned long get_freeobj(struct page *first_page)
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 	return m->freeobj;
 }
 
@@ -471,7 +471,7 @@ static void get_zspage_mapping(struct page *first_page,
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 	*fullness = m->fullness;
 	*class_idx = m->class;
 }
@@ -484,7 +484,7 @@ static void set_zspage_mapping(struct page *first_page,
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
-	m = (struct zs_meta *)&first_page->mapping;
+	m = (struct zs_meta *)&first_page->freelist;
 	m->fullness = fullness;
 	m->class = class_idx;
 }
@@ -946,7 +946,6 @@ static void reset_page(struct page *page)
 	clear_bit(PG_private, &page->flags);
 	clear_bit(PG_private_2, &page->flags);
 	set_page_private(page, 0);
-	page->mapping = NULL;
 	page->freelist = NULL;
 }
 
@@ -1056,6 +1055,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 
 		INIT_LIST_HEAD(&page->lru);
 		if (i == 0) {	/* first page */
+			page->freelist = NULL;
 			SetPagePrivate(page);
 			set_page_private(page, 0);
 			first_page = page;
@@ -2068,9 +2068,9 @@ static int __init zs_init(void)
 
 	/*
 	 * A zspage's a free object index, class index, fullness group,
-	 * inuse object count are encoded in its (first)page->mapping
+	 * inuse object count are encoded in its (first)page->freelist
 	 * so sizeof(struct zs_meta) should be less than
-	 * sizeof(page->mapping(i.e., unsigned long)).
+	 * sizeof(page->freelist(i.e., void *)).
 	 */
 	BUILD_BUG_ON(sizeof(struct zs_meta) > sizeof(unsigned long));
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 10/16] zsmalloc: factor page chain functionality out
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

For migration, we need to create sub-page chain of zspage
dynamically so this patch factors it out from alloc_zspage.

As a minor refactoring, it makes OBJ_ALLOCATED_TAG assign
more clear in obj_malloc(it could be another patch but it's
trivial so I want to put together in this patch).

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 80 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 34 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d4d33a819832..14bcc741eead 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -981,7 +981,9 @@ static void init_zspage(struct size_class *class, struct page *first_page)
 	unsigned long off = 0;
 	struct page *page = first_page;
 
-	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+	first_page->freelist = NULL;
+	INIT_LIST_HEAD(&first_page->lru);
+	set_zspage_inuse(first_page, 0);
 
 	while (page) {
 		struct page *next_page;
@@ -1026,13 +1028,44 @@ static void init_zspage(struct size_class *class, struct page *first_page)
 	set_freeobj(first_page, 0);
 }
 
+static void create_page_chain(struct page *pages[], int nr_pages)
+{
+	int i;
+	struct page *page;
+	struct page *prev_page = NULL;
+	struct page *first_page = NULL;
+
+	for (i = 0; i < nr_pages; i++) {
+		page = pages[i];
+
+		INIT_LIST_HEAD(&page->lru);
+		if (i == 0) {
+			SetPagePrivate(page);
+			set_page_private(page, 0);
+			first_page = page;
+		}
+
+		if (i == 1)
+			set_page_private(first_page, (unsigned long)page);
+		if (i >= 1)
+			set_page_private(page, (unsigned long)first_page);
+		if (i >= 2)
+			list_add(&page->lru, &prev_page->lru);
+		if (i == nr_pages - 1)
+			SetPagePrivate2(page);
+
+		prev_page = page;
+	}
+}
+
 /*
  * Allocate a zspage for the given size class
  */
 static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 {
-	int i, error;
-	struct page *first_page = NULL, *uninitialized_var(prev_page);
+	int i;
+	struct page *first_page = NULL;
+	struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
 
 	/*
 	 * Allocate individual pages and link them together as:
@@ -1045,43 +1078,23 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 	 * (i.e. no other sub-page has this flag set) and PG_private_2 to
 	 * identify the last page.
 	 */
-	error = -ENOMEM;
 	for (i = 0; i < class->pages_per_zspage; i++) {
 		struct page *page;
 
 		page = alloc_page(flags);
-		if (!page)
-			goto cleanup;
-
-		INIT_LIST_HEAD(&page->lru);
-		if (i == 0) {	/* first page */
-			page->freelist = NULL;
-			SetPagePrivate(page);
-			set_page_private(page, 0);
-			first_page = page;
-			set_zspage_inuse(page, 0);
+		if (!page) {
+			while (--i >= 0)
+				__free_page(pages[i]);
+			return NULL;
 		}
-		if (i == 1)
-			set_page_private(first_page, (unsigned long)page);
-		if (i >= 1)
-			set_page_private(page, (unsigned long)first_page);
-		if (i >= 2)
-			list_add(&page->lru, &prev_page->lru);
-		if (i == class->pages_per_zspage - 1)	/* last page */
-			SetPagePrivate2(page);
-		prev_page = page;
+
+		pages[i] = page;
 	}
 
+	create_page_chain(pages, class->pages_per_zspage);
+	first_page = pages[0];
 	init_zspage(class, first_page);
 
-	error = 0; /* Success */
-
-cleanup:
-	if (unlikely(error) && first_page) {
-		free_zspage(first_page);
-		first_page = NULL;
-	}
-
 	return first_page;
 }
 
@@ -1421,7 +1434,6 @@ static unsigned long obj_malloc(struct size_class *class,
 	unsigned long m_offset;
 	void *vaddr;
 
-	handle |= OBJ_ALLOCATED_TAG;
 	obj = get_freeobj(first_page);
 	objidx_to_page_and_offset(class, first_page, obj,
 				&m_page, &m_offset);
@@ -1431,10 +1443,10 @@ static unsigned long obj_malloc(struct size_class *class,
 	set_freeobj(first_page, link->next >> OBJ_ALLOCATED_TAG);
 	if (!class->huge)
 		/* record handle in the header of allocated chunk */
-		link->handle = handle;
+		link->handle = handle | OBJ_ALLOCATED_TAG;
 	else
 		/* record handle in first_page->private */
-		set_page_private(first_page, handle);
+		set_page_private(first_page, handle | OBJ_ALLOCATED_TAG);
 	kunmap_atomic(vaddr);
 	mod_zspage_inuse(first_page, 1);
 	zs_stat_inc(class, OBJ_USED, 1);
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 11/16] zsmalloc: separate free_zspage from putback_zspage
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

Currently, putback_zspage does free zspage under class->lock
if fullness become ZS_EMPTY but it makes trouble to implement
locking scheme for new zspage migration.
So, this patch is to separate free_zspage from putback_zspage
and free zspage out of class->lock which is preparation for
zspage migration.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 14bcc741eead..b11dcd718502 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -949,7 +949,8 @@ static void reset_page(struct page *page)
 	page->freelist = NULL;
 }
 
-static void free_zspage(struct page *first_page)
+static void free_zspage(struct zs_pool *pool, struct size_class *class,
+			struct page *first_page)
 {
 	struct page *nextp, *tmp, *head_extra;
 
@@ -972,6 +973,11 @@ static void free_zspage(struct page *first_page)
 	}
 	reset_page(head_extra);
 	__free_page(head_extra);
+
+	zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+			class->size, class->pages_per_zspage));
+	atomic_long_sub(class->pages_per_zspage,
+				&pool->pages_allocated);
 }
 
 /* Initialize a newly allocated zspage */
@@ -1559,13 +1565,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	spin_lock(&class->lock);
 	obj_free(class, obj);
 	fullness = fix_fullness_group(class, first_page);
-	if (fullness == ZS_EMPTY) {
-		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-				class->size, class->pages_per_zspage));
-		atomic_long_sub(class->pages_per_zspage,
-				&pool->pages_allocated);
-		free_zspage(first_page);
-	}
+	if (fullness == ZS_EMPTY)
+		free_zspage(pool, class, first_page);
 	spin_unlock(&class->lock);
 	unpin_tag(handle);
 
@@ -1752,7 +1753,7 @@ static struct page *isolate_target_page(struct size_class *class)
  * @class: destination class
  * @first_page: target page
  *
- * Return @fist_page's fullness_group
+ * Return @first_page's updated fullness_group
  */
 static enum fullness_group putback_zspage(struct zs_pool *pool,
 			struct size_class *class,
@@ -1764,15 +1765,6 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
 	insert_zspage(class, fullness, first_page);
 	set_zspage_mapping(first_page, class->index, fullness);
 
-	if (fullness == ZS_EMPTY) {
-		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-			class->size, class->pages_per_zspage));
-		atomic_long_sub(class->pages_per_zspage,
-				&pool->pages_allocated);
-
-		free_zspage(first_page);
-	}
-
 	return fullness;
 }
 
@@ -1835,23 +1827,31 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 			if (!migrate_zspage(pool, class, &cc))
 				break;
 
-			putback_zspage(pool, class, dst_page);
+			VM_BUG_ON_PAGE(putback_zspage(pool, class,
+				dst_page) == ZS_EMPTY, dst_page);
 		}
 
 		/* Stop if we couldn't find slot */
 		if (dst_page == NULL)
 			break;
 
-		putback_zspage(pool, class, dst_page);
-		if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+		VM_BUG_ON_PAGE(putback_zspage(pool, class,
+				dst_page) == ZS_EMPTY, dst_page);
+		if (putback_zspage(pool, class, src_page) == ZS_EMPTY) {
 			pool->stats.pages_compacted += class->pages_per_zspage;
-		spin_unlock(&class->lock);
+			spin_unlock(&class->lock);
+			free_zspage(pool, class, src_page);
+		} else {
+			spin_unlock(&class->lock);
+		}
+
 		cond_resched();
 		spin_lock(&class->lock);
 	}
 
 	if (src_page)
-		putback_zspage(pool, class, src_page);
+		VM_BUG_ON_PAGE(putback_zspage(pool, class,
+				src_page) == ZS_EMPTY, src_page);
 
 	spin_unlock(&class->lock);
 }
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 12/16] zsmalloc: zs_compact refactoring
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

Currently, we rely on class->lock to prevent zspage destruction.
It was okay until now because the critical section is short but
with run-time migration, it could be long so class->lock is not
a good apporach any more.

So, this patch introduces [un]freeze_zspage functions which
freeze allocated objects in the zspage with pinning tag so
user cannot free using object. With those functions, this patch
redesign compaction.

Those functions will be used for implementing zspage runtime
migrations, too.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 393 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 257 insertions(+), 136 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b11dcd718502..ac8ca7b10720 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -922,6 +922,13 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
 		return *(unsigned long *)obj;
 }
 
+static inline int testpin_tag(unsigned long handle)
+{
+	unsigned long *ptr = (unsigned long *)handle;
+
+	return test_bit(HANDLE_PIN_BIT, ptr);
+}
+
 static inline int trypin_tag(unsigned long handle)
 {
 	unsigned long *ptr = (unsigned long *)handle;
@@ -949,8 +956,7 @@ static void reset_page(struct page *page)
 	page->freelist = NULL;
 }
 
-static void free_zspage(struct zs_pool *pool, struct size_class *class,
-			struct page *first_page)
+static void free_zspage(struct zs_pool *pool, struct page *first_page)
 {
 	struct page *nextp, *tmp, *head_extra;
 
@@ -973,11 +979,6 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
 	}
 	reset_page(head_extra);
 	__free_page(head_extra);
-
-	zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-			class->size, class->pages_per_zspage));
-	atomic_long_sub(class->pages_per_zspage,
-				&pool->pages_allocated);
 }
 
 /* Initialize a newly allocated zspage */
@@ -1325,6 +1326,11 @@ static bool zspage_full(struct size_class *class, struct page *first_page)
 	return get_zspage_inuse(first_page) == class->objs_per_zspage;
 }
 
+static bool zspage_empty(struct size_class *class, struct page *first_page)
+{
+	return get_zspage_inuse(first_page) == 0;
+}
+
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
 	return atomic_long_read(&pool->pages_allocated);
@@ -1455,7 +1461,6 @@ static unsigned long obj_malloc(struct size_class *class,
 		set_page_private(first_page, handle | OBJ_ALLOCATED_TAG);
 	kunmap_atomic(vaddr);
 	mod_zspage_inuse(first_page, 1);
-	zs_stat_inc(class, OBJ_USED, 1);
 
 	obj = location_to_obj(m_page, obj);
 
@@ -1510,6 +1515,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	}
 
 	obj = obj_malloc(class, first_page, handle);
+	zs_stat_inc(class, OBJ_USED, 1);
 	/* Now move the zspage to another fullness group, if required */
 	fix_fullness_group(class, first_page);
 	record_obj(handle, obj);
@@ -1540,7 +1546,6 @@ static void obj_free(struct size_class *class, unsigned long obj)
 	kunmap_atomic(vaddr);
 	set_freeobj(first_page, f_objidx);
 	mod_zspage_inuse(first_page, -1);
-	zs_stat_dec(class, OBJ_USED, 1);
 }
 
 void zs_free(struct zs_pool *pool, unsigned long handle)
@@ -1564,10 +1569,19 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 
 	spin_lock(&class->lock);
 	obj_free(class, obj);
+	zs_stat_dec(class, OBJ_USED, 1);
 	fullness = fix_fullness_group(class, first_page);
-	if (fullness == ZS_EMPTY)
-		free_zspage(pool, class, first_page);
+	if (fullness == ZS_EMPTY) {
+		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+				class->size, class->pages_per_zspage));
+		spin_unlock(&class->lock);
+		atomic_long_sub(class->pages_per_zspage,
+					&pool->pages_allocated);
+		free_zspage(pool, first_page);
+		goto out;
+	}
 	spin_unlock(&class->lock);
+out:
 	unpin_tag(handle);
 
 	free_handle(pool, handle);
@@ -1637,127 +1651,66 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
 	kunmap_atomic(s_addr);
 }
 
-/*
- * Find alloced object in zspage from index object and
- * return handle.
- */
-static unsigned long find_alloced_obj(struct size_class *class,
-					struct page *page, int index)
+static unsigned long handle_from_obj(struct size_class *class,
+				struct page *first_page, int obj_idx)
 {
-	unsigned long head;
-	int offset = 0;
-	unsigned long handle = 0;
-	void *addr = kmap_atomic(page);
-
-	if (!is_first_page(page))
-		offset = page->index;
-	offset += class->size * index;
-
-	while (offset < PAGE_SIZE) {
-		head = obj_to_head(class, page, addr + offset);
-		if (head & OBJ_ALLOCATED_TAG) {
-			handle = head & ~OBJ_ALLOCATED_TAG;
-			if (trypin_tag(handle))
-				break;
-			handle = 0;
-		}
+	struct page *page;
+	unsigned long offset_in_page;
+	void *addr;
+	unsigned long head, handle = 0;
 
-		offset += class->size;
-		index++;
-	}
+	objidx_to_page_and_offset(class, first_page, obj_idx,
+			&page, &offset_in_page);
 
+	addr = kmap_atomic(page);
+	head = obj_to_head(class, page, addr + offset_in_page);
+	if (head & OBJ_ALLOCATED_TAG)
+		handle = head & ~OBJ_ALLOCATED_TAG;
 	kunmap_atomic(addr);
+
 	return handle;
 }
 
-struct zs_compact_control {
-	/* Source page for migration which could be a subpage of zspage. */
-	struct page *s_page;
-	/* Destination page for migration which should be a first page
-	 * of zspage. */
-	struct page *d_page;
-	 /* Starting object index within @s_page which used for live object
-	  * in the subpage. */
-	int index;
-};
-
-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
-				struct zs_compact_control *cc)
+static int migrate_zspage(struct size_class *class, struct page *dst_page,
+				struct page *src_page)
 {
-	unsigned long used_obj, free_obj;
 	unsigned long handle;
-	struct page *s_page = cc->s_page;
-	struct page *d_page = cc->d_page;
-	unsigned long index = cc->index;
-	int ret = 0;
+	unsigned long old_obj, new_obj;
+	int i;
+	int nr_migrated = 0;
 
-	while (1) {
-		handle = find_alloced_obj(class, s_page, index);
-		if (!handle) {
-			s_page = get_next_page(s_page);
-			if (!s_page)
-				break;
-			index = 0;
+	for (i = 0; i < class->objs_per_zspage; i++) {
+		handle = handle_from_obj(class, src_page, i);
+		if (!handle)
 			continue;
-		}
-
-		/* Stop if there is no more space */
-		if (zspage_full(class, d_page)) {
-			unpin_tag(handle);
-			ret = -ENOMEM;
+		if (zspage_full(class, dst_page))
 			break;
-		}
-
-		used_obj = handle_to_obj(handle);
-		free_obj = obj_malloc(class, d_page, handle);
-		zs_object_copy(class, free_obj, used_obj);
-		index++;
+		old_obj = handle_to_obj(handle);
+		new_obj = obj_malloc(class, dst_page, handle);
+		zs_object_copy(class, new_obj, old_obj);
+		nr_migrated++;
 		/*
 		 * record_obj updates handle's value to free_obj and it will
 		 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
 		 * breaks synchronization using pin_tag(e,g, zs_free) so
 		 * let's keep the lock bit.
 		 */
-		free_obj |= BIT(HANDLE_PIN_BIT);
-		record_obj(handle, free_obj);
-		unpin_tag(handle);
-		obj_free(class, used_obj);
+		new_obj |= BIT(HANDLE_PIN_BIT);
+		record_obj(handle, new_obj);
+		obj_free(class, old_obj);
 	}
-
-	/* Remember last position in this iteration */
-	cc->s_page = s_page;
-	cc->index = index;
-
-	return ret;
-}
-
-static struct page *isolate_target_page(struct size_class *class)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
-		page = class->fullness_list[i];
-		if (page) {
-			remove_zspage(class, i, page);
-			break;
-		}
-	}
-
-	return page;
+	return nr_migrated;
 }
 
 /*
  * putback_zspage - add @first_page into right class's fullness list
- * @pool: target pool
  * @class: destination class
  * @first_page: target page
  *
  * Return @first_page's updated fullness_group
  */
-static enum fullness_group putback_zspage(struct zs_pool *pool,
-			struct size_class *class,
-			struct page *first_page)
+static enum fullness_group putback_zspage(struct size_class *class,
+					struct page *first_page)
 {
 	enum fullness_group fullness;
 
@@ -1768,17 +1721,155 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
 	return fullness;
 }
 
+/*
+ * freeze_zspage - freeze all objects in a zspage
+ * @class: size class of the page
+ * @first_page: first page of zspage
+ *
+ * Freeze all allocated objects in a zspage so objects couldn't be
+ * freed until unfreeze objects. It should be called under class->lock.
+ *
+ * RETURNS:
+ * the number of pinned objects
+ */
+static int freeze_zspage(struct size_class *class, struct page *first_page)
+{
+	unsigned long obj_idx;
+	struct page *obj_page;
+	unsigned long offset;
+	void *addr;
+	int nr_freeze = 0;
+
+	for (obj_idx = 0; obj_idx < class->objs_per_zspage; obj_idx++) {
+		unsigned long head;
+
+		objidx_to_page_and_offset(class, first_page, obj_idx,
+					&obj_page, &offset);
+		addr = kmap_atomic(obj_page);
+		head = obj_to_head(class, obj_page, addr + offset);
+		if (head & OBJ_ALLOCATED_TAG) {
+			unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
+
+			if (!trypin_tag(handle)) {
+				kunmap_atomic(addr);
+				break;
+			}
+			nr_freeze++;
+		}
+		kunmap_atomic(addr);
+	}
+
+	return nr_freeze;
+}
+
+/*
+ * unfreeze_page - unfreeze objects freezed by freeze_zspage in a zspage
+ * @class: size class of the page
+ * @first_page: freezed zspage to unfreeze
+ * @nr_obj: the number of objects to unfreeze
+ *
+ * unfreeze objects in a zspage.
+ */
+static void unfreeze_zspage(struct size_class *class, struct page *first_page,
+			int nr_obj)
+{
+	unsigned long obj_idx;
+	struct page *obj_page;
+	unsigned long offset;
+	void *addr;
+	int nr_unfreeze = 0;
+
+	for (obj_idx = 0; obj_idx < class->objs_per_zspage &&
+			nr_unfreeze < nr_obj; obj_idx++) {
+		unsigned long head;
+
+		objidx_to_page_and_offset(class, first_page, obj_idx,
+					&obj_page, &offset);
+		addr = kmap_atomic(obj_page);
+		head = obj_to_head(class, obj_page, addr + offset);
+		if (head & OBJ_ALLOCATED_TAG) {
+			unsigned long handle = head & ~OBJ_ALLOCATED_TAG;
+
+			VM_BUG_ON(!testpin_tag(handle));
+			unpin_tag(handle);
+			nr_unfreeze++;
+		}
+		kunmap_atomic(addr);
+	}
+}
+
+/*
+ * isolate_source_page - isolate a zspage for migration source
+ * @class: size class of zspage for isolation
+ *
+ * Returns a zspage which are isolated from list so anyone can
+ * allocate a object from that page. As well, freeze all objects
+ * allocated in the zspage so anyone cannot access that objects
+ * (e.g., zs_map_object, zs_free).
+ */
 static struct page *isolate_source_page(struct size_class *class)
 {
 	int i;
 	struct page *page = NULL;
 
 	for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+		int inuse, freezed;
+
 		page = class->fullness_list[i];
 		if (!page)
 			continue;
 
 		remove_zspage(class, i, page);
+
+		inuse = get_zspage_inuse(page);
+		freezed = freeze_zspage(class, page);
+
+		if (inuse != freezed) {
+			unfreeze_zspage(class, page, freezed);
+			putback_zspage(class, page);
+			page = NULL;
+			continue;
+		}
+
+		break;
+	}
+
+	return page;
+}
+
+/*
+ * isolate_target_page - isolate a zspage for migration target
+ * @class: size class of zspage for isolation
+ *
+ * Returns a zspage which are isolated from list so anyone can
+ * allocate a object from that page. As well, freeze all objects
+ * allocated in the zspage so anyone cannot access that objects
+ * (e.g., zs_map_object, zs_free).
+ */
+static struct page *isolate_target_page(struct size_class *class)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+		int inuse, freezed;
+
+		page = class->fullness_list[i];
+		if (!page)
+			continue;
+
+		remove_zspage(class, i, page);
+
+		inuse = get_zspage_inuse(page);
+		freezed = freeze_zspage(class, page);
+
+		if (inuse != freezed) {
+			unfreeze_zspage(class, page, freezed);
+			putback_zspage(class, page);
+			page = NULL;
+			continue;
+		}
+
 		break;
 	}
 
@@ -1793,9 +1884,11 @@ static struct page *isolate_source_page(struct size_class *class)
 static unsigned long zs_can_compact(struct size_class *class)
 {
 	unsigned long obj_wasted;
+	unsigned long obj_allocated, obj_used;
 
-	obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
-		zs_stat_get(class, OBJ_USED);
+	obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+	obj_used = zs_stat_get(class, OBJ_USED);
+	obj_wasted = obj_allocated - obj_used;
 
 	obj_wasted /= get_maxobj_per_zspage(class->size,
 			class->pages_per_zspage);
@@ -1805,53 +1898,81 @@ static unsigned long zs_can_compact(struct size_class *class)
 
 static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 {
-	struct zs_compact_control cc;
-	struct page *src_page;
+	struct page *src_page = NULL;
 	struct page *dst_page = NULL;
 
-	spin_lock(&class->lock);
-	while ((src_page = isolate_source_page(class))) {
+	while (1) {
+		int nr_migrated;
 
-		if (!zs_can_compact(class))
+		spin_lock(&class->lock);
+		if (!zs_can_compact(class)) {
+			spin_unlock(&class->lock);
 			break;
+		}
 
-		cc.index = 0;
-		cc.s_page = src_page;
+		/*
+		 * Isolate source page and freeze all objects in a zspage
+		 * to prevent zspage destroying.
+		 */
+		if (!src_page) {
+			src_page = isolate_source_page(class);
+			if (!src_page) {
+				spin_unlock(&class->lock);
+				break;
+			}
+		}
 
-		while ((dst_page = isolate_target_page(class))) {
-			cc.d_page = dst_page;
-			/*
-			 * If there is no more space in dst_page, resched
-			 * and see if anyone had allocated another zspage.
-			 */
-			if (!migrate_zspage(pool, class, &cc))
+		/* Isolate target page and freeze all objects in the zspage */
+		if (!dst_page) {
+			dst_page = isolate_target_page(class);
+			if (!dst_page) {
+				spin_unlock(&class->lock);
 				break;
+			}
+		}
+		spin_unlock(&class->lock);
+
+		nr_migrated = migrate_zspage(class, dst_page, src_page);
 
-			VM_BUG_ON_PAGE(putback_zspage(pool, class,
-				dst_page) == ZS_EMPTY, dst_page);
+		if (zspage_full(class, dst_page)) {
+			spin_lock(&class->lock);
+			putback_zspage(class, dst_page);
+			unfreeze_zspage(class, dst_page,
+				class->objs_per_zspage);
+			spin_unlock(&class->lock);
+			dst_page = NULL;
 		}
 
-		/* Stop if we couldn't find slot */
-		if (dst_page == NULL)
-			break;
+		if (zspage_empty(class, src_page)) {
+			free_zspage(pool, src_page);
+			spin_lock(&class->lock);
+			zs_stat_dec(class, OBJ_ALLOCATED,
+				get_maxobj_per_zspage(
+				class->size, class->pages_per_zspage));
+			atomic_long_sub(class->pages_per_zspage,
+					&pool->pages_allocated);
 
-		VM_BUG_ON_PAGE(putback_zspage(pool, class,
-				dst_page) == ZS_EMPTY, dst_page);
-		if (putback_zspage(pool, class, src_page) == ZS_EMPTY) {
 			pool->stats.pages_compacted += class->pages_per_zspage;
 			spin_unlock(&class->lock);
-			free_zspage(pool, class, src_page);
-		} else {
-			spin_unlock(&class->lock);
+			src_page = NULL;
 		}
+	}
 
-		cond_resched();
-		spin_lock(&class->lock);
+	if (!src_page && !dst_page)
+		return;
+
+	spin_lock(&class->lock);
+	if (src_page) {
+		putback_zspage(class, src_page);
+		unfreeze_zspage(class, src_page,
+				class->objs_per_zspage);
 	}
 
-	if (src_page)
-		VM_BUG_ON_PAGE(putback_zspage(pool, class,
-				src_page) == ZS_EMPTY, src_page);
+	if (dst_page) {
+		putback_zspage(class, dst_page);
+		unfreeze_zspage(class, dst_page,
+				class->objs_per_zspage);
+	}
 
 	spin_unlock(&class->lock);
 }
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 13/16] zsmalloc: migrate head page of zspage
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

This patch introduces run-time migration feature for zspage.
To begin with, it supports only head page migration for
easy review(later patches will support tail page migration).

For migration, it supports three functions

* zs_page_isolate

It isolates a zspage which includes a subpage VM want to migrate
from class so anyone cannot allocate new object from the zspage.
IOW, allocation freeze

* zs_page_migrate

First of all, it freezes zspage to prevent zspage destrunction
so anyone cannot free object. Then, It copies content from oldpage
to newpage and create new page-chain with new page.
If it was successful, drop the refcount of old page to free
and putback new zspage to right data structure of zsmalloc.
Lastly, unfreeze zspages so we allows object allocation/free
from now on.

* zs_page_putback

It returns isolated zspage to right fullness_group list
if it fails to migrate a page.

NOTE: A hurdle to support migration is that destroying zspage
while migration is going on. Once a zspage is isolated,
anyone cannot allocate object from the zspage but can deallocate
object freely so a zspage could be destroyed until all of objects
in zspage are freezed to prevent deallocation. The problem is
large window betwwen zs_page_isolate and freeze_zspage
in zs_page_migrate so the zspage could be destroyed.

A easy approach to solve the problem is that object freezing
in zs_page_isolate but it has a drawback that any object cannot
be deallocated until migration fails after isolation. However,
There is large time gab between isolation and migration so
any object freeing in other CPU should spin by pin_tag which
would cause big latency. So, this patch introduces lock_zspage
which holds PG_lock of all pages in a zspage right before
freeing the zspage. VM migration locks the page, too right
before calling ->migratepage so such race doesn't exist any more.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/uapi/linux/magic.h |   1 +
 mm/zsmalloc.c              | 332 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 318 insertions(+), 15 deletions(-)

diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index e1fbe72c39c0..93b1affe4801 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -79,5 +79,6 @@
 #define NSFS_MAGIC		0x6e736673
 #define BPF_FS_MAGIC		0xcafe4a11
 #define BALLOON_KVM_MAGIC	0x13661366
+#define ZSMALLOC_MAGIC		0x58295829
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index ac8ca7b10720..f6c9138c3be0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -56,6 +56,8 @@
 #include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
+#include <linux/mount.h>
+#include <linux/migrate.h>
 
 /*
  * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -182,6 +184,8 @@ struct zs_size_stat {
 static struct dentry *zs_stat_root;
 #endif
 
+static struct vfsmount *zsmalloc_mnt;
+
 /*
  * number of size_classes
  */
@@ -263,6 +267,7 @@ struct zs_pool {
 #ifdef CONFIG_ZSMALLOC_STAT
 	struct dentry *stat_dentry;
 #endif
+	struct inode *inode;
 };
 
 struct zs_meta {
@@ -412,6 +417,29 @@ static int is_last_page(struct page *page)
 	return PagePrivate2(page);
 }
 
+/*
+ * Indicate that whether zspage is isolated for page migration.
+ * Protected by size_class lock
+ */
+static void SetZsPageIsolate(struct page *first_page)
+{
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+	SetPageUptodate(first_page);
+}
+
+static int ZsPageIsolate(struct page *first_page)
+{
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	return PageUptodate(first_page);
+}
+
+static void ClearZsPageIsolate(struct page *first_page)
+{
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+	ClearPageUptodate(first_page);
+}
+
 static int get_zspage_inuse(struct page *first_page)
 {
 	struct zs_meta *m;
@@ -783,8 +811,11 @@ static enum fullness_group fix_fullness_group(struct size_class *class,
 	if (newfg == currfg)
 		goto out;
 
-	remove_zspage(class, currfg, first_page);
-	insert_zspage(class, newfg, first_page);
+	/* Later, putback will insert page to right list */
+	if (!ZsPageIsolate(first_page)) {
+		remove_zspage(class, currfg, first_page);
+		insert_zspage(class, newfg, first_page);
+	}
 	set_zspage_mapping(first_page, class_idx, newfg);
 
 out:
@@ -950,12 +981,31 @@ static void unpin_tag(unsigned long handle)
 
 static void reset_page(struct page *page)
 {
+	if (!PageIsolated(page))
+		__ClearPageMovable(page);
+	ClearPageIsolated(page);
 	clear_bit(PG_private, &page->flags);
 	clear_bit(PG_private_2, &page->flags);
 	set_page_private(page, 0);
 	page->freelist = NULL;
 }
 
+/**
+ * lock_zspage - lock all pages in the zspage
+ * @first_page: head page of the zspage
+ *
+ * To prevent destroy during migration, zspage freeing should
+ * hold locks of all pages in a zspage
+ */
+void lock_zspage(struct page *first_page)
+{
+	struct page *cursor = first_page;
+
+	do {
+		while (!trylock_page(cursor));
+	} while ((cursor = get_next_page(cursor)) != NULL);
+}
+
 static void free_zspage(struct zs_pool *pool, struct page *first_page)
 {
 	struct page *nextp, *tmp, *head_extra;
@@ -963,26 +1013,31 @@ static void free_zspage(struct zs_pool *pool, struct page *first_page)
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 	VM_BUG_ON_PAGE(get_zspage_inuse(first_page), first_page);
 
+	lock_zspage(first_page);
 	head_extra = (struct page *)page_private(first_page);
 
-	reset_page(first_page);
-	__free_page(first_page);
-
 	/* zspage with only 1 system page */
 	if (!head_extra)
-		return;
+		goto out;
 
 	list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
 		list_del(&nextp->lru);
 		reset_page(nextp);
-		__free_page(nextp);
+		unlock_page(nextp);
+		put_page(nextp);
 	}
 	reset_page(head_extra);
-	__free_page(head_extra);
+	unlock_page(head_extra);
+	put_page(head_extra);
+out:
+	reset_page(first_page);
+	unlock_page(first_page);
+	put_page(first_page);
 }
 
 /* Initialize a newly allocated zspage */
-static void init_zspage(struct size_class *class, struct page *first_page)
+static void init_zspage(struct size_class *class, struct page *first_page,
+			struct address_space *mapping)
 {
 	int freeobj = 1;
 	unsigned long off = 0;
@@ -991,6 +1046,9 @@ static void init_zspage(struct size_class *class, struct page *first_page)
 	first_page->freelist = NULL;
 	INIT_LIST_HEAD(&first_page->lru);
 	set_zspage_inuse(first_page, 0);
+	BUG_ON(!trylock_page(first_page));
+	__SetPageMovable(first_page, mapping);
+	unlock_page(first_page);
 
 	while (page) {
 		struct page *next_page;
@@ -1065,10 +1123,45 @@ static void create_page_chain(struct page *pages[], int nr_pages)
 	}
 }
 
+static void replace_sub_page(struct size_class *class, struct page *first_page,
+		struct page *newpage, struct page *oldpage)
+{
+	struct page *page;
+	struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL,};
+	int idx = 0;
+
+	page = first_page;
+	do {
+		if (page == oldpage)
+			pages[idx] = newpage;
+		else
+			pages[idx] = page;
+		idx++;
+	} while ((page = get_next_page(page)) != NULL);
+
+	create_page_chain(pages, class->pages_per_zspage);
+
+	if (is_first_page(oldpage)) {
+		enum fullness_group fg;
+		int class_idx;
+
+		SetZsPageIsolate(newpage);
+		get_zspage_mapping(oldpage, &class_idx, &fg);
+		set_zspage_mapping(newpage, class_idx, fg);
+		set_freeobj(newpage, get_freeobj(oldpage));
+		set_zspage_inuse(newpage, get_zspage_inuse(oldpage));
+		if (class->huge)
+			set_page_private(newpage,  page_private(oldpage));
+	}
+
+	__SetPageMovable(newpage, oldpage->mapping);
+}
+
 /*
  * Allocate a zspage for the given size class
  */
-static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+static struct page *alloc_zspage(struct zs_pool *pool,
+				struct size_class *class)
 {
 	int i;
 	struct page *first_page = NULL;
@@ -1088,7 +1181,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 	for (i = 0; i < class->pages_per_zspage; i++) {
 		struct page *page;
 
-		page = alloc_page(flags);
+		page = alloc_page(pool->flags);
 		if (!page) {
 			while (--i >= 0)
 				__free_page(pages[i]);
@@ -1100,7 +1193,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 
 	create_page_chain(pages, class->pages_per_zspage);
 	first_page = pages[0];
-	init_zspage(class, first_page);
+	init_zspage(class, first_page, pool->inode->i_mapping);
 
 	return first_page;
 }
@@ -1499,7 +1592,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(pool, class);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1559,6 +1652,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	if (unlikely(!handle))
 		return;
 
+	/* Once handle is pinned, page|object migration cannot work */
 	pin_tag(handle);
 	obj = handle_to_obj(handle);
 	obj_to_location(obj, &f_page, &f_objidx);
@@ -1714,6 +1808,9 @@ static enum fullness_group putback_zspage(struct size_class *class,
 {
 	enum fullness_group fullness;
 
+	VM_BUG_ON_PAGE(!list_empty(&first_page->lru), first_page);
+	VM_BUG_ON_PAGE(ZsPageIsolate(first_page), first_page);
+
 	fullness = get_fullness_group(class, first_page);
 	insert_zspage(class, fullness, first_page);
 	set_zspage_mapping(first_page, class->index, fullness);
@@ -2059,6 +2156,173 @@ static int zs_register_shrinker(struct zs_pool *pool)
 	return register_shrinker(&pool->shrinker);
 }
 
+bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+{
+	struct zs_pool *pool;
+	struct size_class *class;
+	int class_idx;
+	enum fullness_group fullness;
+	struct page *first_page;
+
+	/*
+	 * The page is locked so it couldn't be destroyed.
+	 * For detail, look at lock_zspage in free_zspage.
+	 */
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(PageIsolated(page), page);
+	/*
+	 * In this implementation, it allows only first page migration.
+	 */
+	VM_BUG_ON_PAGE(!is_first_page(page), page);
+	first_page = page;
+
+	/*
+	 * Without class lock, fullness is meaningless while constant
+	 * class_idx is okay. We will get it under class lock at below,
+	 * again.
+	 */
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	pool = page->mapping->private_data;
+	class = pool->size_class[class_idx];
+
+	if (!spin_trylock(&class->lock))
+		return false;
+
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	remove_zspage(class, fullness, first_page);
+	SetZsPageIsolate(first_page);
+	SetPageIsolated(page);
+	spin_unlock(&class->lock);
+
+	return true;
+}
+
+int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+		struct page *page, enum migrate_mode mode)
+{
+	struct zs_pool *pool;
+	struct size_class *class;
+	int class_idx;
+	enum fullness_group fullness;
+	struct page *first_page;
+	void *s_addr, *d_addr, *addr;
+	int ret = -EBUSY;
+	int offset = 0;
+	int freezed = 0;
+
+	VM_BUG_ON_PAGE(!PageMovable(page), page);
+	VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+	first_page = page;
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	pool = page->mapping->private_data;
+	class = pool->size_class[class_idx];
+
+	/*
+	 * Get stable fullness under class->lock
+	 */
+	if (!spin_trylock(&class->lock))
+		return ret;
+
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	if (get_zspage_inuse(first_page) == 0)
+		goto out_class_unlock;
+
+	freezed = freeze_zspage(class, first_page);
+	if (freezed != get_zspage_inuse(first_page))
+		goto out_unfreeze;
+
+	/* copy contents from page to newpage */
+	s_addr = kmap_atomic(page);
+	d_addr = kmap_atomic(newpage);
+	memcpy(d_addr, s_addr, PAGE_SIZE);
+	kunmap_atomic(d_addr);
+	kunmap_atomic(s_addr);
+
+	if (!is_first_page(page))
+		offset = page->index;
+
+	addr = kmap_atomic(page);
+	do {
+		unsigned long handle;
+		unsigned long head;
+		unsigned long new_obj, old_obj;
+		unsigned long obj_idx;
+		struct page *dummy;
+
+		head = obj_to_head(class, page, addr + offset);
+		if (head & OBJ_ALLOCATED_TAG) {
+			handle = head & ~OBJ_ALLOCATED_TAG;
+			if (!testpin_tag(handle))
+				BUG();
+
+			old_obj = handle_to_obj(handle);
+			obj_to_location(old_obj, &dummy, &obj_idx);
+			new_obj = location_to_obj(newpage, obj_idx);
+			new_obj |= BIT(HANDLE_PIN_BIT);
+			record_obj(handle, new_obj);
+		}
+		offset += class->size;
+	} while (offset < PAGE_SIZE);
+	kunmap_atomic(addr);
+
+	replace_sub_page(class, first_page, newpage, page);
+	first_page = newpage;
+	get_page(newpage);
+	VM_BUG_ON_PAGE(get_fullness_group(class, first_page) ==
+			ZS_EMPTY, first_page);
+	ClearZsPageIsolate(first_page);
+	putback_zspage(class, first_page);
+
+	/* Migration complete. Free old page */
+	ClearPageIsolated(page);
+	reset_page(page);
+	put_page(page);
+	ret = MIGRATEPAGE_SUCCESS;
+
+out_unfreeze:
+	unfreeze_zspage(class, first_page, freezed);
+out_class_unlock:
+	spin_unlock(&class->lock);
+
+	return ret;
+}
+
+void zs_page_putback(struct page *page)
+{
+	struct zs_pool *pool;
+	struct size_class *class;
+	int class_idx;
+	enum fullness_group fullness;
+	struct page *first_page;
+
+	VM_BUG_ON_PAGE(!PageMovable(page), page);
+	VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+	first_page = page;
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	pool = page->mapping->private_data;
+	class = pool->size_class[class_idx];
+
+	/*
+	 * If there is race betwwen zs_free and here, free_zspage
+	 * in zs_free will wait the page lock of @page without
+	 * destroying of zspage.
+	 */
+	INIT_LIST_HEAD(&first_page->lru);
+	spin_lock(&class->lock);
+	ClearPageIsolated(page);
+	ClearZsPageIsolate(first_page);
+	putback_zspage(class, first_page);
+	spin_unlock(&class->lock);
+}
+
+const struct address_space_operations zsmalloc_aops = {
+	.isolate_page = zs_page_isolate,
+	.migratepage = zs_page_migrate,
+	.putback_page = zs_page_putback,
+};
+
 /**
  * zs_create_pool - Creates an allocation pool to work from.
  * @flags: allocation flags used to allocate pool metadata
@@ -2145,6 +2409,15 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 	if (zs_pool_stat_create(pool, name))
 		goto err;
 
+	pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+	if (IS_ERR(pool->inode)) {
+		pool->inode = NULL;
+		goto err;
+	}
+
+	pool->inode->i_mapping->a_ops = &zsmalloc_aops;
+	pool->inode->i_mapping->private_data = pool;
+
 	/*
 	 * Not critical, we still can use the pool
 	 * and user can trigger compaction manually.
@@ -2164,6 +2437,8 @@ void zs_destroy_pool(struct zs_pool *pool)
 	int i;
 
 	zs_unregister_shrinker(pool);
+	if (pool->inode)
+		iput(pool->inode);
 	zs_pool_stat_destroy(pool);
 
 	for (i = 0; i < zs_size_classes; i++) {
@@ -2192,10 +2467,33 @@ void zs_destroy_pool(struct zs_pool *pool)
 }
 EXPORT_SYMBOL_GPL(zs_destroy_pool);
 
+static struct dentry *zs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
+{
+	static const struct dentry_operations ops = {
+		.d_dname = simple_dname,
+	};
+
+	return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+}
+
+static struct file_system_type zsmalloc_fs = {
+	.name		= "zsmalloc",
+	.mount		= zs_mount,
+	.kill_sb	= kill_anon_super,
+};
+
 static int __init zs_init(void)
 {
-	int ret = zs_register_cpu_notifier();
+	int ret;
+
+	zsmalloc_mnt = kern_mount(&zsmalloc_fs);
+	if (IS_ERR(zsmalloc_mnt)) {
+		ret = PTR_ERR(zsmalloc_mnt);
+		goto out;
+	}
 
+	ret = zs_register_cpu_notifier();
 	if (ret)
 		goto notifier_fail;
 
@@ -2218,6 +2516,7 @@ static int __init zs_init(void)
 		pr_err("zs stat initialization failed\n");
 		goto stat_fail;
 	}
+
 	return 0;
 
 stat_fail:
@@ -2226,7 +2525,8 @@ static int __init zs_init(void)
 #endif
 notifier_fail:
 	zs_unregister_cpu_notifier();
-
+	kern_unmount(zsmalloc_mnt);
+out:
 	return ret;
 }
 
@@ -2237,6 +2537,8 @@ static void __exit zs_exit(void)
 #endif
 	zs_unregister_cpu_notifier();
 
+	kern_unmount(zsmalloc_mnt);
+
 	zs_stat_exit();
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 14/16] zsmalloc: use single linked list for page chain
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

For tail page migration, we shouldn't use page->lru which
was used for page chaining because VM will use it for own
purpose so that we need another field for chaining.
For chaining, singly linked list is enough and page->index
of tail page to point first object offset in the page could
be replaced in run-time calculation.

So, this patch change page->lru list for chaining with singly
linked list via page->freelist squeeze and introduces
get_first_obj_ofs to get first object offset in a page.

With that, it could maintain page chaining without using
page->lru.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 119 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 78 insertions(+), 41 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f6c9138c3be0..a41cf3ef2077 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -17,10 +17,7 @@
  *
  * Usage of struct page fields:
  *	page->private: points to the first component (0-order) page
- *	page->index (union with page->freelist): offset of the first object
- *		starting in this page.
- *	page->lru: links together all component pages (except the first page)
- *		of a zspage
+ *	page->index (union with page->freelist): override by struct zs_meta
  *
  *	For _first_ page only:
  *
@@ -271,10 +268,19 @@ struct zs_pool {
 };
 
 struct zs_meta {
-	unsigned long freeobj:FREEOBJ_BITS;
-	unsigned long class:CLASS_BITS;
-	unsigned long fullness:FULLNESS_BITS;
-	unsigned long inuse:INUSE_BITS;
+	union {
+		/* first page */
+		struct {
+			unsigned long freeobj:FREEOBJ_BITS;
+			unsigned long class:CLASS_BITS;
+			unsigned long fullness:FULLNESS_BITS;
+			unsigned long inuse:INUSE_BITS;
+		};
+		/* tail pages */
+		struct {
+			struct page *next;
+		};
+	};
 };
 
 struct mapping_area {
@@ -491,6 +497,34 @@ static unsigned long get_freeobj(struct page *first_page)
 	return m->freeobj;
 }
 
+static void set_next_page(struct page *page, struct page *next)
+{
+	struct zs_meta *m;
+
+	VM_BUG_ON_PAGE(is_first_page(page), page);
+
+	m = (struct zs_meta *)&page->index;
+	m->next = next;
+}
+
+static struct page *get_next_page(struct page *page)
+{
+	struct page *next;
+
+	if (is_last_page(page))
+		next = NULL;
+	else if (is_first_page(page))
+		next = (struct page *)page_private(page);
+	else {
+		struct zs_meta *m = (struct zs_meta *)&page->index;
+
+		VM_BUG_ON(!m->next);
+		next = m->next;
+	}
+
+	return next;
+}
+
 static void get_zspage_mapping(struct page *first_page,
 				unsigned int *class_idx,
 				enum fullness_group *fullness)
@@ -871,18 +905,30 @@ static struct page *get_first_page(struct page *page)
 		return (struct page *)page_private(page);
 }
 
-static struct page *get_next_page(struct page *page)
+int get_first_obj_ofs(struct size_class *class, struct page *first_page,
+			struct page *page)
 {
-	struct page *next;
+	int pos, bound;
+	int page_idx = 0;
+	int ofs = 0;
+	struct page *cursor = first_page;
 
-	if (is_last_page(page))
-		next = NULL;
-	else if (is_first_page(page))
-		next = (struct page *)page_private(page);
-	else
-		next = list_entry(page->lru.next, struct page, lru);
+	if (first_page == page)
+		goto out;
 
-	return next;
+	while (page != cursor) {
+		page_idx++;
+		cursor = get_next_page(cursor);
+	}
+
+	bound = PAGE_SIZE * page_idx;
+	pos = (((class->objs_per_zspage * class->size) *
+		page_idx / class->pages_per_zspage) / class->size
+		) * class->size;
+
+	ofs = (pos + class->size) % PAGE_SIZE;
+out:
+	return ofs;
 }
 
 static void objidx_to_page_and_offset(struct size_class *class,
@@ -1008,27 +1054,25 @@ void lock_zspage(struct page *first_page)
 
 static void free_zspage(struct zs_pool *pool, struct page *first_page)
 {
-	struct page *nextp, *tmp, *head_extra;
+	struct page *nextp, *tmp;
 
 	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 	VM_BUG_ON_PAGE(get_zspage_inuse(first_page), first_page);
 
 	lock_zspage(first_page);
-	head_extra = (struct page *)page_private(first_page);
+	nextp = (struct page *)page_private(first_page);
 
 	/* zspage with only 1 system page */
-	if (!head_extra)
+	if (!nextp)
 		goto out;
 
-	list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
-		list_del(&nextp->lru);
-		reset_page(nextp);
-		unlock_page(nextp);
-		put_page(nextp);
-	}
-	reset_page(head_extra);
-	unlock_page(head_extra);
-	put_page(head_extra);
+	do {
+		tmp = nextp;
+		nextp = get_next_page(nextp);
+		reset_page(tmp);
+		unlock_page(tmp);
+		put_page(tmp);
+	} while (nextp);
 out:
 	reset_page(first_page);
 	unlock_page(first_page);
@@ -1055,13 +1099,6 @@ static void init_zspage(struct size_class *class, struct page *first_page,
 		struct link_free *link;
 		void *vaddr;
 
-		/*
-		 * page->index stores offset of first object starting
-		 * in the page.
-		 */
-		if (page != first_page)
-			page->index = off;
-
 		vaddr = kmap_atomic(page);
 		link = (struct link_free *)vaddr + off / sizeof(*link);
 
@@ -1103,7 +1140,6 @@ static void create_page_chain(struct page *pages[], int nr_pages)
 	for (i = 0; i < nr_pages; i++) {
 		page = pages[i];
 
-		INIT_LIST_HEAD(&page->lru);
 		if (i == 0) {
 			SetPagePrivate(page);
 			set_page_private(page, 0);
@@ -1112,10 +1148,12 @@ static void create_page_chain(struct page *pages[], int nr_pages)
 
 		if (i == 1)
 			set_page_private(first_page, (unsigned long)page);
-		if (i >= 1)
+		if (i >= 1) {
+			set_next_page(page, NULL);
 			set_page_private(page, (unsigned long)first_page);
+		}
 		if (i >= 2)
-			list_add(&page->lru, &prev_page->lru);
+			set_next_page(prev_page, page);
 		if (i == nr_pages - 1)
 			SetPagePrivate2(page);
 
@@ -2239,8 +2277,7 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	kunmap_atomic(d_addr);
 	kunmap_atomic(s_addr);
 
-	if (!is_first_page(page))
-		offset = page->index;
+	offset = get_first_obj_ofs(class, first_page, page);
 
 	addr = kmap_atomic(page);
 	do {
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 15/16] zsmalloc: migrate tail pages in zspage
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

This patch enables tail page migration of zspage.

In this point, I tested zsmalloc regression with micro-benchmark
which does zs_malloc/map/unmap/zs_free for all size class
in every CPU(my system is 12) during 20 sec.

It shows 1% regression which is really small when we consider
the benefit of this feature and realworkload overhead(i.e.,
most overhead comes from compression).

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 mm/zsmalloc.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 114 insertions(+), 15 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index a41cf3ef2077..e24f4a160892 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -551,6 +551,19 @@ static void set_zspage_mapping(struct page *first_page,
 	m->class = class_idx;
 }
 
+static bool check_isolated_page(struct page *first_page)
+{
+	struct page *cursor;
+
+	for (cursor = first_page; cursor != NULL; cursor =
+					get_next_page(cursor)) {
+		if (PageIsolated(cursor))
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * zsmalloc divides the pool into various size classes where each
  * class maintains a list of zspages where each zspage is divided
@@ -1052,6 +1065,44 @@ void lock_zspage(struct page *first_page)
 	} while ((cursor = get_next_page(cursor)) != NULL);
 }
 
+int trylock_zspage(struct page *first_page, struct page *locked_page)
+{
+	struct page *cursor, *fail;
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
+	for (cursor = first_page; cursor != NULL; cursor =
+			get_next_page(cursor)) {
+		if (cursor != locked_page) {
+			if (!trylock_page(cursor)) {
+				fail = cursor;
+				goto unlock;
+			}
+		}
+	}
+
+	return 1;
+unlock:
+	for (cursor = first_page; cursor != fail; cursor =
+			get_next_page(cursor)) {
+		if (cursor != locked_page)
+			unlock_page(cursor);
+	}
+
+	return 0;
+}
+
+void unlock_zspage(struct page *first_page, struct page *locked_page)
+{
+	struct page *cursor = first_page;
+
+	for (; cursor != NULL; cursor = get_next_page(cursor)) {
+		VM_BUG_ON_PAGE(!PageLocked(cursor), cursor);
+		if (cursor != locked_page)
+			unlock_page(cursor);
+	}
+}
+
 static void free_zspage(struct zs_pool *pool, struct page *first_page)
 {
 	struct page *nextp, *tmp;
@@ -1090,15 +1141,16 @@ static void init_zspage(struct size_class *class, struct page *first_page,
 	first_page->freelist = NULL;
 	INIT_LIST_HEAD(&first_page->lru);
 	set_zspage_inuse(first_page, 0);
-	BUG_ON(!trylock_page(first_page));
-	__SetPageMovable(first_page, mapping);
-	unlock_page(first_page);
 
 	while (page) {
 		struct page *next_page;
 		struct link_free *link;
 		void *vaddr;
 
+		BUG_ON(!trylock_page(page));
+		__SetPageMovable(page, mapping);
+		unlock_page(page);
+
 		vaddr = kmap_atomic(page);
 		link = (struct link_free *)vaddr + off / sizeof(*link);
 
@@ -1848,6 +1900,7 @@ static enum fullness_group putback_zspage(struct size_class *class,
 
 	VM_BUG_ON_PAGE(!list_empty(&first_page->lru), first_page);
 	VM_BUG_ON_PAGE(ZsPageIsolate(first_page), first_page);
+	VM_BUG_ON_PAGE(check_isolated_page(first_page), first_page);
 
 	fullness = get_fullness_group(class, first_page);
 	insert_zspage(class, fullness, first_page);
@@ -1954,6 +2007,12 @@ static struct page *isolate_source_page(struct size_class *class)
 		if (!page)
 			continue;
 
+		/* To prevent race between object and page migration */
+		if (!trylock_zspage(page, NULL)) {
+			page = NULL;
+			continue;
+		}
+
 		remove_zspage(class, i, page);
 
 		inuse = get_zspage_inuse(page);
@@ -1962,6 +2021,7 @@ static struct page *isolate_source_page(struct size_class *class)
 		if (inuse != freezed) {
 			unfreeze_zspage(class, page, freezed);
 			putback_zspage(class, page);
+			unlock_zspage(page, NULL);
 			page = NULL;
 			continue;
 		}
@@ -1993,6 +2053,12 @@ static struct page *isolate_target_page(struct size_class *class)
 		if (!page)
 			continue;
 
+		/* To prevent race between object and page migration */
+		if (!trylock_zspage(page, NULL)) {
+			page = NULL;
+			continue;
+		}
+
 		remove_zspage(class, i, page);
 
 		inuse = get_zspage_inuse(page);
@@ -2001,6 +2067,7 @@ static struct page *isolate_target_page(struct size_class *class)
 		if (inuse != freezed) {
 			unfreeze_zspage(class, page, freezed);
 			putback_zspage(class, page);
+			unlock_zspage(page, NULL);
 			page = NULL;
 			continue;
 		}
@@ -2074,11 +2141,13 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 			putback_zspage(class, dst_page);
 			unfreeze_zspage(class, dst_page,
 				class->objs_per_zspage);
+			unlock_zspage(dst_page, NULL);
 			spin_unlock(&class->lock);
 			dst_page = NULL;
 		}
 
 		if (zspage_empty(class, src_page)) {
+			unlock_zspage(src_page, NULL);
 			free_zspage(pool, src_page);
 			spin_lock(&class->lock);
 			zs_stat_dec(class, OBJ_ALLOCATED,
@@ -2101,12 +2170,14 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 		putback_zspage(class, src_page);
 		unfreeze_zspage(class, src_page,
 				class->objs_per_zspage);
+		unlock_zspage(src_page, NULL);
 	}
 
 	if (dst_page) {
 		putback_zspage(class, dst_page);
 		unfreeze_zspage(class, dst_page,
 				class->objs_per_zspage);
+		unlock_zspage(dst_page, NULL);
 	}
 
 	spin_unlock(&class->lock);
@@ -2209,10 +2280,11 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageIsolated(page), page);
 	/*
-	 * In this implementation, it allows only first page migration.
+	 * first_page will not be destroyed by PG_lock of @page but it could
+	 * be migrated out. For prohibiting it, zs_page_migrate calls
+	 * trylock_zspage so it closes the race.
 	 */
-	VM_BUG_ON_PAGE(!is_first_page(page), page);
-	first_page = page;
+	first_page = get_first_page(page);
 
 	/*
 	 * Without class lock, fullness is meaningless while constant
@@ -2226,9 +2298,18 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
 	if (!spin_trylock(&class->lock))
 		return false;
 
+	if (check_isolated_page(first_page))
+		goto skip_isolate;
+
+	/*
+	 * If this is first time isolation for zspage, isolate zspage from
+	 * size_class to prevent further allocations from the zspage.
+	 */
 	get_zspage_mapping(first_page, &class_idx, &fullness);
 	remove_zspage(class, fullness, first_page);
 	SetZsPageIsolate(first_page);
+
+skip_isolate:
 	SetPageIsolated(page);
 	spin_unlock(&class->lock);
 
@@ -2251,7 +2332,7 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
 
-	first_page = page;
+	first_page = get_first_page(page);
 	get_zspage_mapping(first_page, &class_idx, &fullness);
 	pool = page->mapping->private_data;
 	class = pool->size_class[class_idx];
@@ -2266,6 +2347,13 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	if (get_zspage_inuse(first_page) == 0)
 		goto out_class_unlock;
 
+	/*
+	 * It prevents first_page migration during tail page opeartion for
+	 * get_first_page's stability.
+	 */
+	if (!trylock_zspage(first_page, page))
+		goto out_class_unlock;
+
 	freezed = freeze_zspage(class, first_page);
 	if (freezed != get_zspage_inuse(first_page))
 		goto out_unfreeze;
@@ -2304,21 +2392,26 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
 	kunmap_atomic(addr);
 
 	replace_sub_page(class, first_page, newpage, page);
-	first_page = newpage;
+	first_page = get_first_page(newpage);
 	get_page(newpage);
 	VM_BUG_ON_PAGE(get_fullness_group(class, first_page) ==
 			ZS_EMPTY, first_page);
-	ClearZsPageIsolate(first_page);
-	putback_zspage(class, first_page);
+	if (!check_isolated_page(first_page)) {
+		INIT_LIST_HEAD(&first_page->lru);
+		ClearZsPageIsolate(first_page);
+		putback_zspage(class, first_page);
+	}
+
 
 	/* Migration complete. Free old page */
 	ClearPageIsolated(page);
 	reset_page(page);
 	put_page(page);
 	ret = MIGRATEPAGE_SUCCESS;
-
+	page = newpage;
 out_unfreeze:
 	unfreeze_zspage(class, first_page, freezed);
+	unlock_zspage(first_page, page);
 out_class_unlock:
 	spin_unlock(&class->lock);
 
@@ -2336,7 +2429,7 @@ void zs_page_putback(struct page *page)
 	VM_BUG_ON_PAGE(!PageMovable(page), page);
 	VM_BUG_ON_PAGE(!PageIsolated(page), page);
 
-	first_page = page;
+	first_page = get_first_page(page);
 	get_zspage_mapping(first_page, &class_idx, &fullness);
 	pool = page->mapping->private_data;
 	class = pool->size_class[class_idx];
@@ -2346,11 +2439,17 @@ void zs_page_putback(struct page *page)
 	 * in zs_free will wait the page lock of @page without
 	 * destroying of zspage.
 	 */
-	INIT_LIST_HEAD(&first_page->lru);
 	spin_lock(&class->lock);
 	ClearPageIsolated(page);
-	ClearZsPageIsolate(first_page);
-	putback_zspage(class, first_page);
+	/*
+	 * putback zspage to right list if this is last isolated page
+	 * putback in the zspage.
+	 */
+	if (!check_isolated_page(first_page)) {
+		INIT_LIST_HEAD(&first_page->lru);
+		ClearZsPageIsolate(first_page);
+		putback_zspage(class, first_page);
+	}
 	spin_unlock(&class->lock);
 }
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH v3 16/16] zram: use __GFP_MOVABLE for memory allocation
From: Minchan Kim @ 2016-03-30  7:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Minchan Kim, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

Zsmalloc is ready for page migration so zram can use __GFP_MOVABLE
from now on.

I did test to see how it helps to make higher order pages.
Test scenario is as follows.

KVM guest, 1G memory, ext4 formated zram block device,

for i in `seq 1 8`;
do
        dd if=/dev/vda1 of=mnt/test$i.txt bs=128M count=1 &
done

wait `pidof dd`

for i in `seq 1 2 8`;
do
        rm -rf mnt/test$i.txt
done
fstrim -v mnt

echo "init"
cat /proc/buddyinfo

echo "compaction"
echo 1 > /proc/sys/vm/compact_memory
cat /proc/buddyinfo

old:

init
Node 0, zone      DMA    208    120     51     41     11      0      0      0      0      0      0
Node 0, zone    DMA32  16380  13777   9184   3805    789     54      3      0      0      0      0
compaction
Node 0, zone      DMA    132     82     40     39     16      2      1      0      0      0      0
Node 0, zone    DMA32   5219   5526   4969   3455   1831    677    139     15      0      0      0

new:

init
Node 0, zone      DMA    379    115     97     19      2      0      0      0      0      0      0
Node 0, zone    DMA32  18891  16774  10862   3947    637     21      0      0      0      0      0
compaction  1
Node 0, zone      DMA    214     66     87     29     10      3      0      0      0      0      0
Node 0, zone    DMA32   1612   3139   3154   2469   1745    990    384     94      7      0      0

As you can see, compaction made so many high-order pages. Yay!

Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 drivers/block/zram/zram_drv.c | 3 ++-
 mm/zsmalloc.c                 | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..10f6ff1cf6a0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO|__GFP_HIGHMEM
+						|__GFP_MOVABLE);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e24f4a160892..4b1ccb68f2ee 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -308,7 +308,7 @@ static void destroy_handle_cache(struct zs_pool *pool)
 static unsigned long alloc_handle(struct zs_pool *pool)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+		pool->flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v3 00/16] Support non-lru page migration
From: Andrew Morton @ 2016-03-30 23:11 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <1459321935-3655-1-git-send-email-minchan@kernel.org>

On Wed, 30 Mar 2016 16:11:59 +0900 Minchan Kim <minchan@kernel.org> wrote:

> Recently, I got many reports about perfermance degradation
> in embedded system(Android mobile phone, webOS TV and so on)
> and failed to fork easily.
> 
> The problem was fragmentation caused by zram and GPU driver
> pages. Their pages cannot be migrated so compaction cannot
> work well, either so reclaimer ends up shrinking all of working
> set pages. It made system very slow and even to fail to fork
> easily.
> 
> Other pain point is that they cannot work with CMA.
> Most of CMA memory space could be idle(ie, it could be used
> for movable pages unless driver is using) but if driver(i.e.,
> zram) cannot migrate his page, that memory space could be
> wasted. In our product which has big CMA memory, it reclaims
> zones too exccessively although there are lots of free space
> in CMA so system was very slow easily.
> 
> To solve these problem, this patch try to add facility to
> migrate non-lru pages via introducing new friend functions
> of migratepage in address_space_operation and new page flags.
> 
> 	(isolate_page, putback_page)
> 	(PG_movable, PG_isolated)
> 
> For details, please read description in
> "mm/compaction: support non-lru movable page migration".

OK, I grabbed all these.

I wonder about testing coverage during the -next period.  How many
people are likely to exercise these code paths in a serious way before
it all hits mainline?

^ permalink raw reply

* Re: [PATCH v3 00/16] Support non-lru page migration
From: Sergey Senozhatsky @ 2016-03-31  0:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, Sergey Senozhatsky, YiPing Xu, aquini, rknize,
	linux-mm, Chan Gyun Jeong, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, Minchan Kim, Gioh Kim, koct9i,
	Sangseok Lee, jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <20160330161141.4332b189e7a4930e117d765b@linux-foundation.org>

On (03/30/16 16:11), Andrew Morton wrote:
[..]
> > For details, please read description in
> > "mm/compaction: support non-lru movable page migration".
> 
> OK, I grabbed all these.
> 
> I wonder about testing coverage during the -next period.  How many
> people are likely to exercise these code paths in a serious way before
> it all hits mainline?

I'm hammering the zsmalloc part.

	-ss

^ permalink raw reply

* Re: [PATCH v3 00/16] Support non-lru page migration
From: Minchan Kim @ 2016-03-31  0:57 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	jlayton, Joonsoo Kim, Vlastimil Babka, Mel Gorman
In-Reply-To: <20160330161141.4332b189e7a4930e117d765b@linux-foundation.org>

On Wed, Mar 30, 2016 at 04:11:41PM -0700, Andrew Morton wrote:
> On Wed, 30 Mar 2016 16:11:59 +0900 Minchan Kim <minchan@kernel.org> wrote:
> 
> > Recently, I got many reports about perfermance degradation
> > in embedded system(Android mobile phone, webOS TV and so on)
> > and failed to fork easily.
> > 
> > The problem was fragmentation caused by zram and GPU driver
> > pages. Their pages cannot be migrated so compaction cannot
> > work well, either so reclaimer ends up shrinking all of working
> > set pages. It made system very slow and even to fail to fork
> > easily.
> > 
> > Other pain point is that they cannot work with CMA.
> > Most of CMA memory space could be idle(ie, it could be used
> > for movable pages unless driver is using) but if driver(i.e.,
> > zram) cannot migrate his page, that memory space could be
> > wasted. In our product which has big CMA memory, it reclaims
> > zones too exccessively although there are lots of free space
> > in CMA so system was very slow easily.
> > 
> > To solve these problem, this patch try to add facility to
> > migrate non-lru pages via introducing new friend functions
> > of migratepage in address_space_operation and new page flags.
> > 
> > 	(isolate_page, putback_page)
> > 	(PG_movable, PG_isolated)
> > 
> > For details, please read description in
> > "mm/compaction: support non-lru movable page migration".
> 
> OK, I grabbed all these.
> 
> I wonder about testing coverage during the -next period.  How many
> people are likely to exercise these code paths in a serious way before
> it all hits mainline?

I asked this patchset to production team in my company for stress
testing. They alaways catch zram/zsmalloc bugs I have missed so
I hope they help me well, too.

About ballooning part, I hope Rafael Aquini get a time to review
and test it.

Other than that, IOW, linux-next will have a enough time to
test common migration part modification, I guess. :)

Thanks.

^ permalink raw reply

* Re: [PATCH v3 00/16] Support non-lru page migration
From: Minchan Kim @ 2016-03-31  0:57 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Rik van Riel, YiPing Xu, aquini, rknize, Sergey Senozhatsky,
	Chan Gyun Jeong, Hugh Dickins, linux-kernel, Al Viro,
	virtualization, bfields, linux-mm, Gioh Kim, koct9i, Sangseok Lee,
	Joonsoo Kim, Andrew Morton, jlayton, Vlastimil Babka, Mel Gorman
In-Reply-To: <20160331002932.GA1758@swordfish>

On Thu, Mar 31, 2016 at 09:29:32AM +0900, Sergey Senozhatsky wrote:
> On (03/30/16 16:11), Andrew Morton wrote:
> [..]
> > > For details, please read description in
> > > "mm/compaction: support non-lru movable page migration".
> > 
> > OK, I grabbed all these.
> > 
> > I wonder about testing coverage during the -next period.  How many
> > people are likely to exercise these code paths in a serious way before
> > it all hits mainline?
> 
> I'm hammering the zsmalloc part.

Thanks, Sergey!

^ permalink raw reply

* Re: [PATCH 6/6] VMware balloon: Update vmw_balloon.c to use the VMW_PORT macro
From: Sinclair Yeh @ 2016-03-31 14:39 UTC (permalink / raw)
  To: Greg KH; +Cc: pv-drivers, Xavier Deguillard, x86, linux-kernel, virtualization
In-Reply-To: <20160208194157.GA10802@kroah.com>

Hi,

Does any one know when this series will be applied?

Sinclair

On Mon, Feb 08, 2016 at 11:41:57AM -0800, Greg KH wrote:
> On Tue, Jan 19, 2016 at 01:46:05PM -0800, Sinclair Yeh wrote:
> > Updated VMWARE_BALLOON_CMD to use the common VMW_PORT macro.
> > Doing this rather than replacing all instances of VMWARE_BALLOON_CMD
> > to minimize code change.
> > 
> > Signed-off-by: Sinclair Yeh <syeh@vmware.com>
> > Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
> > Reviewed-by: Alok N Kataria <akataria@vmware.com>
> > Acked-by: Xavier Deguillard <xdeguillard@vmware.com>
> > Cc: pv-drivers@vmware.com
> > Cc: Xavier Deguillard <xdeguillard@vmware.com>
> > Cc: linux-kernel@vger.kernel.org
> > Cc: virtualization@lists.linux-foundation.org
> > Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> 
> 
> Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> 
> > -#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)		\
> > -({								\
> > -	unsigned long __status, __dummy1, __dummy2, __dummy3;	\
> > -	__asm__ __volatile__ ("inl %%dx" :			\
> > -		"=a"(__status),					\
> > -		"=c"(__dummy1),					\
> > -		"=d"(__dummy2),					\
> > -		"=b"(result),					\
> > -		"=S" (__dummy3) :				\
> > -		"0"(VMW_BALLOON_HV_MAGIC),			\
> > -		"1"(VMW_BALLOON_CMD_##cmd),			\
> > -		"2"(VMW_BALLOON_HV_PORT),			\
> > -		"3"(arg1),					\
> > -		"4" (arg2) :					\
> > -		"memory");					\
> > -	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)	\
> > -		result = __dummy1;				\
> > -	result &= -1UL;						\
> > -	__status & -1UL;					\
> > +#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)			\
> > +({									\
> > +	unsigned long __status, __dummy1, __dummy2;			\
> > +	unsigned long __si, __di;					\
> > +	VMW_PORT(VMW_BALLOON_CMD_##cmd, arg1, arg2, 0,			\
> > +		 VMW_BALLOON_HV_PORT, VMW_BALLOON_HV_MAGIC,		\
> > +		 __status, result, __dummy1, __dummy2, __si, __di);	\
> > +	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)		\
> > +		result = __dummy1;					\
> > +	result &= -1UL;							\
> > +	__status & -1UL;						\
> >  })
> 
> Honestly, it doesn't look anymore "readable" to me, but it's your code
> to maintain, not mine... :)
> 

^ permalink raw reply

* Re: [PATCH 6/6] VMware balloon: Update vmw_balloon.c to use the VMW_PORT macro
From: Greg KH @ 2016-03-31 16:30 UTC (permalink / raw)
  To: Sinclair Yeh
  Cc: pv-drivers, Xavier Deguillard, x86, linux-kernel, virtualization
In-Reply-To: <20160331143953.GA23662@syeh-linux>

On Thu, Mar 31, 2016 at 07:39:53AM -0700, Sinclair Yeh wrote:
> Hi,
> 
> Does any one know when this series will be applied?

No idea, it's not going through my tree, as I don't have these in my
queue anymore...

greg k-h

^ permalink raw reply

* Re: [PATCH 6/6] VMware balloon: Update vmw_balloon.c to use the VMW_PORT macro
From: Sinclair Yeh @ 2016-03-31 17:32 UTC (permalink / raw)
  To: Greg KH; +Cc: pv-drivers, Xavier Deguillard, x86, linux-kernel, virtualization
In-Reply-To: <20160331163037.GB10256@kroah.com>

On Thu, Mar 31, 2016 at 09:30:37AM -0700, Greg KH wrote:
> On Thu, Mar 31, 2016 at 07:39:53AM -0700, Sinclair Yeh wrote:
> > Hi,
> > 
> > Does any one know when this series will be applied?
> 
> No idea, it's not going through my tree, as I don't have these in my
> queue anymore...

Ok, I'm not sure what else to do.  I've already updated the series once
because it didn't get picked up last December.  Can anyone on the x86 list
pick this up?

Sinclair

^ permalink raw reply

* [PATCH v3 0/6] Support calling functions on dedicated physical cpu
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux

Some hardware (e.g. Dell Studio laptops) require special functions to
be called on physical cpu 0 in order to avoid occasional hangs. When
running as dom0 under Xen this could be achieved only via special boot
parameters (vcpu pinning) limiting the hypervisor in it's scheduling
decisions.

This patch series is adding a generic function to be able to temporarily
pin a (virtual) cpu to a dedicated physical cpu for executing above
mentioned functions on that specific cpu. The drivers (dcdbas and i8k)
requiring this functionality are modified accordingly.

Changes in V3:
- use get_cpu()/put_cpu() as suggested by David Vrabel

Changes in V2:
- instead of manipulating the allowed set of cpus use cpu specific
  workqueue as requested by Peter Zijlstra
- add include/linux/hypervisor.h to hide architecture specific stuff
  from generic kernel code

Juergen Gross (6):
  xen: sync xen header
  smp: add function to execute a function synchronously on a physical
    cpu
  dcdbas: make use of smp_call_sync_on_phys_cpu()
  hwmon: use smp_call_sync_on_phys_cpu() for dell-smm i8k
  virt, sched: add cpu pinning to smp_call_sync_on_phys_cpu()
  xen: add xen_pin_vcpu() to support calling functions on a dedicated
    pcpu

 MAINTAINERS                       |   1 +
 arch/x86/include/asm/hypervisor.h |   9 ++++
 arch/x86/xen/enlighten.c          |  40 +++++++++++++++
 drivers/firmware/dcdbas.c         |  46 ++++++++----------
 drivers/hwmon/dell-smm-hwmon.c    |  27 +++++-----
 include/linux/hypervisor.h        |  17 +++++++
 include/linux/smp.h               |   2 +
 include/xen/interface/sched.h     | 100 +++++++++++++++++++++++++++++++-------
 kernel/smp.c                      |  50 +++++++++++++++++++
 kernel/up.c                       |  18 +++++++
 10 files changed, 251 insertions(+), 59 deletions(-)
 create mode 100644 include/linux/hypervisor.h

-- 
2.6.2

^ permalink raw reply

* [PATCH v3 1/6] xen: sync xen header
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux
In-Reply-To: <1459494874-12194-1-git-send-email-jgross@suse.com>

Import the actual version of include/xen/interface/sched.h from Xen.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 include/xen/interface/sched.h | 100 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 82 insertions(+), 18 deletions(-)

diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
index f184909..a4c4d73 100644
--- a/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@ -3,6 +3,24 @@
  *
  * Scheduler state interactions
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
  */
 
@@ -12,18 +30,30 @@
 #include <xen/interface/event_channel.h>
 
 /*
+ * Guest Scheduler Operations
+ *
+ * The SCHEDOP interface provides mechanisms for a guest to interact
+ * with the scheduler, including yield, blocking and shutting itself
+ * down.
+ */
+
+/*
  * The prototype for this hypercall is:
- *  long sched_op_new(int cmd, void *arg)
+ * long HYPERVISOR_sched_op(enum sched_op cmd, void *arg, ...)
+ *
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == Operation-specific extra argument(s), as described below.
+ * ...  == Additional Operation-specific extra arguments, described below.
  *
- * **NOTE**:
- * Versions of Xen prior to 3.0.2 provide only the following legacy version
+ * Versions of Xen prior to 3.0.2 provided only the following legacy version
  * of this hypercall, supporting only the commands yield, block and shutdown:
  *  long sched_op(int cmd, unsigned long arg)
  * @cmd == SCHEDOP_??? (scheduler operation).
  * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
  *      == SHUTDOWN_* code (SCHEDOP_shutdown)
+ *
+ * This legacy version is available to new guests as:
+ * long HYPERVISOR_sched_op_compat(enum sched_op cmd, unsigned long arg)
  */
 
 /*
@@ -44,12 +74,17 @@
 /*
  * Halt execution of this domain (all VCPUs) and notify the system controller.
  * @arg == pointer to sched_shutdown structure.
+ *
+ * If the sched_shutdown_t reason is SHUTDOWN_suspend then
+ * x86 PV guests must also set RDX (EDX for 32-bit guests) to the MFN
+ * of the guest's start info page.  RDX/EDX is the third hypercall
+ * argument.
+ *
+ * In addition, which reason is SHUTDOWN_suspend this hypercall
+ * returns 1 if suspend was cancelled or the domain was merely
+ * checkpointed, and 0 if it is resuming in a new domain.
  */
 #define SCHEDOP_shutdown    2
-struct sched_shutdown {
-    unsigned int reason; /* SHUTDOWN_* */
-};
-DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
 
 /*
  * Poll a set of event-channel ports. Return when one or more are pending. An
@@ -57,12 +92,6 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
  * @arg == pointer to sched_poll structure.
  */
 #define SCHEDOP_poll        3
-struct sched_poll {
-    GUEST_HANDLE(evtchn_port_t) ports;
-    unsigned int nr_ports;
-    uint64_t timeout;
-};
-DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
 
 /*
  * Declare a shutdown for another domain. The main use of this function is
@@ -71,15 +100,11 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
  * @arg == pointer to sched_remote_shutdown structure.
  */
 #define SCHEDOP_remote_shutdown        4
-struct sched_remote_shutdown {
-    domid_t domain_id;         /* Remote domain ID */
-    unsigned int reason;       /* SHUTDOWN_xxx reason */
-};
 
 /*
  * Latch a shutdown code, so that when the domain later shuts down it
  * reports this code to the control tools.
- * @arg == as for SCHEDOP_shutdown.
+ * @arg == sched_shutdown, as for SCHEDOP_shutdown.
  */
 #define SCHEDOP_shutdown_code 5
 
@@ -92,10 +117,47 @@ struct sched_remote_shutdown {
  * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
  */
 #define SCHEDOP_watchdog    6
+
+/*
+ * Override the current vcpu affinity by pinning it to one physical cpu or
+ * undo this override restoring the previous affinity.
+ * @arg == pointer to sched_pin_override structure.
+ *
+ * A negative pcpu value will undo a previous pin override and restore the
+ * previous cpu affinity.
+ * This call is allowed for the hardware domain only and requires the cpu
+ * to be part of the domain's cpupool.
+ */
+#define SCHEDOP_pin_override 7
+
+struct sched_shutdown {
+    unsigned int reason; /* SHUTDOWN_* => shutdown reason */
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+
+struct sched_poll {
+    GUEST_HANDLE(evtchn_port_t) ports;
+    unsigned int nr_ports;
+    uint64_t timeout;
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+
+struct sched_remote_shutdown {
+    domid_t domain_id;         /* Remote domain ID */
+    unsigned int reason;       /* SHUTDOWN_* => shutdown reason */
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_remote_shutdown);
+
 struct sched_watchdog {
     uint32_t id;                /* watchdog ID */
     uint32_t timeout;           /* timeout */
 };
+DEFINE_GUEST_HANDLE_STRUCT(sched_watchdog);
+
+struct sched_pin_override {
+    int32_t pcpu;
+};
+DEFINE_GUEST_HANDLE_STRUCT(sched_pin_override);
 
 /*
  * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
@@ -107,6 +169,7 @@ struct sched_watchdog {
 #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
 #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
 #define SHUTDOWN_watchdog   4  /* Restart because watchdog time expired.     */
+
 /*
  * Domain asked to perform 'soft reset' for it. The expected behavior is to
  * reset internal Xen state for the domain returning it to the point where it
@@ -115,5 +178,6 @@ struct sched_watchdog {
  * interfaces again.
  */
 #define SHUTDOWN_soft_reset 5
+#define SHUTDOWN_MAX        5  /* Maximum valid shutdown reason.             */
 
 #endif /* __XEN_PUBLIC_SCHED_H__ */
-- 
2.6.2

^ permalink raw reply related

* [PATCH v3 2/6] smp: add function to execute a function synchronously on a physical cpu
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux
In-Reply-To: <1459494874-12194-1-git-send-email-jgross@suse.com>

On some hardware models (e.g. Dell Studio 1555 laptop) some hardware
related functions (e.g. SMIs) are to be executed on physical cpu 0
only. Instead of open coding such a functionality multiple times in
the kernel add a service function for this purpose. This will enable
the possibility to take special measures in virtualized environments
like Xen, too.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2: instead of manipulating the allowed set of cpus use cpu specific
    workqueue as requested by Peter Zijlstra
---
 include/linux/smp.h |  2 ++
 kernel/smp.c        | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/up.c         |  9 +++++++++
 3 files changed, 55 insertions(+)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index c441407..fc9d21b 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -196,4 +196,6 @@ extern void arch_enable_nonboot_cpus_end(void);
 
 void smp_setup_processor_id(void);
 
+int smp_call_sync_on_phys_cpu(unsigned int cpu, int (*func)(void *), void *par);
+
 #endif /* __LINUX_SMP_H */
diff --git a/kernel/smp.c b/kernel/smp.c
index 7416544..62da74b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -739,3 +739,47 @@ void wake_up_all_idle_cpus(void)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
+
+/**
+ * smp_call_sync_on_phys_cpu - Call a function on a specific physical cpu
+ *
+ * Used to call a function on a specific physical cpu. Even if the specified
+ * cpu isn't the current one, return only after the called function has
+ * returned.
+ */
+struct smp_sync_call_struct {
+	struct work_struct	work;
+	struct completion	done;
+	int			(*func)(void *);
+	void			*data;
+	int			ret;
+};
+
+static void smp_call_sync_callback(struct work_struct *work)
+{
+	struct smp_sync_call_struct *sscs;
+
+	sscs = container_of(work, struct smp_sync_call_struct, work);
+	sscs->ret = sscs->func(sscs->data);
+
+	complete(&sscs->done);
+}
+
+int smp_call_sync_on_phys_cpu(unsigned int cpu, int (*func)(void *), void *par)
+{
+	struct smp_sync_call_struct sscs = {
+		.work = __WORK_INITIALIZER(sscs.work, smp_call_sync_callback),
+		.done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
+		.func = func,
+		.data = par,
+	};
+
+	if (cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	queue_work_on(cpu, system_wq, &sscs.work);
+	wait_for_completion(&sscs.done);
+
+	return sscs.ret;
+}
+EXPORT_SYMBOL_GPL(smp_call_sync_on_phys_cpu);
diff --git a/kernel/up.c b/kernel/up.c
index 1760bf3..afd395c 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -82,3 +82,12 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 	preempt_enable();
 }
 EXPORT_SYMBOL(on_each_cpu_cond);
+
+int smp_call_sync_on_phys_cpu(unsigned int cpu, int (*func)(void *), void *par)
+{
+	if (cpu != 0)
+		return -EINVAL;
+
+	return func(par);
+}
+EXPORT_SYMBOL_GPL(smp_call_sync_on_phys_cpu);
-- 
2.6.2

^ permalink raw reply related

* [PATCH v3 3/6] dcdbas: make use of smp_call_sync_on_phys_cpu()
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux
In-Reply-To: <1459494874-12194-1-git-send-email-jgross@suse.com>

Use smp_call_sync_on_phys_cpu() to raise SMI on cpu 0.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/firmware/dcdbas.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c
index 829eec8..d15ad0b 100644
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -238,33 +238,14 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
 	return count;
 }
 
-/**
- * dcdbas_smi_request: generate SMI request
- *
- * Called with smi_data_lock.
- */
-int dcdbas_smi_request(struct smi_cmd *smi_cmd)
+static int raise_smi(void *par)
 {
-	cpumask_var_t old_mask;
-	int ret = 0;
+	struct smi_cmd *smi_cmd = par;
 
-	if (smi_cmd->magic != SMI_CMD_MAGIC) {
-		dev_info(&dcdbas_pdev->dev, "%s: invalid magic value\n",
-			 __func__);
-		return -EBADR;
-	}
-
-	/* SMI requires CPU 0 */
-	if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
-		return -ENOMEM;
-
-	cpumask_copy(old_mask, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpumask_of(0));
 	if (smp_processor_id() != 0) {
 		dev_dbg(&dcdbas_pdev->dev, "%s: failed to get CPU 0\n",
 			__func__);
-		ret = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
 
 	/* generate SMI */
@@ -280,10 +261,23 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
 		: "memory"
 	);
 
-out:
-	set_cpus_allowed_ptr(current, old_mask);
-	free_cpumask_var(old_mask);
-	return ret;
+	return 0;
+}
+/**
+ * dcdbas_smi_request: generate SMI request
+ *
+ * Called with smi_data_lock.
+ */
+int dcdbas_smi_request(struct smi_cmd *smi_cmd)
+{
+	if (smi_cmd->magic != SMI_CMD_MAGIC) {
+		dev_info(&dcdbas_pdev->dev, "%s: invalid magic value\n",
+			 __func__);
+		return -EBADR;
+	}
+
+	/* SMI requires CPU 0 */
+	return smp_call_sync_on_phys_cpu(0, raise_smi, smi_cmd);
 }
 
 /**
-- 
2.6.2

^ permalink raw reply related

* [PATCH v3 4/6] hwmon: use smp_call_sync_on_phys_cpu() for dell-smm i8k
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux
In-Reply-To: <1459494874-12194-1-git-send-email-jgross@suse.com>

Use the smp_call_sync_on_phys_cpu() function to call system management
mode on cpu 0.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/hwmon/dell-smm-hwmon.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c
index c43318d..4875462 100644
--- a/drivers/hwmon/dell-smm-hwmon.c
+++ b/drivers/hwmon/dell-smm-hwmon.c
@@ -130,23 +130,15 @@ static inline const char *i8k_get_dmi_data(int field)
 /*
  * Call the System Management Mode BIOS. Code provided by Jonathan Buzzard.
  */
-static int i8k_smm(struct smm_regs *regs)
+static int i8k_smm_func(void *par)
 {
 	int rc;
+	struct smm_regs *regs = par;
 	int eax = regs->eax;
-	cpumask_var_t old_mask;
 
 	/* SMM requires CPU 0 */
-	if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
-		return -ENOMEM;
-	cpumask_copy(old_mask, &current->cpus_allowed);
-	rc = set_cpus_allowed_ptr(current, cpumask_of(0));
-	if (rc)
-		goto out;
-	if (smp_processor_id() != 0) {
-		rc = -EBUSY;
-		goto out;
-	}
+	if (smp_processor_id() != 0)
+		return -EBUSY;
 
 #if defined(CONFIG_X86_64)
 	asm volatile("pushq %%rax\n\t"
@@ -204,13 +196,18 @@ static int i8k_smm(struct smm_regs *regs)
 	if (rc != 0 || (regs->eax & 0xffff) == 0xffff || regs->eax == eax)
 		rc = -EINVAL;
 
-out:
-	set_cpus_allowed_ptr(current, old_mask);
-	free_cpumask_var(old_mask);
 	return rc;
 }
 
 /*
+ * Call the System Management Mode BIOS.
+ */
+static int i8k_smm(struct smm_regs *regs)
+{
+	return smp_call_sync_on_phys_cpu(0, i8k_smm_func, regs);
+}
+
+/*
  * Read the fan status.
  */
 static int i8k_get_fan_status(int fan)
-- 
2.6.2

^ permalink raw reply related

* [PATCH v3 5/6] virt, sched: add cpu pinning to smp_call_sync_on_phys_cpu()
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux
In-Reply-To: <1459494874-12194-1-git-send-email-jgross@suse.com>

Add generic virtualization support for pinning the current vcpu to a
specified physical cpu. As this operation isn't performance critical
(a very limited set of operations like BIOS calls and SMIs is expected
to need this) just add a hypervisor specific indirection.

Such a pinning should last as short as possible as it might block
sensible vcpu scheduling and maybe other hypervisor functions like
suspending the system which rely on scheduling. To ensure this don't
let the current thread be preempted while the vcpu is pinned in
smp_call_sync_on_phys_cpu().

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V3: use getc_cpu()/put_cpu() as suggested by David Vrabel

V2: adapt to using workqueues
    add include/linux/hypervisor.h to hide architecture specific stuff
    from generic kernel code

In case paravirt maintainers don't want to be responsible for
include/linux/hypervisor.h I could take it.
---
 MAINTAINERS                       |  1 +
 arch/x86/include/asm/hypervisor.h |  9 +++++++++
 include/linux/hypervisor.h        | 17 +++++++++++++++++
 kernel/smp.c                      |  6 ++++++
 kernel/up.c                       | 11 ++++++++++-
 5 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hypervisor.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 378ebff..3571dff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8329,6 +8329,7 @@ S:	Supported
 F:	Documentation/virtual/paravirt_ops.txt
 F:	arch/*/kernel/paravirt*
 F:	arch/*/include/asm/paravirt.h
+F:	include/linux/hypervisor.h
 
 PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES
 M:	Tim Waugh <tim@cyberelk.net>
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 055ea99..13f80a2 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
 
 	/* X2APIC detection (run once per boot) */
 	bool		(*x2apic_available)(void);
+
+	/* pin current vcpu to specified physical cpu (run rarely) */
+	void		(*pin_vcpu)(int);
 };
 
 extern const struct hypervisor_x86 *x86_hyper;
@@ -56,6 +59,12 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 extern bool hypervisor_x2apic_available(void);
+
+static inline void hypervisor_pin_vcpu(int cpu)
+{
+	if (x86_hyper->pin_vcpu)
+		x86_hyper->pin_vcpu(cpu);
+}
 #else
 static inline void init_hypervisor(struct cpuinfo_x86 *c) { }
 static inline void init_hypervisor_platform(void) { }
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
new file mode 100644
index 0000000..3fa5ef2
--- /dev/null
+++ b/include/linux/hypervisor.h
@@ -0,0 +1,17 @@
+#ifndef __LINUX_HYPEVISOR_H
+#define __LINUX_HYPEVISOR_H
+
+/*
+ *	Generic Hypervisor support
+ *		Juergen Gross <jgross@suse.com>
+ */
+
+#ifdef CONFIG_HYPERVISOR_GUEST
+#include <asm/hypervisor.h>
+#else
+static inline void hypervisor_pin_vcpu(int cpu)
+{
+}
+#endif
+
+#endif /* __LINUX_HYPEVISOR_H */
diff --git a/kernel/smp.c b/kernel/smp.c
index 62da74b..f9cfb7d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -14,6 +14,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/hypervisor.h>
 
 #include "smpboot.h"
 
@@ -758,9 +759,14 @@ struct smp_sync_call_struct {
 static void smp_call_sync_callback(struct work_struct *work)
 {
 	struct smp_sync_call_struct *sscs;
+	unsigned int cpu;
 
 	sscs = container_of(work, struct smp_sync_call_struct, work);
+	cpu = get_cpu();
+	hypervisor_pin_vcpu(cpu);
 	sscs->ret = sscs->func(sscs->data);
+	hypervisor_pin_vcpu(-1);
+	put_cpu();
 
 	complete(&sscs->done);
 }
diff --git a/kernel/up.c b/kernel/up.c
index afd395c..725ec44 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/smp.h>
+#include <linux/hypervisor.h>
 
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 				int wait)
@@ -85,9 +86,17 @@ EXPORT_SYMBOL(on_each_cpu_cond);
 
 int smp_call_sync_on_phys_cpu(unsigned int cpu, int (*func)(void *), void *par)
 {
+	int ret;
+
 	if (cpu != 0)
 		return -EINVAL;
 
-	return func(par);
+	preempt_disable();
+	hypervisor_pin_vcpu(0);
+	ret = func(par);
+	hypervisor_pin_vcpu(-1);
+	preempt_enable();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(smp_call_sync_on_phys_cpu);
-- 
2.6.2

^ permalink raw reply related

* [PATCH v3 6/6] xen: add xen_pin_vcpu() to support calling functions on a dedicated pcpu
From: Juergen Gross @ 2016-04-01  7:14 UTC (permalink / raw)
  To: linux-kernel, xen-devel
  Cc: Juergen Gross, jeremy, jdelvare, konrad.wilk, peterz, hpa,
	akataria, x86, virtualization, chrisw, mingo, david.vrabel,
	Douglas_Warzecha, pali.rohar, boris.ostrovsky, tglx, linux
In-Reply-To: <1459494874-12194-1-git-send-email-jgross@suse.com>

Some hardware models (e.g. Dell Studio 1555 laptops) require calls to
the firmware to be issued on cpu 0 only. As Dom0 might have to use
these calls, add xen_pin_vcpu() to achieve this functionality.

In case either the domain doesn't have the privilege to make the
related hypercall or the hypervisor isn't supporting it, issue a
warning once and disable further pinning attempts.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/enlighten.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 880862c..7907bcf8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1885,6 +1885,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
+static void xen_pin_vcpu(int cpu)
+{
+	static bool disable_pinning;
+	struct sched_pin_override pin_override;
+	int ret;
+
+	if (disable_pinning)
+		return;
+
+	pin_override.pcpu = cpu;
+	ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override);
+	if (cpu < 0)
+		return;
+
+	switch (ret) {
+	case -ENOSYS:
+		pr_warn("The kernel tried to call a function on physical cpu %d, but Xen isn't\n"
+			"supporting this. In case of problems you might consider vcpu pinning.\n",
+			cpu);
+		disable_pinning = true;
+		break;
+	case -EPERM:
+		WARN(1, "Trying to pin vcpu without having privilege to do so\n");
+		disable_pinning = true;
+		break;
+	case -EINVAL:
+	case -EBUSY:
+		pr_warn("The kernel tried to call a function on physical cpu %d, but this cpu\n"
+			"seems not to be available. Please check your Xen cpu configuration.\n",
+			cpu);
+		break;
+	case 0:
+		break;
+	default:
+		WARN(1, "rc %d while trying to pin vcpu\n", ret);
+		disable_pinning = true;
+	}
+}
+
 const struct hypervisor_x86 x86_hyper_xen = {
 	.name			= "Xen",
 	.detect			= xen_platform,
@@ -1893,6 +1932,7 @@ const struct hypervisor_x86 x86_hyper_xen = {
 #endif
 	.x2apic_available	= xen_x2apic_para_available,
 	.set_cpu_features       = xen_set_cpu_features,
+	.pin_vcpu               = xen_pin_vcpu,
 };
 EXPORT_SYMBOL(x86_hyper_xen);
 
-- 
2.6.2

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox