[PATCH] mm/zblock: use vmalloc for page allocations

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] mm/zblock: use vmalloc for page allocations
@ 2025-05-02  8:01 Vitaly Wool
  2025-05-02  8:07 ` Igor Belousov
                   ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Vitaly Wool @ 2025-05-02  8:01 UTC (permalink / raw)
  To: linux-mm
  Cc: akpm, linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Yosry Ahmed, Minchan Kim, Sergey Senozhatsky, Igor Belousov,
	Vitaly Wool

From: Igor Belousov <igor.b@beldev.am>

Use vmalloc for page allocations for zblock blocks to avoid extra
pressure on the memmory subsystem with multiple higher order
allocations.

While at it, introduce a module parameter to opportunistically
allocate pages of lower orders via try_page_alloc() for faster
allocations whenever possible.

Since vmalloc works fine with non-power of 2 numbers of pages,
rewrite the block size tables to use that opportunity.

Signed-off-by: Igor Belousov <igor.b@beldev.am>
Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
---

Tests run on qemu-arm64 (8 CPUs, 1.5G RAM, 4K pages):
1. zblock
43205.38user
7320.53system
2:12:04elapsed
zswpin 346127
zswpout 1642438

2. zsmalloc
47194.61user 
7978.48system 
2:25:03elapsed 
zswpin 448031
zswpout 1810485

So zblock gives a nearly 10% advantage.

Please note that zsmalloc *crashes* on 16K page tests so I couldn't
compare performance in that case.

 mm/zblock.c | 101 ++++++++++++++++++++++------------
 mm/zblock.h | 153 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 156 insertions(+), 98 deletions(-)

diff --git a/mm/zblock.c b/mm/zblock.c
index e2036a6e1617..38468028e129 100644
--- a/mm/zblock.c
+++ b/mm/zblock.c
@@ -24,12 +24,17 @@
 #include <linux/preempt.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/vmalloc.h>
 #include <linux/zpool.h>
 #include "zblock.h"
 
 static struct rb_root block_desc_tree = RB_ROOT;
 static struct dentry *zblock_debugfs_root;
 
+/* allocate order 0 blocks using vmalloc? <-- disabled by default */
+static bool vmalloc_small_blocks;
+module_param_named(vmalloc_small_blocks, vmalloc_small_blocks, bool, 0644);
+
 /* Encode handle of a particular slot in the pool using metadata */
 static inline unsigned long metadata_to_handle(struct zblock_block *block,
 				unsigned int block_type, unsigned int slot)
@@ -56,13 +61,14 @@ static inline struct zblock_block *find_and_claim_block(struct block_list *b,
 	struct list_head *l = &b->active_list;
 	unsigned int slot;
 
-	if (!list_empty(l)) {
+	spin_lock(&b->lock);
+	if (likely(!list_empty(l))) {
 		struct zblock_block *z = list_first_entry(l, typeof(*z), link);
 
 		if (--z->free_slots == 0)
-			list_move(&z->link, &b->full_list);
+			__list_del_clearprev(&z->link);
 		/*
-		 * There is a slot in the block and we just made sure it would
+		 * There is a slot in the block and we just made sure it will
 		 * remain.
 		 * Find that slot and set the busy bit.
 		 */
@@ -74,31 +80,57 @@ static inline struct zblock_block *find_and_claim_block(struct block_list *b,
 					slot)) {
 			if (!test_and_set_bit(slot, z->slot_info))
 				break;
-			barrier();
 		}
+		spin_unlock(&b->lock);
 
-		WARN_ON(slot >= block_desc[block_type].slots_per_block);
 		*handle = metadata_to_handle(z, block_type, slot);
 		return z;
 	}
+	spin_unlock(&b->lock);
 	return NULL;
 }
 
+static inline int zblock_get_order(unsigned int num_pages)
+{
+	switch (num_pages) {
+	case 1:
+	case 2:
+		return num_pages - 1;
+	case 4:
+		return 2;
+	default:
+		break;
+	}
+	return -1;
+}
 /*
  * allocate new block and add it to corresponding block list
  */
 static struct zblock_block *alloc_block(struct zblock_pool *pool,
 					int block_type, gfp_t gfp,
-					unsigned long *handle)
+					unsigned long *handle,
+					unsigned int nid)
 {
+	struct block_list *block_list = &pool->block_lists[block_type];
+	unsigned int num_pages = block_desc[block_type].num_pages;
 	struct zblock_block *block;
-	struct block_list *block_list;
+	struct page *page = NULL;
 
-	block = (void *)__get_free_pages(gfp, block_desc[block_type].order);
-	if (!block)
-		return NULL;
+	if (!vmalloc_small_blocks && zblock_get_order(num_pages) >= 0) {
+		page = try_alloc_pages(nid, zblock_get_order(num_pages));
+		if (page) {
+			page->private = PAGE_SMALL_BLOCK;
+			block = page_address(page);
+		}
+	}
+	if (!page) {
+		block = __vmalloc_node(PAGE_SIZE * num_pages, PAGE_SIZE, gfp, nid, NULL);
+		if (!block)
+			return NULL;
 
-	block_list = &pool->block_lists[block_type];
+		page = vmalloc_to_page(block);
+		page->private = 0;
+	}
 
 	/* init block data  */
 	block->free_slots = block_desc[block_type].slots_per_block - 1;
@@ -122,8 +154,8 @@ static int zblock_blocks_show(struct seq_file *s, void *v)
 		struct block_list *block_list = &pool->block_lists[i];
 
 		seq_printf(s, "%d: %ld blocks of %d pages (total %ld pages)\n",
-			i, block_list->block_count, 1 << block_desc[i].order,
-			block_list->block_count << block_desc[i].order);
+			i, block_list->block_count, block_desc[i].num_pages,
+			block_list->block_count * block_desc[i].num_pages);
 	}
 	return 0;
 }
@@ -142,19 +174,17 @@ DEFINE_SHOW_ATTRIBUTE(zblock_blocks);
  */
 static struct zblock_pool *zblock_create_pool(gfp_t gfp)
 {
-	struct zblock_pool *pool;
-	struct block_list *block_list;
+	struct zblock_pool *pool = kmalloc(sizeof(struct zblock_pool), gfp);
 	int i;
 
-	pool = kmalloc(sizeof(struct zblock_pool), gfp);
 	if (!pool)
 		return NULL;
 
 	/* init each block list */
 	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
-		block_list = &pool->block_lists[i];
+		struct block_list *block_list = &pool->block_lists[i];
+
 		spin_lock_init(&block_list->lock);
-		INIT_LIST_HEAD(&block_list->full_list);
 		INIT_LIST_HEAD(&block_list->active_list);
 		block_list->block_count = 0;
 	}
@@ -187,7 +217,7 @@ static void zblock_destroy_pool(struct zblock_pool *pool)
  * a new slot.
  */
 static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
-			unsigned long *handle)
+			unsigned long *handle, unsigned int nid)
 {
 	int block_type = -1;
 	struct zblock_block *block;
@@ -196,7 +226,7 @@ static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
 	if (!size)
 		return -EINVAL;
 
-	if (size > PAGE_SIZE)
+	if (size > block_desc[ARRAY_SIZE(block_desc) - 1].slot_size)
 		return -ENOSPC;
 
 	/* find basic block type with suitable slot size */
@@ -220,19 +250,15 @@ static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
 	}
 	if (WARN_ON(block_type < 0))
 		return -EINVAL;
-	if (block_type >= ARRAY_SIZE(block_desc))
-		return -ENOSPC;
 
 	block_list = &pool->block_lists[block_type];
 
-	spin_lock(&block_list->lock);
 	block = find_and_claim_block(block_list, block_type, handle);
-	spin_unlock(&block_list->lock);
 	if (block)
 		return 0;
 
 	/* not found block with free slots try to allocate new empty block */
-	block = alloc_block(pool, block_type, gfp & ~(__GFP_MOVABLE | __GFP_HIGHMEM), handle);
+	block = alloc_block(pool, block_type, gfp, handle, nid);
 	return block ? 0 : -ENOMEM;
 }
 
@@ -251,17 +277,25 @@ static void zblock_free(struct zblock_pool *pool, unsigned long handle)
 	block = handle_to_metadata(handle, &block_type, &slot);
 	block_list = &pool->block_lists[block_type];
 
+	/* clear bit early, this will shorten the search */
+	clear_bit(slot, block->slot_info);
+
 	spin_lock(&block_list->lock);
-	/* if all slots in block are empty delete whole block */
+	/* if all slots in block are empty delete the whole block */
 	if (++block->free_slots == block_desc[block_type].slots_per_block) {
+		struct page *page = vmalloc_to_page(block);
+		int num_pages = block_desc[block_type].num_pages;
+
 		block_list->block_count--;
-		list_del(&block->link);
+		__list_del_clearprev(&block->link);
 		spin_unlock(&block_list->lock);
-		free_pages((unsigned long)block, block_desc[block_type].order);
+		if (page->private & PAGE_SMALL_BLOCK)
+			__free_pages(page, zblock_get_order(num_pages));
+		else
+			vfree(block);
 		return;
 	} else if (block->free_slots == 1)
-		list_move_tail(&block->link, &block_list->active_list);
-	clear_bit(slot, block->slot_info);
+		list_add(&block->link, &block_list->active_list);
 	spin_unlock(&block_list->lock);
 }
 
@@ -329,7 +363,7 @@ static u64 zblock_get_total_pages(struct zblock_pool *pool)
 
 	total_size = 0;
 	for (i = 0; i < ARRAY_SIZE(block_desc); i++)
-		total_size += pool->block_lists[i].block_count << block_desc[i].order;
+		total_size += pool->block_lists[i].block_count * block_desc[i].num_pages;
 
 	return total_size;
 }
@@ -351,7 +385,7 @@ static void zblock_zpool_destroy(void *pool)
 static int zblock_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle, const int nid)
 {
-	return zblock_alloc(pool, size, gfp, handle);
+	return zblock_alloc(pool, size, gfp, handle, nid);
 }
 
 static void zblock_zpool_free(void *pool, unsigned long handle)
@@ -407,6 +441,7 @@ static int __init create_rbtree(void)
 {
 	int i;
 
+	BUILD_BUG_ON(ARRAY_SIZE(block_desc) > MAX_TABLE_SIZE);
 	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
 		struct block_desc_node *block_node = kmalloc(sizeof(*block_node),
 							GFP_KERNEL);
@@ -425,7 +460,7 @@ static int __init create_rbtree(void)
 		block_node->this_slot_size = block_desc[i].slot_size;
 		block_node->block_idx = i;
 		if (i == ARRAY_SIZE(block_desc) - 1)
-			block_node->next_slot_size = PAGE_SIZE;
+			block_node->next_slot_size = PAGE_SIZE * 2;
 		else
 			block_node->next_slot_size = block_desc[i+1].slot_size;
 		while (*new) {
diff --git a/mm/zblock.h b/mm/zblock.h
index 9af11f392f97..d433237d6ad4 100644
--- a/mm/zblock.h
+++ b/mm/zblock.h
@@ -10,13 +10,11 @@
 #include <linux/rbtree.h>
 #include <linux/types.h>
 
-#define SLOT_FREE 0
-#define BIT_SLOT_OCCUPIED 0
-#define BIT_SLOT_MAPPED 1
+#define PAGE_SMALL_BLOCK	1
 
 #if PAGE_SIZE == 0x1000
-/* max 128 slots per block, max table size 32 */
-#define SLOT_BITS 7
+/* max 64 slots per block, max table size 64 */
+#define SLOT_BITS 6
 #elif PAGE_SIZE == 0x4000
 /* max 256 slots per block, max table size 64 */
 #define SLOT_BITS 8
@@ -25,24 +23,26 @@
 #define SLOT_BITS 8
 #endif
 
+#define MAX_TABLE_SIZE (1 << (PAGE_SHIFT - SLOT_BITS))
+
 #define MAX_SLOTS (1 << SLOT_BITS)
 #define SLOT_MASK ((0x1UL << SLOT_BITS) - 1)
 
 #define ZBLOCK_HEADER_SIZE	round_up(sizeof(struct zblock_block), sizeof(long))
-#define BLOCK_DATA_SIZE(order) ((PAGE_SIZE << order) - ZBLOCK_HEADER_SIZE)
-#define SLOT_SIZE(nslots, order) (round_down((BLOCK_DATA_SIZE(order) / nslots), sizeof(long)))
+#define BLOCK_DATA_SIZE(num) ((PAGE_SIZE * (num)) - ZBLOCK_HEADER_SIZE)
+#define SLOT_SIZE(nslots, num) (round_down((BLOCK_DATA_SIZE(num) / nslots), sizeof(long)))
 
 /**
  * struct zblock_block - block metadata
- * Block consists of several (1/2/4/8) pages and contains fixed
+ * Block consists of several pages and contains fixed
  * integer number of slots for allocating compressed pages.
  *
  * free_slots:	number of free slots in the block
  * slot_info:	contains data about free/occupied slots
  */
 struct zblock_block {
-	struct list_head link;
 	DECLARE_BITMAP(slot_info, 1 << SLOT_BITS);
+	struct list_head link;
 	u32 free_slots;
 };
 
@@ -54,12 +54,12 @@ struct zblock_block {
  *
  * slot_size:		size of slot for this list
  * slots_per_block:	number of slots per block for this list
- * order:		order for __get_free_pages
+ * num_pages:		number of pages per block
  */
 struct block_desc {
 	unsigned int slot_size;
 	unsigned short slots_per_block;
-	unsigned short order;
+	unsigned short num_pages;
 };
 
 struct block_desc_node {
@@ -71,78 +71,103 @@ struct block_desc_node {
 
 static const struct block_desc block_desc[] = {
 #if PAGE_SIZE == 0x1000
-	{ SLOT_SIZE(63, 0), 63, 0 },
-	{ SLOT_SIZE(32, 0), 32, 0 },
-	{ SLOT_SIZE(21, 0), 21, 0 },
-	{ SLOT_SIZE(15, 0), 15, 0 },
-	{ SLOT_SIZE(12, 0), 12, 0 },
-	{ SLOT_SIZE(10, 0), 10, 0 },
-	{ SLOT_SIZE(9, 0), 9, 0 },
-	{ SLOT_SIZE(8, 0), 8, 0 },
-	{ SLOT_SIZE(29, 2), 29, 2 },
-	{ SLOT_SIZE(13, 1), 13, 1 },
-	{ SLOT_SIZE(6, 0), 6, 0 },
-	{ SLOT_SIZE(11, 1), 11, 1 },
-	{ SLOT_SIZE(5, 0), 5, 0 },
-	{ SLOT_SIZE(9, 1), 9, 1 },
-	{ SLOT_SIZE(8, 1), 8, 1 },
-	{ SLOT_SIZE(29, 3), 29, 3 },
+	{ SLOT_SIZE(28, 1), 28, 1 },
+	{ SLOT_SIZE(18, 1), 18, 1 },
+	{ SLOT_SIZE(12, 1), 12, 1 },
+	{ SLOT_SIZE(10, 1), 10, 1 },
+	{ SLOT_SIZE(17, 2), 17, 2 },
+	{ SLOT_SIZE(15, 2), 15, 2 },
 	{ SLOT_SIZE(13, 2), 13, 2 },
-	{ SLOT_SIZE(12, 2), 12, 2 },
+	{ SLOT_SIZE(6, 1), 6, 1 },
 	{ SLOT_SIZE(11, 2), 11, 2 },
-	{ SLOT_SIZE(10, 2), 10, 2 },
+	{ SLOT_SIZE(5, 1), 5, 1 },
+	{ SLOT_SIZE(19, 4), 19, 4 },
 	{ SLOT_SIZE(9, 2), 9, 2 },
-	{ SLOT_SIZE(17, 3), 17, 3 },
-	{ SLOT_SIZE(8, 2), 8, 2 },
-	{ SLOT_SIZE(15, 3), 15, 3 },
-	{ SLOT_SIZE(14, 3), 14, 3 },
-	{ SLOT_SIZE(13, 3), 13, 3 },
-	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(17, 4), 17, 4 },
+	{ SLOT_SIZE(4, 1), 4, 1 },
+	{ SLOT_SIZE(23, 6), 23, 6 },
 	{ SLOT_SIZE(11, 3), 11, 3 },
+	{ SLOT_SIZE(7, 2), 7, 2 },
 	{ SLOT_SIZE(10, 3), 10, 3 },
-	{ SLOT_SIZE(9, 3), 9, 3 },
-	{ SLOT_SIZE(4, 2), 4, 2 },
+	{ SLOT_SIZE(19, 6), 19, 6 },
+	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(14, 5), 14, 5 },
+	{ SLOT_SIZE(8, 3), 8, 3 },
+	{ SLOT_SIZE(5, 2), 5, 2 },
+	{ SLOT_SIZE(12, 5), 12, 5 },
+	{ SLOT_SIZE(9, 4), 9, 4 },
+	{ SLOT_SIZE(15, 7), 15, 7 },
+	{ SLOT_SIZE(2, 1), 2, 1 },
+	{ SLOT_SIZE(15, 8), 15, 8 },
+	{ SLOT_SIZE(9, 5), 9, 5 },
+	{ SLOT_SIZE(12, 7), 12, 7 },
+	{ SLOT_SIZE(13, 8), 13, 8 },
+	{ SLOT_SIZE(6, 4), 6, 4 },
+	{ SLOT_SIZE(11, 8), 11, 8 },
+	{ SLOT_SIZE(9, 7), 9, 7 },
+	{ SLOT_SIZE(6, 5), 6, 5 },
+	{ SLOT_SIZE(9, 8), 9, 8 },
+	{ SLOT_SIZE(4, 4), 4, 4 },
 #else
-	{ SLOT_SIZE(255, 0), 255, 0 },
-	{ SLOT_SIZE(185, 0), 185, 0 },
-	{ SLOT_SIZE(145, 0), 145, 0 },
-	{ SLOT_SIZE(113, 0), 113, 0 },
-	{ SLOT_SIZE(92, 0), 92, 0 },
-	{ SLOT_SIZE(75, 0), 75, 0 },
-	{ SLOT_SIZE(60, 0), 60, 0 },
-	{ SLOT_SIZE(51, 0), 51, 0 },
-	{ SLOT_SIZE(43, 0), 43, 0 },
-	{ SLOT_SIZE(37, 0), 37, 0 },
-	{ SLOT_SIZE(32, 0), 32, 0 },
-	{ SLOT_SIZE(27, 0), 27, 0 },
-	{ SLOT_SIZE(23, 0), 23, 0 },
-	{ SLOT_SIZE(19, 0), 19, 0 },
-	{ SLOT_SIZE(17, 0), 17, 0 },
-	{ SLOT_SIZE(15, 0), 15, 0 },
-	{ SLOT_SIZE(13, 0), 13, 0 },
-	{ SLOT_SIZE(11, 0), 11, 0 },
-	{ SLOT_SIZE(10, 0), 10, 0 },
-	{ SLOT_SIZE(9, 0), 9, 0 },
-	{ SLOT_SIZE(8, 0), 8, 0 },
-	{ SLOT_SIZE(15, 1), 15, 1 },
-	{ SLOT_SIZE(14, 1), 14, 1 },
-	{ SLOT_SIZE(13, 1), 13, 1 },
+	{ SLOT_SIZE(185, 1), 185, 1 },
+	{ SLOT_SIZE(113, 1), 113, 1 },
+	{ SLOT_SIZE(86, 1), 86, 1 },
+	{ SLOT_SIZE(72, 1), 72, 1 },
+	{ SLOT_SIZE(58, 1), 58, 1 },
+	{ SLOT_SIZE(49, 1), 49, 1 },
+	{ SLOT_SIZE(42, 1), 42, 1 },
+	{ SLOT_SIZE(37, 1), 37, 1 },
+	{ SLOT_SIZE(33, 1), 33, 1 },
+	{ SLOT_SIZE(59, 2), 59, 2 },
+	{ SLOT_SIZE(27, 1), 27, 1 },
+	{ SLOT_SIZE(25, 1), 25, 1 },
+	{ SLOT_SIZE(23, 1), 23, 1 },
+	{ SLOT_SIZE(21, 1), 21, 1 },
+	{ SLOT_SIZE(39, 2), 39, 2 },
+	{ SLOT_SIZE(37, 2), 37, 2 },
+	{ SLOT_SIZE(35, 2), 35, 2 },
+	{ SLOT_SIZE(33, 2), 33, 2 },
+	{ SLOT_SIZE(31, 2), 31, 2 },
+	{ SLOT_SIZE(29, 2), 29, 2 },
+	{ SLOT_SIZE(27, 2), 27, 2 },
+	{ SLOT_SIZE(25, 2), 25, 2 },
 	{ SLOT_SIZE(12, 1), 12, 1 },
 	{ SLOT_SIZE(11, 1), 11, 1 },
+	{ SLOT_SIZE(21, 2), 21, 2 },
 	{ SLOT_SIZE(10, 1), 10, 1 },
+	{ SLOT_SIZE(19, 2), 19, 2 },
 	{ SLOT_SIZE(9, 1), 9, 1 },
+	{ SLOT_SIZE(17, 2), 17, 2 },
 	{ SLOT_SIZE(8, 1), 8, 1 },
 	{ SLOT_SIZE(15, 2), 15, 2 },
 	{ SLOT_SIZE(14, 2), 14, 2 },
 	{ SLOT_SIZE(13, 2), 13, 2 },
 	{ SLOT_SIZE(12, 2), 12, 2 },
+	{ SLOT_SIZE(23, 4), 23, 4 },
 	{ SLOT_SIZE(11, 2), 11, 2 },
+	{ SLOT_SIZE(21, 4), 21, 4 },
 	{ SLOT_SIZE(10, 2), 10, 2 },
+	{ SLOT_SIZE(19, 4), 19, 4 },
 	{ SLOT_SIZE(9, 2), 9, 2 },
+	{ SLOT_SIZE(17, 4), 17, 4 },
 	{ SLOT_SIZE(8, 2), 8, 2 },
-	{ SLOT_SIZE(7, 2), 7, 2 },
-	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(15, 4), 15, 4 },
+	{ SLOT_SIZE(14, 4), 14, 4 },
+	{ SLOT_SIZE(10, 3), 10, 3 },
+	{ SLOT_SIZE(16, 5), 16, 5 },
+	{ SLOT_SIZE(12, 4), 12, 4 },
+	{ SLOT_SIZE(11, 4), 11, 4 },
+	{ SLOT_SIZE(8, 3), 8, 3 },
 	{ SLOT_SIZE(5, 2), 5, 2 },
+	{ SLOT_SIZE(7, 3), 7, 3 },
+	{ SLOT_SIZE(11, 5), 11, 5 },
+	{ SLOT_SIZE(4, 2), 4, 2 },
+	{ SLOT_SIZE(9, 5), 9, 5 },
+	{ SLOT_SIZE(8, 5), 8, 5 },
+	{ SLOT_SIZE(3, 2), 3, 2 },
+	{ SLOT_SIZE(4, 3), 4, 3 },
+	{ SLOT_SIZE(7, 6), 7, 6 },
+	{ SLOT_SIZE(4, 4), 4, 4 },
 #endif /* PAGE_SIZE */
 };
 
@@ -150,13 +175,11 @@ static const struct block_desc block_desc[] = {
  * struct block_list - stores metadata of particular list
  * lock:		protects the list of blocks
  * active_list:		linked list of active (non-full) blocks
- * full_list:		linked list of full blocks
  * block_count:		total number of blocks in the list
  */
 struct block_list {
 	spinlock_t lock;
 	struct list_head active_list;
-	struct list_head full_list;
 	unsigned long block_count;
 };
 
-- 
2.39.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-02  8:01 [PATCH] mm/zblock: use vmalloc for page allocations Vitaly Wool
@ 2025-05-02  8:07 ` Igor Belousov
  2025-05-03 18:46   ` Vitaly Wool
  2025-05-05 14:29 ` Johannes Weiner
  2025-05-06 13:13 ` Yosry Ahmed
  2 siblings, 1 reply; 22+ messages in thread
From: Igor Belousov @ 2025-05-02  8:07 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Yosry Ahmed, Minchan Kim, Sergey Senozhatsky

On 2025-05-02 12:01, Vitaly Wool wrote:
> From: Igor Belousov <igor.b@beldev.am>
> 
> Use vmalloc for page allocations for zblock blocks to avoid extra
> pressure on the memmory subsystem with multiple higher order
> allocations.
> 
> While at it, introduce a module parameter to opportunistically
> allocate pages of lower orders via try_page_alloc() for faster
> allocations whenever possible.
> 
> Since vmalloc works fine with non-power of 2 numbers of pages,
> rewrite the block size tables to use that opportunity.
> 
> Signed-off-by: Igor Belousov <igor.b@beldev.am>
> Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
> ---
> 
> Tests run on qemu-arm64 (8 CPUs, 1.5G RAM, 4K pages):
> 1. zblock
> 43205.38user
> 7320.53system
> 2:12:04elapsed
> zswpin 346127
> zswpout 1642438
> 
> 2. zsmalloc
> 47194.61user
> 7978.48system
> 2:25:03elapsed
> zswpin 448031
> zswpout 1810485
> 
> So zblock gives a nearly 10% advantage.
> 
> Please note that zsmalloc *crashes* on 16K page tests so I couldn't
> compare performance in that case.

Right, and it looks like this:

[  762.499278]  bug_handler+0x0/0xa8
[  762.499433]  die_kernel_fault+0x1c4/0x36c
[  762.499616]  fault_from_pkey+0x0/0x98
[  762.499784]  do_translation_fault+0x3c/0x94
[  762.499969]  do_mem_abort+0x44/0x94
[  762.500140]  el1_abort+0x40/0x64
[  762.500306]  el1h_64_sync_handler+0xa4/0x120
[  762.500502]  el1h_64_sync+0x6c/0x70
[  762.500718]  __pi_memcpy_generic+0x1e4/0x22c (P)
[  762.500931]  zs_zpool_obj_write+0x10/0x1c
[  762.501117]  zpool_obj_write+0x18/0x24
[  762.501305]  zswap_store+0x490/0x7c4
[  762.501474]  swap_writepage+0x260/0x448
[  762.501654]  pageout+0x148/0x340
[  762.501816]  shrink_folio_list+0xa7c/0xf34
[  762.502008]  shrink_lruvec+0x6fc/0xbd0
[  762.502189]  shrink_node+0x52c/0x960
[  762.502359]  balance_pgdat+0x344/0x738
[  762.502537]  kswapd+0x210/0x37c
[  762.502691]  kthread+0x12c/0x204
[  762.502920]  ret_from_fork+0x10/0x20

Thanks,
Igor


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-02  8:07 ` Igor Belousov
@ 2025-05-03 18:46   ` Vitaly Wool
  2025-05-04  5:02     ` Sergey Senozhatsky
  2025-05-05 14:08     ` Johannes Weiner
  0 siblings, 2 replies; 22+ messages in thread
From: Vitaly Wool @ 2025-05-03 18:46 UTC (permalink / raw)
  To: Igor Belousov
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Yosry Ahmed, Minchan Kim, Sergey Senozhatsky



> On May 2, 2025, at 10:07 AM, Igor Belousov <igor.b@beldev.am> wrote:
> 
> On 2025-05-02 12:01, Vitaly Wool wrote:
>> From: Igor Belousov <igor.b@beldev.am>
>> Use vmalloc for page allocations for zblock blocks to avoid extra
>> pressure on the memmory subsystem with multiple higher order
>> allocations.
>> While at it, introduce a module parameter to opportunistically
>> allocate pages of lower orders via try_page_alloc() for faster
>> allocations whenever possible.
>> Since vmalloc works fine with non-power of 2 numbers of pages,
>> rewrite the block size tables to use that opportunity.
>> Signed-off-by: Igor Belousov <igor.b@beldev.am>
>> Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
>> ---
>> Tests run on qemu-arm64 (8 CPUs, 1.5G RAM, 4K pages):
>> 1. zblock
>> 43205.38user
>> 7320.53system
>> 2:12:04elapsed
>> zswpin 346127
>> zswpout 1642438
>> 2. zsmalloc
>> 47194.61user
>> 7978.48system
>> 2:25:03elapsed
>> zswpin 448031
>> zswpout 1810485
>> So zblock gives a nearly 10% advantage.
>> Please note that zsmalloc *crashes* on 16K page tests so I couldn't
>> compare performance in that case.
> 
> Right, and it looks like this:
> 
> [  762.499278]  bug_handler+0x0/0xa8
> [  762.499433]  die_kernel_fault+0x1c4/0x36c
> [  762.499616]  fault_from_pkey+0x0/0x98
> [  762.499784]  do_translation_fault+0x3c/0x94
> [  762.499969]  do_mem_abort+0x44/0x94
> [  762.500140]  el1_abort+0x40/0x64
> [  762.500306]  el1h_64_sync_handler+0xa4/0x120
> [  762.500502]  el1h_64_sync+0x6c/0x70
> [  762.500718]  __pi_memcpy_generic+0x1e4/0x22c (P)
> [  762.500931]  zs_zpool_obj_write+0x10/0x1c
> [  762.501117]  zpool_obj_write+0x18/0x24
> [  762.501305]  zswap_store+0x490/0x7c4
> [  762.501474]  swap_writepage+0x260/0x448
> [  762.501654]  pageout+0x148/0x340
> [  762.501816]  shrink_folio_list+0xa7c/0xf34
> [  762.502008]  shrink_lruvec+0x6fc/0xbd0
> [  762.502189]  shrink_node+0x52c/0x960
> [  762.502359]  balance_pgdat+0x344/0x738
> [  762.502537]  kswapd+0x210/0x37c
> [  762.502691]  kthread+0x12c/0x204
> [  762.502920]  ret_from_fork+0x10/0x20

In fact we don’t know if zsmalloc is actually supposed to work with 16K pages. That’s the question to Sergey and Minchan. If it is indeed supposed to handle 16K pages, I would suggest that you submitted a full report with reproduction steps and/or provided a fix if possible.

~Vitaly



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-03 18:46   ` Vitaly Wool
@ 2025-05-04  5:02     ` Sergey Senozhatsky
  2025-05-04  6:14       ` Sergey Senozhatsky
  2025-05-05 14:08     ` Johannes Weiner
  1 sibling, 1 reply; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-04  5:02 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: Igor Belousov, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Yosry Ahmed, Minchan Kim,
	Sergey Senozhatsky

On (25/05/03 20:46), Vitaly Wool wrote:
> > Right, and it looks like this:
> > 
> > [  762.499278]  bug_handler+0x0/0xa8
> > [  762.499433]  die_kernel_fault+0x1c4/0x36c
> > [  762.499616]  fault_from_pkey+0x0/0x98
> > [  762.499784]  do_translation_fault+0x3c/0x94
> > [  762.499969]  do_mem_abort+0x44/0x94
> > [  762.500140]  el1_abort+0x40/0x64
> > [  762.500306]  el1h_64_sync_handler+0xa4/0x120
> > [  762.500502]  el1h_64_sync+0x6c/0x70
> > [  762.500718]  __pi_memcpy_generic+0x1e4/0x22c (P)
> > [  762.500931]  zs_zpool_obj_write+0x10/0x1c
> > [  762.501117]  zpool_obj_write+0x18/0x24
> > [  762.501305]  zswap_store+0x490/0x7c4
> > [  762.501474]  swap_writepage+0x260/0x448
> > [  762.501654]  pageout+0x148/0x340
> > [  762.501816]  shrink_folio_list+0xa7c/0xf34
> > [  762.502008]  shrink_lruvec+0x6fc/0xbd0
> > [  762.502189]  shrink_node+0x52c/0x960
> > [  762.502359]  balance_pgdat+0x344/0x738
> > [  762.502537]  kswapd+0x210/0x37c
> > [  762.502691]  kthread+0x12c/0x204
> > [  762.502920]  ret_from_fork+0x10/0x20
> 
> In fact we don’t know if zsmalloc is actually supposed to work with
> 16K pages.

Hmm I think it is supposed to work, can't think of a reason why it
shouldn't.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-04  5:02     ` Sergey Senozhatsky
@ 2025-05-04  6:14       ` Sergey Senozhatsky
  0 siblings, 0 replies; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-04  6:14 UTC (permalink / raw)
  To: Vitaly Wool, Igor Belousov
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Yosry Ahmed, Minchan Kim, Sergey Senozhatsky

On (25/05/04 14:02), Sergey Senozhatsky wrote:
> On (25/05/03 20:46), Vitaly Wool wrote:
> > > Right, and it looks like this:
> > > 
> > > [  762.499278]  bug_handler+0x0/0xa8
> > > [  762.499433]  die_kernel_fault+0x1c4/0x36c
> > > [  762.499616]  fault_from_pkey+0x0/0x98
> > > [  762.499784]  do_translation_fault+0x3c/0x94
> > > [  762.499969]  do_mem_abort+0x44/0x94
> > > [  762.500140]  el1_abort+0x40/0x64
> > > [  762.500306]  el1h_64_sync_handler+0xa4/0x120
> > > [  762.500502]  el1h_64_sync+0x6c/0x70
> > > [  762.500718]  __pi_memcpy_generic+0x1e4/0x22c (P)
> > > [  762.500931]  zs_zpool_obj_write+0x10/0x1c
> > > [  762.501117]  zpool_obj_write+0x18/0x24
> > > [  762.501305]  zswap_store+0x490/0x7c4
> > > [  762.501474]  swap_writepage+0x260/0x448
> > > [  762.501654]  pageout+0x148/0x340
> > > [  762.501816]  shrink_folio_list+0xa7c/0xf34
> > > [  762.502008]  shrink_lruvec+0x6fc/0xbd0
> > > [  762.502189]  shrink_node+0x52c/0x960
> > > [  762.502359]  balance_pgdat+0x344/0x738
> > > [  762.502537]  kswapd+0x210/0x37c
> > > [  762.502691]  kthread+0x12c/0x204
> > > [  762.502920]  ret_from_fork+0x10/0x20
> > 
> > In fact we don’t know if zsmalloc is actually supposed to work with
> > 16K pages.
> 
> Hmm I think it is supposed to work, can't think of a reason why it
> shouldn't.

I'm able to repro, I think.  Will try to take a look later today/tonight.
Thank you for the report.

// Feel free to send a patch if you have a fix already.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-03 18:46   ` Vitaly Wool
  2025-05-04  5:02     ` Sergey Senozhatsky
@ 2025-05-05 14:08     ` Johannes Weiner
  2025-05-06  2:13       ` Sergey Senozhatsky
  1 sibling, 1 reply; 22+ messages in thread
From: Johannes Weiner @ 2025-05-05 14:08 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: Igor Belousov, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Yosry Ahmed, Minchan Kim, Sergey Senozhatsky

On Sat, May 03, 2025 at 08:46:07PM +0200, Vitaly Wool wrote:
> 
> 
> > On May 2, 2025, at 10:07 AM, Igor Belousov <igor.b@beldev.am> wrote:
> > 
> > On 2025-05-02 12:01, Vitaly Wool wrote:
> >> From: Igor Belousov <igor.b@beldev.am>
> >> Use vmalloc for page allocations for zblock blocks to avoid extra
> >> pressure on the memmory subsystem with multiple higher order
> >> allocations.
> >> While at it, introduce a module parameter to opportunistically
> >> allocate pages of lower orders via try_page_alloc() for faster
> >> allocations whenever possible.
> >> Since vmalloc works fine with non-power of 2 numbers of pages,
> >> rewrite the block size tables to use that opportunity.
> >> Signed-off-by: Igor Belousov <igor.b@beldev.am>
> >> Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
> >> ---
> >> Tests run on qemu-arm64 (8 CPUs, 1.5G RAM, 4K pages):
> >> 1. zblock
> >> 43205.38user
> >> 7320.53system
> >> 2:12:04elapsed
> >> zswpin 346127
> >> zswpout 1642438
> >> 2. zsmalloc
> >> 47194.61user
> >> 7978.48system
> >> 2:25:03elapsed
> >> zswpin 448031
> >> zswpout 1810485
> >> So zblock gives a nearly 10% advantage.
> >> Please note that zsmalloc *crashes* on 16K page tests so I couldn't
> >> compare performance in that case.
> > 
> > Right, and it looks like this:
> > 
> > [  762.499278]  bug_handler+0x0/0xa8
> > [  762.499433]  die_kernel_fault+0x1c4/0x36c
> > [  762.499616]  fault_from_pkey+0x0/0x98
> > [  762.499784]  do_translation_fault+0x3c/0x94
> > [  762.499969]  do_mem_abort+0x44/0x94
> > [  762.500140]  el1_abort+0x40/0x64
> > [  762.500306]  el1h_64_sync_handler+0xa4/0x120
> > [  762.500502]  el1h_64_sync+0x6c/0x70
> > [  762.500718]  __pi_memcpy_generic+0x1e4/0x22c (P)
> > [  762.500931]  zs_zpool_obj_write+0x10/0x1c
> > [  762.501117]  zpool_obj_write+0x18/0x24
> > [  762.501305]  zswap_store+0x490/0x7c4
> > [  762.501474]  swap_writepage+0x260/0x448
> > [  762.501654]  pageout+0x148/0x340
> > [  762.501816]  shrink_folio_list+0xa7c/0xf34
> > [  762.502008]  shrink_lruvec+0x6fc/0xbd0
> > [  762.502189]  shrink_node+0x52c/0x960
> > [  762.502359]  balance_pgdat+0x344/0x738
> > [  762.502537]  kswapd+0x210/0x37c
> > [  762.502691]  kthread+0x12c/0x204
> > [  762.502920]  ret_from_fork+0x10/0x20
> 
> In fact we don’t know if zsmalloc is actually supposed to work with
> 16K pages. That’s the question to Sergey and Minchan. If it is
> indeed supposed to handle 16K pages, I would suggest that you
> submitted a full report with reproduction steps and/or provided a
> fix if possible.

I've been using zsmalloc with 16k pages just fine for ~a year,
currently running it on 6.14.2-asahi. This machine sees a lot of
memory pressure, too.

Could this be a more recent regression, maybe in the new obj_write()?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-05 14:08     ` Johannes Weiner
@ 2025-05-06  2:13       ` Sergey Senozhatsky
  0 siblings, 0 replies; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-06  2:13 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Vitaly Wool, Igor Belousov, linux-mm, akpm, linux-kernel,
	Nhat Pham, Shakeel Butt, Yosry Ahmed, Minchan Kim,
	Sergey Senozhatsky

On (25/05/05 10:08), Johannes Weiner wrote:
> I've been using zsmalloc with 16k pages just fine for ~a year,
> currently running it on 6.14.2-asahi. This machine sees a lot of
> memory pressure, too.
> 
> Could this be a more recent regression, maybe in the new obj_write()?

This looks like a recent regression.  In the old code we'd something like

	__zs_map_object(area, zpdescs, off, class->size)

which would use class->size for all memcpy() calculations:

       sizes[0] = PAGE_SIZE - off;
       sizes[1] = size - sizes[0];

       /* copy object to per-cpu buffer */
       memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]);
       memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]);

So we sometimes would memcpy() more than the actual payload (object size
can be smaller than class->size), which would work because compressed
buffer is huge enough.  In the new code we use object size, only for
write() tho.

read_begin()/end() still use class->size, so  I think in some cases we
can "unnecessarily" go into
	"object spans two pages, memcpy() from both pages a local copy"
even if the actual object fits on one page.  We may also want to pass the
object size (which we know) to read_begin()/end(), this potentially can
save some memcpy() calls.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-02  8:01 [PATCH] mm/zblock: use vmalloc for page allocations Vitaly Wool
  2025-05-02  8:07 ` Igor Belousov
@ 2025-05-05 14:29 ` Johannes Weiner
  2025-05-06  9:42   ` Uladzislau Rezki
  2025-05-06 13:13 ` Yosry Ahmed
  2 siblings, 1 reply; 22+ messages in thread
From: Johannes Weiner @ 2025-05-05 14:29 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Yosry Ahmed, Minchan Kim, Sergey Senozhatsky, Igor Belousov

On Fri, May 02, 2025 at 10:01:56AM +0200, Vitaly Wool wrote:
>  static struct zblock_block *alloc_block(struct zblock_pool *pool,
>  					int block_type, gfp_t gfp,
> -					unsigned long *handle)
> +					unsigned long *handle,
> +					unsigned int nid)
>  {
> +	struct block_list *block_list = &pool->block_lists[block_type];
> +	unsigned int num_pages = block_desc[block_type].num_pages;
>  	struct zblock_block *block;
> -	struct block_list *block_list;
> +	struct page *page = NULL;
>  
> -	block = (void *)__get_free_pages(gfp, block_desc[block_type].order);
> -	if (!block)
> -		return NULL;
> +	if (!vmalloc_small_blocks && zblock_get_order(num_pages) >= 0) {
> +		page = try_alloc_pages(nid, zblock_get_order(num_pages));

This is broken in several ways.

The function is meant for NMI contexts - the "try" refers to
trylocking the freelists, in case whatever got interrupted was inside
the allocator already. This will fall back to vmalloc unpredictably.

It also doesn't take a gfp parameter, which ignores the zswap ones,
and substitutes a set that doesn't make any sense in this context:
__GFP_NOMEMALLOC is counter productive inside reclaim; __GFP_ACCOUNT
wreaks complete havoc on how compressed memory is charged to cgroups
(double charging the wrong groups for shared blocks).


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-05 14:29 ` Johannes Weiner
@ 2025-05-06  9:42   ` Uladzislau Rezki
  0 siblings, 0 replies; 22+ messages in thread
From: Uladzislau Rezki @ 2025-05-06  9:42 UTC (permalink / raw)
  To: Johannes Weiner, Vitaly Wool
  Cc: Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Yosry Ahmed, Minchan Kim, Sergey Senozhatsky,
	Igor Belousov

On Mon, May 05, 2025 at 10:29:49AM -0400, Johannes Weiner wrote:
> On Fri, May 02, 2025 at 10:01:56AM +0200, Vitaly Wool wrote:
> >  static struct zblock_block *alloc_block(struct zblock_pool *pool,
> >  					int block_type, gfp_t gfp,
> > -					unsigned long *handle)
> > +					unsigned long *handle,
> > +					unsigned int nid)
> >  {
> > +	struct block_list *block_list = &pool->block_lists[block_type];
> > +	unsigned int num_pages = block_desc[block_type].num_pages;
> >  	struct zblock_block *block;
> > -	struct block_list *block_list;
> > +	struct page *page = NULL;
> >  
> > -	block = (void *)__get_free_pages(gfp, block_desc[block_type].order);
> > -	if (!block)
> > -		return NULL;
> > +	if (!vmalloc_small_blocks && zblock_get_order(num_pages) >= 0) {
> > +		page = try_alloc_pages(nid, zblock_get_order(num_pages));
> 
> This is broken in several ways.
> 
> The function is meant for NMI contexts - the "try" refers to
> trylocking the freelists, in case whatever got interrupted was inside
> the allocator already. This will fall back to vmalloc unpredictably.
> 
> It also doesn't take a gfp parameter, which ignores the zswap ones,
> and substitutes a set that doesn't make any sense in this context:
> __GFP_NOMEMALLOC is counter productive inside reclaim; __GFP_ACCOUNT
> wreaks complete havoc on how compressed memory is charged to cgroups
> (double charging the wrong groups for shared blocks).
> 
+ "&& zblock_get_order(num_pages) >= 0" is always true?

A fallback makes sense to use when order > 0, IMO. Or just stick
fully to vmalloc. Another option is kvmalloc()/kvfree().

--
Uladzislau Rezki


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-02  8:01 [PATCH] mm/zblock: use vmalloc for page allocations Vitaly Wool
  2025-05-02  8:07 ` Igor Belousov
  2025-05-05 14:29 ` Johannes Weiner
@ 2025-05-06 13:13 ` Yosry Ahmed
  2025-05-06 13:27   ` Herbert Xu
  2025-05-07  5:57   ` Sergey Senozhatsky
  2 siblings, 2 replies; 22+ messages in thread
From: Yosry Ahmed @ 2025-05-06 13:13 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Minchan Kim, Sergey Senozhatsky, Igor Belousov,
	Herbert Xu

On Fri, May 02, 2025 at 10:01:56AM +0200, Vitaly Wool wrote:
> From: Igor Belousov <igor.b@beldev.am>
> 
> Use vmalloc for page allocations for zblock blocks to avoid extra
> pressure on the memmory subsystem with multiple higher order
> allocations.
> 
> While at it, introduce a module parameter to opportunistically
> allocate pages of lower orders via try_page_alloc() for faster
> allocations whenever possible.
> 
> Since vmalloc works fine with non-power of 2 numbers of pages,
> rewrite the block size tables to use that opportunity.
> 
> Signed-off-by: Igor Belousov <igor.b@beldev.am>
> Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>

Keep in mind that zswap_decompress() will always do an extra copy if the
address returned by zpool_obj_read_begin() is a vmalloc address. To
avoid this we need to enlighten the scatterlist API to work with vmalloc
addresses.

(CC'ing Herbert as he was looking into this)

If we can use vmalloc for zblock, then we can probably also use vmalloc
in zsmalloc and get rid of the chaining logic completely. This would
make zsmalloc simpler and closer to zblock in that regard.

Sergey, WDYT?

> ---
> 
> Tests run on qemu-arm64 (8 CPUs, 1.5G RAM, 4K pages):
> 1. zblock
> 43205.38user
> 7320.53system
> 2:12:04elapsed
> zswpin 346127
> zswpout 1642438
> 
> 2. zsmalloc
> 47194.61user 
> 7978.48system 
> 2:25:03elapsed 
> zswpin 448031
> zswpout 1810485
> 
> So zblock gives a nearly 10% advantage.
> 
> Please note that zsmalloc *crashes* on 16K page tests so I couldn't
> compare performance in that case.
> 
>  mm/zblock.c | 101 ++++++++++++++++++++++------------
>  mm/zblock.h | 153 ++++++++++++++++++++++++++++++----------------------
>  2 files changed, 156 insertions(+), 98 deletions(-)
> 
> diff --git a/mm/zblock.c b/mm/zblock.c
> index e2036a6e1617..38468028e129 100644
> --- a/mm/zblock.c
> +++ b/mm/zblock.c
> @@ -24,12 +24,17 @@
>  #include <linux/preempt.h>
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
> +#include <linux/vmalloc.h>
>  #include <linux/zpool.h>
>  #include "zblock.h"
>  
>  static struct rb_root block_desc_tree = RB_ROOT;
>  static struct dentry *zblock_debugfs_root;
>  
> +/* allocate order 0 blocks using vmalloc? <-- disabled by default */
> +static bool vmalloc_small_blocks;
> +module_param_named(vmalloc_small_blocks, vmalloc_small_blocks, bool, 0644);
> +
>  /* Encode handle of a particular slot in the pool using metadata */
>  static inline unsigned long metadata_to_handle(struct zblock_block *block,
>  				unsigned int block_type, unsigned int slot)
> @@ -56,13 +61,14 @@ static inline struct zblock_block *find_and_claim_block(struct block_list *b,
>  	struct list_head *l = &b->active_list;
>  	unsigned int slot;
>  
> -	if (!list_empty(l)) {
> +	spin_lock(&b->lock);
> +	if (likely(!list_empty(l))) {
>  		struct zblock_block *z = list_first_entry(l, typeof(*z), link);
>  
>  		if (--z->free_slots == 0)
> -			list_move(&z->link, &b->full_list);
> +			__list_del_clearprev(&z->link);
>  		/*
> -		 * There is a slot in the block and we just made sure it would
> +		 * There is a slot in the block and we just made sure it will
>  		 * remain.
>  		 * Find that slot and set the busy bit.
>  		 */
> @@ -74,31 +80,57 @@ static inline struct zblock_block *find_and_claim_block(struct block_list *b,
>  					slot)) {
>  			if (!test_and_set_bit(slot, z->slot_info))
>  				break;
> -			barrier();
>  		}
> +		spin_unlock(&b->lock);
>  
> -		WARN_ON(slot >= block_desc[block_type].slots_per_block);
>  		*handle = metadata_to_handle(z, block_type, slot);
>  		return z;
>  	}
> +	spin_unlock(&b->lock);
>  	return NULL;
>  }
>  
> +static inline int zblock_get_order(unsigned int num_pages)
> +{
> +	switch (num_pages) {
> +	case 1:
> +	case 2:
> +		return num_pages - 1;
> +	case 4:
> +		return 2;
> +	default:
> +		break;
> +	}
> +	return -1;
> +}
>  /*
>   * allocate new block and add it to corresponding block list
>   */
>  static struct zblock_block *alloc_block(struct zblock_pool *pool,
>  					int block_type, gfp_t gfp,
> -					unsigned long *handle)
> +					unsigned long *handle,
> +					unsigned int nid)
>  {
> +	struct block_list *block_list = &pool->block_lists[block_type];
> +	unsigned int num_pages = block_desc[block_type].num_pages;
>  	struct zblock_block *block;
> -	struct block_list *block_list;
> +	struct page *page = NULL;
>  
> -	block = (void *)__get_free_pages(gfp, block_desc[block_type].order);
> -	if (!block)
> -		return NULL;
> +	if (!vmalloc_small_blocks && zblock_get_order(num_pages) >= 0) {
> +		page = try_alloc_pages(nid, zblock_get_order(num_pages));
> +		if (page) {
> +			page->private = PAGE_SMALL_BLOCK;
> +			block = page_address(page);
> +		}
> +	}
> +	if (!page) {
> +		block = __vmalloc_node(PAGE_SIZE * num_pages, PAGE_SIZE, gfp, nid, NULL);
> +		if (!block)
> +			return NULL;
>  
> -	block_list = &pool->block_lists[block_type];
> +		page = vmalloc_to_page(block);
> +		page->private = 0;
> +	}
>  
>  	/* init block data  */
>  	block->free_slots = block_desc[block_type].slots_per_block - 1;
> @@ -122,8 +154,8 @@ static int zblock_blocks_show(struct seq_file *s, void *v)
>  		struct block_list *block_list = &pool->block_lists[i];
>  
>  		seq_printf(s, "%d: %ld blocks of %d pages (total %ld pages)\n",
> -			i, block_list->block_count, 1 << block_desc[i].order,
> -			block_list->block_count << block_desc[i].order);
> +			i, block_list->block_count, block_desc[i].num_pages,
> +			block_list->block_count * block_desc[i].num_pages);
>  	}
>  	return 0;
>  }
> @@ -142,19 +174,17 @@ DEFINE_SHOW_ATTRIBUTE(zblock_blocks);
>   */
>  static struct zblock_pool *zblock_create_pool(gfp_t gfp)
>  {
> -	struct zblock_pool *pool;
> -	struct block_list *block_list;
> +	struct zblock_pool *pool = kmalloc(sizeof(struct zblock_pool), gfp);
>  	int i;
>  
> -	pool = kmalloc(sizeof(struct zblock_pool), gfp);
>  	if (!pool)
>  		return NULL;
>  
>  	/* init each block list */
>  	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
> -		block_list = &pool->block_lists[i];
> +		struct block_list *block_list = &pool->block_lists[i];
> +
>  		spin_lock_init(&block_list->lock);
> -		INIT_LIST_HEAD(&block_list->full_list);
>  		INIT_LIST_HEAD(&block_list->active_list);
>  		block_list->block_count = 0;
>  	}
> @@ -187,7 +217,7 @@ static void zblock_destroy_pool(struct zblock_pool *pool)
>   * a new slot.
>   */
>  static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
> -			unsigned long *handle)
> +			unsigned long *handle, unsigned int nid)
>  {
>  	int block_type = -1;
>  	struct zblock_block *block;
> @@ -196,7 +226,7 @@ static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
>  	if (!size)
>  		return -EINVAL;
>  
> -	if (size > PAGE_SIZE)
> +	if (size > block_desc[ARRAY_SIZE(block_desc) - 1].slot_size)
>  		return -ENOSPC;
>  
>  	/* find basic block type with suitable slot size */
> @@ -220,19 +250,15 @@ static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
>  	}
>  	if (WARN_ON(block_type < 0))
>  		return -EINVAL;
> -	if (block_type >= ARRAY_SIZE(block_desc))
> -		return -ENOSPC;
>  
>  	block_list = &pool->block_lists[block_type];
>  
> -	spin_lock(&block_list->lock);
>  	block = find_and_claim_block(block_list, block_type, handle);
> -	spin_unlock(&block_list->lock);
>  	if (block)
>  		return 0;
>  
>  	/* not found block with free slots try to allocate new empty block */
> -	block = alloc_block(pool, block_type, gfp & ~(__GFP_MOVABLE | __GFP_HIGHMEM), handle);
> +	block = alloc_block(pool, block_type, gfp, handle, nid);
>  	return block ? 0 : -ENOMEM;
>  }
>  
> @@ -251,17 +277,25 @@ static void zblock_free(struct zblock_pool *pool, unsigned long handle)
>  	block = handle_to_metadata(handle, &block_type, &slot);
>  	block_list = &pool->block_lists[block_type];
>  
> +	/* clear bit early, this will shorten the search */
> +	clear_bit(slot, block->slot_info);
> +
>  	spin_lock(&block_list->lock);
> -	/* if all slots in block are empty delete whole block */
> +	/* if all slots in block are empty delete the whole block */
>  	if (++block->free_slots == block_desc[block_type].slots_per_block) {
> +		struct page *page = vmalloc_to_page(block);
> +		int num_pages = block_desc[block_type].num_pages;
> +
>  		block_list->block_count--;
> -		list_del(&block->link);
> +		__list_del_clearprev(&block->link);
>  		spin_unlock(&block_list->lock);
> -		free_pages((unsigned long)block, block_desc[block_type].order);
> +		if (page->private & PAGE_SMALL_BLOCK)
> +			__free_pages(page, zblock_get_order(num_pages));
> +		else
> +			vfree(block);
>  		return;
>  	} else if (block->free_slots == 1)
> -		list_move_tail(&block->link, &block_list->active_list);
> -	clear_bit(slot, block->slot_info);
> +		list_add(&block->link, &block_list->active_list);
>  	spin_unlock(&block_list->lock);
>  }
>  
> @@ -329,7 +363,7 @@ static u64 zblock_get_total_pages(struct zblock_pool *pool)
>  
>  	total_size = 0;
>  	for (i = 0; i < ARRAY_SIZE(block_desc); i++)
> -		total_size += pool->block_lists[i].block_count << block_desc[i].order;
> +		total_size += pool->block_lists[i].block_count * block_desc[i].num_pages;
>  
>  	return total_size;
>  }
> @@ -351,7 +385,7 @@ static void zblock_zpool_destroy(void *pool)
>  static int zblock_zpool_malloc(void *pool, size_t size, gfp_t gfp,
>  			unsigned long *handle, const int nid)
>  {
> -	return zblock_alloc(pool, size, gfp, handle);
> +	return zblock_alloc(pool, size, gfp, handle, nid);
>  }
>  
>  static void zblock_zpool_free(void *pool, unsigned long handle)
> @@ -407,6 +441,7 @@ static int __init create_rbtree(void)
>  {
>  	int i;
>  
> +	BUILD_BUG_ON(ARRAY_SIZE(block_desc) > MAX_TABLE_SIZE);
>  	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
>  		struct block_desc_node *block_node = kmalloc(sizeof(*block_node),
>  							GFP_KERNEL);
> @@ -425,7 +460,7 @@ static int __init create_rbtree(void)
>  		block_node->this_slot_size = block_desc[i].slot_size;
>  		block_node->block_idx = i;
>  		if (i == ARRAY_SIZE(block_desc) - 1)
> -			block_node->next_slot_size = PAGE_SIZE;
> +			block_node->next_slot_size = PAGE_SIZE * 2;
>  		else
>  			block_node->next_slot_size = block_desc[i+1].slot_size;
>  		while (*new) {
> diff --git a/mm/zblock.h b/mm/zblock.h
> index 9af11f392f97..d433237d6ad4 100644
> --- a/mm/zblock.h
> +++ b/mm/zblock.h
> @@ -10,13 +10,11 @@
>  #include <linux/rbtree.h>
>  #include <linux/types.h>
>  
> -#define SLOT_FREE 0
> -#define BIT_SLOT_OCCUPIED 0
> -#define BIT_SLOT_MAPPED 1
> +#define PAGE_SMALL_BLOCK	1
>  
>  #if PAGE_SIZE == 0x1000
> -/* max 128 slots per block, max table size 32 */
> -#define SLOT_BITS 7
> +/* max 64 slots per block, max table size 64 */
> +#define SLOT_BITS 6
>  #elif PAGE_SIZE == 0x4000
>  /* max 256 slots per block, max table size 64 */
>  #define SLOT_BITS 8
> @@ -25,24 +23,26 @@
>  #define SLOT_BITS 8
>  #endif
>  
> +#define MAX_TABLE_SIZE (1 << (PAGE_SHIFT - SLOT_BITS))
> +
>  #define MAX_SLOTS (1 << SLOT_BITS)
>  #define SLOT_MASK ((0x1UL << SLOT_BITS) - 1)
>  
>  #define ZBLOCK_HEADER_SIZE	round_up(sizeof(struct zblock_block), sizeof(long))
> -#define BLOCK_DATA_SIZE(order) ((PAGE_SIZE << order) - ZBLOCK_HEADER_SIZE)
> -#define SLOT_SIZE(nslots, order) (round_down((BLOCK_DATA_SIZE(order) / nslots), sizeof(long)))
> +#define BLOCK_DATA_SIZE(num) ((PAGE_SIZE * (num)) - ZBLOCK_HEADER_SIZE)
> +#define SLOT_SIZE(nslots, num) (round_down((BLOCK_DATA_SIZE(num) / nslots), sizeof(long)))
>  
>  /**
>   * struct zblock_block - block metadata
> - * Block consists of several (1/2/4/8) pages and contains fixed
> + * Block consists of several pages and contains fixed
>   * integer number of slots for allocating compressed pages.
>   *
>   * free_slots:	number of free slots in the block
>   * slot_info:	contains data about free/occupied slots
>   */
>  struct zblock_block {
> -	struct list_head link;
>  	DECLARE_BITMAP(slot_info, 1 << SLOT_BITS);
> +	struct list_head link;
>  	u32 free_slots;
>  };
>  
> @@ -54,12 +54,12 @@ struct zblock_block {
>   *
>   * slot_size:		size of slot for this list
>   * slots_per_block:	number of slots per block for this list
> - * order:		order for __get_free_pages
> + * num_pages:		number of pages per block
>   */
>  struct block_desc {
>  	unsigned int slot_size;
>  	unsigned short slots_per_block;
> -	unsigned short order;
> +	unsigned short num_pages;
>  };
>  
>  struct block_desc_node {
> @@ -71,78 +71,103 @@ struct block_desc_node {
>  
>  static const struct block_desc block_desc[] = {
>  #if PAGE_SIZE == 0x1000
> -	{ SLOT_SIZE(63, 0), 63, 0 },
> -	{ SLOT_SIZE(32, 0), 32, 0 },
> -	{ SLOT_SIZE(21, 0), 21, 0 },
> -	{ SLOT_SIZE(15, 0), 15, 0 },
> -	{ SLOT_SIZE(12, 0), 12, 0 },
> -	{ SLOT_SIZE(10, 0), 10, 0 },
> -	{ SLOT_SIZE(9, 0), 9, 0 },
> -	{ SLOT_SIZE(8, 0), 8, 0 },
> -	{ SLOT_SIZE(29, 2), 29, 2 },
> -	{ SLOT_SIZE(13, 1), 13, 1 },
> -	{ SLOT_SIZE(6, 0), 6, 0 },
> -	{ SLOT_SIZE(11, 1), 11, 1 },
> -	{ SLOT_SIZE(5, 0), 5, 0 },
> -	{ SLOT_SIZE(9, 1), 9, 1 },
> -	{ SLOT_SIZE(8, 1), 8, 1 },
> -	{ SLOT_SIZE(29, 3), 29, 3 },
> +	{ SLOT_SIZE(28, 1), 28, 1 },
> +	{ SLOT_SIZE(18, 1), 18, 1 },
> +	{ SLOT_SIZE(12, 1), 12, 1 },
> +	{ SLOT_SIZE(10, 1), 10, 1 },
> +	{ SLOT_SIZE(17, 2), 17, 2 },
> +	{ SLOT_SIZE(15, 2), 15, 2 },
>  	{ SLOT_SIZE(13, 2), 13, 2 },
> -	{ SLOT_SIZE(12, 2), 12, 2 },
> +	{ SLOT_SIZE(6, 1), 6, 1 },
>  	{ SLOT_SIZE(11, 2), 11, 2 },
> -	{ SLOT_SIZE(10, 2), 10, 2 },
> +	{ SLOT_SIZE(5, 1), 5, 1 },
> +	{ SLOT_SIZE(19, 4), 19, 4 },
>  	{ SLOT_SIZE(9, 2), 9, 2 },
> -	{ SLOT_SIZE(17, 3), 17, 3 },
> -	{ SLOT_SIZE(8, 2), 8, 2 },
> -	{ SLOT_SIZE(15, 3), 15, 3 },
> -	{ SLOT_SIZE(14, 3), 14, 3 },
> -	{ SLOT_SIZE(13, 3), 13, 3 },
> -	{ SLOT_SIZE(6, 2), 6, 2 },
> +	{ SLOT_SIZE(17, 4), 17, 4 },
> +	{ SLOT_SIZE(4, 1), 4, 1 },
> +	{ SLOT_SIZE(23, 6), 23, 6 },
>  	{ SLOT_SIZE(11, 3), 11, 3 },
> +	{ SLOT_SIZE(7, 2), 7, 2 },
>  	{ SLOT_SIZE(10, 3), 10, 3 },
> -	{ SLOT_SIZE(9, 3), 9, 3 },
> -	{ SLOT_SIZE(4, 2), 4, 2 },
> +	{ SLOT_SIZE(19, 6), 19, 6 },
> +	{ SLOT_SIZE(6, 2), 6, 2 },
> +	{ SLOT_SIZE(14, 5), 14, 5 },
> +	{ SLOT_SIZE(8, 3), 8, 3 },
> +	{ SLOT_SIZE(5, 2), 5, 2 },
> +	{ SLOT_SIZE(12, 5), 12, 5 },
> +	{ SLOT_SIZE(9, 4), 9, 4 },
> +	{ SLOT_SIZE(15, 7), 15, 7 },
> +	{ SLOT_SIZE(2, 1), 2, 1 },
> +	{ SLOT_SIZE(15, 8), 15, 8 },
> +	{ SLOT_SIZE(9, 5), 9, 5 },
> +	{ SLOT_SIZE(12, 7), 12, 7 },
> +	{ SLOT_SIZE(13, 8), 13, 8 },
> +	{ SLOT_SIZE(6, 4), 6, 4 },
> +	{ SLOT_SIZE(11, 8), 11, 8 },
> +	{ SLOT_SIZE(9, 7), 9, 7 },
> +	{ SLOT_SIZE(6, 5), 6, 5 },
> +	{ SLOT_SIZE(9, 8), 9, 8 },
> +	{ SLOT_SIZE(4, 4), 4, 4 },
>  #else
> -	{ SLOT_SIZE(255, 0), 255, 0 },
> -	{ SLOT_SIZE(185, 0), 185, 0 },
> -	{ SLOT_SIZE(145, 0), 145, 0 },
> -	{ SLOT_SIZE(113, 0), 113, 0 },
> -	{ SLOT_SIZE(92, 0), 92, 0 },
> -	{ SLOT_SIZE(75, 0), 75, 0 },
> -	{ SLOT_SIZE(60, 0), 60, 0 },
> -	{ SLOT_SIZE(51, 0), 51, 0 },
> -	{ SLOT_SIZE(43, 0), 43, 0 },
> -	{ SLOT_SIZE(37, 0), 37, 0 },
> -	{ SLOT_SIZE(32, 0), 32, 0 },
> -	{ SLOT_SIZE(27, 0), 27, 0 },
> -	{ SLOT_SIZE(23, 0), 23, 0 },
> -	{ SLOT_SIZE(19, 0), 19, 0 },
> -	{ SLOT_SIZE(17, 0), 17, 0 },
> -	{ SLOT_SIZE(15, 0), 15, 0 },
> -	{ SLOT_SIZE(13, 0), 13, 0 },
> -	{ SLOT_SIZE(11, 0), 11, 0 },
> -	{ SLOT_SIZE(10, 0), 10, 0 },
> -	{ SLOT_SIZE(9, 0), 9, 0 },
> -	{ SLOT_SIZE(8, 0), 8, 0 },
> -	{ SLOT_SIZE(15, 1), 15, 1 },
> -	{ SLOT_SIZE(14, 1), 14, 1 },
> -	{ SLOT_SIZE(13, 1), 13, 1 },
> +	{ SLOT_SIZE(185, 1), 185, 1 },
> +	{ SLOT_SIZE(113, 1), 113, 1 },
> +	{ SLOT_SIZE(86, 1), 86, 1 },
> +	{ SLOT_SIZE(72, 1), 72, 1 },
> +	{ SLOT_SIZE(58, 1), 58, 1 },
> +	{ SLOT_SIZE(49, 1), 49, 1 },
> +	{ SLOT_SIZE(42, 1), 42, 1 },
> +	{ SLOT_SIZE(37, 1), 37, 1 },
> +	{ SLOT_SIZE(33, 1), 33, 1 },
> +	{ SLOT_SIZE(59, 2), 59, 2 },
> +	{ SLOT_SIZE(27, 1), 27, 1 },
> +	{ SLOT_SIZE(25, 1), 25, 1 },
> +	{ SLOT_SIZE(23, 1), 23, 1 },
> +	{ SLOT_SIZE(21, 1), 21, 1 },
> +	{ SLOT_SIZE(39, 2), 39, 2 },
> +	{ SLOT_SIZE(37, 2), 37, 2 },
> +	{ SLOT_SIZE(35, 2), 35, 2 },
> +	{ SLOT_SIZE(33, 2), 33, 2 },
> +	{ SLOT_SIZE(31, 2), 31, 2 },
> +	{ SLOT_SIZE(29, 2), 29, 2 },
> +	{ SLOT_SIZE(27, 2), 27, 2 },
> +	{ SLOT_SIZE(25, 2), 25, 2 },
>  	{ SLOT_SIZE(12, 1), 12, 1 },
>  	{ SLOT_SIZE(11, 1), 11, 1 },
> +	{ SLOT_SIZE(21, 2), 21, 2 },
>  	{ SLOT_SIZE(10, 1), 10, 1 },
> +	{ SLOT_SIZE(19, 2), 19, 2 },
>  	{ SLOT_SIZE(9, 1), 9, 1 },
> +	{ SLOT_SIZE(17, 2), 17, 2 },
>  	{ SLOT_SIZE(8, 1), 8, 1 },
>  	{ SLOT_SIZE(15, 2), 15, 2 },
>  	{ SLOT_SIZE(14, 2), 14, 2 },
>  	{ SLOT_SIZE(13, 2), 13, 2 },
>  	{ SLOT_SIZE(12, 2), 12, 2 },
> +	{ SLOT_SIZE(23, 4), 23, 4 },
>  	{ SLOT_SIZE(11, 2), 11, 2 },
> +	{ SLOT_SIZE(21, 4), 21, 4 },
>  	{ SLOT_SIZE(10, 2), 10, 2 },
> +	{ SLOT_SIZE(19, 4), 19, 4 },
>  	{ SLOT_SIZE(9, 2), 9, 2 },
> +	{ SLOT_SIZE(17, 4), 17, 4 },
>  	{ SLOT_SIZE(8, 2), 8, 2 },
> -	{ SLOT_SIZE(7, 2), 7, 2 },
> -	{ SLOT_SIZE(6, 2), 6, 2 },
> +	{ SLOT_SIZE(15, 4), 15, 4 },
> +	{ SLOT_SIZE(14, 4), 14, 4 },
> +	{ SLOT_SIZE(10, 3), 10, 3 },
> +	{ SLOT_SIZE(16, 5), 16, 5 },
> +	{ SLOT_SIZE(12, 4), 12, 4 },
> +	{ SLOT_SIZE(11, 4), 11, 4 },
> +	{ SLOT_SIZE(8, 3), 8, 3 },
>  	{ SLOT_SIZE(5, 2), 5, 2 },
> +	{ SLOT_SIZE(7, 3), 7, 3 },
> +	{ SLOT_SIZE(11, 5), 11, 5 },
> +	{ SLOT_SIZE(4, 2), 4, 2 },
> +	{ SLOT_SIZE(9, 5), 9, 5 },
> +	{ SLOT_SIZE(8, 5), 8, 5 },
> +	{ SLOT_SIZE(3, 2), 3, 2 },
> +	{ SLOT_SIZE(4, 3), 4, 3 },
> +	{ SLOT_SIZE(7, 6), 7, 6 },
> +	{ SLOT_SIZE(4, 4), 4, 4 },
>  #endif /* PAGE_SIZE */
>  };
>  
> @@ -150,13 +175,11 @@ static const struct block_desc block_desc[] = {
>   * struct block_list - stores metadata of particular list
>   * lock:		protects the list of blocks
>   * active_list:		linked list of active (non-full) blocks
> - * full_list:		linked list of full blocks
>   * block_count:		total number of blocks in the list
>   */
>  struct block_list {
>  	spinlock_t lock;
>  	struct list_head active_list;
> -	struct list_head full_list;
>  	unsigned long block_count;
>  };
>  
> -- 
> 2.39.2
> 


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-06 13:13 ` Yosry Ahmed
@ 2025-05-06 13:27   ` Herbert Xu
  2025-05-06 13:37     ` Christoph Hellwig
  2025-05-07  5:57   ` Sergey Senozhatsky
  1 sibling, 1 reply; 22+ messages in thread
From: Herbert Xu @ 2025-05-06 13:27 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Sergey Senozhatsky,
	Igor Belousov, Christoph Hellwig

On Tue, May 06, 2025 at 01:13:17PM +0000, Yosry Ahmed wrote:
>
> Keep in mind that zswap_decompress() will always do an extra copy if the
> address returned by zpool_obj_read_begin() is a vmalloc address. To
> avoid this we need to enlighten the scatterlist API to work with vmalloc
> addresses.
> 
> (CC'ing Herbert as he was looking into this)

acomp now supports linear addresses so vmalloc can be sent through
directly.  It will fail if you use hardware offload though since
you can't DMA directly to vmalloc virtual addresses.

If you wish to support hardware offload, then you'll need to break
the vmalloc memory down page-by-page to create an SG list.

Adding Christoph to the cc list in case he knows of any updates in
this area (using vmalloc memory with DMA).

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-06 13:27   ` Herbert Xu
@ 2025-05-06 13:37     ` Christoph Hellwig
  0 siblings, 0 replies; 22+ messages in thread
From: Christoph Hellwig @ 2025-05-06 13:37 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Yosry Ahmed, Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Sergey Senozhatsky,
	Igor Belousov, Christoph Hellwig

On Tue, May 06, 2025 at 09:27:49PM +0800, Herbert Xu wrote:
> On Tue, May 06, 2025 at 01:13:17PM +0000, Yosry Ahmed wrote:
> >
> > Keep in mind that zswap_decompress() will always do an extra copy if the
> > address returned by zpool_obj_read_begin() is a vmalloc address. To
> > avoid this we need to enlighten the scatterlist API to work with vmalloc
> > addresses.
> > 
> > (CC'ing Herbert as he was looking into this)
> 
> acomp now supports linear addresses so vmalloc can be sent through
> directly.  It will fail if you use hardware offload though since
> you can't DMA directly to vmalloc virtual addresses.

You absolutely can and lots of code does.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-06 13:13 ` Yosry Ahmed
  2025-05-06 13:27   ` Herbert Xu
@ 2025-05-07  5:57   ` Sergey Senozhatsky
  2025-05-07  6:08     ` Sergey Senozhatsky
  1 sibling, 1 reply; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-07  5:57 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Sergey Senozhatsky,
	Igor Belousov, Herbert Xu

On (25/05/06 13:13), Yosry Ahmed wrote:
> If we can use vmalloc for zblock, then we can probably also use vmalloc
> in zsmalloc and get rid of the chaining logic completely. This would
> make zsmalloc simpler and closer to zblock in that regard.
> 
> Sergey, WDYT?

This sounds interesting.  We might get rid of lots of memcpy()
in object read/write paths, and so on.  I don't know if 0-order
chaining was the only option for zsmalloc, or just happened to
be the first one.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-07  5:57   ` Sergey Senozhatsky
@ 2025-05-07  6:08     ` Sergey Senozhatsky
  2025-05-07  6:14       ` Sergey Senozhatsky
                         ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-07  6:08 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Igor Belousov,
	Herbert Xu, Sergey Senozhatsky

On (25/05/07 14:57), Sergey Senozhatsky wrote:
> On (25/05/06 13:13), Yosry Ahmed wrote:
> > If we can use vmalloc for zblock, then we can probably also use vmalloc
> > in zsmalloc and get rid of the chaining logic completely. This would
> > make zsmalloc simpler and closer to zblock in that regard.
> > 
> > Sergey, WDYT?
> 
> This sounds interesting.  We might get rid of lots of memcpy()
> in object read/write paths, and so on.  I don't know if 0-order
> chaining was the only option for zsmalloc, or just happened to
> be the first one.

I assume we might have problems with zspage release path.  vfree()
should break .swap_slot_free_notify, as far as I can see.
.swap_slot_free_notify is called under swap-cluster spin-lock,
so if we free the last object in the zspage we cannot immediately
free that zspage, because vfree() might_sleep().


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-07  6:08     ` Sergey Senozhatsky
@ 2025-05-07  6:14       ` Sergey Senozhatsky
  2025-05-07  6:54       ` Christoph Hellwig
  2025-05-07  8:50       ` Uladzislau Rezki
  2 siblings, 0 replies; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-07  6:14 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Yosry Ahmed, Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Igor Belousov,
	Herbert Xu

On (25/05/07 15:08), Sergey Senozhatsky wrote:
> On (25/05/07 14:57), Sergey Senozhatsky wrote:
> > On (25/05/06 13:13), Yosry Ahmed wrote:
> > > If we can use vmalloc for zblock, then we can probably also use vmalloc
> > > in zsmalloc and get rid of the chaining logic completely. This would
> > > make zsmalloc simpler and closer to zblock in that regard.
> > > 
> > > Sergey, WDYT?
> > 
> > This sounds interesting.  We might get rid of lots of memcpy()
> > in object read/write paths, and so on.  I don't know if 0-order
> > chaining was the only option for zsmalloc, or just happened to
> > be the first one.
> 
> I assume we might have problems with zspage release path.  vfree()
> should break .swap_slot_free_notify, as far as I can see.
> .swap_slot_free_notify is called under swap-cluster spin-lock,
> so if we free the last object in the zspage we cannot immediately
> free that zspage, because vfree() might_sleep().

... but there is a deferred zspage free already, we can switch
to it entirely, I assume.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-07  6:08     ` Sergey Senozhatsky
  2025-05-07  6:14       ` Sergey Senozhatsky
@ 2025-05-07  6:54       ` Christoph Hellwig
  2025-05-08  5:58         ` Sergey Senozhatsky
  2025-05-07  8:50       ` Uladzislau Rezki
  2 siblings, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2025-05-07  6:54 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Yosry Ahmed, Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Igor Belousov,
	Herbert Xu

On Wed, May 07, 2025 at 03:08:08PM +0900, Sergey Senozhatsky wrote:
> > This sounds interesting.  We might get rid of lots of memcpy()
> > in object read/write paths, and so on.  I don't know if 0-order
> > chaining was the only option for zsmalloc, or just happened to
> > be the first one.
> 
> I assume we might have problems with zspage release path.  vfree()
> should break .swap_slot_free_notify, as far as I can see.
> .swap_slot_free_notify is called under swap-cluster spin-lock,
> so if we free the last object in the zspage we cannot immediately
> free that zspage, because vfree() might_sleep().

Note that swap_slot_free_notify really needs to go away in favor
of just sending a discard bio.  Having special block ops for a
single user bypassing the proper block interface is not sustainable.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-07  6:54       ` Christoph Hellwig
@ 2025-05-08  5:58         ` Sergey Senozhatsky
  2025-05-08  6:00           ` Christoph Hellwig
  0 siblings, 1 reply; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-08  5:58 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sergey Senozhatsky, Yosry Ahmed, Vitaly Wool, linux-mm, akpm,
	linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Minchan Kim, Igor Belousov, Herbert Xu

On (25/05/06 23:54), Christoph Hellwig wrote:
> On Wed, May 07, 2025 at 03:08:08PM +0900, Sergey Senozhatsky wrote:
> > > This sounds interesting.  We might get rid of lots of memcpy()
> > > in object read/write paths, and so on.  I don't know if 0-order
> > > chaining was the only option for zsmalloc, or just happened to
> > > be the first one.
> > 
> > I assume we might have problems with zspage release path.  vfree()
> > should break .swap_slot_free_notify, as far as I can see.
> > .swap_slot_free_notify is called under swap-cluster spin-lock,
> > so if we free the last object in the zspage we cannot immediately
> > free that zspage, because vfree() might_sleep().
> 
> Note that swap_slot_free_notify really needs to go away in favor
> of just sending a discard bio.  Having special block ops for a
> single user bypassing the proper block interface is not sustainable.

Oh, I didn't realize that zram was the only swap_slot_free_notify
user.  zram already handles REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES so
I guess only swap-cluster needs some work.  Are there any
blockers/complications on the swap-cluster side?


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-08  5:58         ` Sergey Senozhatsky
@ 2025-05-08  6:00           ` Christoph Hellwig
  2025-05-08  6:17             ` Sergey Senozhatsky
  0 siblings, 1 reply; 22+ messages in thread
From: Christoph Hellwig @ 2025-05-08  6:00 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Christoph Hellwig, Yosry Ahmed, Vitaly Wool, linux-mm, akpm,
	linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Minchan Kim, Igor Belousov, Herbert Xu

On Thu, May 08, 2025 at 02:58:14PM +0900, Sergey Senozhatsky wrote:
> Oh, I didn't realize that zram was the only swap_slot_free_notify
> user.  zram already handles REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES so
> I guess only swap-cluster needs some work.  Are there any
> blockers/complications on the swap-cluster side?

I think the reason it was added it was so that the discard can be
done non-blocking with a spinlock held.  Which seems a bit sketch
when calling into a driver anyway..



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-08  6:00           ` Christoph Hellwig
@ 2025-05-08  6:17             ` Sergey Senozhatsky
  2025-05-08  6:33               ` Sergey Senozhatsky
  0 siblings, 1 reply; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-08  6:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sergey Senozhatsky, Yosry Ahmed, Vitaly Wool, linux-mm, akpm,
	linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Minchan Kim, Igor Belousov, Herbert Xu

On (25/05/07 23:00), Christoph Hellwig wrote:
> On Thu, May 08, 2025 at 02:58:14PM +0900, Sergey Senozhatsky wrote:
> > Oh, I didn't realize that zram was the only swap_slot_free_notify
> > user.  zram already handles REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES so
> > I guess only swap-cluster needs some work.  Are there any
> > blockers/complications on the swap-cluster side?
> 
> I think the reason it was added it was so that the discard can be
> done non-blocking with a spinlock held.  Which seems a bit sketch
> when calling into a driver anyway..

swap_slot_free_notify is not guaranteed to free anything on the zram/zsmalloc
side. zram attempts to trylock entry and if it fails to acquire the ownership
swap_slot_free_notify for that entry becomes a .miss_free.  So we just keep
stale data in zspage (potentially, preventing it from being released if that
was the last allocated object).  I don't know if .miss_free happens often in
real life.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-08  6:17             ` Sergey Senozhatsky
@ 2025-05-08  6:33               ` Sergey Senozhatsky
  0 siblings, 0 replies; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-08  6:33 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Christoph Hellwig, Yosry Ahmed, Vitaly Wool, linux-mm, akpm,
	linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Minchan Kim, Igor Belousov, Herbert Xu

On (25/05/08 15:17), Sergey Senozhatsky wrote:
> On (25/05/07 23:00), Christoph Hellwig wrote:
> > On Thu, May 08, 2025 at 02:58:14PM +0900, Sergey Senozhatsky wrote:
> > > Oh, I didn't realize that zram was the only swap_slot_free_notify
> > > user.  zram already handles REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES so
> > > I guess only swap-cluster needs some work.  Are there any
> > > blockers/complications on the swap-cluster side?
> > 
> > I think the reason it was added it was so that the discard can be
> > done non-blocking with a spinlock held.  Which seems a bit sketch
> > when calling into a driver anyway..
> 
> swap_slot_free_notify is not guaranteed to free anything on the zram/zsmalloc
> side. zram attempts to trylock entry

.. I keep forgetting that slot-free can be called from an IRQ.  That's
some complication.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-07  6:08     ` Sergey Senozhatsky
  2025-05-07  6:14       ` Sergey Senozhatsky
  2025-05-07  6:54       ` Christoph Hellwig
@ 2025-05-07  8:50       ` Uladzislau Rezki
  2025-05-08  6:07         ` Sergey Senozhatsky
  2 siblings, 1 reply; 22+ messages in thread
From: Uladzislau Rezki @ 2025-05-07  8:50 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Yosry Ahmed, Vitaly Wool, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Minchan Kim, Igor Belousov,
	Herbert Xu

On Wed, May 07, 2025 at 03:08:08PM +0900, Sergey Senozhatsky wrote:
> On (25/05/07 14:57), Sergey Senozhatsky wrote:
> > On (25/05/06 13:13), Yosry Ahmed wrote:
> > > If we can use vmalloc for zblock, then we can probably also use vmalloc
> > > in zsmalloc and get rid of the chaining logic completely. This would
> > > make zsmalloc simpler and closer to zblock in that regard.
> > > 
> > > Sergey, WDYT?
> > 
> > This sounds interesting.  We might get rid of lots of memcpy()
> > in object read/write paths, and so on.  I don't know if 0-order
> > chaining was the only option for zsmalloc, or just happened to
> > be the first one.
> 
> I assume we might have problems with zspage release path.  vfree()
> should break .swap_slot_free_notify, as far as I can see.
> .swap_slot_free_notify is called under swap-cluster spin-lock,
> so if we free the last object in the zspage we cannot immediately
> free that zspage, because vfree() might_sleep().
> 
you can use vfree_atomic(), it can be collected in any atomic but
no in NMI.

--
Uladzislau Rezki


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] mm/zblock: use vmalloc for page allocations
  2025-05-07  8:50       ` Uladzislau Rezki
@ 2025-05-08  6:07         ` Sergey Senozhatsky
  0 siblings, 0 replies; 22+ messages in thread
From: Sergey Senozhatsky @ 2025-05-08  6:07 UTC (permalink / raw)
  To: Uladzislau Rezki, Yosry Ahmed
  Cc: Sergey Senozhatsky, Vitaly Wool, linux-mm, akpm, linux-kernel,
	Nhat Pham, Shakeel Butt, Johannes Weiner, Minchan Kim,
	Igor Belousov, Herbert Xu

On (25/05/07 10:50), Uladzislau Rezki wrote:
> On Wed, May 07, 2025 at 03:08:08PM +0900, Sergey Senozhatsky wrote:
> > On (25/05/07 14:57), Sergey Senozhatsky wrote:
> > > On (25/05/06 13:13), Yosry Ahmed wrote:
> > > > If we can use vmalloc for zblock, then we can probably also use vmalloc
> > > > in zsmalloc and get rid of the chaining logic completely. This would
> > > > make zsmalloc simpler and closer to zblock in that regard.
> > > > 
> > > > Sergey, WDYT?
> > > 
> > > This sounds interesting.  We might get rid of lots of memcpy()
> > > in object read/write paths, and so on.  I don't know if 0-order
> > > chaining was the only option for zsmalloc, or just happened to
> > > be the first one.
> > 
> > I assume we might have problems with zspage release path.  vfree()
> > should break .swap_slot_free_notify, as far as I can see.
> > .swap_slot_free_notify is called under swap-cluster spin-lock,
> > so if we free the last object in the zspage we cannot immediately
> > free that zspage, because vfree() might_sleep().
> > 
> you can use vfree_atomic(), it can be collected in any atomic but
> no in NMI.

Indeed, thanks.

A bigger problem than zspage release path is loosing GFP_MOVABLE,
I suspect.


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2025-05-08  6:33 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-02  8:01 [PATCH] mm/zblock: use vmalloc for page allocations Vitaly Wool
2025-05-02  8:07 ` Igor Belousov
2025-05-03 18:46   ` Vitaly Wool
2025-05-04  5:02     ` Sergey Senozhatsky
2025-05-04  6:14       ` Sergey Senozhatsky
2025-05-05 14:08     ` Johannes Weiner
2025-05-06  2:13       ` Sergey Senozhatsky
2025-05-05 14:29 ` Johannes Weiner
2025-05-06  9:42   ` Uladzislau Rezki
2025-05-06 13:13 ` Yosry Ahmed
2025-05-06 13:27   ` Herbert Xu
2025-05-06 13:37     ` Christoph Hellwig
2025-05-07  5:57   ` Sergey Senozhatsky
2025-05-07  6:08     ` Sergey Senozhatsky
2025-05-07  6:14       ` Sergey Senozhatsky
2025-05-07  6:54       ` Christoph Hellwig
2025-05-08  5:58         ` Sergey Senozhatsky
2025-05-08  6:00           ` Christoph Hellwig
2025-05-08  6:17             ` Sergey Senozhatsky
2025-05-08  6:33               ` Sergey Senozhatsky
2025-05-07  8:50       ` Uladzislau Rezki
2025-05-08  6:07         ` Sergey Senozhatsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).