linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4] mm: add zblock allocator
@ 2025-04-12 15:42 Vitaly Wool
  2025-04-12 19:25 ` Igor Belousov
                   ` (4 more replies)
  0 siblings, 5 replies; 20+ messages in thread
From: Vitaly Wool @ 2025-04-12 15:42 UTC (permalink / raw)
  To: linux-mm
  Cc: akpm, linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Vitaly Wool, Igor Belousov

zblock is a special purpose allocator for storing compressed pages.
It stores integer number of same size objects per its block. These
blocks consist of several physical pages (2**n, i. e. 1/2/4/8).

With zblock, it is possible to densely arrange objects of various sizes
resulting in low internal fragmentation. Also this allocator tries to
fill incomplete blocks instead of adding new ones, in many cases
providing a compression ratio comparable to zmalloc's.

zblock is also in most cases superior to zsmalloc with regard to
average performance and worst execution times, thus allowing for better
response time and real-time characteristics of the whole system.

High memory and page migration are currently not supported by zblock.

Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
Signed-off-by: Igor Belousov <igor.b@beldev.am>
---
v3: https://patchwork.kernel.org/project/linux-mm/patch/20250408125211.1611879-1-vitaly.wool@konsulko.se/
Changes since v3:
- rebased and tested against the latest -mm tree
- the block descriptors table was updated for better compression ratio
- fixed the bug with wrong SLOT_BITS value
- slot search moved to find_and_claim_block()

Test results (zstd compressor, 8 core Ryzen 9 VM, make bzImage):
- zblock:
    real	6m52.621s
    user	33m41.771s
    sys 	6m28.825s
    Zswap:            162328 kB
    Zswapped:         754468 kB
    zswpin 93851
    zswpout 542481
    zswpwb 935
- zsmalloc:
    real	7m4.355s
    user	34m37.538s
    sys 	6m22.086s
    zswpin 101243
    zswpout 448217
    zswpwb 640
    Zswap:            175704 kB
    Zswapped:         778692 kB

 Documentation/mm/zblock.rst |  24 ++
 MAINTAINERS                 |   7 +
 mm/Kconfig                  |  12 +
 mm/Makefile                 |   1 +
 mm/zblock.c                 | 443 ++++++++++++++++++++++++++++++++++++
 mm/zblock.h                 | 176 ++++++++++++++
 6 files changed, 663 insertions(+)
 create mode 100644 Documentation/mm/zblock.rst
 create mode 100644 mm/zblock.c
 create mode 100644 mm/zblock.h

diff --git a/Documentation/mm/zblock.rst b/Documentation/mm/zblock.rst
new file mode 100644
index 000000000000..9751434d0b76
--- /dev/null
+++ b/Documentation/mm/zblock.rst
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+zblock
+======
+
+zblock is a special purpose allocator for storing compressed pages.
+It stores integer number of compressed objects per its block. These
+blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
+
+With zblock, it is possible to densely arrange objects of various sizes
+resulting in low internal fragmentation. Also this allocator tries to
+fill incomplete blocks instead of adding new ones,  in many cases
+providing a compression ratio substantially higher than z3fold and zbud
+(though lower than zmalloc's).
+
+zblock does not require MMU to operate and also is superior to zsmalloc
+with regard to average performance and worst execution times, thus
+allowing for better response time and real-time characteristics of the
+whole system.
+
+E. g. on a series of stress-ng tests run on a Raspberry Pi 5, we get
+5-10% higher value for bogo ops/s in zblock/zsmalloc comparison.
+
diff --git a/MAINTAINERS b/MAINTAINERS
index 96b827049501..46465c986005 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -26640,6 +26640,13 @@ F:	Documentation/networking/device_drivers/hamradio/z8530drv.rst
 F:	drivers/net/hamradio/*scc.c
 F:	drivers/net/hamradio/z8530.h
 
+ZBLOCK COMPRESSED SLAB MEMORY ALLOCATOR
+M:	Vitaly Wool <vitaly.wool@konsulko.se>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	Documentation/mm/zblock.rst
+F:	mm/zblock.[ch]
+
 ZD1211RW WIRELESS DRIVER
 L:	linux-wireless@vger.kernel.org
 S:	Orphan
diff --git a/mm/Kconfig b/mm/Kconfig
index e113f713b493..5aa1479151ec 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -152,6 +152,18 @@ config ZSWAP_ZPOOL_DEFAULT
        default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
        default ""
 
+config ZBLOCK
+	tristate "Fast compression allocator with high density"
+	depends on ZPOOL
+	help
+	  A special purpose allocator for storing compressed pages.
+	  It stores integer number of same size compressed objects per
+	  its block. These blocks consist of several physical pages
+	  (2**n, i. e. 1/2/4/8).
+
+	  With zblock, it is possible to densely arrange objects of
+	  various sizes resulting in low internal fragmentation.
+
 config ZSMALLOC
 	tristate
 	prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
diff --git a/mm/Makefile b/mm/Makefile
index e7f6bbf8ae5f..9d7e5b5bb694 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -116,6 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZPOOL)	+= zpool.o
+obj-$(CONFIG_ZBLOCK)	+= zblock.o
 obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)	+= cma.o
diff --git a/mm/zblock.c b/mm/zblock.c
new file mode 100644
index 000000000000..ecc7aeb611af
--- /dev/null
+++ b/mm/zblock.c
@@ -0,0 +1,443 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * zblock.c
+ *
+ * Author: Vitaly Wool <vitaly.wool@konsulko.se>
+ * Based on the work from Ananda Badmaev <a.badmaev@clicknet.pro>
+ * Copyright (C) 2022-2025, Konsulko AB.
+ *
+ * Zblock is a small object allocator with the intention to serve as a
+ * zpool backend. It operates on page blocks which consist of number
+ * of physical pages being a power of 2 and store integer number of
+ * compressed pages per block which results in determinism and simplicity.
+ *
+ * zblock doesn't export any API and is meant to be used via zpool API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zpool.h>
+#include "zblock.h"
+
+static struct rb_root block_desc_tree = RB_ROOT;
+
+/* Encode handle of a particular slot in the pool using metadata */
+static inline unsigned long metadata_to_handle(struct zblock_block *block,
+				unsigned int block_type, unsigned int slot)
+{
+	return (unsigned long)(block) | (block_type << SLOT_BITS) | slot;
+}
+
+/* Return block, block type and slot in the pool corresponding to handle */
+static inline struct zblock_block *handle_to_metadata(unsigned long handle,
+				unsigned int *block_type, unsigned int *slot)
+{
+	*block_type = (handle & (PAGE_SIZE - 1)) >> SLOT_BITS;
+	*slot = handle & SLOT_MASK;
+	return (struct zblock_block *)(handle & PAGE_MASK);
+}
+
+/*
+ * Find a block with at least one free slot and claim it.
+ * We make sure that the first block, if exists, will always work.
+ */
+static inline struct zblock_block *find_and_claim_block(struct block_list *b,
+		int block_type, unsigned long *handle)
+{
+	struct list_head *l = &b->active_list;
+	unsigned int slot;
+
+	if (!list_empty(l)) {
+		struct zblock_block *z = list_first_entry(l, typeof(*z), link);
+
+		if (--z->free_slots == 0)
+			list_move(&z->link, &b->full_list);
+		/*
+		 * There is a slot in the block and we just made sure it would
+		 * remain.
+		 * Find that slot and set the busy bit.
+		 */
+		for (slot = find_first_zero_bit(z->slot_info,
+					block_desc[block_type].slots_per_block);
+		     slot < block_desc[block_type].slots_per_block;
+		     slot = find_next_zero_bit(z->slot_info,
+					block_desc[block_type].slots_per_block,
+					slot)) {
+			if (!test_and_set_bit(slot, z->slot_info))
+				break;
+			barrier();
+		}
+
+		WARN_ON(slot >= block_desc[block_type].slots_per_block);
+		*handle = metadata_to_handle(z, block_type, slot);
+		return z;
+	}
+	return NULL;
+}
+
+/*
+ * allocate new block and add it to corresponding block list
+ */
+static struct zblock_block *alloc_block(struct zblock_pool *pool,
+					int block_type, gfp_t gfp,
+					unsigned long *handle)
+{
+	struct zblock_block *block;
+	struct block_list *block_list;
+
+	block = (void *)__get_free_pages(gfp, block_desc[block_type].order);
+	if (!block)
+		return NULL;
+
+	block_list = &pool->block_lists[block_type];
+
+	/* init block data  */
+	block->free_slots = block_desc[block_type].slots_per_block - 1;
+	memset(&block->slot_info, 0, sizeof(block->slot_info));
+	set_bit(0, block->slot_info);
+	*handle = metadata_to_handle(block, block_type, 0);
+
+	spin_lock(&block_list->lock);
+	list_add(&block->link, &block_list->active_list);
+	block_list->block_count++;
+	spin_unlock(&block_list->lock);
+	return block;
+}
+
+/*****************
+ * API Functions
+ *****************/
+/**
+ * zblock_create_pool() - create a new zblock pool
+ * @gfp:	gfp flags when allocating the zblock pool structure
+ * @ops:	user-defined operations for the zblock pool
+ *
+ * Return: pointer to the new zblock pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct zblock_pool *zblock_create_pool(gfp_t gfp)
+{
+	struct zblock_pool *pool;
+	struct block_list *block_list;
+	int i;
+
+	pool = kmalloc(sizeof(struct zblock_pool), gfp);
+	if (!pool)
+		return NULL;
+
+	/* init each block list */
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
+		block_list = &pool->block_lists[i];
+		spin_lock_init(&block_list->lock);
+		INIT_LIST_HEAD(&block_list->full_list);
+		INIT_LIST_HEAD(&block_list->active_list);
+		block_list->block_count = 0;
+	}
+	return pool;
+}
+
+/**
+ * zblock_destroy_pool() - destroys an existing zblock pool
+ * @pool:	the zblock pool to be destroyed
+ *
+ */
+static void zblock_destroy_pool(struct zblock_pool *pool)
+{
+	kfree(pool);
+}
+
+
+/**
+ * zblock_alloc() - allocates a slot of appropriate size
+ * @pool:	zblock pool from which to allocate
+ * @size:	size in bytes of the desired allocation
+ * @gfp:	gfp flags used if the pool needs to grow
+ * @handle:	handle of the new allocation
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new slot.
+ */
+static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	int block_type = -1;
+	struct zblock_block *block;
+	struct block_list *block_list;
+
+	if (!size)
+		return -EINVAL;
+
+	if (size > PAGE_SIZE)
+		return -ENOSPC;
+
+	/* find basic block type with suitable slot size */
+	if (size < block_desc[0].slot_size)
+		block_type = 0;
+	else {
+		struct block_desc_node *block_node;
+		struct rb_node *node = block_desc_tree.rb_node;
+
+		while (node) {
+			block_node = container_of(node, typeof(*block_node), node);
+			if (size < block_node->this_slot_size)
+				node = node->rb_left;
+			else if (size >= block_node->next_slot_size)
+				node = node->rb_right;
+			else {
+				block_type = block_node->block_idx + 1;
+				break;
+			}
+		}
+	}
+	if (WARN_ON(block_type < 0))
+		return -EINVAL;
+	if (block_type >= ARRAY_SIZE(block_desc))
+		return -ENOSPC;
+
+	block_list = &pool->block_lists[block_type];
+
+	spin_lock(&block_list->lock);
+	block = find_and_claim_block(block_list, block_type, handle);
+	spin_unlock(&block_list->lock);
+	if (block)
+		return 0;
+
+	/* not found block with free slots try to allocate new empty block */
+	block = alloc_block(pool, block_type, gfp & ~(__GFP_MOVABLE | __GFP_HIGHMEM), handle);
+	return block ? 0 : -ENOMEM;
+}
+
+/**
+ * zblock_free() - frees the allocation associated with the given handle
+ * @pool:	pool in which the allocation resided
+ * @handle:	handle associated with the allocation returned by zblock_alloc()
+ *
+ */
+static void zblock_free(struct zblock_pool *pool, unsigned long handle)
+{
+	unsigned int slot, block_type;
+	struct zblock_block *block;
+	struct block_list *block_list;
+
+	block = handle_to_metadata(handle, &block_type, &slot);
+	block_list = &pool->block_lists[block_type];
+
+	spin_lock(&block_list->lock);
+	/* if all slots in block are empty delete whole block */
+	if (++block->free_slots == block_desc[block_type].slots_per_block) {
+		block_list->block_count--;
+		list_del(&block->link);
+		spin_unlock(&block_list->lock);
+		free_pages((unsigned long)block, block_desc[block_type].order);
+		return;
+	} else if (block->free_slots == 1)
+		list_move_tail(&block->link, &block_list->active_list);
+	clear_bit(slot, block->slot_info);
+	spin_unlock(&block_list->lock);
+}
+
+/**
+ * zblock_map() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be mapped
+ *
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+static void *zblock_map(struct zblock_pool *pool, unsigned long handle)
+{
+	unsigned int block_type, slot;
+	struct zblock_block *block;
+	unsigned long offs;
+	void *p;
+
+	block = handle_to_metadata(handle, &block_type, &slot);
+	offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block_type].slot_size;
+	p = (void *)block + offs;
+	return p;
+}
+
+/**
+ * zblock_unmap() - unmaps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be unmapped
+ */
+static void zblock_unmap(struct zblock_pool *pool, unsigned long handle)
+{
+}
+
+/**
+ * zblock_write() - write to the memory area defined by handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation
+ * @handle_mem: pointer to source memory block
+ * @mem_len:	length of the memory block to write
+ */
+static void zblock_write(struct zblock_pool *pool, unsigned long handle,
+			 void *handle_mem, size_t mem_len)
+{
+	unsigned int block_type, slot;
+	struct zblock_block *block;
+	unsigned long offs;
+	void *p;
+
+	block = handle_to_metadata(handle, &block_type, &slot);
+	offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block_type].slot_size;
+	p = (void *)block + offs;
+	memcpy(p, handle_mem, mem_len);
+}
+
+/**
+ * zblock_get_total_pages() - gets the zblock pool size in pages
+ * @pool:	pool being queried
+ *
+ * Returns: size in bytes of the given pool.
+ */
+static u64 zblock_get_total_pages(struct zblock_pool *pool)
+{
+	u64 total_size;
+	int i;
+
+	total_size = 0;
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++)
+		total_size += pool->block_lists[i].block_count << block_desc[i].order;
+
+	return total_size;
+}
+
+/*****************
+ * zpool
+ ****************/
+
+static void *zblock_zpool_create(const char *name, gfp_t gfp)
+{
+	return zblock_create_pool(gfp);
+}
+
+static void zblock_zpool_destroy(void *pool)
+{
+	zblock_destroy_pool(pool);
+}
+
+static int zblock_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	return zblock_alloc(pool, size, gfp, handle);
+}
+
+static void zblock_zpool_free(void *pool, unsigned long handle)
+{
+	zblock_free(pool, handle);
+}
+
+static void *zblock_zpool_read_begin(void *pool, unsigned long handle,
+				void *local_copy)
+{
+	return zblock_map(pool, handle);
+}
+
+static void zblock_zpool_obj_write(void *pool, unsigned long handle,
+				void *handle_mem, size_t mem_len)
+{
+	zblock_write(pool, handle, handle_mem, mem_len);
+}
+
+static void zblock_zpool_read_end(void *pool, unsigned long handle,
+				void *handle_mem)
+{
+	zblock_unmap(pool, handle);
+}
+
+static u64 zblock_zpool_total_pages(void *pool)
+{
+	return zblock_get_total_pages(pool);
+}
+
+static struct zpool_driver zblock_zpool_driver = {
+	.type =			"zblock",
+	.owner =		THIS_MODULE,
+	.create =		zblock_zpool_create,
+	.destroy =		zblock_zpool_destroy,
+	.malloc =		zblock_zpool_malloc,
+	.free =			zblock_zpool_free,
+	.obj_read_begin =	zblock_zpool_read_begin,
+	.obj_read_end =		zblock_zpool_read_end,
+	.obj_write =		zblock_zpool_obj_write,
+	.total_pages =		zblock_zpool_total_pages,
+};
+
+MODULE_ALIAS("zpool-zblock");
+
+static void delete_rbtree(void)
+{
+	while (!RB_EMPTY_ROOT(&block_desc_tree))
+		rb_erase(block_desc_tree.rb_node, &block_desc_tree);
+}
+
+static int __init create_rbtree(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
+		struct block_desc_node *block_node = kmalloc(sizeof(*block_node),
+							GFP_KERNEL);
+		struct rb_node **new = &block_desc_tree.rb_node, *parent = NULL;
+
+		if (!block_node) {
+			delete_rbtree();
+			return -ENOMEM;
+		}
+		if (i > 0 && block_desc[i].slot_size <= block_desc[i-1].slot_size) {
+			pr_err("%s: block descriptors not in ascending order\n",
+				__func__);
+			delete_rbtree();
+			return -EINVAL;
+		}
+		block_node->this_slot_size = block_desc[i].slot_size;
+		block_node->block_idx = i;
+		if (i == ARRAY_SIZE(block_desc) - 1)
+			block_node->next_slot_size = PAGE_SIZE;
+		else
+			block_node->next_slot_size = block_desc[i+1].slot_size;
+		while (*new) {
+			parent = *new;
+			/* the array is sorted so we will always go to the right */
+			new = &((*new)->rb_right);
+		}
+		rb_link_node(&block_node->node, parent, new);
+		rb_insert_color(&block_node->node, &block_desc_tree);
+	}
+	return 0;
+}
+
+static int __init init_zblock(void)
+{
+	int ret = create_rbtree();
+
+	if (ret)
+		return ret;
+
+	zpool_register_driver(&zblock_zpool_driver);
+	return 0;
+}
+
+static void __exit exit_zblock(void)
+{
+	zpool_unregister_driver(&zblock_zpool_driver);
+	delete_rbtree();
+}
+
+module_init(init_zblock);
+module_exit(exit_zblock);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Wool <vitaly.wool@konsulko.se>");
+MODULE_DESCRIPTION("Block allocator for compressed pages");
diff --git a/mm/zblock.h b/mm/zblock.h
new file mode 100644
index 000000000000..fd72961c077a
--- /dev/null
+++ b/mm/zblock.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Author: Vitaly Wool <vitaly.wool@konsulko.com>
+ * Copyright (C) 2025, Konsulko AB.
+ */
+#ifndef __ZBLOCK_H__
+#define __ZBLOCK_H__
+
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/types.h>
+
+#define SLOT_FREE 0
+#define BIT_SLOT_OCCUPIED 0
+#define BIT_SLOT_MAPPED 1
+
+#if PAGE_SIZE == 0x1000
+/* max 128 slots per block, max table size 32 */
+#define SLOT_BITS 7
+#elif PAGE_SIZE == 0x4000
+/* max 256 slots per block, max table size 64 */
+#define SLOT_BITS 8
+#else
+#error Unsupported PAGE_SIZE
+#endif
+
+#define MAX_SLOTS (1 << SLOT_BITS)
+#define SLOT_MASK ((0x1UL << SLOT_BITS) - 1)
+
+#define ZBLOCK_HEADER_SIZE	round_up(sizeof(struct zblock_block), sizeof(long))
+#define BLOCK_DATA_SIZE(order) ((PAGE_SIZE << order) - ZBLOCK_HEADER_SIZE)
+#define SLOT_SIZE(nslots, order) (round_down((BLOCK_DATA_SIZE(order) / nslots), sizeof(long)))
+
+/**
+ * struct zblock_block - block metadata
+ * Block consists of several (1/2/4/8) pages and contains fixed
+ * integer number of slots for allocating compressed pages.
+ *
+ * free_slots:	number of free slots in the block
+ * slot_info:	contains data about free/occupied slots
+ */
+struct zblock_block {
+	struct list_head link;
+	DECLARE_BITMAP(slot_info, 1 << SLOT_BITS);
+	u32 free_slots;
+};
+
+/**
+ * struct block_desc - general metadata for block lists
+ * Each block list stores only blocks of corresponding type which means
+ * that all blocks in it have the same number and size of slots.
+ * All slots are aligned to size of long.
+ *
+ * slot_size:		size of slot for this list
+ * slots_per_block:	number of slots per block for this list
+ * order:		order for __get_free_pages
+ */
+struct block_desc {
+	unsigned int slot_size;
+	unsigned short slots_per_block;
+	unsigned short order;
+};
+
+struct block_desc_node {
+	struct rb_node node;
+	unsigned int this_slot_size;
+	unsigned int next_slot_size;
+	unsigned int block_idx;
+};
+
+static const struct block_desc block_desc[] = {
+#if PAGE_SIZE == 0x1000
+	{ SLOT_SIZE(63, 0), 63, 0 },
+	{ SLOT_SIZE(32, 0), 32, 0 },
+	{ SLOT_SIZE(21, 0), 21, 0 },
+	{ SLOT_SIZE(15, 0), 15, 0 },
+	{ SLOT_SIZE(12, 0), 12, 0 },
+	{ SLOT_SIZE(10, 0), 10, 0 },
+	{ SLOT_SIZE(9, 0), 9, 0 },
+	{ SLOT_SIZE(8, 0), 8, 0 },
+	{ SLOT_SIZE(29, 2), 29, 2 },
+	{ SLOT_SIZE(13, 1), 13, 1 },
+	{ SLOT_SIZE(6, 0), 6, 0 },
+	{ SLOT_SIZE(11, 1), 11, 1 },
+	{ SLOT_SIZE(5, 0), 5, 0 },
+	{ SLOT_SIZE(9, 1), 9, 1 },
+	{ SLOT_SIZE(8, 1), 8, 1 },
+	{ SLOT_SIZE(29, 3), 29, 3 },
+	{ SLOT_SIZE(13, 2), 13, 2 },
+	{ SLOT_SIZE(12, 2), 12, 2 },
+	{ SLOT_SIZE(11, 2), 11, 2 },
+	{ SLOT_SIZE(10, 2), 10, 2 },
+	{ SLOT_SIZE(9, 2), 9, 2 },
+	{ SLOT_SIZE(17, 3), 17, 3 },
+	{ SLOT_SIZE(8, 2), 8, 2 },
+	{ SLOT_SIZE(15, 3), 15, 3 },
+	{ SLOT_SIZE(14, 3), 14, 3 },
+	{ SLOT_SIZE(13, 3), 13, 3 },
+	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(11, 3), 11, 3 },
+	{ SLOT_SIZE(10, 3), 10, 3 },
+	{ SLOT_SIZE(9, 3), 9, 3 },
+	{ SLOT_SIZE(4, 2), 4, 2 },
+#elif PAGE_SIZE == 0x4000
+	{ SLOT_SIZE(255, 0), 255, 0 },
+	{ SLOT_SIZE(185, 0), 185, 0 },
+	{ SLOT_SIZE(145, 0), 145, 0 },
+	{ SLOT_SIZE(113, 0), 113, 0 },
+	{ SLOT_SIZE(92, 0), 92, 0 },
+	{ SLOT_SIZE(75, 0), 75, 0 },
+	{ SLOT_SIZE(60, 0), 60, 0 },
+	{ SLOT_SIZE(51, 0), 51, 0 },
+	{ SLOT_SIZE(43, 0), 43, 0 },
+	{ SLOT_SIZE(37, 0), 37, 0 },
+	{ SLOT_SIZE(32, 0), 32, 0 },
+	{ SLOT_SIZE(27, 0), 27, 0 },
+	{ SLOT_SIZE(23, 0), 23, 0 },
+	{ SLOT_SIZE(19, 0), 19, 0 },
+	{ SLOT_SIZE(17, 0), 17, 0 },
+	{ SLOT_SIZE(15, 0), 15, 0 },
+	{ SLOT_SIZE(13, 0), 13, 0 },
+	{ SLOT_SIZE(11, 0), 11, 0 },
+	{ SLOT_SIZE(10, 0), 10, 0 },
+	{ SLOT_SIZE(9, 0), 9, 0 },
+	{ SLOT_SIZE(8, 0), 8, 0 },
+	{ SLOT_SIZE(15, 1), 15, 1 },
+	{ SLOT_SIZE(14, 1), 14, 1 },
+	{ SLOT_SIZE(13, 1), 13, 1 },
+	{ SLOT_SIZE(12, 1), 12, 1 },
+	{ SLOT_SIZE(11, 1), 11, 1 },
+	{ SLOT_SIZE(10, 1), 10, 1 },
+	{ SLOT_SIZE(9, 1), 9, 1 },
+	{ SLOT_SIZE(8, 1), 8, 1 },
+	{ SLOT_SIZE(15, 2), 15, 2 },
+	{ SLOT_SIZE(14, 2), 14, 2 },
+	{ SLOT_SIZE(13, 2), 13, 2 },
+	{ SLOT_SIZE(12, 2), 12, 2 },
+	{ SLOT_SIZE(11, 2), 11, 2 },
+	{ SLOT_SIZE(10, 2), 10, 2 },
+	{ SLOT_SIZE(9, 2), 9, 2 },
+	{ SLOT_SIZE(8, 2), 8, 2 },
+	{ SLOT_SIZE(7, 2), 7, 2 },
+	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(5, 2), 5, 2 },
+#endif /* PAGE_SIZE */
+};
+
+/**
+ * struct block_list - stores metadata of particular list
+ * lock:		protects the list of blocks
+ * active_list:		linked list of active (non-full) blocks
+ * full_list:		linked list of full blocks
+ * block_count:		total number of blocks in the list
+ */
+struct block_list {
+	spinlock_t lock;
+	struct list_head active_list;
+	struct list_head full_list;
+	unsigned long block_count;
+};
+
+/**
+ * struct zblock_pool - stores metadata for each zblock pool
+ * @block_lists:	array of block lists
+ * @zpool:		zpool driver
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * for a particular zblock pool.
+ */
+struct zblock_pool {
+	struct block_list block_lists[ARRAY_SIZE(block_desc)];
+	struct zpool *zpool;
+};
+
+
+#endif
-- 
2.49.0



^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-12 15:42 [PATCH v4] mm: add zblock allocator Vitaly Wool
@ 2025-04-12 19:25 ` Igor Belousov
  2025-04-16 12:09 ` Johannes Weiner
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 20+ messages in thread
From: Igor Belousov @ 2025-04-12 19:25 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner

2025-04-12 19:42 skrev Vitaly Wool:
> zblock is a special purpose allocator for storing compressed pages.
> It stores integer number of same size objects per its block. These
> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
> 
> With zblock, it is possible to densely arrange objects of various sizes
> resulting in low internal fragmentation. Also this allocator tries to
> fill incomplete blocks instead of adding new ones, in many cases
> providing a compression ratio comparable to zmalloc's.
> 
> zblock is also in most cases superior to zsmalloc with regard to
> average performance and worst execution times, thus allowing for better
> response time and real-time characteristics of the whole system.
> 
> High memory and page migration are currently not supported by zblock.
> 
> Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
> Signed-off-by: Igor Belousov <igor.b@beldev.am>

Tested-by: Igor Belousov <igor.b@beldev.am>

> ---
> v3: 
> https://patchwork.kernel.org/project/linux-mm/patch/20250408125211.1611879-1-vitaly.wool@konsulko.se/
> Changes since v3:
> - rebased and tested against the latest -mm tree
> - the block descriptors table was updated for better compression ratio
> - fixed the bug with wrong SLOT_BITS value
> - slot search moved to find_and_claim_block()
> 
> Test results (zstd compressor, 8 core Ryzen 9 VM, make bzImage):
> - zblock:
>     real	6m52.621s
>     user	33m41.771s
>     sys 	6m28.825s
>     Zswap:            162328 kB
>     Zswapped:         754468 kB
>     zswpin 93851
>     zswpout 542481
>     zswpwb 935
> - zsmalloc:
>     real	7m4.355s
>     user	34m37.538s
>     sys 	6m22.086s
>     zswpin 101243
>     zswpout 448217
>     zswpwb 640
>     Zswap:            175704 kB
>     Zswapped:         778692 kB
> 
>  Documentation/mm/zblock.rst |  24 ++
>  MAINTAINERS                 |   7 +
>  mm/Kconfig                  |  12 +
>  mm/Makefile                 |   1 +
>  mm/zblock.c                 | 443 ++++++++++++++++++++++++++++++++++++
>  mm/zblock.h                 | 176 ++++++++++++++
>  6 files changed, 663 insertions(+)
>  create mode 100644 Documentation/mm/zblock.rst
>  create mode 100644 mm/zblock.c
>  create mode 100644 mm/zblock.h
> 
> diff --git a/Documentation/mm/zblock.rst b/Documentation/mm/zblock.rst
> new file mode 100644
> index 000000000000..9751434d0b76
> --- /dev/null
> +++ b/Documentation/mm/zblock.rst
> @@ -0,0 +1,24 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +======
> +zblock
> +======
> +
> +zblock is a special purpose allocator for storing compressed pages.
> +It stores integer number of compressed objects per its block. These
> +blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
> +
> +With zblock, it is possible to densely arrange objects of various 
> sizes
> +resulting in low internal fragmentation. Also this allocator tries to
> +fill incomplete blocks instead of adding new ones,  in many cases
> +providing a compression ratio substantially higher than z3fold and 
> zbud
> +(though lower than zmalloc's).
> +
> +zblock does not require MMU to operate and also is superior to 
> zsmalloc
> +with regard to average performance and worst execution times, thus
> +allowing for better response time and real-time characteristics of the
> +whole system.
> +
> +E. g. on a series of stress-ng tests run on a Raspberry Pi 5, we get
> +5-10% higher value for bogo ops/s in zblock/zsmalloc comparison.
> +
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 96b827049501..46465c986005 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -26640,6 +26640,13 @@ 
> F:	Documentation/networking/device_drivers/hamradio/z8530drv.rst
>  F:	drivers/net/hamradio/*scc.c
>  F:	drivers/net/hamradio/z8530.h
> 
> +ZBLOCK COMPRESSED SLAB MEMORY ALLOCATOR
> +M:	Vitaly Wool <vitaly.wool@konsulko.se>
> +L:	linux-mm@kvack.org
> +S:	Maintained
> +F:	Documentation/mm/zblock.rst
> +F:	mm/zblock.[ch]
> +
>  ZD1211RW WIRELESS DRIVER
>  L:	linux-wireless@vger.kernel.org
>  S:	Orphan
> diff --git a/mm/Kconfig b/mm/Kconfig
> index e113f713b493..5aa1479151ec 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -152,6 +152,18 @@ config ZSWAP_ZPOOL_DEFAULT
>         default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
>         default ""
> 
> +config ZBLOCK
> +	tristate "Fast compression allocator with high density"
> +	depends on ZPOOL
> +	help
> +	  A special purpose allocator for storing compressed pages.
> +	  It stores integer number of same size compressed objects per
> +	  its block. These blocks consist of several physical pages
> +	  (2**n, i. e. 1/2/4/8).
> +
> +	  With zblock, it is possible to densely arrange objects of
> +	  various sizes resulting in low internal fragmentation.
> +
>  config ZSMALLOC
>  	tristate
>  	prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
> diff --git a/mm/Makefile b/mm/Makefile
> index e7f6bbf8ae5f..9d7e5b5bb694 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -116,6 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += 
> debug_vm_pgtable.o
>  obj-$(CONFIG_PAGE_OWNER) += page_owner.o
>  obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
>  obj-$(CONFIG_ZPOOL)	+= zpool.o
> +obj-$(CONFIG_ZBLOCK)	+= zblock.o
>  obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
>  obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
>  obj-$(CONFIG_CMA)	+= cma.o
> diff --git a/mm/zblock.c b/mm/zblock.c
> new file mode 100644
> index 000000000000..ecc7aeb611af
> --- /dev/null
> +++ b/mm/zblock.c
> @@ -0,0 +1,443 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * zblock.c
> + *
> + * Author: Vitaly Wool <vitaly.wool@konsulko.se>
> + * Based on the work from Ananda Badmaev <a.badmaev@clicknet.pro>
> + * Copyright (C) 2022-2025, Konsulko AB.
> + *
> + * Zblock is a small object allocator with the intention to serve as a
> + * zpool backend. It operates on page blocks which consist of number
> + * of physical pages being a power of 2 and store integer number of
> + * compressed pages per block which results in determinism and 
> simplicity.
> + *
> + * zblock doesn't export any API and is meant to be used via zpool 
> API.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/atomic.h>
> +#include <linux/list.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/preempt.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/zpool.h>
> +#include "zblock.h"
> +
> +static struct rb_root block_desc_tree = RB_ROOT;
> +
> +/* Encode handle of a particular slot in the pool using metadata */
> +static inline unsigned long metadata_to_handle(struct zblock_block 
> *block,
> +				unsigned int block_type, unsigned int slot)
> +{
> +	return (unsigned long)(block) | (block_type << SLOT_BITS) | slot;
> +}
> +
> +/* Return block, block type and slot in the pool corresponding to 
> handle */
> +static inline struct zblock_block *handle_to_metadata(unsigned long 
> handle,
> +				unsigned int *block_type, unsigned int *slot)
> +{
> +	*block_type = (handle & (PAGE_SIZE - 1)) >> SLOT_BITS;
> +	*slot = handle & SLOT_MASK;
> +	return (struct zblock_block *)(handle & PAGE_MASK);
> +}
> +
> +/*
> + * Find a block with at least one free slot and claim it.
> + * We make sure that the first block, if exists, will always work.
> + */
> +static inline struct zblock_block *find_and_claim_block(struct 
> block_list *b,
> +		int block_type, unsigned long *handle)
> +{
> +	struct list_head *l = &b->active_list;
> +	unsigned int slot;
> +
> +	if (!list_empty(l)) {
> +		struct zblock_block *z = list_first_entry(l, typeof(*z), link);
> +
> +		if (--z->free_slots == 0)
> +			list_move(&z->link, &b->full_list);
> +		/*
> +		 * There is a slot in the block and we just made sure it would
> +		 * remain.
> +		 * Find that slot and set the busy bit.
> +		 */
> +		for (slot = find_first_zero_bit(z->slot_info,
> +					block_desc[block_type].slots_per_block);
> +		     slot < block_desc[block_type].slots_per_block;
> +		     slot = find_next_zero_bit(z->slot_info,
> +					block_desc[block_type].slots_per_block,
> +					slot)) {
> +			if (!test_and_set_bit(slot, z->slot_info))
> +				break;
> +			barrier();
> +		}
> +
> +		WARN_ON(slot >= block_desc[block_type].slots_per_block);
> +		*handle = metadata_to_handle(z, block_type, slot);
> +		return z;
> +	}
> +	return NULL;
> +}
> +
> +/*
> + * allocate new block and add it to corresponding block list
> + */
> +static struct zblock_block *alloc_block(struct zblock_pool *pool,
> +					int block_type, gfp_t gfp,
> +					unsigned long *handle)
> +{
> +	struct zblock_block *block;
> +	struct block_list *block_list;
> +
> +	block = (void *)__get_free_pages(gfp, block_desc[block_type].order);
> +	if (!block)
> +		return NULL;
> +
> +	block_list = &pool->block_lists[block_type];
> +
> +	/* init block data  */
> +	block->free_slots = block_desc[block_type].slots_per_block - 1;
> +	memset(&block->slot_info, 0, sizeof(block->slot_info));
> +	set_bit(0, block->slot_info);
> +	*handle = metadata_to_handle(block, block_type, 0);
> +
> +	spin_lock(&block_list->lock);
> +	list_add(&block->link, &block_list->active_list);
> +	block_list->block_count++;
> +	spin_unlock(&block_list->lock);
> +	return block;
> +}
> +
> +/*****************
> + * API Functions
> + *****************/
> +/**
> + * zblock_create_pool() - create a new zblock pool
> + * @gfp:	gfp flags when allocating the zblock pool structure
> + * @ops:	user-defined operations for the zblock pool
> + *
> + * Return: pointer to the new zblock pool or NULL if the metadata 
> allocation
> + * failed.
> + */
> +static struct zblock_pool *zblock_create_pool(gfp_t gfp)
> +{
> +	struct zblock_pool *pool;
> +	struct block_list *block_list;
> +	int i;
> +
> +	pool = kmalloc(sizeof(struct zblock_pool), gfp);
> +	if (!pool)
> +		return NULL;
> +
> +	/* init each block list */
> +	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
> +		block_list = &pool->block_lists[i];
> +		spin_lock_init(&block_list->lock);
> +		INIT_LIST_HEAD(&block_list->full_list);
> +		INIT_LIST_HEAD(&block_list->active_list);
> +		block_list->block_count = 0;
> +	}
> +	return pool;
> +}
> +
> +/**
> + * zblock_destroy_pool() - destroys an existing zblock pool
> + * @pool:	the zblock pool to be destroyed
> + *
> + */
> +static void zblock_destroy_pool(struct zblock_pool *pool)
> +{
> +	kfree(pool);
> +}
> +
> +
> +/**
> + * zblock_alloc() - allocates a slot of appropriate size
> + * @pool:	zblock pool from which to allocate
> + * @size:	size in bytes of the desired allocation
> + * @gfp:	gfp flags used if the pool needs to grow
> + * @handle:	handle of the new allocation
> + *
> + * Return: 0 if success and handle is set, otherwise -EINVAL if the 
> size or
> + * gfp arguments are invalid or -ENOMEM if the pool was unable to 
> allocate
> + * a new slot.
> + */
> +static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t 
> gfp,
> +			unsigned long *handle)
> +{
> +	int block_type = -1;
> +	struct zblock_block *block;
> +	struct block_list *block_list;
> +
> +	if (!size)
> +		return -EINVAL;
> +
> +	if (size > PAGE_SIZE)
> +		return -ENOSPC;
> +
> +	/* find basic block type with suitable slot size */
> +	if (size < block_desc[0].slot_size)
> +		block_type = 0;
> +	else {
> +		struct block_desc_node *block_node;
> +		struct rb_node *node = block_desc_tree.rb_node;
> +
> +		while (node) {
> +			block_node = container_of(node, typeof(*block_node), node);
> +			if (size < block_node->this_slot_size)
> +				node = node->rb_left;
> +			else if (size >= block_node->next_slot_size)
> +				node = node->rb_right;
> +			else {
> +				block_type = block_node->block_idx + 1;
> +				break;
> +			}
> +		}
> +	}
> +	if (WARN_ON(block_type < 0))
> +		return -EINVAL;
> +	if (block_type >= ARRAY_SIZE(block_desc))
> +		return -ENOSPC;
> +
> +	block_list = &pool->block_lists[block_type];
> +
> +	spin_lock(&block_list->lock);
> +	block = find_and_claim_block(block_list, block_type, handle);
> +	spin_unlock(&block_list->lock);
> +	if (block)
> +		return 0;
> +
> +	/* not found block with free slots try to allocate new empty block */
> +	block = alloc_block(pool, block_type, gfp & ~(__GFP_MOVABLE | 
> __GFP_HIGHMEM), handle);
> +	return block ? 0 : -ENOMEM;
> +}
> +
> +/**
> + * zblock_free() - frees the allocation associated with the given 
> handle
> + * @pool:	pool in which the allocation resided
> + * @handle:	handle associated with the allocation returned by 
> zblock_alloc()
> + *
> + */
> +static void zblock_free(struct zblock_pool *pool, unsigned long 
> handle)
> +{
> +	unsigned int slot, block_type;
> +	struct zblock_block *block;
> +	struct block_list *block_list;
> +
> +	block = handle_to_metadata(handle, &block_type, &slot);
> +	block_list = &pool->block_lists[block_type];
> +
> +	spin_lock(&block_list->lock);
> +	/* if all slots in block are empty delete whole block */
> +	if (++block->free_slots == block_desc[block_type].slots_per_block) {
> +		block_list->block_count--;
> +		list_del(&block->link);
> +		spin_unlock(&block_list->lock);
> +		free_pages((unsigned long)block, block_desc[block_type].order);
> +		return;
> +	} else if (block->free_slots == 1)
> +		list_move_tail(&block->link, &block_list->active_list);
> +	clear_bit(slot, block->slot_info);
> +	spin_unlock(&block_list->lock);
> +}
> +
> +/**
> + * zblock_map() - maps the allocation associated with the given handle
> + * @pool:	pool in which the allocation resides
> + * @handle:	handle associated with the allocation to be mapped
> + *
> + *
> + * Returns: a pointer to the mapped allocation
> + */
> +static void *zblock_map(struct zblock_pool *pool, unsigned long 
> handle)
> +{
> +	unsigned int block_type, slot;
> +	struct zblock_block *block;
> +	unsigned long offs;
> +	void *p;
> +
> +	block = handle_to_metadata(handle, &block_type, &slot);
> +	offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block_type].slot_size;
> +	p = (void *)block + offs;
> +	return p;
> +}
> +
> +/**
> + * zblock_unmap() - unmaps the allocation associated with the given 
> handle
> + * @pool:	pool in which the allocation resides
> + * @handle:	handle associated with the allocation to be unmapped
> + */
> +static void zblock_unmap(struct zblock_pool *pool, unsigned long 
> handle)
> +{
> +}
> +
> +/**
> + * zblock_write() - write to the memory area defined by handle
> + * @pool:	pool in which the allocation resides
> + * @handle:	handle associated with the allocation
> + * @handle_mem: pointer to source memory block
> + * @mem_len:	length of the memory block to write
> + */
> +static void zblock_write(struct zblock_pool *pool, unsigned long 
> handle,
> +			 void *handle_mem, size_t mem_len)
> +{
> +	unsigned int block_type, slot;
> +	struct zblock_block *block;
> +	unsigned long offs;
> +	void *p;
> +
> +	block = handle_to_metadata(handle, &block_type, &slot);
> +	offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block_type].slot_size;
> +	p = (void *)block + offs;
> +	memcpy(p, handle_mem, mem_len);
> +}
> +
> +/**
> + * zblock_get_total_pages() - gets the zblock pool size in pages
> + * @pool:	pool being queried
> + *
> + * Returns: size in bytes of the given pool.
> + */
> +static u64 zblock_get_total_pages(struct zblock_pool *pool)
> +{
> +	u64 total_size;
> +	int i;
> +
> +	total_size = 0;
> +	for (i = 0; i < ARRAY_SIZE(block_desc); i++)
> +		total_size += pool->block_lists[i].block_count << 
> block_desc[i].order;
> +
> +	return total_size;
> +}
> +
> +/*****************
> + * zpool
> + ****************/
> +
> +static void *zblock_zpool_create(const char *name, gfp_t gfp)
> +{
> +	return zblock_create_pool(gfp);
> +}
> +
> +static void zblock_zpool_destroy(void *pool)
> +{
> +	zblock_destroy_pool(pool);
> +}
> +
> +static int zblock_zpool_malloc(void *pool, size_t size, gfp_t gfp,
> +			unsigned long *handle)
> +{
> +	return zblock_alloc(pool, size, gfp, handle);
> +}
> +
> +static void zblock_zpool_free(void *pool, unsigned long handle)
> +{
> +	zblock_free(pool, handle);
> +}
> +
> +static void *zblock_zpool_read_begin(void *pool, unsigned long handle,
> +				void *local_copy)
> +{
> +	return zblock_map(pool, handle);
> +}
> +
> +static void zblock_zpool_obj_write(void *pool, unsigned long handle,
> +				void *handle_mem, size_t mem_len)
> +{
> +	zblock_write(pool, handle, handle_mem, mem_len);
> +}
> +
> +static void zblock_zpool_read_end(void *pool, unsigned long handle,
> +				void *handle_mem)
> +{
> +	zblock_unmap(pool, handle);
> +}
> +
> +static u64 zblock_zpool_total_pages(void *pool)
> +{
> +	return zblock_get_total_pages(pool);
> +}
> +
> +static struct zpool_driver zblock_zpool_driver = {
> +	.type =			"zblock",
> +	.owner =		THIS_MODULE,
> +	.create =		zblock_zpool_create,
> +	.destroy =		zblock_zpool_destroy,
> +	.malloc =		zblock_zpool_malloc,
> +	.free =			zblock_zpool_free,
> +	.obj_read_begin =	zblock_zpool_read_begin,
> +	.obj_read_end =		zblock_zpool_read_end,
> +	.obj_write =		zblock_zpool_obj_write,
> +	.total_pages =		zblock_zpool_total_pages,
> +};
> +
> +MODULE_ALIAS("zpool-zblock");
> +
> +static void delete_rbtree(void)
> +{
> +	while (!RB_EMPTY_ROOT(&block_desc_tree))
> +		rb_erase(block_desc_tree.rb_node, &block_desc_tree);
> +}
> +
> +static int __init create_rbtree(void)
> +{
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
> +		struct block_desc_node *block_node = kmalloc(sizeof(*block_node),
> +							GFP_KERNEL);
> +		struct rb_node **new = &block_desc_tree.rb_node, *parent = NULL;
> +
> +		if (!block_node) {
> +			delete_rbtree();
> +			return -ENOMEM;
> +		}
> +		if (i > 0 && block_desc[i].slot_size <= block_desc[i-1].slot_size) {
> +			pr_err("%s: block descriptors not in ascending order\n",
> +				__func__);
> +			delete_rbtree();
> +			return -EINVAL;
> +		}
> +		block_node->this_slot_size = block_desc[i].slot_size;
> +		block_node->block_idx = i;
> +		if (i == ARRAY_SIZE(block_desc) - 1)
> +			block_node->next_slot_size = PAGE_SIZE;
> +		else
> +			block_node->next_slot_size = block_desc[i+1].slot_size;
> +		while (*new) {
> +			parent = *new;
> +			/* the array is sorted so we will always go to the right */
> +			new = &((*new)->rb_right);
> +		}
> +		rb_link_node(&block_node->node, parent, new);
> +		rb_insert_color(&block_node->node, &block_desc_tree);
> +	}
> +	return 0;
> +}
> +
> +static int __init init_zblock(void)
> +{
> +	int ret = create_rbtree();
> +
> +	if (ret)
> +		return ret;
> +
> +	zpool_register_driver(&zblock_zpool_driver);
> +	return 0;
> +}
> +
> +static void __exit exit_zblock(void)
> +{
> +	zpool_unregister_driver(&zblock_zpool_driver);
> +	delete_rbtree();
> +}
> +
> +module_init(init_zblock);
> +module_exit(exit_zblock);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Vitaly Wool <vitaly.wool@konsulko.se>");
> +MODULE_DESCRIPTION("Block allocator for compressed pages");
> diff --git a/mm/zblock.h b/mm/zblock.h
> new file mode 100644
> index 000000000000..fd72961c077a
> --- /dev/null
> +++ b/mm/zblock.h
> @@ -0,0 +1,176 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Author: Vitaly Wool <vitaly.wool@konsulko.com>
> + * Copyright (C) 2025, Konsulko AB.
> + */
> +#ifndef __ZBLOCK_H__
> +#define __ZBLOCK_H__
> +
> +#include <linux/mm.h>
> +#include <linux/rbtree.h>
> +#include <linux/types.h>
> +
> +#define SLOT_FREE 0
> +#define BIT_SLOT_OCCUPIED 0
> +#define BIT_SLOT_MAPPED 1
> +
> +#if PAGE_SIZE == 0x1000
> +/* max 128 slots per block, max table size 32 */
> +#define SLOT_BITS 7
> +#elif PAGE_SIZE == 0x4000
> +/* max 256 slots per block, max table size 64 */
> +#define SLOT_BITS 8
> +#else
> +#error Unsupported PAGE_SIZE
> +#endif
> +
> +#define MAX_SLOTS (1 << SLOT_BITS)
> +#define SLOT_MASK ((0x1UL << SLOT_BITS) - 1)
> +
> +#define ZBLOCK_HEADER_SIZE	round_up(sizeof(struct zblock_block), 
> sizeof(long))
> +#define BLOCK_DATA_SIZE(order) ((PAGE_SIZE << order) - 
> ZBLOCK_HEADER_SIZE)
> +#define SLOT_SIZE(nslots, order) (round_down((BLOCK_DATA_SIZE(order) / 
> nslots), sizeof(long)))
> +
> +/**
> + * struct zblock_block - block metadata
> + * Block consists of several (1/2/4/8) pages and contains fixed
> + * integer number of slots for allocating compressed pages.
> + *
> + * free_slots:	number of free slots in the block
> + * slot_info:	contains data about free/occupied slots
> + */
> +struct zblock_block {
> +	struct list_head link;
> +	DECLARE_BITMAP(slot_info, 1 << SLOT_BITS);
> +	u32 free_slots;
> +};
> +
> +/**
> + * struct block_desc - general metadata for block lists
> + * Each block list stores only blocks of corresponding type which 
> means
> + * that all blocks in it have the same number and size of slots.
> + * All slots are aligned to size of long.
> + *
> + * slot_size:		size of slot for this list
> + * slots_per_block:	number of slots per block for this list
> + * order:		order for __get_free_pages
> + */
> +struct block_desc {
> +	unsigned int slot_size;
> +	unsigned short slots_per_block;
> +	unsigned short order;
> +};
> +
> +struct block_desc_node {
> +	struct rb_node node;
> +	unsigned int this_slot_size;
> +	unsigned int next_slot_size;
> +	unsigned int block_idx;
> +};
> +
> +static const struct block_desc block_desc[] = {
> +#if PAGE_SIZE == 0x1000
> +	{ SLOT_SIZE(63, 0), 63, 0 },
> +	{ SLOT_SIZE(32, 0), 32, 0 },
> +	{ SLOT_SIZE(21, 0), 21, 0 },
> +	{ SLOT_SIZE(15, 0), 15, 0 },
> +	{ SLOT_SIZE(12, 0), 12, 0 },
> +	{ SLOT_SIZE(10, 0), 10, 0 },
> +	{ SLOT_SIZE(9, 0), 9, 0 },
> +	{ SLOT_SIZE(8, 0), 8, 0 },
> +	{ SLOT_SIZE(29, 2), 29, 2 },
> +	{ SLOT_SIZE(13, 1), 13, 1 },
> +	{ SLOT_SIZE(6, 0), 6, 0 },
> +	{ SLOT_SIZE(11, 1), 11, 1 },
> +	{ SLOT_SIZE(5, 0), 5, 0 },
> +	{ SLOT_SIZE(9, 1), 9, 1 },
> +	{ SLOT_SIZE(8, 1), 8, 1 },
> +	{ SLOT_SIZE(29, 3), 29, 3 },
> +	{ SLOT_SIZE(13, 2), 13, 2 },
> +	{ SLOT_SIZE(12, 2), 12, 2 },
> +	{ SLOT_SIZE(11, 2), 11, 2 },
> +	{ SLOT_SIZE(10, 2), 10, 2 },
> +	{ SLOT_SIZE(9, 2), 9, 2 },
> +	{ SLOT_SIZE(17, 3), 17, 3 },
> +	{ SLOT_SIZE(8, 2), 8, 2 },
> +	{ SLOT_SIZE(15, 3), 15, 3 },
> +	{ SLOT_SIZE(14, 3), 14, 3 },
> +	{ SLOT_SIZE(13, 3), 13, 3 },
> +	{ SLOT_SIZE(6, 2), 6, 2 },
> +	{ SLOT_SIZE(11, 3), 11, 3 },
> +	{ SLOT_SIZE(10, 3), 10, 3 },
> +	{ SLOT_SIZE(9, 3), 9, 3 },
> +	{ SLOT_SIZE(4, 2), 4, 2 },
> +#elif PAGE_SIZE == 0x4000
> +	{ SLOT_SIZE(255, 0), 255, 0 },
> +	{ SLOT_SIZE(185, 0), 185, 0 },
> +	{ SLOT_SIZE(145, 0), 145, 0 },
> +	{ SLOT_SIZE(113, 0), 113, 0 },
> +	{ SLOT_SIZE(92, 0), 92, 0 },
> +	{ SLOT_SIZE(75, 0), 75, 0 },
> +	{ SLOT_SIZE(60, 0), 60, 0 },
> +	{ SLOT_SIZE(51, 0), 51, 0 },
> +	{ SLOT_SIZE(43, 0), 43, 0 },
> +	{ SLOT_SIZE(37, 0), 37, 0 },
> +	{ SLOT_SIZE(32, 0), 32, 0 },
> +	{ SLOT_SIZE(27, 0), 27, 0 },
> +	{ SLOT_SIZE(23, 0), 23, 0 },
> +	{ SLOT_SIZE(19, 0), 19, 0 },
> +	{ SLOT_SIZE(17, 0), 17, 0 },
> +	{ SLOT_SIZE(15, 0), 15, 0 },
> +	{ SLOT_SIZE(13, 0), 13, 0 },
> +	{ SLOT_SIZE(11, 0), 11, 0 },
> +	{ SLOT_SIZE(10, 0), 10, 0 },
> +	{ SLOT_SIZE(9, 0), 9, 0 },
> +	{ SLOT_SIZE(8, 0), 8, 0 },
> +	{ SLOT_SIZE(15, 1), 15, 1 },
> +	{ SLOT_SIZE(14, 1), 14, 1 },
> +	{ SLOT_SIZE(13, 1), 13, 1 },
> +	{ SLOT_SIZE(12, 1), 12, 1 },
> +	{ SLOT_SIZE(11, 1), 11, 1 },
> +	{ SLOT_SIZE(10, 1), 10, 1 },
> +	{ SLOT_SIZE(9, 1), 9, 1 },
> +	{ SLOT_SIZE(8, 1), 8, 1 },
> +	{ SLOT_SIZE(15, 2), 15, 2 },
> +	{ SLOT_SIZE(14, 2), 14, 2 },
> +	{ SLOT_SIZE(13, 2), 13, 2 },
> +	{ SLOT_SIZE(12, 2), 12, 2 },
> +	{ SLOT_SIZE(11, 2), 11, 2 },
> +	{ SLOT_SIZE(10, 2), 10, 2 },
> +	{ SLOT_SIZE(9, 2), 9, 2 },
> +	{ SLOT_SIZE(8, 2), 8, 2 },
> +	{ SLOT_SIZE(7, 2), 7, 2 },
> +	{ SLOT_SIZE(6, 2), 6, 2 },
> +	{ SLOT_SIZE(5, 2), 5, 2 },
> +#endif /* PAGE_SIZE */
> +};
> +
> +/**
> + * struct block_list - stores metadata of particular list
> + * lock:		protects the list of blocks
> + * active_list:		linked list of active (non-full) blocks
> + * full_list:		linked list of full blocks
> + * block_count:		total number of blocks in the list
> + */
> +struct block_list {
> +	spinlock_t lock;
> +	struct list_head active_list;
> +	struct list_head full_list;
> +	unsigned long block_count;
> +};
> +
> +/**
> + * struct zblock_pool - stores metadata for each zblock pool
> + * @block_lists:	array of block lists
> + * @zpool:		zpool driver
> + *
> + * This structure is allocated at pool creation time and maintains 
> metadata
> + * for a particular zblock pool.
> + */
> +struct zblock_pool {
> +	struct block_list block_lists[ARRAY_SIZE(block_desc)];
> +	struct zpool *zpool;
> +};
> +
> +
> +#endif


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-12 15:42 [PATCH v4] mm: add zblock allocator Vitaly Wool
  2025-04-12 19:25 ` Igor Belousov
@ 2025-04-16 12:09 ` Johannes Weiner
  2025-04-16 20:10   ` Vitaly
  2025-04-18 10:52   ` David Hildenbrand
  2025-04-22 10:46 ` Yosry Ahmed
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 20+ messages in thread
From: Johannes Weiner @ 2025-04-16 12:09 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Igor Belousov

On Sat, Apr 12, 2025 at 05:42:07PM +0200, Vitaly Wool wrote:
> zblock is a special purpose allocator for storing compressed pages.
> It stores integer number of same size objects per its block. These
> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
> 
> With zblock, it is possible to densely arrange objects of various sizes
> resulting in low internal fragmentation. Also this allocator tries to
> fill incomplete blocks instead of adding new ones, in many cases
> providing a compression ratio comparable to zmalloc's.
> 
> zblock is also in most cases superior to zsmalloc with regard to
> average performance and worst execution times, thus allowing for better
> response time and real-time characteristics of the whole system.

Is there a reason not to use this allocation scheme in zsmalloc then?

I'm curious what others think, but I'm still not convinced a second
allocator makes sense. It's maintenance overhead, a permanent struggle
to match feature parity, and it fragments development and testing base.

Not long ago several slab allocators were removed for those
reasons. Likewise, we just deleted zbud and z3fold because they didn't
get any attention and bitrotted, but not before years of inflicting
pain through the zpool interface, users accidentally making very
suboptimal choices, reporting the same bugs over and over again etc.

If you discovered a better allocation scheme, that's excellent. But I
don't see why it warrants forking the entire allocator.

I also don't buy the fragmentation argument. Even if you are better at
packing during allocation time (although, citation needed), the
workload can unmap swap pages such that they leave partial blocks
behind indefinitely if you don't have block compaction.

Then there is the migration support, which you said is planned, but
which would presumably require the same indirection between handle and
the backing pages that zsmalloc has. How much will this eat into the
performance advantage?

I'd much rather you'd focus on making zsmalloc better. Improve the
packing scheme, make expensive features optional/configurable etc.
That would be easier on developers and users alike.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-16 12:09 ` Johannes Weiner
@ 2025-04-16 20:10   ` Vitaly
  2025-04-17 14:16     ` Johannes Weiner
  2025-04-18 10:52   ` David Hildenbrand
  1 sibling, 1 reply; 20+ messages in thread
From: Vitaly @ 2025-04-16 20:10 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Igor Belousov

[-- Attachment #1: Type: text/plain, Size: 2805 bytes --]


On Wednesday, April 16, 2025 at 2:09:12 pm +02:00, Johannes Weiner <hannes@cmpxchg.org> wrote:

>> zblock is also in most cases superior to zsmalloc with regard to
>> average performance and worst execution times, thus allowing for better
>> response time and real-time characteristics of the whole system.

>> Is there a reason not to use this allocation scheme in zsmalloc then?

Introducing such a scheme in zsmalloc is theoretically possible but it appears to be more complicated than implementing it from scratch, which is exactly what was done.

> I'm curious what others think, but I'm still not convinced a second
> allocator makes sense. It's maintenance overhead, a permanent struggle
> to match feature parity, and it fragments development and testing base.

> Not long ago several slab allocators were removed for those
> reasons. Likewise, we just deleted zbud and z3fold because they didn't
> get any attention and bitrotted, but not before years of inflicting
> pain through the zpool interface, users accidentally making very
> suboptimal choices, reporting the same bugs over and over again etc.

I'm not sure what pain you are talking about. There were reasons why z3fold and zbud existed. z3fold and zbud were the ones that supported page reclaim, zsmalloc wasn't quite usable with zswap until recently. When we did z3fold it was outperforming zsmalloc.

With that said, I didn't object to removing z3fold because I did understand that it made no sense to keep it at that point.

> I also don't buy the fragmentation argument. Even if you are better at
> packing during allocation time (although, citation needed), the
> workload can unmap swap pages such that they leave partial blocks
> behind indefinitely if you don't have block compaction.

We published Zswap/Zswapped values for zblock/zsmalloc after stress loads and those were on par, basically.

> Then there is the migration support, which you said is planned, but
> which would presumably require the same indirection between handle and
> the backing pages that zsmalloc has. How much will this eat into the
> performance advantage?


I don't think that will be necessary. We're working on supporting GFP_MOVABLE and minimising high order allocations
> I'd much rather you'd focus on making zsmalloc better. Improve the
> packing scheme, make expensive features optional/configurable etc.
> That would be easier on developers and users alike.

zblock's source code is almost 5x smaller in size than zsmalloc's and yet zblock works better in many cases with just a few bottlenecks. Why would you mind that we'd focus on making zblock better instead and possibly retire zsmalloc when that mission is accomplished, just like we retired z3fold a while ago?

Best regards,

Vitaly

[-- Attachment #2.1: Type: text/html, Size: 3520 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-16 20:10   ` Vitaly
@ 2025-04-17 14:16     ` Johannes Weiner
  2025-04-18  7:43       ` Vitaly Wool
  0 siblings, 1 reply; 20+ messages in thread
From: Johannes Weiner @ 2025-04-17 14:16 UTC (permalink / raw)
  To: Vitaly; +Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Igor Belousov

On Wed, Apr 16, 2025 at 10:10:23PM +0200, Vitaly wrote:
> 
> On Wednesday, April 16, 2025 at 2:09:12 pm +02:00, Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> >> zblock is also in most cases superior to zsmalloc with regard to
> >> average performance and worst execution times, thus allowing for better
> >> response time and real-time characteristics of the whole system.
> 
> >> Is there a reason not to use this allocation scheme in zsmalloc then?
> 
> Introducing such a scheme in zsmalloc is theoretically possible but
> it appears to be more complicated than implementing it from scratch,
> which is exactly what was done.

Sure, but having two options adds sizable complexity as well.

> > I'm curious what others think, but I'm still not convinced a second
> > allocator makes sense. It's maintenance overhead, a permanent struggle
> > to match feature parity, and it fragments development and testing base.
> 
> > Not long ago several slab allocators were removed for those
> > reasons. Likewise, we just deleted zbud and z3fold because they didn't
> > get any attention and bitrotted, but not before years of inflicting
> > pain through the zpool interface, users accidentally making very
> > suboptimal choices, reporting the same bugs over and over again etc.
> 
> I'm not sure what pain you are talking about.

I list them in the paragraph you're replying to, and I've previously
outlined the burden on developers, maintainers, and admins to support
multiple implementations of the same functionality.

There is a real cost to doing this that you seem to dismiss.

> There were reasons why z3fold and zbud existed. z3fold and zbud were
> the ones that supported page reclaim, zsmalloc wasn't quite usable
> with zswap until recently. When we did z3fold it was outperforming
> zsmalloc.

We see a higher compression ratio than 3 on a wide variety of
workloads, so I'm a little doubtful z3fold ever outperformed zsmalloc
in general-purpose environments.

When Meta started using zswap, certainly zsmalloc was the only real
contender. zbud's storage density was almost not worth the cost of
compression. z3fold was also not dense enough, and we ran into
stability issues and crashes. The page reclaim/writeback
implementation was not very useful either - take a look at the history
of changes from Nhat and Domenico. These weren't just issues specific
to our usecases, but much more blatant "how was this ever supposed to
work?" problems.

There is nothing wrong with the evolution from zbud to more
sophisticated allocators. But there is a pretty mature and
feature-rich allocator now, and that sets a floor on what new
allocators need to support to be considered general-purpose.

> With that said, I didn't object to removing z3fold because I did
> understand that it made no sense to keep it at that point.

But you're proposing to do the same thing again, when multiple people
just got done phasing out and cleaning up your previous experiments.

> > I also don't buy the fragmentation argument. Even if you are better at
> > packing during allocation time (although, citation needed), the
> > workload can unmap swap pages such that they leave partial blocks
> > behind indefinitely if you don't have block compaction.
> 
> We published Zswap/Zswapped values for zblock/zsmalloc after stress
> loads and those were on par, basically.

Those are accounted in zswap, so unfortunately don't capture backend
fragmentation. You'd need to implement some of the debugging features
and memory counters that zsmalloc has in order to compare them.

> > Then there is the migration support, which you said is planned, but
> > which would presumably require the same indirection between handle and
> > the backing pages that zsmalloc has. How much will this eat into the
> > performance advantage?
> 
> I don't think that will be necessary. We're working on supporting
> GFP_MOVABLE and minimising high order allocations

> > I'd much rather you'd focus on making zsmalloc better. Improve the
> > packing scheme, make expensive features optional/configurable etc.
> > That would be easier on developers and users alike.
> 
> zblock's source code is almost 5x smaller in size than zsmalloc's

It's an apple-to-oranges comparison.

zsmalloc has memory pressure handling and a rich debugging
infrastructure that was added over time based on what people thought
necessary and useful from production experience.

Implement the same functionality in zblock and we can compare lines
and performance.

> and yet zblock works better in many cases with just a few
> bottlenecks. Why would you mind that we'd focus on making zblock
> better instead and possibly retire zsmalloc when that mission is
> accomplished, just like we retired z3fold a while ago?

You're proposing a significant, open-ended maintenance burden for
everybody else. I'm just asking for some justification stronger than
"the small subset of the backend allocator that we implemented is
slightly faster in a limited number of benchmarks."

The fact that zstd - a very commonly used compressor - immediately
surfaced bugs that made it *much slower* is not reassuring.

The freelist issues that came up suggest strongly that you haven't
looked too closely at zsmalloc and actually tried to find out why it
does things the way it does. Which in turn suggests to me that this is
not going to be the only cornercase lesson that zblock will go through
for things that have been addressed in zsmalloc a long time ago.

All new research is expected to address prior work in its space. A new
allocator should at least come with some analysis of where exactly the
current allocator is flawed and why fixing that would likely amount to
an entire rewrite anyway. Or be a more realistic drop-in replacement
for the existing allocator already.

Making incremental improvements is the default. Forking needs a very
good reason.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-17 14:16     ` Johannes Weiner
@ 2025-04-18  7:43       ` Vitaly Wool
  0 siblings, 0 replies; 20+ messages in thread
From: Vitaly Wool @ 2025-04-18  7:43 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Igor Belousov



> On Apr 17, 2025, at 4:16 PM, Johannes Weiner <hannes@cmpxchg.org> wrote:
> 
> On Wed, Apr 16, 2025 at 10:10:23PM +0200, Vitaly wrote:
>> 
>> On Wednesday, April 16, 2025 at 2:09:12 pm +02:00, Johannes Weiner <hannes@cmpxchg.org> wrote:
>> 
>>>> zblock is also in most cases superior to zsmalloc with regard to
>>>> average performance and worst execution times, thus allowing for better
>>>> response time and real-time characteristics of the whole system.
>> 
>>>> Is there a reason not to use this allocation scheme in zsmalloc then?
>> 
>> Introducing such a scheme in zsmalloc is theoretically possible but
>> it appears to be more complicated than implementing it from scratch,
>> which is exactly what was done.
> 
> Sure, but having two options adds sizable complexity as well.
> 
>>> I'm curious what others think, but I'm still not convinced a second
>>> allocator makes sense. It's maintenance overhead, a permanent struggle
>>> to match feature parity, and it fragments development and testing base.
>> 
>>> Not long ago several slab allocators were removed for those
>>> reasons. Likewise, we just deleted zbud and z3fold because they didn't
>>> get any attention and bitrotted, but not before years of inflicting
>>> pain through the zpool interface, users accidentally making very
>>> suboptimal choices, reporting the same bugs over and over again etc.
>> 
>> I'm not sure what pain you are talking about.
> 
> I list them in the paragraph you're replying to, and I've previously
> outlined the burden on developers, maintainers, and admins to support
> multiple implementations of the same functionality.
> 
> There is a real cost to doing this that you seem to dismiss.

I don’t dismiss the cost of maintenance, I just don’t buy this “years of pain” point because it’s very subjective. There are many people who still think that using Linux is a pain. OTOH back in 2016-2017 mobile devices using z3fold had better response metrics than those using zsmalloc.
> 
>> There were reasons why z3fold and zbud existed. z3fold and zbud were
>> the ones that supported page reclaim, zsmalloc wasn't quite usable
>> with zswap until recently. When we did z3fold it was outperforming
>> zsmalloc.
> 
> We see a higher compression ratio than 3 on a wide variety of
> workloads, so I'm a little doubtful z3fold ever outperformed zsmalloc
> in general-purpose environments.

z3fold never outperformed zsmalloc in terms of allocation density but before zsmalloc got a *working* compaction implementation it was on par.
Besides, compression ratios higher than 3 had been pretty rare before zstd was introduced, and if you ever tried zstd on msm8009 which was the SoC for many mobile devices back then you’d have probably understood how pointless it was back then for most of the embedded world.

And at that time z3fold was indeed faster than zsmalloc on multi-core systems, and especially on big.LITTLE.

> 
> When Meta started using zswap, certainly zsmalloc was the only real
> contender. zbud's storage density was almost not worth the cost of
> compression. z3fold was also not dense enough, and we ran into
> stability issues and crashes. The page reclaim/writeback
> implementation was not very useful either - take a look at the history
> of changes from Nhat and Domenico. These weren't just issues specific
> to our usecases, but much more blatant "how was this ever supposed to
> work?" problems.
> 
> There is nothing wrong with the evolution from zbud to more
> sophisticated allocators. But there is a pretty mature and
> feature-rich allocator now, and that sets a floor on what new
> allocators need to support to be considered general-purpose.
> 
>> With that said, I didn't object to removing z3fold because I did
>> understand that it made no sense to keep it at that point.
> 
> But you're proposing to do the same thing again, when multiple people
> just got done phasing out and cleaning up your previous experiments.

Oh well, the conversation is taking a twist. Do you mean that any new submission is “the same thing again”? Or what exactly do you mean here?
 
> 
>>> I also don't buy the fragmentation argument. Even if you are better at
>>> packing during allocation time (although, citation needed), the
>>> workload can unmap swap pages such that they leave partial blocks
>>> behind indefinitely if you don't have block compaction.
>> 
>> We published Zswap/Zswapped values for zblock/zsmalloc after stress
>> loads and those were on par, basically.
> 
> Those are accounted in zswap, so unfortunately don't capture backend
> fragmentation. You'd need to implement some of the debugging features
> and memory counters that zsmalloc has in order to compare them.

zblock reports total_pages in an honest way, i. e. the amount of pages it allocated, no matter how full or empty these are, and the numbers clearly show there’s no substantial internal fragmentation for any real life workloads we could come up with. You can of course allocate a bunch of slots and then free every second one and not do anything else at all and then the fragmentation will be large, but this is not how zswap operates. 

FWIW it is possible to make zsmalloc constantly alternate between compaction and fragmentation but this will not mimic either zram or zswap operation too.


>>> Then there is the migration support, which you said is planned, but
>>> which would presumably require the same indirection between handle and
>>> the backing pages that zsmalloc has. How much will this eat into the
>>> performance advantage?
>> 
>> I don't think that will be necessary. We're working on supporting
>> GFP_MOVABLE and minimising high order allocations
> 
>>> I'd much rather you'd focus on making zsmalloc better. Improve the
>>> packing scheme, make expensive features optional/configurable etc.
>>> That would be easier on developers and users alike.
>> 
>> zblock's source code is almost 5x smaller in size than zsmalloc's
> 
> It's an apple-to-oranges comparison.
> 
> zsmalloc has memory pressure handling and a rich debugging
> infrastructure that was added over time based on what people thought
> necessary and useful from production experience.
> 
> Implement the same functionality in zblock and we can compare lines
> and performance.

You assume that e.g. memory pressure handing implemented in zsmalloc is necessary for zblock, which I don’t think is the case. 
Debug facilitating additions are good but I _really_ _doubt_ these will bloat zblock code by 5x.
> 
>> and yet zblock works better in many cases with just a few
>> bottlenecks. Why would you mind that we'd focus on making zblock
>> better instead and possibly retire zsmalloc when that mission is
>> accomplished, just like we retired z3fold a while ago?
> 
> You're proposing a significant, open-ended maintenance burden for
> everybody else. I'm just asking for some justification stronger than
> "the small subset of the backend allocator that we implemented is
> slightly faster in a limited number of benchmarks."

Obviously we will be maintaining and extending the code, we have already discussed that.

> 
> The fact that zstd - a very commonly used compressor - immediately
> surfaced bugs that made it *much slower* is not reassuring.

I would expect that someone stating that first would first ask what the problem actually was. :)

~Vitaly




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-16 12:09 ` Johannes Weiner
  2025-04-16 20:10   ` Vitaly
@ 2025-04-18 10:52   ` David Hildenbrand
  2025-04-18 10:56     ` Vitaly Wool
  1 sibling, 1 reply; 20+ messages in thread
From: David Hildenbrand @ 2025-04-18 10:52 UTC (permalink / raw)
  To: Johannes Weiner, Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Igor Belousov

On 16.04.25 14:09, Johannes Weiner wrote:
> On Sat, Apr 12, 2025 at 05:42:07PM +0200, Vitaly Wool wrote:
>> zblock is a special purpose allocator for storing compressed pages.
>> It stores integer number of same size objects per its block. These
>> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
>>
>> With zblock, it is possible to densely arrange objects of various sizes
>> resulting in low internal fragmentation. Also this allocator tries to
>> fill incomplete blocks instead of adding new ones, in many cases
>> providing a compression ratio comparable to zmalloc's.
>>
>> zblock is also in most cases superior to zsmalloc with regard to
>> average performance and worst execution times, thus allowing for better
>> response time and real-time characteristics of the whole system.
> 
> Is there a reason not to use this allocation scheme in zsmalloc then?
> 
> I'm curious what others think, but I'm still not convinced a second
> allocator makes sense. It's maintenance overhead, a permanent struggle
> to match feature parity, and it fragments development and testing base.
> 
> Not long ago several slab allocators were removed for those
> reasons. Likewise, we just deleted zbud and z3fold because they didn't
> get any attention and bitrotted, but not before years of inflicting
> pain through the zpool interface, users accidentally making very
> suboptimal choices, reporting the same bugs over and over again etc.
> 
> If you discovered a better allocation scheme, that's excellent. But I
> don't see why it warrants forking the entire allocator.

Just curious, I see a review on v4 happening on something that was 
nacked by two people in v2 [1].

Do these nack's still apply or were something clarified and they no 
longer apply?


[1] 
https://lore.kernel.org/linux-mm/CAKEwX=Ma9phmURz5nyJm0MQrWmXGFLFBPwr8-Cx=zbc473rx9A@mail.gmail.com/

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-18 10:52   ` David Hildenbrand
@ 2025-04-18 10:56     ` Vitaly Wool
  2025-04-18 11:03       ` David Hildenbrand
  0 siblings, 1 reply; 20+ messages in thread
From: Vitaly Wool @ 2025-04-18 10:56 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Johannes Weiner, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Igor Belousov



> On Apr 18, 2025, at 12:52 PM, David Hildenbrand <david@redhat.com> wrote:
> 
> On 16.04.25 14:09, Johannes Weiner wrote:
>> On Sat, Apr 12, 2025 at 05:42:07PM +0200, Vitaly Wool wrote:
>>> zblock is a special purpose allocator for storing compressed pages.
>>> It stores integer number of same size objects per its block. These
>>> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
>>> 
>>> With zblock, it is possible to densely arrange objects of various sizes
>>> resulting in low internal fragmentation. Also this allocator tries to
>>> fill incomplete blocks instead of adding new ones, in many cases
>>> providing a compression ratio comparable to zmalloc's.
>>> 
>>> zblock is also in most cases superior to zsmalloc with regard to
>>> average performance and worst execution times, thus allowing for better
>>> response time and real-time characteristics of the whole system.
>> Is there a reason not to use this allocation scheme in zsmalloc then?
>> I'm curious what others think, but I'm still not convinced a second
>> allocator makes sense. It's maintenance overhead, a permanent struggle
>> to match feature parity, and it fragments development and testing base.
>> Not long ago several slab allocators were removed for those
>> reasons. Likewise, we just deleted zbud and z3fold because they didn't
>> get any attention and bitrotted, but not before years of inflicting
>> pain through the zpool interface, users accidentally making very
>> suboptimal choices, reporting the same bugs over and over again etc.
>> If you discovered a better allocation scheme, that's excellent. But I
>> don't see why it warrants forking the entire allocator.
> 
> Just curious, I see a review on v4 happening on something that was nacked by two people in v2 [1].
> 
> Do these nack's still apply or were something clarified and they no longer apply?

The reasons for both NAKs are no longer valid (since v3).

~Vitaly

> 
> 
> [1] https://lore.kernel.org/linux-mm/CAKEwX=Ma9phmURz5nyJm0MQrWmXGFLFBPwr8-Cx=zbc473rx9A@mail.gmail.com/
> 
> -- 
> Cheers,
> 
> David / dhildenb
> 



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-18 10:56     ` Vitaly Wool
@ 2025-04-18 11:03       ` David Hildenbrand
  0 siblings, 0 replies; 20+ messages in thread
From: David Hildenbrand @ 2025-04-18 11:03 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: Johannes Weiner, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Igor Belousov

On 18.04.25 12:56, Vitaly Wool wrote:
> 
> 
>> On Apr 18, 2025, at 12:52 PM, David Hildenbrand <david@redhat.com> wrote:
>>
>> On 16.04.25 14:09, Johannes Weiner wrote:
>>> On Sat, Apr 12, 2025 at 05:42:07PM +0200, Vitaly Wool wrote:
>>>> zblock is a special purpose allocator for storing compressed pages.
>>>> It stores integer number of same size objects per its block. These
>>>> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
>>>>
>>>> With zblock, it is possible to densely arrange objects of various sizes
>>>> resulting in low internal fragmentation. Also this allocator tries to
>>>> fill incomplete blocks instead of adding new ones, in many cases
>>>> providing a compression ratio comparable to zmalloc's.
>>>>
>>>> zblock is also in most cases superior to zsmalloc with regard to
>>>> average performance and worst execution times, thus allowing for better
>>>> response time and real-time characteristics of the whole system.
>>> Is there a reason not to use this allocation scheme in zsmalloc then?
>>> I'm curious what others think, but I'm still not convinced a second
>>> allocator makes sense. It's maintenance overhead, a permanent struggle
>>> to match feature parity, and it fragments development and testing base.
>>> Not long ago several slab allocators were removed for those
>>> reasons. Likewise, we just deleted zbud and z3fold because they didn't
>>> get any attention and bitrotted, but not before years of inflicting
>>> pain through the zpool interface, users accidentally making very
>>> suboptimal choices, reporting the same bugs over and over again etc.
>>> If you discovered a better allocation scheme, that's excellent. But I
>>> don't see why it warrants forking the entire allocator.
>>
>> Just curious, I see a review on v4 happening on something that was nacked by two people in v2 [1].
>>
>> Do these nack's still apply or were something clarified and they no longer apply?
> 
> The reasons for both NAKs are no longer valid (since v3).

Well, there I read [2]

"This is a general NAK from me on any new allocators"

So now I am confused. But maybe that's a different NAK :)

Anyhow, I was just stumbling over this after skimming v2 when it was 
under review, and I did not find a clue about that in the version change 
info under --- (e.g., NAK addressed because of Y).

Have to leave now for the long weekend, Happy Easter / Happy Holidays!

[2] https://lore.kernel.org/linux-mm/20250408195533.GA99052@cmpxchg.org/

-- 
Cheers,

David / dhildenb



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-12 15:42 [PATCH v4] mm: add zblock allocator Vitaly Wool
  2025-04-12 19:25 ` Igor Belousov
  2025-04-16 12:09 ` Johannes Weiner
@ 2025-04-22 10:46 ` Yosry Ahmed
  2025-04-23 19:53   ` Vitaly Wool
  2025-05-04  9:26 ` Andrew Morton
  2025-07-20  2:56 ` Andrew Morton
  4 siblings, 1 reply; 20+ messages in thread
From: Yosry Ahmed @ 2025-04-22 10:46 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Igor Belousov, Minchan Kim, Sergey Senozhatsky

On Sat, Apr 12, 2025 at 05:42:07PM +0200, Vitaly Wool wrote:
> zblock is a special purpose allocator for storing compressed pages.
> It stores integer number of same size objects per its block. These
> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
> 
> With zblock, it is possible to densely arrange objects of various sizes
> resulting in low internal fragmentation. Also this allocator tries to
> fill incomplete blocks instead of adding new ones, in many cases
> providing a compression ratio comparable to zmalloc's.
> 
> zblock is also in most cases superior to zsmalloc with regard to
> average performance and worst execution times, thus allowing for better
> response time and real-time characteristics of the whole system.
> 
> High memory and page migration are currently not supported by zblock.
> 
> Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
> Signed-off-by: Igor Belousov <igor.b@beldev.am>

Please CC me when sending zswap-related patches, not sure how you're
picking the CC list. Also, CC'ing the zsmalloc maintainers here for the
discussion of adding a new allocator vs improving zsmalloc.

I didn't look too closely but I generally agree that we should improve
zsmalloc where possible rather than add a new allocator. We are trying
not to repeat the zbud/z3fold or slub/slob stories here. Zsmalloc is
getting a lot of mileage from both zswap and zram, and is more-or-less
battle-tested. Let's work toward building upon that instead of starting
over.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-22 10:46 ` Yosry Ahmed
@ 2025-04-23 19:53   ` Vitaly Wool
  2025-04-30 12:27     ` Yosry Ahmed
  2025-05-01 23:49     ` Sergey Senozhatsky
  0 siblings, 2 replies; 20+ messages in thread
From: Vitaly Wool @ 2025-04-23 19:53 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Igor Belousov, Minchan Kim, Sergey Senozhatsky

On 4/22/25 12:46, Yosry Ahmed wrote:
> I didn't look too closely but I generally agree that we should improve
> zsmalloc where possible rather than add a new allocator. We are trying
> not to repeat the zbud/z3fold or slub/slob stories here. Zsmalloc is
> getting a lot of mileage from both zswap and zram, and is more-or-less
> battle-tested. Let's work toward building upon that instead of starting
> over.

The thing here is, zblock is using a very different approach to small 
object allocation. The idea is: we have an array of descriptors which 
correspond to multi-page blocks divided in chunks of equal size 
(block_size[i]). For each object of size x we find the descriptor n such as:
	block_size[n-1] < n < block_size[n]
and then we store that object in an empty slot in one of the blocks. 
Thus, the density is high, the search is fast (rbtree based) and there 
are no objects spanning over 2 pages, so no extra memcpy involved.

And with the latest zblock, we see that it has a clear advantage in 
performance over zsmalloc, retaining roughly the same allocation density 
for 4K pages and scoring better on 16K pages. E. g. on a kernel compilation:

* zsmalloc/zstd/make -j32 bzImage
	real	8m0.594s
	user	39m37.783s
	sys	8m24.262s
	Zswap:            200600 kB <-- after build completion
	Zswapped:         854072 kB <-- after build completion
	zswpin 309774
	zswpout 1538332

* zblock/zstd/make -j32 bzImage
	real	7m35.546s
	user	38m03.475s
	sys	7m47.407s
	Zswap:            250940 kB <-- after build completion
	Zswapped:         870660 kB <-- after build completion
	zswpin 248606
	zswpout 1277319

So what we see here is that zblock is definitely faster and at least not 
worse with regard to allocation density under heavy load. It has 
slightly worse _idle_ allocation density but since it will quickly catch 
up under load it is not really important. What is important is that its 
characteristics don't deteriorate over time. Overall, zblock is simple 
and efficient and there is /raison d'etre/ for it.

Now, it is indeed possible to partially rework zsmalloc using zblock's 
algorithm but this will be a rather substantial change, equal or bigger 
in effort to implementing the approach described above from scratch (and 
this is what we did), and with such drastic changes most of the testing 
that has been done with zsmalloc would be invalidated, and we'll be out 
in the wild anyway. So even though I see your point, I don't think it 
applies in this particular case.

~Vitaly


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-23 19:53   ` Vitaly Wool
@ 2025-04-30 12:27     ` Yosry Ahmed
  2025-05-01 12:41       ` Vitaly Wool
  2025-05-01 23:49     ` Sergey Senozhatsky
  1 sibling, 1 reply; 20+ messages in thread
From: Yosry Ahmed @ 2025-04-30 12:27 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Igor Belousov, Minchan Kim, Sergey Senozhatsky

On Wed, Apr 23, 2025 at 09:53:48PM +0200, Vitaly Wool wrote:
> On 4/22/25 12:46, Yosry Ahmed wrote:
> > I didn't look too closely but I generally agree that we should improve
> > zsmalloc where possible rather than add a new allocator. We are trying
> > not to repeat the zbud/z3fold or slub/slob stories here. Zsmalloc is
> > getting a lot of mileage from both zswap and zram, and is more-or-less
> > battle-tested. Let's work toward building upon that instead of starting
> > over.
> 
> The thing here is, zblock is using a very different approach to small object
> allocation. The idea is: we have an array of descriptors which correspond to
> multi-page blocks divided in chunks of equal size (block_size[i]). For each
> object of size x we find the descriptor n such as:
> 	block_size[n-1] < n < block_size[n]
> and then we store that object in an empty slot in one of the blocks. Thus,
> the density is high, the search is fast (rbtree based) and there are no
> objects spanning over 2 pages, so no extra memcpy involved.

The block sizes seem to be similar in principle to class sizes in
zsmalloc. It seems to me that there are two apparent differentiating
properties to zblock:

- Block lookup uses an rbtree, so it's faster than zsmalloc's list
  iteration. On the other hand, zsmalloc divides each class into
  fullness groups and tries to pack almost full groups first. Not sure
  if zblock's approach is strictly better.

- Zblock uses higher order allocations vs. zsmalloc always using order-0
  allocations. I think this may be the main advantage and I remember
  asking if zsmalloc can support this. Always using order-0 pages is
  more reliable but may not always be the best choice.

On the other hand, zblock is lacking in other regards. For example:
- The lack of compaction means that certain workloads will see a lot of
  fragmentation. It purely depends on the access patterns. We could end
  up with a lot of blocks each containing a single object and there is
  no way to recover AFAICT.

- Zblock will fail if a high order allocation cannot be satisfied, which
  is more likely to happen under memory pressure, and it's usually when
  zblock is needed in the first place.

- There's probably more, I didn't check too closely, and I am hoping
  that Minchan and Sergey will chime in here.

> 
> And with the latest zblock, we see that it has a clear advantage in
> performance over zsmalloc, retaining roughly the same allocation density for
> 4K pages and scoring better on 16K pages. E. g. on a kernel compilation:
> 
> * zsmalloc/zstd/make -j32 bzImage
> 	real	8m0.594s
> 	user	39m37.783s
> 	sys	8m24.262s
> 	Zswap:            200600 kB <-- after build completion
> 	Zswapped:         854072 kB <-- after build completion
> 	zswpin 309774
> 	zswpout 1538332
> 
> * zblock/zstd/make -j32 bzImage
> 	real	7m35.546s
> 	user	38m03.475s
> 	sys	7m47.407s
> 	Zswap:            250940 kB <-- after build completion
> 	Zswapped:         870660 kB <-- after build completion
> 	zswpin 248606
> 	zswpout 1277319
> 
> So what we see here is that zblock is definitely faster and at least not
> worse with regard to allocation density under heavy load. It has slightly
> worse _idle_ allocation density but since it will quickly catch up under
> load it is not really important. What is important is that its
> characteristics don't deteriorate over time. Overall, zblock is simple and
> efficient and there is /raison d'etre/ for it.

Zblock is performing better for this specific workload, but as I
mentioned earlier there are other aspects that zblock is missing.
Zsmalloc has seen a very large range of workloads of different types,
and we cannot just dismiss this.

> 
> Now, it is indeed possible to partially rework zsmalloc using zblock's
> algorithm but this will be a rather substantial change, equal or bigger in
> effort to implementing the approach described above from scratch (and this
> is what we did), and with such drastic changes most of the testing that has
> been done with zsmalloc would be invalidated, and we'll be out in the wild
> anyway. So even though I see your point, I don't think it applies in this
> particular case.


Well, we should start by breaking down the differences and finding out
why zblock is performing better, as I mentioned above. If it's the
faster lookups or higher order allocations, we can work to support that
in zsmalloc. Similarly, if zsmalloc has unnecessary complexity it'd be
great to get rid of it rather than starting over.

Also, we don't have to do it all at once and invalidate the testing that
zsmalloc has seen. These can be incremental changes that get spread over
multiple releases, getting incremental exposure in the process.

> 
> ~Vitaly


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-30 12:27     ` Yosry Ahmed
@ 2025-05-01 12:41       ` Vitaly Wool
  2025-05-01 23:43         ` Sergey Senozhatsky
  2025-05-06 13:04         ` Yosry Ahmed
  0 siblings, 2 replies; 20+ messages in thread
From: Vitaly Wool @ 2025-05-01 12:41 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Igor Belousov, Minchan Kim, Sergey Senozhatsky

Hi Yosry,

On 4/30/25 14:27, Yosry Ahmed wrote:
> On Wed, Apr 23, 2025 at 09:53:48PM +0200, Vitaly Wool wrote:
>> On 4/22/25 12:46, Yosry Ahmed wrote:
>>> I didn't look too closely but I generally agree that we should improve
>>> zsmalloc where possible rather than add a new allocator. We are trying
>>> not to repeat the zbud/z3fold or slub/slob stories here. Zsmalloc is
>>> getting a lot of mileage from both zswap and zram, and is more-or-less
>>> battle-tested. Let's work toward building upon that instead of starting
>>> over.
>>
>> The thing here is, zblock is using a very different approach to small object
>> allocation. The idea is: we have an array of descriptors which correspond to
>> multi-page blocks divided in chunks of equal size (block_size[i]). For each
>> object of size x we find the descriptor n such as:
>> 	block_size[n-1] < n < block_size[n]
>> and then we store that object in an empty slot in one of the blocks. Thus,
>> the density is high, the search is fast (rbtree based) and there are no
>> objects spanning over 2 pages, so no extra memcpy involved.
> 
> The block sizes seem to be similar in principle to class sizes in
> zsmalloc. It seems to me that there are two apparent differentiating
> properties to zblock:
> 
> - Block lookup uses an rbtree, so it's faster than zsmalloc's list
>    iteration. On the other hand, zsmalloc divides each class into
>    fullness groups and tries to pack almost full groups first. Not sure
>    if zblock's approach is strictly better.

If we free a slot in a fully packed block we put it on top of the list. 
zswap's normal operation pattern is that there will be more free slots 
in that block so it's roughly the same.

> - Zblock uses higher order allocations vs. zsmalloc always using order-0
>    allocations. I think this may be the main advantage and I remember
>    asking if zsmalloc can support this. Always using order-0 pages is
>    more reliable but may not always be the best choice.

There's a patch we'll be posting soon with "opportunistic" high order 
allocations (i. e. if try_alloc_pages fails, allocate order-0 pages 
instead). This will leverage the benefits of higher order allocations 
without putting too much stress on the system.

> On the other hand, zblock is lacking in other regards. For example:
> - The lack of compaction means that certain workloads will see a lot of
>    fragmentation. It purely depends on the access patterns. We could end
>    up with a lot of blocks each containing a single object and there is
>    no way to recover AFAICT.

We have been giving many variants of stress load on the memory subsystem 
and the worst compression ratio *after* the stress load was 2.8x using 
zstd as the compressor (and about 4x under load). With zsmalloc under 
the same conditions the ratio was 3.6x after and 4x under load.

With more normal (but still stressing) usage patterns the numbers 
*after* the stress load were around 3.8x and 4.1x, respectively.

Bottom line, ending up with a lot of blocks each containing a single 
object is not a real life scenario. With that said, we have a quite 
simple solution in the making that will get zblock on par with zsmalloc 
even in the cases described above.

> - Zblock will fail if a high order allocation cannot be satisfied, which
>    is more likely to happen under memory pressure, and it's usually when
>    zblock is needed in the first place.

See above, this issue will be addressed in the patch coming in a really 
short while.

> - There's probably more, I didn't check too closely, and I am hoping
>    that Minchan and Sergey will chime in here.
> 
>>
>> And with the latest zblock, we see that it has a clear advantage in
>> performance over zsmalloc, retaining roughly the same allocation density for
>> 4K pages and scoring better on 16K pages. E. g. on a kernel compilation:
>>
>> * zsmalloc/zstd/make -j32 bzImage
>> 	real	8m0.594s
>> 	user	39m37.783s
>> 	sys	8m24.262s
>> 	Zswap:            200600 kB <-- after build completion
>> 	Zswapped:         854072 kB <-- after build completion
>> 	zswpin 309774
>> 	zswpout 1538332
>>
>> * zblock/zstd/make -j32 bzImage
>> 	real	7m35.546s
>> 	user	38m03.475s
>> 	sys	7m47.407s
>> 	Zswap:            250940 kB <-- after build completion
>> 	Zswapped:         870660 kB <-- after build completion
>> 	zswpin 248606
>> 	zswpout 1277319
>>
>> So what we see here is that zblock is definitely faster and at least not
>> worse with regard to allocation density under heavy load. It has slightly
>> worse _idle_ allocation density but since it will quickly catch up under
>> load it is not really important. What is important is that its
>> characteristics don't deteriorate over time. Overall, zblock is simple and
>> efficient and there is /raison d'etre/ for it.
> 
> Zblock is performing better for this specific workload, but as I
> mentioned earlier there are other aspects that zblock is missing.
> Zsmalloc has seen a very large range of workloads of different types,
> and we cannot just dismiss this.

We've been running many different work loads with both allocators but 
posting all the results in the patch description will go well beyond the 
purpose of a patch submission. If there are some workloads you are 
interested in in particular, please let me know, odds are high we have 
some results for those too.

>> Now, it is indeed possible to partially rework zsmalloc using zblock's
>> algorithm but this will be a rather substantial change, equal or bigger in
>> effort to implementing the approach described above from scratch (and this
>> is what we did), and with such drastic changes most of the testing that has
>> been done with zsmalloc would be invalidated, and we'll be out in the wild
>> anyway. So even though I see your point, I don't think it applies in this
>> particular case.
> 
> 
> Well, we should start by breaking down the differences and finding out
> why zblock is performing better, as I mentioned above. If it's the
> faster lookups or higher order allocations, we can work to support that
> in zsmalloc. Similarly, if zsmalloc has unnecessary complexity it'd be
> great to get rid of it rather than starting over.
> 
> Also, we don't have to do it all at once and invalidate the testing that
> zsmalloc has seen. These can be incremental changes that get spread over
> multiple releases, getting incremental exposure in the process.

I believe we are a lot closer now to having a zblock without the initial 
drawbacks you have pointed out than a faster zsmalloc, retaining the 
code simplicity of the former.

~Vitaly



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-05-01 12:41       ` Vitaly Wool
@ 2025-05-01 23:43         ` Sergey Senozhatsky
  2025-05-06 13:04         ` Yosry Ahmed
  1 sibling, 0 replies; 20+ messages in thread
From: Sergey Senozhatsky @ 2025-05-01 23:43 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: Yosry Ahmed, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Igor Belousov, Minchan Kim,
	Sergey Senozhatsky

On (25/05/01 14:41), Vitaly Wool wrote:
[..]
> Bottom line, ending up with a lot of blocks each containing a single object
> is not a real life scenario.

Why not?  What if the data patterns do not favor compressible objects?
E.g. what if the distribution of compressed objects looks like this (a real
zsmalloc stats):

size-class   objs-allocated
---------------------------
...
1904         3315
1920         3468
1936         3515
2048         25816
2144         22363
2160         3230
2176         3075
2192         2990
2224         5665
2272         8118
2304         5040
2336         4529
2384         6132
2400         1768
2448         4825
2512         5135
2560         2944
2592         1562
2624         1512
2720         3813
2832         3315
2864         820
...

Notice tenth of thousands of 2048-2144 bytes objects.  zsmalloc size
classes are fundamentally independent of each other and, hence, of
the compressed objects distribution: the lack of objects, or even
"the absence of" for that matter, in size-class 256 does not change
a thing for size-class 2048.  This is a very important property.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-23 19:53   ` Vitaly Wool
  2025-04-30 12:27     ` Yosry Ahmed
@ 2025-05-01 23:49     ` Sergey Senozhatsky
  2025-05-03  8:27       ` Vitaly
  1 sibling, 1 reply; 20+ messages in thread
From: Sergey Senozhatsky @ 2025-05-01 23:49 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: Yosry Ahmed, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Igor Belousov, Minchan Kim,
	Sergey Senozhatsky

On (25/04/23 21:53), Vitaly Wool wrote:
[..]
> * zsmalloc/zstd/make -j32 bzImage
> 	real	8m0.594s
> 	user	39m37.783s
> 	sys	8m24.262s
> 	Zswap:            200600 kB <-- after build completion
> 	Zswapped:         854072 kB <-- after build completion
> 	zswpin 309774
> 	zswpout 1538332
> 
> * zblock/zstd/make -j32 bzImage
> 	real	7m35.546s
> 	user	38m03.475s
> 	sys	7m47.407s
> 	Zswap:            250940 kB <-- after build completion
> 	Zswapped:         870660 kB <-- after build completion
> 	zswpin 248606
> 	zswpout 1277319

I'm sorry but what does this test test?  That under memory pressure the
kernel swaps out different pages with different compression characteristics?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-05-01 23:49     ` Sergey Senozhatsky
@ 2025-05-03  8:27       ` Vitaly
  0 siblings, 0 replies; 20+ messages in thread
From: Vitaly @ 2025-05-03  8:27 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Yosry Ahmed, linux-mm, akpm, linux-kernel, Nhat Pham,
	Shakeel Butt, Johannes Weiner, Igor Belousov, Minchan Kim,
	Sergey Senozhatsky

[-- Attachment #1: Type: text/plain, Size: 908 bytes --]






On Friday, May 2, 2025 at 1:49:08 am +02:00, Sergey Senozhatsky <senozhatsky@chromium.org> wrote:

> On (25/04/23 21:53), Vitaly Wool wrote:
> [..]
> 
> > * zsmalloc/zstd/make -j32 bzImage
> > real	8m0.594s
> > user	39m37.783s
> > sys	8m24.262s
> > Zswap: 200600 kB <-- after build completion
> > Zswapped: 854072 kB <-- after build completion
> > zswpin 309774
> > zswpout 1538332
> > 
> > * zblock/zstd/make -j32 bzImage
> > real	7m35.546s
> > user	38m03.475s
> > sys	7m47.407s
> > Zswap: 250940 kB <-- after build completion
> > Zswapped: 870660 kB <-- after build completion
> > zswpin 248606
> > zswpout 1277319
> > 
> I'm sorry but what does this test test? That under memory pressure the
> kernel swaps out different pages with different compression characteristics?
> This test illustrates that zblock is faster than zsmalloc. No rocket science here.


~Vitaly

[-- Attachment #2.1: Type: text/html, Size: 1425 bytes --]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-12 15:42 [PATCH v4] mm: add zblock allocator Vitaly Wool
                   ` (2 preceding siblings ...)
  2025-04-22 10:46 ` Yosry Ahmed
@ 2025-05-04  9:26 ` Andrew Morton
  2025-07-20  2:56 ` Andrew Morton
  4 siblings, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2025-05-04  9:26 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Igor Belousov

On Sat, 12 Apr 2025 17:42:07 +0200 Vitaly Wool <vitaly.wool@konsulko.se> wrote:

> zblock is a special purpose allocator for storing compressed pages.
> It stores integer number of same size objects per its block. These
> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
> 
> With zblock, it is possible to densely arrange objects of various sizes
> resulting in low internal fragmentation. Also this allocator tries to
> fill incomplete blocks instead of adding new ones, in many cases
> providing a compression ratio comparable to zmalloc's.
> 
> zblock is also in most cases superior to zsmalloc with regard to
> average performance and worst execution times, thus allowing for better
> response time and real-time characteristics of the whole system.
> 
> High memory and page migration are currently not supported by zblock.
> 

My x86_64 allmodconfig build failed.

  MODPOST Module.symvers
ERROR: modpost: "try_alloc_pages_noprof" [mm/zblock.ko] undefined!

I don't understand why this wasn't encountered earlier.


From: Andrew Morton <akpm@linux-foundation.org>
Subject: mm-add-zblock-allocator-fix-2
Date: Sun May  4 02:13:54 AM PDT 2025

export try_alloc_pages_noprof() to modules for CONFIG_ZBLOCK=m

Cc: Vitaly Wool <vitaly.wool@konsulko.se>
Cc: Igor Belousov <igor.b@beldev.am>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 mm/page_alloc.c |    1 +
 1 file changed, 1 insertion(+)

--- a/mm/page_alloc.c~a
+++ a/mm/page_alloc.c
@@ -7470,3 +7470,4 @@ struct page *try_alloc_pages_noprof(int
 	kmsan_alloc_page(page, order, alloc_gfp);
 	return page;
 }
+EXPORT_SYMBOL_GPL(try_alloc_pages_noprof);
_



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-05-01 12:41       ` Vitaly Wool
  2025-05-01 23:43         ` Sergey Senozhatsky
@ 2025-05-06 13:04         ` Yosry Ahmed
  2025-06-11 17:11           ` Vitaly Wool
  1 sibling, 1 reply; 20+ messages in thread
From: Yosry Ahmed @ 2025-05-06 13:04 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Igor Belousov, Minchan Kim, Sergey Senozhatsky

On Thu, May 01, 2025 at 02:41:29PM +0200, Vitaly Wool wrote:
> Hi Yosry,
> 
> On 4/30/25 14:27, Yosry Ahmed wrote:
> > On Wed, Apr 23, 2025 at 09:53:48PM +0200, Vitaly Wool wrote:
> > > On 4/22/25 12:46, Yosry Ahmed wrote:
> > > > I didn't look too closely but I generally agree that we should improve
> > > > zsmalloc where possible rather than add a new allocator. We are trying
> > > > not to repeat the zbud/z3fold or slub/slob stories here. Zsmalloc is
> > > > getting a lot of mileage from both zswap and zram, and is more-or-less
> > > > battle-tested. Let's work toward building upon that instead of starting
> > > > over.
> > > 
> > > The thing here is, zblock is using a very different approach to small object
> > > allocation. The idea is: we have an array of descriptors which correspond to
> > > multi-page blocks divided in chunks of equal size (block_size[i]). For each
> > > object of size x we find the descriptor n such as:
> > > 	block_size[n-1] < n < block_size[n]
> > > and then we store that object in an empty slot in one of the blocks. Thus,
> > > the density is high, the search is fast (rbtree based) and there are no
> > > objects spanning over 2 pages, so no extra memcpy involved.
> > 
> > The block sizes seem to be similar in principle to class sizes in
> > zsmalloc. It seems to me that there are two apparent differentiating
> > properties to zblock:
> > 
> > - Block lookup uses an rbtree, so it's faster than zsmalloc's list
> >    iteration. On the other hand, zsmalloc divides each class into
> >    fullness groups and tries to pack almost full groups first. Not sure
> >    if zblock's approach is strictly better.
> 
> If we free a slot in a fully packed block we put it on top of the list.
> zswap's normal operation pattern is that there will be more free slots in
> that block so it's roughly the same.

How so? IIUC the order in which slots are freed depends on the LRU (for
writeback) and swapins (for loads). Why do we expect that slots from the
same block will be freed in close succession?

> 
> > - Zblock uses higher order allocations vs. zsmalloc always using order-0
> >    allocations. I think this may be the main advantage and I remember
> >    asking if zsmalloc can support this. Always using order-0 pages is
> >    more reliable but may not always be the best choice.
> 
> There's a patch we'll be posting soon with "opportunistic" high order
> allocations (i. e. if try_alloc_pages fails, allocate order-0 pages
> instead). This will leverage the benefits of higher order allocations
> without putting too much stress on the system.
> 
> > On the other hand, zblock is lacking in other regards. For example:
> > - The lack of compaction means that certain workloads will see a lot of
> >    fragmentation. It purely depends on the access patterns. We could end
> >    up with a lot of blocks each containing a single object and there is
> >    no way to recover AFAICT.
> 
> We have been giving many variants of stress load on the memory subsystem and
> the worst compression ratio *after* the stress load was 2.8x using zstd as
> the compressor (and about 4x under load). With zsmalloc under the same
> conditions the ratio was 3.6x after and 4x under load.
> 
> With more normal (but still stressing) usage patterns the numbers *after*
> the stress load were around 3.8x and 4.1x, respectively.
> 
> Bottom line, ending up with a lot of blocks each containing a single object
> is not a real life scenario. With that said, we have a quite simple solution
> in the making that will get zblock on par with zsmalloc even in the cases
> described above.

Could you share a high-level description of how this issue will be
addressed in zblock? I am trying to understand why/how zblock can handle
this better/simpler than zsmalloc.

> 
> > - Zblock will fail if a high order allocation cannot be satisfied, which
> >    is more likely to happen under memory pressure, and it's usually when
> >    zblock is needed in the first place.
> 
> See above, this issue will be addressed in the patch coming in a really
> short while.
> 
> > - There's probably more, I didn't check too closely, and I am hoping
> >    that Minchan and Sergey will chime in here.
> > 
> > > 
> > > And with the latest zblock, we see that it has a clear advantage in
> > > performance over zsmalloc, retaining roughly the same allocation density for
> > > 4K pages and scoring better on 16K pages. E. g. on a kernel compilation:
> > > 
> > > * zsmalloc/zstd/make -j32 bzImage
> > > 	real	8m0.594s
> > > 	user	39m37.783s
> > > 	sys	8m24.262s
> > > 	Zswap:            200600 kB <-- after build completion
> > > 	Zswapped:         854072 kB <-- after build completion
> > > 	zswpin 309774
> > > 	zswpout 1538332
> > > 
> > > * zblock/zstd/make -j32 bzImage
> > > 	real	7m35.546s
> > > 	user	38m03.475s
> > > 	sys	7m47.407s
> > > 	Zswap:            250940 kB <-- after build completion
> > > 	Zswapped:         870660 kB <-- after build completion
> > > 	zswpin 248606
> > > 	zswpout 1277319
> > > 
> > > So what we see here is that zblock is definitely faster and at least not
> > > worse with regard to allocation density under heavy load. It has slightly
> > > worse _idle_ allocation density but since it will quickly catch up under
> > > load it is not really important. What is important is that its
> > > characteristics don't deteriorate over time. Overall, zblock is simple and
> > > efficient and there is /raison d'etre/ for it.
> > 
> > Zblock is performing better for this specific workload, but as I
> > mentioned earlier there are other aspects that zblock is missing.
> > Zsmalloc has seen a very large range of workloads of different types,
> > and we cannot just dismiss this.
> 
> We've been running many different work loads with both allocators but
> posting all the results in the patch description will go well beyond the
> purpose of a patch submission. If there are some workloads you are
> interested in in particular, please let me know, odds are high we have some
> results for those too.

That's good to know. I don't have specific workloads in mind, was just
stating the fact that zsmalloc has been tested with a variety of
workloads in production environments.

> 
> > > Now, it is indeed possible to partially rework zsmalloc using zblock's
> > > algorithm but this will be a rather substantial change, equal or bigger in
> > > effort to implementing the approach described above from scratch (and this
> > > is what we did), and with such drastic changes most of the testing that has
> > > been done with zsmalloc would be invalidated, and we'll be out in the wild
> > > anyway. So even though I see your point, I don't think it applies in this
> > > particular case.
> > 
> > 
> > Well, we should start by breaking down the differences and finding out
> > why zblock is performing better, as I mentioned above. If it's the
> > faster lookups or higher order allocations, we can work to support that
> > in zsmalloc. Similarly, if zsmalloc has unnecessary complexity it'd be
> > great to get rid of it rather than starting over.
> > 
> > Also, we don't have to do it all at once and invalidate the testing that
> > zsmalloc has seen. These can be incremental changes that get spread over
> > multiple releases, getting incremental exposure in the process.
> 
> I believe we are a lot closer now to having a zblock without the initial
> drawbacks you have pointed out than a faster zsmalloc, retaining the code
> simplicity of the former.

This does not answer my question tho. I am trying to understand what
makes zblock faster than zsmalloc.

If it's the (optionally opportunistic) higher order allocations, we can
probably do the same in zsmalloc and avoid memcpys as well. I guess we
can try to allocate zspages as a high-order contiguous allocation first,
and fallback to the current "chain" structure on failure.

If it's the lack of overhead from compaction (e.g. less locking), then
the question should be why does zsmalloc need compaction and zblock
allegedly doesn't?

I am not arguing that you're seeing better results from zblock, what I
am saying is let's pinpoint what makes zblock better first, then decide
whether the same can be applied to zsmalloc or if it's really better to
create a new allocator instead. Right now we don't have full
information.

WDYT?


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-05-06 13:04         ` Yosry Ahmed
@ 2025-06-11 17:11           ` Vitaly Wool
  0 siblings, 0 replies; 20+ messages in thread
From: Vitaly Wool @ 2025-06-11 17:11 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: linux-mm, akpm, linux-kernel, Nhat Pham, Shakeel Butt,
	Johannes Weiner, Igor Belousov, Minchan Kim, Sergey Senozhatsky



> On May 6, 2025, at 3:04 PM, Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
> 
> On Thu, May 01, 2025 at 02:41:29PM +0200, Vitaly Wool wrote:
>> Hi Yosry,
>> 
>> On 4/30/25 14:27, Yosry Ahmed wrote:
>>> On Wed, Apr 23, 2025 at 09:53:48PM +0200, Vitaly Wool wrote:
>>>> On 4/22/25 12:46, Yosry Ahmed wrote:
>>>>> I didn't look too closely but I generally agree that we should improve
>>>>> zsmalloc where possible rather than add a new allocator. We are trying
>>>>> not to repeat the zbud/z3fold or slub/slob stories here. Zsmalloc is
>>>>> getting a lot of mileage from both zswap and zram, and is more-or-less
>>>>> battle-tested. Let's work toward building upon that instead of starting
>>>>> over.
>>>> 
>>>> The thing here is, zblock is using a very different approach to small object
>>>> allocation. The idea is: we have an array of descriptors which correspond to
>>>> multi-page blocks divided in chunks of equal size (block_size[i]). For each
>>>> object of size x we find the descriptor n such as:
>>>> block_size[n-1] < n < block_size[n]
>>>> and then we store that object in an empty slot in one of the blocks. Thus,
>>>> the density is high, the search is fast (rbtree based) and there are no
>>>> objects spanning over 2 pages, so no extra memcpy involved.
>>> 
>>> The block sizes seem to be similar in principle to class sizes in
>>> zsmalloc. It seems to me that there are two apparent differentiating
>>> properties to zblock:
>>> 
>>> - Block lookup uses an rbtree, so it's faster than zsmalloc's list
>>>   iteration. On the other hand, zsmalloc divides each class into
>>>   fullness groups and tries to pack almost full groups first. Not sure
>>>   if zblock's approach is strictly better.
>> 
>> If we free a slot in a fully packed block we put it on top of the list.
>> zswap's normal operation pattern is that there will be more free slots in
>> that block so it's roughly the same.
> 
> How so? IIUC the order in which slots are freed depends on the LRU (for
> writeback) and swapins (for loads). Why do we expect that slots from the
> same block will be freed in close succession?
> 
>> 
>>> - Zblock uses higher order allocations vs. zsmalloc always using order-0
>>>   allocations. I think this may be the main advantage and I remember
>>>   asking if zsmalloc can support this. Always using order-0 pages is
>>>   more reliable but may not always be the best choice.
>> 
>> There's a patch we'll be posting soon with "opportunistic" high order
>> allocations (i. e. if try_alloc_pages fails, allocate order-0 pages
>> instead). This will leverage the benefits of higher order allocations
>> without putting too much stress on the system.
>> 
>>> On the other hand, zblock is lacking in other regards. For example:
>>> - The lack of compaction means that certain workloads will see a lot of
>>>   fragmentation. It purely depends on the access patterns. We could end
>>>   up with a lot of blocks each containing a single object and there is
>>>   no way to recover AFAICT.
>> 
>> We have been giving many variants of stress load on the memory subsystem and
>> the worst compression ratio *after* the stress load was 2.8x using zstd as
>> the compressor (and about 4x under load). With zsmalloc under the same
>> conditions the ratio was 3.6x after and 4x under load.
>> 
>> With more normal (but still stressing) usage patterns the numbers *after*
>> the stress load were around 3.8x and 4.1x, respectively.
>> 
>> Bottom line, ending up with a lot of blocks each containing a single object
>> is not a real life scenario. With that said, we have a quite simple solution
>> in the making that will get zblock on par with zsmalloc even in the cases
>> described above.
> 
> Could you share a high-level description of how this issue will be
> addressed in zblock? I am trying to understand why/how zblock can handle
> this better/simpler than zsmalloc.
> 
>> 
>>> - Zblock will fail if a high order allocation cannot be satisfied, which
>>>   is more likely to happen under memory pressure, and it's usually when
>>>   zblock is needed in the first place.
>> 
>> See above, this issue will be addressed in the patch coming in a really
>> short while.
>> 
>>> - There's probably more, I didn't check too closely, and I am hoping
>>>   that Minchan and Sergey will chime in here.
>>> 
>>>> 
>>>> And with the latest zblock, we see that it has a clear advantage in
>>>> performance over zsmalloc, retaining roughly the same allocation density for
>>>> 4K pages and scoring better on 16K pages. E. g. on a kernel compilation:
>>>> 
>>>> * zsmalloc/zstd/make -j32 bzImage
>>>> real 8m0.594s
>>>> user 39m37.783s
>>>> sys 8m24.262s
>>>> Zswap:            200600 kB <-- after build completion
>>>> Zswapped:         854072 kB <-- after build completion
>>>> zswpin 309774
>>>> zswpout 1538332
>>>> 
>>>> * zblock/zstd/make -j32 bzImage
>>>> real 7m35.546s
>>>> user 38m03.475s
>>>> sys 7m47.407s
>>>> Zswap:            250940 kB <-- after build completion
>>>> Zswapped:         870660 kB <-- after build completion
>>>> zswpin 248606
>>>> zswpout 1277319
>>>> 
>>>> So what we see here is that zblock is definitely faster and at least not
>>>> worse with regard to allocation density under heavy load. It has slightly
>>>> worse _idle_ allocation density but since it will quickly catch up under
>>>> load it is not really important. What is important is that its
>>>> characteristics don't deteriorate over time. Overall, zblock is simple and
>>>> efficient and there is /raison d'etre/ for it.
>>> 
>>> Zblock is performing better for this specific workload, but as I
>>> mentioned earlier there are other aspects that zblock is missing.
>>> Zsmalloc has seen a very large range of workloads of different types,
>>> and we cannot just dismiss this.
>> 
>> We've been running many different work loads with both allocators but
>> posting all the results in the patch description will go well beyond the
>> purpose of a patch submission. If there are some workloads you are
>> interested in in particular, please let me know, odds are high we have some
>> results for those too.
> 
> That's good to know. I don't have specific workloads in mind, was just
> stating the fact that zsmalloc has been tested with a variety of
> workloads in production environments.
> 
>> 
>>>> Now, it is indeed possible to partially rework zsmalloc using zblock's
>>>> algorithm but this will be a rather substantial change, equal or bigger in
>>>> effort to implementing the approach described above from scratch (and this
>>>> is what we did), and with such drastic changes most of the testing that has
>>>> been done with zsmalloc would be invalidated, and we'll be out in the wild
>>>> anyway. So even though I see your point, I don't think it applies in this
>>>> particular case.
>>> 
>>> 
>>> Well, we should start by breaking down the differences and finding out
>>> why zblock is performing better, as I mentioned above. If it's the
>>> faster lookups or higher order allocations, we can work to support that
>>> in zsmalloc. Similarly, if zsmalloc has unnecessary complexity it'd be
>>> great to get rid of it rather than starting over.
>>> 
>>> Also, we don't have to do it all at once and invalidate the testing that
>>> zsmalloc has seen. These can be incremental changes that get spread over
>>> multiple releases, getting incremental exposure in the process.
>> 
>> I believe we are a lot closer now to having a zblock without the initial
>> drawbacks you have pointed out than a faster zsmalloc, retaining the code
>> simplicity of the former.
> 
> This does not answer my question tho. I am trying to understand what
> makes zblock faster than zsmalloc.
> 
> If it's the (optionally opportunistic) higher order allocations, we can
> probably do the same in zsmalloc and avoid memcpys as well. I guess we
> can try to allocate zspages as a high-order contiguous allocation first,
> and fallback to the current "chain" structure on failure.

We got rid of high order allocations and see better results now. vmalloc works well for our purpose.


> I am not arguing that you're seeing better results from zblock, what I
> am saying is let's pinpoint what makes zblock better first, then decide
> whether the same can be applied to zsmalloc or if it's really better to
> create a new allocator instead. Right now we don't have full
> information.
> 
> WDYT?

I would suggest that we focused on what zblock could deliver what zsmalloc doesn’t seem to be able to. E. g. it’s easy to add block descriptors at runtime basing on the statistics from the existing allocations, and it will lead to even higher compression density. Since zsmalloc only operates on 2^n sized objects, it doesn’t have that flexibility and it’s not possible to add it without making it essentially a zblock. I believe this is also why zsmalloc is losing to zblock in compression density on 16K pages already.

~Vitaly

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] mm: add zblock allocator
  2025-04-12 15:42 [PATCH v4] mm: add zblock allocator Vitaly Wool
                   ` (3 preceding siblings ...)
  2025-05-04  9:26 ` Andrew Morton
@ 2025-07-20  2:56 ` Andrew Morton
  4 siblings, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2025-07-20  2:56 UTC (permalink / raw)
  To: Vitaly Wool
  Cc: linux-mm, linux-kernel, Nhat Pham, Shakeel Butt, Johannes Weiner,
	Igor Belousov

On Sat, 12 Apr 2025 17:42:07 +0200 Vitaly Wool <vitaly.wool@konsulko.se> wrote:

> zblock is a special purpose allocator for storing compressed pages.
> It stores integer number of same size objects per its block. These
> blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
> 
> With zblock, it is possible to densely arrange objects of various sizes
> resulting in low internal fragmentation. Also this allocator tries to
> fill incomplete blocks instead of adding new ones, in many cases
> providing a compression ratio comparable to zmalloc's.
> 
> zblock is also in most cases superior to zsmalloc with regard to
> average performance and worst execution times, thus allowing for better
> response time and real-time characteristics of the whole system.
> 
> High memory and page migration are currently not supported by zblock.

Well dang, I was about to drop zblock as this version (and its nine
-fix patches) have been in mm.git for three months without reported
issues and review is very stuck.

But I now realize that it has been in mm.git's mm-new branch for all
that time and has had no -next exposure.  Sorry about that.

So after the 6.17 release I'll move the patch into mm-unstable (and
hence linux-next) to get you some linux-next exposure.  After some time
there I expect I will drop the patch.

If you wish to persist with this I suggest you update the changelog to
address all review comments which have been received thus far and
resend.

Current version:

From: Vitaly Wool <vitaly.wool@konsulko.se>
Subject: mm: add zblock allocator
Date: Sat, 12 Apr 2025 17:42:07 +0200

zblock is a special purpose allocator for storing compressed pages.  It
stores integer number of same size objects per its block.  These blocks
consist of several physical pages (2**n, i.  e.  1/2/4/8).

With zblock, it is possible to densely arrange objects of various sizes
resulting in low internal fragmentation.  Also this allocator tries to
fill incomplete blocks instead of adding new ones, in many cases providing
a compression ratio comparable to zmalloc's.

zblock is also in most cases superior to zsmalloc with regard to average
performance and worst execution times, thus allowing for better response
time and real-time characteristics of the whole system.

High memory and page migration are currently not supported by zblock.

Test results (zstd compressor, 8 core Ryzen 9 VM, make bzImage):
- zblock:
    real	6m52.621s
    user	33m41.771s
    sys 	6m28.825s
    Zswap:            162328 kB
    Zswapped:         754468 kB
    zswpin 93851
    zswpout 542481
    zswpwb 935
- zsmalloc:
    real	7m4.355s
    user	34m37.538s
    sys 	6m22.086s
    zswpin 101243
    zswpout 448217
    zswpwb 640
    Zswap:            175704 kB
    Zswapped:         778692 kB

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: export try_alloc_pages_noprof() to modules for CONFIG_ZBLOCK=m]
[akpm@linux-foundation.org: try_alloc_pages() was renamed]
[akpm@linux-foundation.org: fix kerneldoc for zblock_create_pool()]
[igor.b@beldev.am: add debugfs]
  Link: https://lkml.kernel.org/r/20250428064924.53496-1-igor.b@beldev.am
[igor.b@beldev.am: avoid failing the build]
  Link: https://lkml.kernel.org/r/20250428065727.57990-1-igor.b@beldev.am
[akpm@linux-foundation.org: s/#warn/#warning/]
[igor.b@beldev.am: use vmalloc for page allocations]
  Link: https://lkml.kernel.org/r/20250506220329.2355482-1-vitaly.wool@konsulko.se
[vitaly.wool@konsulko.se: make active_list rcu_list]
  Link: https://lkml.kernel.org/r/20250508122925.2600217-1-vitaly.wool@konsulko.se
Link: https://lkml.kernel.org/r/20250412154207.2152667-1-vitaly.wool@konsulko.se
Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.se>
Signed-off-by: Igor Belousov <igor.b@beldev.am>
Tested-by: Igor Belousov <igor.b@beldev.am>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/mm/zblock.rst |   24 +
 MAINTAINERS                 |    7 
 mm/Kconfig                  |   12 
 mm/Makefile                 |    1 
 mm/page_alloc.c             |    1 
 mm/zblock.c                 |  473 ++++++++++++++++++++++++++++++++++
 mm/zblock.h                 |  205 ++++++++++++++
 7 files changed, 723 insertions(+)

diff --git a/Documentation/mm/zblock.rst a/Documentation/mm/zblock.rst
new file mode 100644
--- /dev/null
+++ a/Documentation/mm/zblock.rst
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+zblock
+======
+
+zblock is a special purpose allocator for storing compressed pages.
+It stores integer number of compressed objects per its block. These
+blocks consist of several physical pages (2**n, i. e. 1/2/4/8).
+
+With zblock, it is possible to densely arrange objects of various sizes
+resulting in low internal fragmentation. Also this allocator tries to
+fill incomplete blocks instead of adding new ones,  in many cases
+providing a compression ratio substantially higher than z3fold and zbud
+(though lower than zmalloc's).
+
+zblock does not require MMU to operate and also is superior to zsmalloc
+with regard to average performance and worst execution times, thus
+allowing for better response time and real-time characteristics of the
+whole system.
+
+E. g. on a series of stress-ng tests run on a Raspberry Pi 5, we get
+5-10% higher value for bogo ops/s in zblock/zsmalloc comparison.
+
--- a/MAINTAINERS~mm-add-zblock-allocator
+++ a/MAINTAINERS
@@ -27383,6 +27383,13 @@ F:	Documentation/networking/device_drive
 F:	drivers/net/hamradio/*scc.c
 F:	drivers/net/hamradio/z8530.h
 
+ZBLOCK COMPRESSED SLAB MEMORY ALLOCATOR
+M:	Vitaly Wool <vitaly.wool@konsulko.se>
+L:	linux-mm@kvack.org
+S:	Maintained
+F:	Documentation/mm/zblock.rst
+F:	mm/zblock.[ch]
+
 ZD1211RW WIRELESS DRIVER
 L:	linux-wireless@vger.kernel.org
 S:	Orphan
--- a/mm/Kconfig~mm-add-zblock-allocator
+++ a/mm/Kconfig
@@ -152,6 +152,18 @@ config ZSWAP_ZPOOL_DEFAULT
        default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
        default ""
 
+config ZBLOCK
+	tristate "Fast compression allocator with high density"
+	depends on ZPOOL
+	help
+	  A special purpose allocator for storing compressed pages.
+	  It stores integer number of same size compressed objects per
+	  its block. These blocks consist of several physical pages
+	  (2**n, i. e. 1/2/4/8).
+
+	  With zblock, it is possible to densely arrange objects of
+	  various sizes resulting in low internal fragmentation.
+
 config ZSMALLOC
 	tristate
 	prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
--- a/mm/Makefile~mm-add-zblock-allocator
+++ a/mm/Makefile
@@ -116,6 +116,7 @@ obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZPOOL)	+= zpool.o
+obj-$(CONFIG_ZBLOCK)	+= zblock.o
 obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)	+= cma.o
--- a/mm/page_alloc.c~mm-add-zblock-allocator
+++ a/mm/page_alloc.c
@@ -7593,3 +7593,4 @@ struct page *alloc_pages_nolock_noprof(i
 	kmsan_alloc_page(page, order, alloc_gfp);
 	return page;
 }
+EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);
diff --git a/mm/zblock.c a/mm/zblock.c
new file mode 100644
--- /dev/null
+++ a/mm/zblock.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * zblock.c
+ *
+ * Author: Vitaly Wool <vitaly.wool@konsulko.se>
+ * Based on the work from Ananda Badmaev <a.badmaev@clicknet.pro>
+ * Copyright (C) 2022-2025, Konsulko AB.
+ *
+ * Zblock is a small object allocator with the intention to serve as a
+ * zpool backend. It operates on page blocks which consist of number
+ * of physical pages being a power of 2 and store integer number of
+ * compressed pages per block which results in determinism and simplicity.
+ *
+ * zblock doesn't export any API and is meant to be used via zpool API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/zpool.h>
+#include "zblock.h"
+
+static struct rb_root block_desc_tree = RB_ROOT;
+static struct dentry *zblock_debugfs_root;
+
+/* Encode handle of a particular slot in the pool using metadata */
+static inline unsigned long metadata_to_handle(struct zblock_block *block, unsigned int slot)
+{
+	return (unsigned long)(block) | slot;
+}
+
+/* Return block, block type and slot in the pool corresponding to handle */
+static inline struct zblock_block *handle_to_metadata(unsigned long handle, unsigned int *slot)
+{
+	*slot = handle & SLOT_MASK;
+	return (struct zblock_block *)(handle & ~SLOT_MASK);
+}
+
+/*
+ * Find a block with at least one free slot and claim it.
+ * We make sure that the first block, if exists, will always work.
+ */
+static inline struct zblock_block *find_and_claim_block(struct block_list *b,
+		int block_type, unsigned long *handle)
+{
+	struct list_head *l = &b->active_list;
+	unsigned int slot;
+	struct zblock_block *z;
+
+	rcu_read_lock();
+retry_claim:
+	z = list_first_or_null_rcu(l, typeof(*z), link);
+	if (z) {
+		spin_lock(&b->lock);
+		if (unlikely(!z->free_slots)) {
+			spin_unlock(&b->lock);
+			goto retry_claim;
+		}
+		if (--z->free_slots == 0)
+			list_bidir_del_rcu(&z->link);
+		spin_unlock(&b->lock);
+		/*
+		 * There is a slot in the block and we just made sure it will
+		 * remain.
+		 * Find that slot and set the busy bit.
+		 */
+		for (slot = find_first_zero_bit(z->slot_info,
+					block_desc[block_type].slots_per_block);
+		     slot < block_desc[block_type].slots_per_block;
+		     slot = find_next_zero_bit(z->slot_info,
+					block_desc[block_type].slots_per_block,
+					slot)) {
+			if (!test_and_set_bit(slot, z->slot_info))
+				break;
+		}
+
+		*handle = metadata_to_handle(z, slot);
+	}
+	rcu_read_unlock();
+	return z;
+}
+
+/*
+ * allocate new block and add it to corresponding block list
+ */
+static struct zblock_block *alloc_block(struct zblock_pool *pool,
+					int block_type, gfp_t gfp,
+					unsigned long *handle,
+					unsigned int nid)
+{
+	struct block_list *block_list = &pool->block_lists[block_type];
+	unsigned int num_pages = block_desc[block_type].num_pages;
+	struct zblock_block *block;
+
+	block = __vmalloc_node(PAGE_SIZE * num_pages, PAGE_SIZE, gfp, nid, NULL);
+	if (!block)
+		return NULL;
+
+	/* init block data */
+	init_rcu_head(&block->rcu);
+	block->block_type = block_type;
+	block->free_slots = block_desc[block_type].slots_per_block - 1;
+	memset(&block->slot_info, 0, sizeof(block->slot_info));
+	set_bit(0, block->slot_info);
+	*handle = metadata_to_handle(block, 0);
+
+	spin_lock(&block_list->lock);
+	list_add_rcu(&block->link, &block_list->active_list);
+	block_list->block_count++;
+	spin_unlock(&block_list->lock);
+	return block;
+}
+
+static int zblock_blocks_show(struct seq_file *s, void *v)
+{
+	struct zblock_pool *pool = s->private;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
+		struct block_list *block_list = &pool->block_lists[i];
+
+		seq_printf(s, "%d: %ld blocks of %d pages (total %ld pages)\n",
+			i, block_list->block_count, block_desc[i].num_pages,
+			block_list->block_count * block_desc[i].num_pages);
+	}
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(zblock_blocks);
+
+/*****************
+ * API Functions
+ *****************/
+/**
+ * zblock_create_pool() - create a new zblock pool
+ * @gfp:	gfp flags when allocating the zblock pool structure
+ *
+ * Return: pointer to the new zblock pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct zblock_pool *zblock_create_pool(gfp_t gfp)
+{
+	struct zblock_pool *pool = kmalloc(sizeof(struct zblock_pool), gfp);
+	int i;
+
+	if (!pool)
+		return NULL;
+
+	/* init each block list */
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
+		struct block_list *block_list = &pool->block_lists[i];
+
+		spin_lock_init(&block_list->lock);
+		INIT_LIST_HEAD(&block_list->active_list);
+		block_list->block_count = 0;
+	}
+
+	pool->block_header_cache = kmem_cache_create("zblock",
+						     sizeof(struct zblock_block),
+						     (1 << SLOT_BITS), 0, NULL);
+	if (!pool->block_header_cache)
+		goto out;
+
+	debugfs_create_file("blocks", S_IFREG | 0444, zblock_debugfs_root,
+			    pool, &zblock_blocks_fops);
+	return pool;
+
+out:
+	kfree(pool);
+	return NULL;
+}
+
+/**
+ * zblock_destroy_pool() - destroys an existing zblock pool
+ * @pool:	the zblock pool to be destroyed
+ *
+ */
+static void zblock_destroy_pool(struct zblock_pool *pool)
+{
+	kmem_cache_destroy(pool->block_header_cache);
+	kfree(pool);
+}
+
+
+/**
+ * zblock_alloc() - allocates a slot of appropriate size
+ * @pool:	zblock pool from which to allocate
+ * @size:	size in bytes of the desired allocation
+ * @gfp:	gfp flags used if the pool needs to grow
+ * @handle:	handle of the new allocation
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new slot.
+ */
+static int zblock_alloc(struct zblock_pool *pool, size_t size, gfp_t gfp,
+			unsigned long *handle, unsigned int nid)
+{
+	int block_type = -1;
+	struct zblock_block *block;
+	struct block_list *block_list;
+
+	if (!size)
+		return -EINVAL;
+
+	if (size > block_desc[ARRAY_SIZE(block_desc) - 1].slot_size)
+		return -ENOSPC;
+
+	/* find basic block type with suitable slot size */
+	if (size < block_desc[0].slot_size)
+		block_type = 0;
+	else {
+		struct block_desc_node *block_node;
+		struct rb_node *node = block_desc_tree.rb_node;
+
+		while (node) {
+			block_node = container_of(node, typeof(*block_node), node);
+			if (size < block_node->this_slot_size)
+				node = node->rb_left;
+			else if (size >= block_node->next_slot_size)
+				node = node->rb_right;
+			else {
+				block_type = block_node->block_idx + 1;
+				break;
+			}
+		}
+	}
+	if (WARN_ON(block_type < 0))
+		return -EINVAL;
+
+	block_list = &pool->block_lists[block_type];
+
+	block = find_and_claim_block(block_list, block_type, handle);
+	if (block)
+		return 0;
+
+	/* not found block with free slots try to allocate new empty block */
+	block = alloc_block(pool, block_type, gfp, handle, nid);
+	return block ? 0 : -ENOMEM;
+}
+
+/**
+ * zblock_free() - frees the allocation associated with the given handle
+ * @pool:	pool in which the allocation resided
+ * @handle:	handle associated with the allocation returned by zblock_alloc()
+ *
+ */
+static void zblock_free(struct zblock_pool *pool, unsigned long handle)
+{
+	unsigned int slot, block_type;
+	struct zblock_block *block;
+	struct block_list *block_list;
+
+	block = handle_to_metadata(handle, &slot);
+	block_type = block->block_type;
+	block_list = &pool->block_lists[block_type];
+
+	/* clear bit early, this will shorten the search */
+	clear_bit(slot, block->slot_info);
+
+	spin_lock(&block_list->lock);
+	/* if all slots in block are empty delete the whole block */
+	if (++block->free_slots == block_desc[block_type].slots_per_block) {
+		block_list->block_count--;
+		list_bidir_del_rcu(&block->link);
+		spin_unlock(&block_list->lock);
+		kvfree_rcu(block, rcu);
+		return;
+	} else if (block->free_slots == 1)
+		list_add_tail_rcu(&block->link, &block_list->active_list);
+	spin_unlock(&block_list->lock);
+}
+
+/**
+ * zblock_map() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be mapped
+ *
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+static void *zblock_map(struct zblock_pool *pool, unsigned long handle)
+{
+	unsigned int slot;
+	struct zblock_block *block = handle_to_metadata(handle, &slot);
+	unsigned long offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block->block_type].slot_size;
+
+	return (void *)block + offs;
+}
+
+/**
+ * zblock_unmap() - unmaps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be unmapped
+ */
+static void zblock_unmap(struct zblock_pool *pool, unsigned long handle)
+{
+}
+
+/**
+ * zblock_write() - write to the memory area defined by handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation
+ * @handle_mem: pointer to source memory block
+ * @mem_len:	length of the memory block to write
+ */
+static void zblock_write(struct zblock_pool *pool, unsigned long handle,
+			 void *handle_mem, size_t mem_len)
+{
+	unsigned int slot;
+	struct zblock_block *block = handle_to_metadata(handle, &slot);
+	unsigned long offs = ZBLOCK_HEADER_SIZE + slot * block_desc[block->block_type].slot_size;
+
+	memcpy((void *)block + offs, handle_mem, mem_len);
+}
+
+/**
+ * zblock_get_total_pages() - gets the zblock pool size in pages
+ * @pool:	pool being queried
+ *
+ * Returns: size in bytes of the given pool.
+ */
+static u64 zblock_get_total_pages(struct zblock_pool *pool)
+{
+	u64 total_size;
+	int i;
+
+	total_size = 0;
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++)
+		total_size += pool->block_lists[i].block_count * block_desc[i].num_pages;
+
+	return total_size;
+}
+
+/*****************
+ * zpool
+ ****************/
+
+static void *zblock_zpool_create(const char *name, gfp_t gfp)
+{
+	return zblock_create_pool(gfp);
+}
+
+static void zblock_zpool_destroy(void *pool)
+{
+	zblock_destroy_pool(pool);
+}
+
+static int zblock_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+			unsigned long *handle, const int nid)
+{
+	return zblock_alloc(pool, size, gfp, handle, nid);
+}
+
+static void zblock_zpool_free(void *pool, unsigned long handle)
+{
+	zblock_free(pool, handle);
+}
+
+static void *zblock_zpool_read_begin(void *pool, unsigned long handle,
+				void *local_copy)
+{
+	return zblock_map(pool, handle);
+}
+
+static void zblock_zpool_obj_write(void *pool, unsigned long handle,
+				void *handle_mem, size_t mem_len)
+{
+	zblock_write(pool, handle, handle_mem, mem_len);
+}
+
+static void zblock_zpool_read_end(void *pool, unsigned long handle,
+				void *handle_mem)
+{
+	zblock_unmap(pool, handle);
+}
+
+static u64 zblock_zpool_total_pages(void *pool)
+{
+	return zblock_get_total_pages(pool);
+}
+
+static struct zpool_driver zblock_zpool_driver = {
+	.type =			"zblock",
+	.owner =		THIS_MODULE,
+	.create =		zblock_zpool_create,
+	.destroy =		zblock_zpool_destroy,
+	.malloc =		zblock_zpool_malloc,
+	.free =			zblock_zpool_free,
+	.obj_read_begin =	zblock_zpool_read_begin,
+	.obj_read_end =		zblock_zpool_read_end,
+	.obj_write =		zblock_zpool_obj_write,
+	.total_pages =		zblock_zpool_total_pages,
+};
+
+MODULE_ALIAS("zpool-zblock");
+
+static void delete_rbtree(void)
+{
+	while (!RB_EMPTY_ROOT(&block_desc_tree))
+		rb_erase(block_desc_tree.rb_node, &block_desc_tree);
+}
+
+static int __init create_rbtree(void)
+{
+	int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(block_desc) > MAX_TABLE_SIZE);
+	for (i = 0; i < ARRAY_SIZE(block_desc); i++) {
+		struct block_desc_node *block_node = kmalloc(sizeof(*block_node),
+							GFP_KERNEL);
+		struct rb_node **new = &block_desc_tree.rb_node, *parent = NULL;
+
+		if (!block_node) {
+			delete_rbtree();
+			return -ENOMEM;
+		}
+		if (i > 0 && block_desc[i].slot_size <= block_desc[i-1].slot_size) {
+			pr_err("%s: block descriptors not in ascending order\n",
+				__func__);
+			delete_rbtree();
+			return -EINVAL;
+		}
+		block_node->this_slot_size = block_desc[i].slot_size;
+		block_node->block_idx = i;
+		if (i == ARRAY_SIZE(block_desc) - 1)
+			block_node->next_slot_size = PAGE_SIZE * 2;
+		else
+			block_node->next_slot_size = block_desc[i+1].slot_size;
+		while (*new) {
+			parent = *new;
+			/* the array is sorted so we will always go to the right */
+			new = &((*new)->rb_right);
+		}
+		rb_link_node(&block_node->node, parent, new);
+		rb_insert_color(&block_node->node, &block_desc_tree);
+	}
+	return 0;
+}
+
+static int __init init_zblock(void)
+{
+	int ret = create_rbtree();
+
+	if (ret)
+		return ret;
+
+	zpool_register_driver(&zblock_zpool_driver);
+
+	zblock_debugfs_root = debugfs_create_dir("zblock", NULL);
+	return 0;
+}
+
+static void __exit exit_zblock(void)
+{
+	zpool_unregister_driver(&zblock_zpool_driver);
+	debugfs_remove_recursive(zblock_debugfs_root);
+	delete_rbtree();
+}
+
+module_init(init_zblock);
+module_exit(exit_zblock);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Wool <vitaly.wool@konsulko.se>");
+MODULE_DESCRIPTION("Block allocator for compressed pages");
diff --git a/mm/zblock.h a/mm/zblock.h
new file mode 100644
--- /dev/null
+++ a/mm/zblock.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Author: Vitaly Wool <vitaly.wool@konsulko.com>
+ * Copyright (C) 2025, Konsulko AB.
+ */
+#ifndef __ZBLOCK_H__
+#define __ZBLOCK_H__
+
+#include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/rculist.h>
+#include <linux/types.h>
+
+#if PAGE_SIZE == 0x1000
+/* max 64 slots per block, max table size 64 */
+#define SLOT_BITS 6
+#elif PAGE_SIZE == 0x4000
+/* max 256 slots per block, max table size 64 */
+#define SLOT_BITS 8
+#else
+#warning This PAGE_SIZE is not quite supported yet
+#define SLOT_BITS 8
+#endif
+
+#define MAX_TABLE_SIZE (1 << (PAGE_SHIFT - SLOT_BITS))
+
+#define MAX_SLOTS (1 << SLOT_BITS)
+#define SLOT_MASK ((0x1UL << SLOT_BITS) - 1)
+
+#define ZBLOCK_HEADER_SIZE	round_up(sizeof(struct zblock_block), sizeof(long))
+#define BLOCK_DATA_SIZE(num) ((PAGE_SIZE * (num)) - ZBLOCK_HEADER_SIZE)
+#define SLOT_SIZE(nslots, num) (round_down((BLOCK_DATA_SIZE(num) / nslots), sizeof(long)))
+#define ZBLOCK_MAX_PAGES_PER_BLOCK	8
+
+/**
+ * struct zblock_block - block metadata
+ * Block consists of several pages and contains fixed
+ * integer number of slots for allocating compressed pages.
+ *
+ * free_slots:	number of free slots in the block
+ * slot_info:	contains data about free/occupied slots
+ */
+struct zblock_block {
+	DECLARE_BITMAP(slot_info, 1 << SLOT_BITS);
+	struct list_head link;
+	struct rcu_head rcu;
+	unsigned short block_type;
+	unsigned short free_slots;
+};
+
+/**
+ * struct block_desc - general metadata for block lists
+ * Each block list stores only blocks of corresponding type which means
+ * that all blocks in it have the same number and size of slots.
+ * All slots are aligned to size of long.
+ *
+ * slot_size:		size of slot for this list
+ * slots_per_block:	number of slots per block for this list
+ * num_pages:		number of pages per block
+ */
+struct block_desc {
+	unsigned int slot_size;
+	unsigned short slots_per_block;
+	unsigned short num_pages;
+};
+
+struct block_desc_node {
+	struct rb_node node;
+	unsigned int this_slot_size;
+	unsigned int next_slot_size;
+	unsigned int block_idx;
+};
+
+static const struct block_desc block_desc[] = {
+#if PAGE_SIZE == 0x1000
+	{ SLOT_SIZE(28, 1), 28, 1 },
+	{ SLOT_SIZE(18, 1), 18, 1 },
+	{ SLOT_SIZE(12, 1), 12, 1 },
+	{ SLOT_SIZE(10, 1), 10, 1 },
+	{ SLOT_SIZE(17, 2), 17, 2 },
+	{ SLOT_SIZE(15, 2), 15, 2 },
+	{ SLOT_SIZE(13, 2), 13, 2 },
+	{ SLOT_SIZE(6, 1), 6, 1 },
+	{ SLOT_SIZE(11, 2), 11, 2 },
+	{ SLOT_SIZE(5, 1), 5, 1 },
+	{ SLOT_SIZE(19, 4), 19, 4 },
+	{ SLOT_SIZE(9, 2), 9, 2 },
+	{ SLOT_SIZE(17, 4), 17, 4 },
+	{ SLOT_SIZE(4, 1), 4, 1 },
+	{ SLOT_SIZE(23, 6), 23, 6 },
+	{ SLOT_SIZE(11, 3), 11, 3 },
+	{ SLOT_SIZE(7, 2), 7, 2 },
+	{ SLOT_SIZE(10, 3), 10, 3 },
+	{ SLOT_SIZE(19, 6), 19, 6 },
+	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(14, 5), 14, 5 },
+	{ SLOT_SIZE(8, 3), 8, 3 },
+	{ SLOT_SIZE(5, 2), 5, 2 },
+	{ SLOT_SIZE(12, 5), 12, 5 },
+	{ SLOT_SIZE(9, 4), 9, 4 },
+	{ SLOT_SIZE(15, 7), 15, 7 },
+	{ SLOT_SIZE(2, 1), 2, 1 },
+	{ SLOT_SIZE(15, 8), 15, 8 },
+	{ SLOT_SIZE(9, 5), 9, 5 },
+	{ SLOT_SIZE(12, 7), 12, 7 },
+	{ SLOT_SIZE(13, 8), 13, 8 },
+	{ SLOT_SIZE(6, 4), 6, 4 },
+	{ SLOT_SIZE(11, 8), 11, 8 },
+	{ SLOT_SIZE(9, 7), 9, 7 },
+	{ SLOT_SIZE(6, 5), 6, 5 },
+	{ SLOT_SIZE(9, 8), 9, 8 },
+	{ SLOT_SIZE(4, 4), 4, 4 },
+#else
+	{ SLOT_SIZE(185, 1), 185, 1 },
+	{ SLOT_SIZE(113, 1), 113, 1 },
+	{ SLOT_SIZE(86, 1), 86, 1 },
+	{ SLOT_SIZE(72, 1), 72, 1 },
+	{ SLOT_SIZE(58, 1), 58, 1 },
+	{ SLOT_SIZE(49, 1), 49, 1 },
+	{ SLOT_SIZE(42, 1), 42, 1 },
+	{ SLOT_SIZE(37, 1), 37, 1 },
+	{ SLOT_SIZE(33, 1), 33, 1 },
+	{ SLOT_SIZE(59, 2), 59, 2 },
+	{ SLOT_SIZE(27, 1), 27, 1 },
+	{ SLOT_SIZE(25, 1), 25, 1 },
+	{ SLOT_SIZE(23, 1), 23, 1 },
+	{ SLOT_SIZE(21, 1), 21, 1 },
+	{ SLOT_SIZE(39, 2), 39, 2 },
+	{ SLOT_SIZE(37, 2), 37, 2 },
+	{ SLOT_SIZE(35, 2), 35, 2 },
+	{ SLOT_SIZE(33, 2), 33, 2 },
+	{ SLOT_SIZE(31, 2), 31, 2 },
+	{ SLOT_SIZE(29, 2), 29, 2 },
+	{ SLOT_SIZE(27, 2), 27, 2 },
+	{ SLOT_SIZE(25, 2), 25, 2 },
+	{ SLOT_SIZE(12, 1), 12, 1 },
+	{ SLOT_SIZE(11, 1), 11, 1 },
+	{ SLOT_SIZE(21, 2), 21, 2 },
+	{ SLOT_SIZE(10, 1), 10, 1 },
+	{ SLOT_SIZE(19, 2), 19, 2 },
+	{ SLOT_SIZE(9, 1), 9, 1 },
+	{ SLOT_SIZE(17, 2), 17, 2 },
+	{ SLOT_SIZE(8, 1), 8, 1 },
+	{ SLOT_SIZE(15, 2), 15, 2 },
+	{ SLOT_SIZE(14, 2), 14, 2 },
+	{ SLOT_SIZE(27, 4), 27, 4 },
+	{ SLOT_SIZE(13, 2), 13, 2 },
+	{ SLOT_SIZE(25, 4), 25, 4 },
+	{ SLOT_SIZE(12, 2), 12, 2 },
+	{ SLOT_SIZE(23, 4), 23, 4 },
+	{ SLOT_SIZE(11, 2), 11, 2 },
+	{ SLOT_SIZE(21, 4), 21, 4 },
+	{ SLOT_SIZE(10, 2), 10, 2 },
+	{ SLOT_SIZE(19, 4), 19, 4 },
+	{ SLOT_SIZE(9, 2), 9, 2 },
+	{ SLOT_SIZE(17, 4), 17, 4 },
+	{ SLOT_SIZE(4, 1), 4, 1 },
+	{ SLOT_SIZE(23, 6), 23, 6 },
+	{ SLOT_SIZE(11, 3), 11, 3 },
+	{ SLOT_SIZE(7, 2), 7, 2 },
+	{ SLOT_SIZE(10, 3), 10, 3 },
+	{ SLOT_SIZE(16, 5), 16, 5 },
+	{ SLOT_SIZE(6, 2), 6, 2 },
+	{ SLOT_SIZE(11, 4), 11, 4 },
+	{ SLOT_SIZE(8, 3), 8, 3 },
+	{ SLOT_SIZE(5, 2), 5, 2 },
+	{ SLOT_SIZE(7, 3), 7, 3 },
+	{ SLOT_SIZE(11, 5), 11, 5 },
+	{ SLOT_SIZE(4, 2), 4, 2 },
+	{ SLOT_SIZE(9, 5), 9, 5 },
+	{ SLOT_SIZE(8, 5), 8, 5 },
+	{ SLOT_SIZE(3, 2), 3, 2 },
+	{ SLOT_SIZE(7, 6), 7, 6 },
+	{ SLOT_SIZE(4, 4), 4, 4 },
+#endif /* PAGE_SIZE */
+};
+
+/**
+ * struct block_list - stores metadata of particular list
+ * lock:		protects the list of blocks
+ * active_list:		linked list of active (non-full) blocks
+ * block_count:		total number of blocks in the list
+ */
+struct block_list {
+	spinlock_t lock;
+	struct list_head active_list;
+	unsigned long block_count;
+};
+
+/**
+ * struct zblock_pool - stores metadata for each zblock pool
+ * @block_lists:	array of block lists
+ * @zpool:		zpool driver
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * for a particular zblock pool.
+ */
+struct zblock_pool {
+	struct block_list block_lists[ARRAY_SIZE(block_desc)];
+	struct kmem_cache *block_header_cache;
+	struct zpool *zpool;
+};
+
+
+#endif
_



^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2025-07-20  2:57 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-12 15:42 [PATCH v4] mm: add zblock allocator Vitaly Wool
2025-04-12 19:25 ` Igor Belousov
2025-04-16 12:09 ` Johannes Weiner
2025-04-16 20:10   ` Vitaly
2025-04-17 14:16     ` Johannes Weiner
2025-04-18  7:43       ` Vitaly Wool
2025-04-18 10:52   ` David Hildenbrand
2025-04-18 10:56     ` Vitaly Wool
2025-04-18 11:03       ` David Hildenbrand
2025-04-22 10:46 ` Yosry Ahmed
2025-04-23 19:53   ` Vitaly Wool
2025-04-30 12:27     ` Yosry Ahmed
2025-05-01 12:41       ` Vitaly Wool
2025-05-01 23:43         ` Sergey Senozhatsky
2025-05-06 13:04         ` Yosry Ahmed
2025-06-11 17:11           ` Vitaly Wool
2025-05-01 23:49     ` Sergey Senozhatsky
2025-05-03  8:27       ` Vitaly
2025-05-04  9:26 ` Andrew Morton
2025-07-20  2:56 ` Andrew Morton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).