[patch] kmalloc_percpu -- 1 of 2

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch] kmalloc_percpu  -- 1 of 2
@ 2002-12-04 12:12 Ravikiran G Thirumalai
  2002-12-04 12:15 ` [patch] kmalloc_percpu -- 2 " Ravikiran G Thirumalai
  0 siblings, 1 reply; 17+ messages in thread
From: Ravikiran G Thirumalai @ 2002-12-04 12:12 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton, Rusty Russell, dipankar

Here's a 2.5.50 version of kmalloc_percpu originally submitted by Dipankar.  
This one incorporates Rusty's suggestions to rearrange code and sync 
up with its static counterpart. This version needs exposure of malloc_sizes 
in order to move the interfaces to percpu.c from slab.c (kmalloc_percpu and
kfree_percpu donot have anything to do with the slab allocator itself).
First of the two patches is to expose the malloc_sizes and the second
one is the actual allocator.  I'll follow this up with patchsets to enable 
networking mibs use kmalloc_percpu. We'll have to use kmalloc_percpu
for mibs since DEFINE_PER_CPU won't work with modules (and ipv6 stuff
can be compiled in as modules)

Following is the 1 of 2 patches.

D: Name: slabchange-2.5.50-1.patch
D: Description: Exposes malloc_sizes for kmalloc_percpu
D: Author: Ravikiran Thirumalai


 include/linux/slab.h |   13 +++++++++++++
 mm/slab.c            |    9 +--------
 2 files changed, 14 insertions(+), 8 deletions(-)


diff -ruN linux-2.5.50/include/linux/slab.h kmalloc_percpu-2.5.50/include/linux/slab.h
--- linux-2.5.50/include/linux/slab.h	Thu Nov 28 04:06:23 2002
+++ kmalloc_percpu-2.5.50/include/linux/slab.h	Sun Dec  1 11:13:49 2002
@@ -75,6 +75,19 @@
 extern kmem_cache_t	*sigact_cachep;
 extern kmem_cache_t	*bio_cachep;
 
+/* 
+ * Size description struct for general caches. 
+ * This had to be exposed for kmalloc_percpu.
+ */
+
+struct cache_sizes {
+	size_t           cs_size;
+	kmem_cache_t    *cs_cachep;
+	kmem_cache_t    *cs_dmacachep;
+};
+
+extern struct cache_sizes malloc_sizes[];
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_SLAB_H */
diff -ruN linux-2.5.50/mm/slab.c kmalloc_percpu-2.5.50/mm/slab.c
--- linux-2.5.50/mm/slab.c	Thu Nov 28 04:06:17 2002
+++ kmalloc_percpu-2.5.50/mm/slab.c	Sun Dec  1 11:13:49 2002
@@ -369,15 +369,8 @@
 #define	SET_PAGE_SLAB(pg,x)   ((pg)->list.prev = (struct list_head *)(x))
 #define	GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->list.prev)
 
-/* Size description struct for general caches. */
-struct cache_sizes {
-	size_t		 cs_size;
-	kmem_cache_t	*cs_cachep;
-	kmem_cache_t	*cs_dmacachep;
-};
-
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
-static struct cache_sizes malloc_sizes[] = {
+struct cache_sizes malloc_sizes[] = {
 #if PAGE_SIZE == 4096
 	{    32,	NULL, NULL},
 #endif

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-04 12:12 [patch] kmalloc_percpu -- 1 of 2 Ravikiran G Thirumalai
@ 2002-12-04 12:15 ` Ravikiran G Thirumalai
  2002-12-04 19:34   ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Ravikiran G Thirumalai @ 2002-12-04 12:15 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrew Morton, Rusty Russell, dipankar

Here's a 2.5.50 version of kmalloc_percpu originally submitted by Dipankar.
This one incorporates Rusty's suggestions to rearrange code and sync
up with its static counterpart. This version needs exposure of malloc_sizes
in order to move the interfaces to percpu.c from slab.c (kmalloc_percpu and
kfree_percpu donot have anything to do with the slab allocator itself).
First of the two patches is to expose the malloc_sizes and the second
one is the actual allocator.  I'll follow this up with patchsets to enable
networking mibs use kmalloc_percpu. We'll have to use kmalloc_percpu
for mibs since DEFINE_PER_CPU won't work with modules (and ipv6 stuff
can be compiled in as modules)
 
Following is the 2 of 2 patches.
 
D: Name: kmalloc_percpu-2.5.50-1.patch
D: Description: Dynamic per-cpu kernel memory allocator
D: Author: Dipankar Sarma & Ravikiran Thirumalai


 include/linux/percpu.h |   39 ++++
 init/main.c            |    1 
 kernel/Makefile        |    4 
 kernel/ksyms.c         |    4 
 kernel/percpu.c        |  452 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 498 insertions(+), 2 deletions(-)


diff -ruN linux-2.5.50/include/linux/percpu.h kmalloc_percpu-2.5.50/include/linux/percpu.h
--- linux-2.5.50/include/linux/percpu.h	Thu Nov 28 04:06:19 2002
+++ kmalloc_percpu-2.5.50/include/linux/percpu.h	Sun Dec  1 11:54:49 2002
@@ -1,10 +1,49 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
 #include <linux/spinlock.h> /* For preempt_disable() */
+#include <linux/slab.h> /* For kmalloc_percpu() */
 #include <asm/percpu.h>
 
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+#ifdef CONFIG_SMP
+
+struct percpu_data {
+	void *ptrs[NR_CPUS];
+	void *blkp;
+};
+
+/* Use this with kmalloc_percpu */
+#define per_cpu_ptr(ptr, cpu)                   \
+({                                              \
+        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
+        (__typeof__(ptr))__p->ptrs[(cpu)];	\
+})
+
+extern void *kmalloc_percpu(size_t size, int flags);
+extern void kfree_percpu(const void *);
+extern void kmalloc_percpu_init(void);
+
+#else /* CONFIG_SMP */
+
+#define per_cpu_ptr(ptr, cpu) (ptr)
+
+static inline void *kmalloc_percpu(size_t size, int flags)
+{
+	return(kmalloc(size, flags));
+}
+static inline void kfree_percpu(const void *ptr)
+{	
+	kfree(ptr);
+}
+static inline void kmalloc_percpu_init(void) { }
+
+#endif /* CONFIG_SMP */
+
+/* Use these with kmalloc_percpu */
+#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
+#define put_cpu_ptr(ptr) put_cpu()
+
 #endif /* __LINUX_PERCPU_H */
diff -ruN linux-2.5.50/init/main.c kmalloc_percpu-2.5.50/init/main.c
--- linux-2.5.50/init/main.c	Thu Nov 28 04:05:51 2002
+++ kmalloc_percpu-2.5.50/init/main.c	Sun Dec  1 11:54:49 2002
@@ -423,6 +423,7 @@
 	page_address_init();
 	mem_init();
 	kmem_cache_sizes_init();
+	kmalloc_percpu_init();
 	pidhash_init();
 	pgtable_cache_init();
 	pte_chain_init();
diff -ruN linux-2.5.50/kernel/Makefile kmalloc_percpu-2.5.50/kernel/Makefile
--- linux-2.5.50/kernel/Makefile	Thu Nov 28 04:05:51 2002
+++ kmalloc_percpu-2.5.50/kernel/Makefile	Sun Dec  1 11:54:49 2002
@@ -4,7 +4,7 @@
 
 export-objs = signal.o sys.o kmod.o workqueue.o ksyms.o pm.o exec_domain.o \
 		printk.o platform.o suspend.o dma.o module.o cpufreq.o \
-		profile.o rcupdate.o intermodule.o
+		profile.o rcupdate.o intermodule.o percpu.o
 
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
@@ -13,7 +13,7 @@
 	    rcupdate.o intermodule.o extable.o
 
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o
+obj-$(CONFIG_SMP) += cpu.o percpu.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += ksyms.o module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
diff -ruN linux-2.5.50/kernel/ksyms.c kmalloc_percpu-2.5.50/kernel/ksyms.c
--- linux-2.5.50/kernel/ksyms.c	Thu Nov 28 04:05:47 2002
+++ kmalloc_percpu-2.5.50/kernel/ksyms.c	Sun Dec  1 11:54:49 2002
@@ -97,6 +97,10 @@
 EXPORT_SYMBOL(remove_shrinker);
 EXPORT_SYMBOL(kmalloc);
 EXPORT_SYMBOL(kfree);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(kfree_percpu);
+#endif
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(__vmalloc);
 EXPORT_SYMBOL(vmalloc);
diff -ruN linux-2.5.50/kernel/percpu.c kmalloc_percpu-2.5.50/kernel/percpu.c
--- linux-2.5.50/kernel/percpu.c	Thu Jan  1 05:30:00 1970
+++ kmalloc_percpu-2.5.50/kernel/percpu.c	Sun Dec  1 11:54:49 2002
@@ -0,0 +1,452 @@
+/*
+ * Dynamic Per-CPU Data Allocator.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) IBM Corporation, 2002
+ *
+ * Author:              Dipankar Sarma <dipankar@in.ibm.com>
+ * 			Ravikiran G. Thirumalai <kiran@in.ibm.com>
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+
+struct percpu_data_blklist {
+	struct list_head blks;
+	struct list_head *firstnotfull;
+	spinlock_t lock;
+	size_t objsize;
+	size_t blksize;
+	kmem_cache_t *cachep;
+	char *cachename;
+};
+
+struct percpu_data_blk {
+	struct list_head linkage;
+	void *blkaddr[NR_CPUS];
+	unsigned int usecount;
+	int *freearr;
+	int freehead;
+	struct percpu_data_blklist *blklist;
+};
+
+static struct percpu_data_blklist data_blklist[] = {
+	{
+	 .blks = LIST_HEAD_INIT(data_blklist[0].blks),
+	 .firstnotfull = &data_blklist[0].blks,
+	 .lock = SPIN_LOCK_UNLOCKED,
+	 .objsize = 4,
+	 .blksize = ALIGN(4, SMP_CACHE_BYTES),
+	 .cachep = NULL,
+	 .cachename = "percpu_data_4"},
+	{
+	 .blks = LIST_HEAD_INIT(data_blklist[1].blks),
+	 .firstnotfull = &data_blklist[1].blks,
+	 .lock = SPIN_LOCK_UNLOCKED,
+	 .objsize = 8,
+	 .blksize = ALIGN(8, SMP_CACHE_BYTES),
+	 .cachep = NULL,
+	 .cachename = "percpu_data_8"},
+	{
+	 .blks = LIST_HEAD_INIT(data_blklist[2].blks),
+	 .firstnotfull = &data_blklist[2].blks,
+	 .lock = SPIN_LOCK_UNLOCKED,
+	 .objsize = 16,
+	 .blksize = ALIGN(16, SMP_CACHE_BYTES),
+	 .cachep = NULL,
+	 .cachename = "percpu_data_16"},
+#if PAGE_SIZE != 4096
+	{
+	 .blks = LIST_HEAD_INIT(data_blklist[3].blks),
+	 .firstnotfull = &data_blklist[3].blks,
+	 .lock = SPIN_LOCK_UNLOCKED,
+	 .objsize = 32,
+	 .blksize = ALIGN(32, SMP_CACHE_BYTES),
+	 .cachep = NULL,
+	 .cachename = "percpu_data_32"}
+#endif
+};
+
+static int data_blklist_count =
+    sizeof (data_blklist) / sizeof (struct percpu_data_blklist);
+
+/*
+ * Allocate a block descriptor structure and initialize it.  
+ * Returns the address of the block descriptor or NULL on failure.
+ */
+static struct percpu_data_blk *
+percpu_data_blk_alloc(struct percpu_data_blklist *blklist, int flags)
+{
+	struct percpu_data_blk *blkp;
+	int i;
+	int count;
+
+	if (!(blkp = kmalloc(sizeof (struct percpu_data_blk), flags)))
+		goto out1;
+	INIT_LIST_HEAD(&blkp->linkage);
+	blkp->usecount = 0;
+	count = blklist->blksize / blklist->objsize;
+	blkp->freearr = kmalloc(count, flags);
+	if (!blkp->freearr)
+		goto out;
+	blkp->freehead = 0;
+	for (i = 0; i < count; i++)
+		blkp->freearr[i] = i + 1;
+	blkp->freearr[i - 1] = -1;	/* Marks the end of the array */
+	blkp->blklist = blklist;
+	return blkp;
+out:
+	kfree(blkp);
+out1:
+	return NULL;
+}
+
+/*
+ * Frees the block descriptor structure
+ */
+static void
+percpu_data_blk_free(struct percpu_data_blk *blkp)
+{
+	kfree(blkp);
+}
+
+/*
+ * Add a block to the percpu data object memory pool.
+ * Returns 0 on failure and 1 on success
+ */
+static int
+percpu_data_mem_grow(struct percpu_data_blklist *blklist, int flags)
+{
+	struct percpu_data_blk *blkp;
+	unsigned long save_flags;
+	int i;
+
+	if (!(blkp = percpu_data_blk_alloc(blklist, flags)))
+		goto out;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		blkp->blkaddr[i] = kmem_cache_alloc(blklist->cachep, flags);
+		if (!(blkp->blkaddr[i]))
+			goto out1;
+		memset(blkp->blkaddr[i], 0, blklist->blksize);
+	}
+
+	/* 
+	 * Now that we have the block successfully allocated 
+	 * and instantiated..  add it.....
+	 */
+	spin_lock_irqsave(&blklist->lock, save_flags);
+	list_add_tail(&blkp->linkage, &blklist->blks);
+	if (blklist->firstnotfull == &blklist->blks)
+		blklist->firstnotfull = &blkp->linkage;
+	spin_unlock_irqrestore(&blklist->lock, save_flags);
+	return 1;
+
+out1:
+	i--;
+	for (; i >= 0; i--) {
+		if (!cpu_possible(i))
+			continue;
+		kmem_cache_free(blklist->cachep, blkp->blkaddr[i]);
+	}
+	percpu_data_blk_free(blkp);
+out:
+	return 0;
+}
+
+/*
+ * Initialise the main percpu data control structure.
+ */
+static void __init
+percpu_data_blklist_init(struct percpu_data_blklist *blklist)
+{
+	blklist->cachep = kmem_cache_create(blklist->cachename,
+					    blklist->blksize, 0,
+					    SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!blklist->cachep)
+		BUG();
+}
+
+static struct percpu_data_blklist *
+percpu_data_get_blklist(size_t size, int flags)
+{
+	int i;
+	for (i = 0; i < data_blklist_count; i++) {
+		if (size > data_blklist[i].objsize)
+			continue;
+		return &data_blklist[i];
+	}
+	return NULL;
+}
+
+/*
+ * Initialize the percpu_data subsystem.
+ */
+void __init
+kmalloc_percpu_init(void)
+{
+	int i;
+	for (i = 0; i < data_blklist_count; i++) {
+		percpu_data_blklist_init(&data_blklist[i]);
+	}
+}
+
+/*
+ * Allocs an object from the block.  Returns back the object index.
+ */
+static int
+__percpu_interlaced_alloc_one(struct percpu_data_blklist *blklist,
+			      struct percpu_data_blk *blkp)
+{
+	unsigned int objidx;
+
+	objidx = blkp->freehead;
+	blkp->freehead = blkp->freearr[objidx];
+	blkp->usecount++;
+	if (blkp->freehead < 0) {
+		blklist->firstnotfull = blkp->linkage.next;
+	}
+	return objidx;
+}
+
+/*
+ * Allocate a per cpu data object and return a pointer to it.
+ */
+static int
+__percpu_interlaced_alloc(struct percpu_data *percpu,
+			  struct percpu_data_blklist *blklist, int flags)
+{
+	struct percpu_data_blk *blkp;
+	unsigned long save_flags;
+	struct list_head *l;
+	int objidx;
+	int i;
+
+tryagain:
+
+	spin_lock_irqsave(&blklist->lock, save_flags);
+	l = blklist->firstnotfull;
+	if (l == &blklist->blks)
+		goto unlock_and_get_mem;
+	blkp = list_entry(l, struct percpu_data_blk, linkage);
+
+	objidx = __percpu_interlaced_alloc_one(blklist, blkp);
+	spin_unlock_irqrestore(&blklist->lock, save_flags);
+	/* 
+	 * Since we hold the lock and firstnotfull is not the
+	 * head list, we should be getting an object alloc here. firstnotfull 
+	 * can be pointing to head of the list when all the blks are 
+	 * full or when there're no blocks left 
+	 */
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		percpu->ptrs[i] = blkp->blkaddr[i] + objidx * blklist->objsize;
+	}
+	percpu->blkp = (void *) blkp;
+	return 1;
+
+unlock_and_get_mem:
+
+	spin_unlock_irqrestore(&blklist->lock, save_flags);
+	if (percpu_data_mem_grow(blklist, flags))
+		goto tryagain;	/* added another block..try allocing obj .. */
+
+	return 0;
+}
+
+/*
+ * Allocate a per-cpu data object and return a pointer to it.
+ * Returns NULL on failure. 
+ */
+static int
+percpu_interlaced_alloc(struct percpu_data *pdata, size_t size, int flags)
+{
+	struct percpu_data_blklist *blklist;
+
+	blklist = percpu_data_get_blklist(size, flags);
+	if (blklist == NULL)
+		return 0;
+	return __percpu_interlaced_alloc(pdata, blklist, flags);
+}
+
+/*
+ * Frees memory associated with a percpu data block
+ */
+static void
+percpu_data_blk_destroy(struct percpu_data_blklist *blklist,
+			struct percpu_data_blk *blkp)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		kmem_cache_free(blklist->cachep, blkp->blkaddr[i]);
+	}
+	percpu_data_blk_free(blkp);
+}
+
+/*
+ * Frees an object from a block and fixes the freelist accdly.
+ * Frees the slab cache memory if a block gets empty during free.
+ */
+static void
+__percpu_interlaced_free(struct percpu_data_blklist *blklist,
+			 struct percpu_data *percpu)
+{
+	struct percpu_data_blk *blkp;
+	int objidx;
+	int objoffset;
+	struct list_head *t;
+	unsigned long save_flags;
+
+	spin_lock_irqsave(&blklist->lock, save_flags);
+	blkp = (struct percpu_data_blk *) percpu->blkp;
+	objoffset = percpu->ptrs[0] - blkp->blkaddr[0];
+	objidx = objoffset / blklist->objsize;
+
+	kfree(percpu);
+
+	blkp->freearr[objidx] = blkp->freehead;
+	blkp->freehead = objidx;
+	blkp->usecount--;
+
+	if (blkp->freearr[objidx] < 0) {
+		/* 
+		 * block was previously full and is now just partially full ..
+		 * so make firstnotfull pt to this block and fix list accdly 
+		 */
+		t = blklist->firstnotfull;
+		blklist->firstnotfull = &blkp->linkage;
+		if (blkp->linkage.next == t) {
+			spin_unlock_irqrestore(&blklist->lock, save_flags);
+			return;
+		}
+		list_del(&blkp->linkage);
+		list_add_tail(&blkp->linkage, t);
+
+		spin_unlock_irqrestore(&blklist->lock, save_flags);
+		return;
+	}
+
+	if (blkp->usecount == 0) {
+		t = blklist->firstnotfull->prev;
+
+		list_del(&blkp->linkage);
+		if (blklist->firstnotfull == &blkp->linkage)
+			blklist->firstnotfull = t->next;
+
+		spin_unlock_irqrestore(&blklist->lock, save_flags);
+		percpu_data_blk_destroy(blklist, blkp);
+		return;
+	}
+
+	spin_unlock_irqrestore(&blklist->lock, save_flags);
+	return;
+}
+
+/*
+ * Frees up a percpu data object
+ */
+static void
+percpu_interlaced_free(struct percpu_data *percpu)
+{
+	struct percpu_data_blk *blkp = percpu->blkp;
+	__percpu_interlaced_free(blkp->blklist, percpu);
+}
+
+/**
+ * kmalloc_percpu - allocate one copy of the object for every present
+ * cpu in the system.
+ *
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ * The @flags argument may be one of:
+ *
+ * %GFP_USER - Allocate memory on behalf of user.  May sleep.
+ *
+ * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
+ *
+ * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
+ */
+void *
+kmalloc_percpu(size_t size, int flags)
+{
+	int i;
+	struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
+
+	if (!pdata)
+		goto out_done;
+	pdata->blkp = NULL;
+	if (size <= (malloc_sizes[0].cs_size >> 1)) {
+		if (!percpu_interlaced_alloc(pdata, size, flags))
+			goto out;
+	} else {
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_possible(i))
+				continue;
+			pdata->ptrs[i] = kmalloc(size, flags);
+			if (!pdata->ptrs[i])
+				goto unwind_oom;
+		}
+	}
+	/* Catch derefs w/o wrappers */
+	return (void *) (~(unsigned long) pdata);
+
+unwind_oom:
+	while (--i >= 0) {
+		if (!cpu_possible(i))
+			continue;
+		kfree(pdata->ptrs[i]);
+	}
+out:
+	kfree(pdata);
+out_done:
+	return NULL;
+}
+
+/**
+ * kfree_percpu - free previously allocated percpu memory
+ * @objp: pointer returned by kmalloc_percpu.
+ *
+ * Don't free memory not originally allocated by kmalloc_percpu()
+ * The complemented objp is to check for that.
+ */
+void
+kfree_percpu(const void *objp)
+{
+	int i;
+	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+
+	if (p->blkp) {
+		percpu_interlaced_free(p);
+	} else {
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_possible(i))
+				continue;
+			kfree(p->ptrs[i]);
+		}
+	}
+}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-04 12:15 ` [patch] kmalloc_percpu -- 2 " Ravikiran G Thirumalai
@ 2002-12-04 19:34   ` Andrew Morton
  2002-12-05  3:42     ` Dipankar Sarma
  0 siblings, 1 reply; 17+ messages in thread
From: Andrew Morton @ 2002-12-04 19:34 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: linux-kernel, Rusty Russell, dipankar

Ravikiran G Thirumalai wrote:
> 
> Here's a 2.5.50 version of kmalloc_percpu originally submitted by Dipankar.


> +/* Use these with kmalloc_percpu */
> +#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
> +#define put_cpu_ptr(ptr) put_cpu()

These names sound very much like get_cpu_var() and put_cpu_var(),
yet they are using a quite different subsystem.  It would be good
to choose something more distinct.  Not that I can think of anything
at present ;)

The commentary above these functions should clearly state that thou
shalt not sleep between them.

> ...
> --- linux-2.5.50/kernel/Makefile        Thu Nov 28 04:05:51 2002
> +++ kmalloc_percpu-2.5.50/kernel/Makefile       Sun Dec  1 11:54:49 2002
> @@ -4,7 +4,7 @@
> 
>  export-objs = signal.o sys.o kmod.o workqueue.o ksyms.o pm.o exec_domain.o \
>                 printk.o platform.o suspend.o dma.o module.o cpufreq.o \
> -               profile.o rcupdate.o intermodule.o
> +               profile.o rcupdate.o intermodule.o percpu.o

I suppose so.  It _could_ be in mm/percpu.c


> ...
> +static int data_blklist_count =
> +    sizeof (data_blklist) / sizeof (struct percpu_data_blklist);

The ARRAY_SIZE macro is suitable here.

> +static struct percpu_data_blk *
> +percpu_data_blk_alloc(struct percpu_data_blklist *blklist, int flags)
> ...
> +static void
> +percpu_data_blk_free(struct percpu_data_blk *blkp)
> ...
> +static int
> +percpu_data_mem_grow(struct percpu_data_blklist *blklist, int flags)
> ...
> +static void __init
> +percpu_data_blklist_init(struct percpu_data_blklist *blklist)
> ...
> +static struct percpu_data_blklist *
> +percpu_data_get_blklist(size_t size, int flags)
> ...
> +static int
> +__percpu_interlaced_alloc_one(struct percpu_data_blklist *blklist,
> +                             struct percpu_data_blk *blkp)
> ...
> +static int
> +__percpu_interlaced_alloc(struct percpu_data *percpu,
> +                         struct percpu_data_blklist *blklist, int flags)
> ...
> +static int
> +percpu_interlaced_alloc(struct percpu_data *pdata, size_t size, int flags)
> ...
> +static void
> +percpu_data_blk_destroy(struct percpu_data_blklist *blklist,
> +                       struct percpu_data_blk *blkp)
> ...
> +static void
> +__percpu_interlaced_free(struct percpu_data_blklist *blklist,
> +                        struct percpu_data *percpu)
> ...
> +static void
> +percpu_interlaced_free(struct percpu_data *percpu)
> ...

ummm.  What on earth is all that stuff?

> +void *
> +kmalloc_percpu(size_t size, int flags)
> +{
> +       int i;
> +       struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
> +
> +       if (!pdata)
> +               goto out_done;
> +       pdata->blkp = NULL;
> +       if (size <= (malloc_sizes[0].cs_size >> 1)) {
> +               if (!percpu_interlaced_alloc(pdata, size, flags))
> +                       goto out;
> +       } else {
> +               for (i = 0; i < NR_CPUS; i++) {
> +                       if (!cpu_possible(i))
> +                               continue;
> +                       pdata->ptrs[i] = kmalloc(size, flags);
> +                       if (!pdata->ptrs[i])
> +                               goto unwind_oom;
> +               }
> +       }
> +       /* Catch derefs w/o wrappers */
> +       return (void *) (~(unsigned long) pdata);
> +
> +unwind_oom:
> +       while (--i >= 0) {
> +               if (!cpu_possible(i))
> +                       continue;
> +               kfree(pdata->ptrs[i]);
> +       }
> +out:
> +       kfree(pdata);
> +out_done:
> +       return NULL;
> +}

If we were to remove the percpu_interlaced_alloc() leg here, we'd
have a nice, simple per-cpu kmalloc implementation.

Could you please explain what all the other code is there for?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-04 19:34   ` Andrew Morton
@ 2002-12-05  3:42     ` Dipankar Sarma
  2002-12-05  4:32       ` Andrew Morton
  0 siblings, 1 reply; 17+ messages in thread
From: Dipankar Sarma @ 2002-12-05  3:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-kernel, Rusty Russell

On Wed, Dec 04, 2002 at 11:34:35AM -0800, Andrew Morton wrote:
> > +/* Use these with kmalloc_percpu */
> > +#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
> > +#define put_cpu_ptr(ptr) put_cpu()
> 
> These names sound very much like get_cpu_var() and put_cpu_var(),
> yet they are using a quite different subsystem.  It would be good
> to choose something more distinct.  Not that I can think of anything
> at present ;)

Well, they are similar, aren't they ? get_cpu_ptr() can just be thought
of as the dynamic twin of get_cpu_var(). So, in that sense it seems ok
to me.

> 
> If we were to remove the percpu_interlaced_alloc() leg here, we'd
> have a nice, simple per-cpu kmalloc implementation.
> 
> Could you please explain what all the other code is there for?

The interlaced allocator allows you to save space when kmalloc_percpu()
is used to allocate small objects. That is done by interlacing each
CPU's copy of the objects just like the static per-cpu data area.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05  3:42     ` Dipankar Sarma
@ 2002-12-05  4:32       ` Andrew Morton
  2002-12-05  4:47         ` William Lee Irwin III
  2002-12-05 10:53         ` Dipankar Sarma
  0 siblings, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2002-12-05  4:32 UTC (permalink / raw)
  To: dipankar; +Cc: Ravikiran G Thirumalai, linux-kernel, Rusty Russell

Dipankar Sarma wrote:
> 
> On Wed, Dec 04, 2002 at 11:34:35AM -0800, Andrew Morton wrote:
> > > +/* Use these with kmalloc_percpu */
> > > +#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
> > > +#define put_cpu_ptr(ptr) put_cpu()
> >
> > These names sound very much like get_cpu_var() and put_cpu_var(),
> > yet they are using a quite different subsystem.  It would be good
> > to choose something more distinct.  Not that I can think of anything
> > at present ;)
> 
> Well, they are similar, aren't they ? get_cpu_ptr() can just be thought
> of as the dynamic twin of get_cpu_var(). So, in that sense it seems ok
> to me.

hm.  spose so.

> >
> > If we were to remove the percpu_interlaced_alloc() leg here, we'd
> > have a nice, simple per-cpu kmalloc implementation.
> >
> > Could you please explain what all the other code is there for?
> 
> The interlaced allocator allows you to save space when kmalloc_percpu()
> is used to allocate small objects. That is done by interlacing each
> CPU's copy of the objects just like the static per-cpu data area.
> 

Where in the kernel is such a large number of 4-, 8- or 16-byte
objects being used?

The slab allocator will support caches right down to 1024 x 4-byte
objects per page.  Why is that not appropriate?

If it is for locality-of-reference between individual objects then
where in the kernel is this required, and are performance measurements
available?  It is very unusual to have objects which are so small,
and a better design would be to obtain the locality of reference
by aggregating the data into an array or structure.

Sorry, but you have what is basically a brand new allocator in
there, and we need a very good reason for including it.  I'd like
to know what that reason is, please.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05  4:32       ` Andrew Morton
@ 2002-12-05  4:47         ` William Lee Irwin III
  2002-12-05 10:53         ` Dipankar Sarma
  1 sibling, 0 replies; 17+ messages in thread
From: William Lee Irwin III @ 2002-12-05  4:47 UTC (permalink / raw)
  To: Andrew Morton
  Cc: dipankar, Ravikiran G Thirumalai, linux-kernel, Rusty Russell

On Wed, Dec 04, 2002 at 08:32:58PM -0800, Andrew Morton wrote:
> Where in the kernel is such a large number of 4-, 8- or 16-byte
> objects being used?
> The slab allocator will support caches right down to 1024 x 4-byte
> objects per page.  Why is that not appropriate?
> If it is for locality-of-reference between individual objects then
> where in the kernel is this required, and are performance measurements
> available?  It is very unusual to have objects which are so small,
> and a better design would be to obtain the locality of reference
> by aggregating the data into an array or structure.

I will argue not on the frequency of calls but on the preciousness of
space; highmem feels very serious pain when internal fragmentation
of pinned pages occurs (which this is designed to prevent). I don't
have direct experience with this patch/API, but I can say that
fragmentation in ZONE_NORMAL is deadly (witness pagetable occupancy
vs. ZONE_NORMAL consumption, which motivated highpte, despite my
pagetable reclamation smoke blowing).


Bill

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05  4:32       ` Andrew Morton
  2002-12-05  4:47         ` William Lee Irwin III
@ 2002-12-05 10:53         ` Dipankar Sarma
  2002-12-05 11:23           ` yodaiken
  2002-12-05 20:02           ` Andrew Morton
  1 sibling, 2 replies; 17+ messages in thread
From: Dipankar Sarma @ 2002-12-05 10:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ravikiran G Thirumalai, linux-kernel, Rusty Russell

Hi Andrew,

On Wed, Dec 04, 2002 at 08:32:58PM -0800, Andrew Morton wrote:
> Where in the kernel is such a large number of 4-, 8- or 16-byte
> objects being used?

Well, kernel objects may not be that small, but one would expect
the per-cpu parts of the kernel objects to be sometimes small, often down to
a couple of counters counting statistics.

> 
> The slab allocator will support caches right down to 1024 x 4-byte
> objects per page.  Why is that not appropriate?

Well, if you allocated 4-byte objects directly from the slab allocator,
you aren't guranteed to *not* share a cache line with another object
modified by a different cpu.

> 
> Sorry, but you have what is basically a brand new allocator in
> there, and we need a very good reason for including it.  I'd like
> to know what that reason is, please.

The reason is concern about per-cpu allocation for small per-CPU
parts (typically counters) of objects. If a driver has two counters
counting reads and writes, you don't want to eat up a whole cacheline
for them for each CPU per instance of the device.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 10:53         ` Dipankar Sarma
@ 2002-12-05 11:23           ` yodaiken
  2002-12-05 11:28             ` William Lee Irwin III
  2002-12-05 12:41             ` Dipankar Sarma
  2002-12-05 20:02           ` Andrew Morton
  1 sibling, 2 replies; 17+ messages in thread
From: yodaiken @ 2002-12-05 11:23 UTC (permalink / raw)
  To: Dipankar Sarma
  Cc: Andrew Morton, Ravikiran G Thirumalai, linux-kernel,
	Rusty Russell



On Thu, Dec 05, 2002 at 04:23:29PM +0530, Dipankar Sarma wrote:
> Hi Andrew,
> 
> On Wed, Dec 04, 2002 at 08:32:58PM -0800, Andrew Morton wrote:
> > Where in the kernel is such a large number of 4-, 8- or 16-byte
> > objects being used?
> 
> Well, kernel objects may not be that small, but one would expect
> the per-cpu parts of the kernel objects to be sometimes small, often down to
> a couple of counters counting statistics.


Doesn't your allocator increase chances of cache conflict on the same
cpu ?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 11:23           ` yodaiken
@ 2002-12-05 11:28             ` William Lee Irwin III
  2002-12-05 12:41             ` Dipankar Sarma
  1 sibling, 0 replies; 17+ messages in thread
From: William Lee Irwin III @ 2002-12-05 11:28 UTC (permalink / raw)
  To: yodaiken
  Cc: Dipankar Sarma, Andrew Morton, Ravikiran G Thirumalai,
	linux-kernel, Rusty Russell

On Wed, Dec 04, 2002 at 08:32:58PM -0800, Andrew Morton wrote:
>>> Where in the kernel is such a large number of 4-, 8- or 16-byte
>>> objects being used?

On Thu, Dec 05, 2002 at 04:23:29PM +0530, Dipankar Sarma wrote:
> > Well, kernel objects may not be that small, but one would expect
> > the per-cpu parts of the kernel objects to be sometimes small, often down to
> > a couple of counters counting statistics.

On Thu, Dec 05, 2002 at 04:23:12AM -0700, yodaiken@fsmlabs.com wrote:
> Doesn't your allocator increase chances of cache conflict on the same
> cpu ?

This is so; I'm personally far more concerned about ZONE_NORMAL space
consumption in the cacheline aligned case.


Bill

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 11:23           ` yodaiken
  2002-12-05 11:28             ` William Lee Irwin III
@ 2002-12-05 12:41             ` Dipankar Sarma
  2002-12-05 15:08               ` yodaiken
  1 sibling, 1 reply; 17+ messages in thread
From: Dipankar Sarma @ 2002-12-05 12:41 UTC (permalink / raw)
  To: yodaiken; +Cc: Andrew Morton, Ravikiran G Thirumalai, linux-kernel,
	Rusty Russell

On Thu, Dec 05, 2002 at 11:33:15AM +0000, yodaiken@fsmlabs.com wrote:
> 
> > 
> > Well, kernel objects may not be that small, but one would expect
> > the per-cpu parts of the kernel objects to be sometimes small, often down to
> > a couple of counters counting statistics.
> 
> 
> Doesn't your allocator increase chances of cache conflict on the same
> cpu ?
> 

You mean by increasing the footprint and the chance of eviction ? It
is a compromise. Or you would face NR_CPUS bloat and non-NUMA-node-local 
accesses for all CPUs outside the NUMA node where your NR_CPUS array
is located.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 12:41             ` Dipankar Sarma
@ 2002-12-05 15:08               ` yodaiken
  0 siblings, 0 replies; 17+ messages in thread
From: yodaiken @ 2002-12-05 15:08 UTC (permalink / raw)
  To: Dipankar Sarma
  Cc: yodaiken, Andrew Morton, Ravikiran G Thirumalai, linux-kernel,
	Rusty Russell

On Thu, Dec 05, 2002 at 06:11:53PM +0530, Dipankar Sarma wrote:
> > Doesn't your allocator increase chances of cache conflict on the same
> > cpu ?
> > 
> 
> You mean by increasing the footprint and the chance of eviction ? It
> is a compromise. Or you would face NR_CPUS bloat and non-NUMA-node-local 
> accesses for all CPUs outside the NUMA node where your NR_CPUS array
> is located.

What do you base the trade-off decision on?

> 
> Thanks
> -- 
> Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
> Linux Technology Center, IBM Software Lab, Bangalore, India.
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
---------------------------------------------------------
Victor Yodaiken 
Finite State Machine Labs: The RTLinux Company.
www.fsmlabs.com  www.rtlinux.com
1+ 505 838 9109


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 10:53         ` Dipankar Sarma
  2002-12-05 11:23           ` yodaiken
@ 2002-12-05 20:02           ` Andrew Morton
  2002-12-05 21:23             ` Dipankar Sarma
  2002-12-09  5:30             ` Ravikiran G Thirumalai
  1 sibling, 2 replies; 17+ messages in thread
From: Andrew Morton @ 2002-12-05 20:02 UTC (permalink / raw)
  To: dipankar; +Cc: Ravikiran G Thirumalai, linux-kernel, Rusty Russell

Dipankar Sarma wrote:
> 
> Hi Andrew,
> 
> On Wed, Dec 04, 2002 at 08:32:58PM -0800, Andrew Morton wrote:
> > Where in the kernel is such a large number of 4-, 8- or 16-byte
> > objects being used?
> 
> Well, kernel objects may not be that small, but one would expect
> the per-cpu parts of the kernel objects to be sometimes small, often down to
> a couple of counters counting statistics.

Sorry, "one would expect" is not sufficient grounds for incorporation of
a new allocator.  As far as I can tell, all the proposed users are in
fact allocating decent-sized aggregates, and that will remain the usual
case.

The code exists, great.  We can pull it in when there is a demonstrated
need for it.  But until that need is shown, this is overdesign.

> >
> > The slab allocator will support caches right down to 1024 x 4-byte
> > objects per page.  Why is that not appropriate?
> 
> Well, if you allocated 4-byte objects directly from the slab allocator,
> you aren't guranteed to *not* share a cache line with another object
> modified by a different cpu.

If that's a problem it can be addressed in the slab head arrays - make
sure that they are always filled and emptied in multiple-of-cacheline-sized
units for objects which are smaller than a cacheline.  That benefits all
slab users.

> >
> > Sorry, but you have what is basically a brand new allocator in
> > there, and we need a very good reason for including it.  I'd like
> > to know what that reason is, please.
> 
> The reason is concern about per-cpu allocation for small per-CPU
> parts (typically counters) of objects. If a driver has two counters
> counting reads and writes, you don't want to eat up a whole cacheline
> for them for each CPU per instance of the device.
> 

I don't buy it.

- If the driver has two counters per device then the storage is
  infinitesimal.

- If it has multiple counters per device (always the case) then
  the driver will aggregate them anyway.

I am not aware of any situations in which a driver has a large
(or even medium) number of small, discrete counters of this nature.
Sufficiently large to justify a new allocator.

I'd suggest that you drop the new allocator until a compelling
need for it (in real, live 2.5/2.6 code) has been demonstrated.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 20:02           ` Andrew Morton
@ 2002-12-05 21:23             ` Dipankar Sarma
  2002-12-05 22:15               ` Andrew Morton
  2002-12-09  5:30             ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 17+ messages in thread
From: Dipankar Sarma @ 2002-12-05 21:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Rusty Russell, Ravikiran G Thirumalai

On Thu, Dec 05, 2002 at 09:10:16PM +0100, Andrew Morton wrote:
> 
> I'd suggest that you drop the new allocator until a compelling
> need for it (in real, live 2.5/2.6 code) has been demonstrated.

Fine with me since atleast one workaround for fragmentation with small 
allocations is known. I can't see anything in 2.5 timeframe 
requiring small per-cpu allocations.

Would you like me to resubmit a simple kmalloc-only version ?

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 21:23             ` Dipankar Sarma
@ 2002-12-05 22:15               ` Andrew Morton
  0 siblings, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-12-05 22:15 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel, Rusty Russell, Ravikiran G Thirumalai

Dipankar Sarma wrote:
> 
> On Thu, Dec 05, 2002 at 09:10:16PM +0100, Andrew Morton wrote:
> >
> > I'd suggest that you drop the new allocator until a compelling
> > need for it (in real, live 2.5/2.6 code) has been demonstrated.
> 
> Fine with me since atleast one workaround for fragmentation with small
> allocations is known. I can't see anything in 2.5 timeframe
> requiring small per-cpu allocations.
> 
> Would you like me to resubmit a simple kmalloc-only version ?
> 

I think that would be best.

BTW, looking at the snmp application of this work:

+#define ICMP_INC_STATS_USER_FIELD(offt)                                \
+       (*((unsigned long *) ((void *)                                  \
+                            per_cpu_ptr(icmp_statistics[1],            \
+                                        smp_processor_id())) + offt))++;

This guy is racy on preempt.  Just a little bit.  It is effectively:

	ptr = per_cpu_ptr(...);
	(*ptr)++;

On some architectures, `(*ptr)++' is not atomic wrt interrupts.  The
CPU could be preempted midway through the increment.

Surely it's not an issue for SNMP stats, but for some applications
such as struct page_state, such a permanent off-by-a-little-bit would
be a showstopper.

So some big loud comments which describe the worthiness of get_cpu_ptr(),
and the potential inaccuracy of per_cpu_ptr would be useful.

And as this is the first application of the kmalloc_precpu infrastructure,
it may be best to convert it to use get_cpu_ptr/put_cpu_ptr.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-05 20:02           ` Andrew Morton
  2002-12-05 21:23             ` Dipankar Sarma
@ 2002-12-09  5:30             ` Ravikiran G Thirumalai
  2002-12-09  5:57               ` Andrew Morton
  2002-12-09 19:28               ` Andrew Morton
  1 sibling, 2 replies; 17+ messages in thread
From: Ravikiran G Thirumalai @ 2002-12-09  5:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: dipankar, linux-kernel, Rusty Russell

Hi Andrew,
Sorry for the delayed response... I was out of station and couldn't
reply earlier ....

On Thu, Dec 05, 2002 at 12:02:51PM -0800, Andrew Morton wrote:
> Dipankar Sarma wrote:
> > 
> > Hi Andrew,
> > 
> > On Wed, Dec 04, 2002 at 08:32:58PM -0800, Andrew Morton wrote:
> > > Where in the kernel is such a large number of 4-, 8- or 16-byte
> > > objects being used?
> > 
> > Well, kernel objects may not be that small, but one would expect
> > the per-cpu parts of the kernel objects to be sometimes small, often down to
> > a couple of counters counting statistics.
> 
> Sorry, "one would expect" is not sufficient grounds for incorporation of
> a new allocator.  As far as I can tell, all the proposed users are in
> fact allocating decent-sized aggregates, and that will remain the usual
> case.

The main objective of the interlaced allocator was cacheline utilisation
more than main memory fragmentation (That has been my understanding at least).
Without the interlaced allocator, we'd just pad up data and lose
precious cacheline space.  If you have a general purpose object
allocator, one would want objects in different cachelines as kmalloc
does, but that is not the case for kmalloc_percpu users.  If obj A and
obj B exists on the same cacheline, atleast objB does not take
another cacheline...If you hit objB after objA, you gain, but if
you don't, you don't lose.

As for the object sizes
1. We are assuming 32 bytes cachelines in this thread I suppose
But ppc64 has a 128 byte cacheline and s390 a 256 byte Jumbo cacheline.
I guess with larger cacheline sizes you have lesser no of cachelines --
makes cachelines all the more precious.  (Right now, I am speaking
in ignorance of the ppc64 and s390 cache architectures .. I
can just see L1_CACHE_SHIFT in the kernel sources).  So wouldn't
interlaced allocations help these archs .. even when you have 64
bytes big objects?

2. When we have a case for data structures to be per-cpued, not all
the members will be frequently modified or 'bouncy'... say if you take
netdevice stats, rx and tx counters are likely to be hot
and bouncy....and others not that hot... making the whole
structure per-cpu might not be good, but we did not have
a clean workaround until kmalloc_percpu.  So when you start
identifying hot objects in these data structures, and making
per-cpu objects only of hot objects, your object size
tends to go down .. making a case for the interlaced allocator .....
This capability is not possible without the interlaced allocator no?

Does this make a reasonable case for interlaced allocator now?
(Of course, blklist init in the patch has to be modified to create
blklists for objects of size 4, 8 .... SMP_CACHE_BYTES/2)

Thanks,
Kiran 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-09  5:30             ` Ravikiran G Thirumalai
@ 2002-12-09  5:57               ` Andrew Morton
  2002-12-09 19:28               ` Andrew Morton
  1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-12-09  5:57 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: dipankar, linux-kernel, Rusty Russell

Ravikiran G Thirumalai wrote:
> 
> ...
> As for the object sizes
> 1. We are assuming 32 bytes cachelines in this thread I suppose
> But ppc64 has a 128 byte cacheline and s390 a 256 byte Jumbo cacheline.
> I guess with larger cacheline sizes you have lesser no of cachelines --
> makes cachelines all the more precious.  (Right now, I am speaking
> in ignorance of the ppc64 and s390 cache architectures .. I
> can just see L1_CACHE_SHIFT in the kernel sources).  So wouldn't
> interlaced allocations help these archs .. even when you have 64
> bytes big objects?

You're assuming that the slab allocator always returns cachesize-padded
objects.  It does not have to do that.  It can return 4-byte-sized and
-aligned objects if you ask it to.

> ...
> Does this make a reasonable case for interlaced allocator now?
> (Of course, blklist init in the patch has to be modified to create
> blklists for objects of size 4, 8 .... SMP_CACHE_BYTES/2)

Oh I can see the benefits, but they appear to be rather theoretical.

I'm just applying some pressure here against adding another allocator
unless it is really needed.  On principle.

A slab cache of 4-byte objects will tend to give you what you want
anyway, due to the batch filling and freeing of the head arrays.
If that is proven to be insufficient then it would be better to
put development effort into strengthening slab, rather than competely
bypassing it.

(And a really simple solution would be to create a separate slab cache
per cpu...)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [patch] kmalloc_percpu  -- 2 of 2
  2002-12-09  5:30             ` Ravikiran G Thirumalai
  2002-12-09  5:57               ` Andrew Morton
@ 2002-12-09 19:28               ` Andrew Morton
  1 sibling, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2002-12-09 19:28 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: dipankar, linux-kernel, Rusty Russell

Ravikiran G Thirumalai wrote:
> 
> ...
> As for the object sizes
> 1. We are assuming 32 bytes cachelines in this thread I suppose
> But ppc64 has a 128 byte cacheline and s390 a 256 byte Jumbo cacheline.
> I guess with larger cacheline sizes you have lesser no of cachelines --
> makes cachelines all the more precious.  (Right now, I am speaking
> in ignorance of the ppc64 and s390 cache architectures .. I
> can just see L1_CACHE_SHIFT in the kernel sources).  So wouldn't
> interlaced allocations help these archs .. even when you have 64
> bytes big objects?

You're assuming that the slab allocator always returns cachesize-padded
objects.  It does not have to do that.  It can return 4-byte-sized and
-aligned objects if you ask it to.

> ...
> Does this make a reasonable case for interlaced allocator now?
> (Of course, blklist init in the patch has to be modified to create
> blklists for objects of size 4, 8 .... SMP_CACHE_BYTES/2)

Oh I can see the benefits, but they appear to be rather theoretical.

I'm just applying some pressure here against adding another allocator
unless it is really needed.  On principle.

A slab cache of 4-byte objects will tend to give you what you want
anyway, due to the batch filling and freeing of the head arrays.
If that is proven to be insufficient then it would be better to
put development effort into strengthening slab, rather than competely
bypassing it.

(And a really simple solution would be to create a separate slab cache
per cpu...)

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2002-12-09 22:23 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-12-04 12:12 [patch] kmalloc_percpu -- 1 of 2 Ravikiran G Thirumalai
2002-12-04 12:15 ` [patch] kmalloc_percpu -- 2 " Ravikiran G Thirumalai
2002-12-04 19:34   ` Andrew Morton
2002-12-05  3:42     ` Dipankar Sarma
2002-12-05  4:32       ` Andrew Morton
2002-12-05  4:47         ` William Lee Irwin III
2002-12-05 10:53         ` Dipankar Sarma
2002-12-05 11:23           ` yodaiken
2002-12-05 11:28             ` William Lee Irwin III
2002-12-05 12:41             ` Dipankar Sarma
2002-12-05 15:08               ` yodaiken
2002-12-05 20:02           ` Andrew Morton
2002-12-05 21:23             ` Dipankar Sarma
2002-12-05 22:15               ` Andrew Morton
2002-12-09  5:30             ` Ravikiran G Thirumalai
2002-12-09  5:57               ` Andrew Morton
2002-12-09 19:28               ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.