From: Christoph Lameter <cl@linux-foundation.org>
To: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org,
Nick Piggin <npiggin@suse.de>,
David Rientjes <rientjes@google.com>
Subject: [S+Q3 15/23] slub: Allow resizing of per cpu queues
Date: Tue, 03 Aug 2010 21:45:29 -0500 [thread overview]
Message-ID: <20100804024532.510006548@linux.com> (raw)
In-Reply-To: 20100804024514.139976032@linux.com
[-- Attachment #1: unified_resize --]
[-- Type: text/plain, Size: 13063 bytes --]
Allow resizing of cpu queue and batch size. This is done in the
basic steps that are also followed by SLAB.
Careful: The ->cpu pointer is becoming volatile. References
to the ->cpu pointer either
A. Occur with interrupts disabled. This guarantees that nothing on the
processor itself interferes. This only serializes access to a single
processor specific area.
B. Occur with slub_lock taken for operations on all per cpu areas.
Taking the slub_lock guarantees that no resizing operation will occur
while accessing the percpu areas. The data in the percpu areas
is volatile even with slub_lock since the alloc and free functions
do not take slub_lock and will operate on fields of kmem_cache_cpu.
C. Are racy: Tolerable for statistics. The ->cpu pointer must always
point to a valid kmem_cache_cpu area.
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
---
include/linux/slub_def.h | 9 -
mm/slub.c | 218 +++++++++++++++++++++++++++++++++++++++++------
2 files changed, 197 insertions(+), 30 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2010-07-31 18:25:53.000000000 -0500
+++ linux-2.6/mm/slub.c 2010-07-31 19:02:05.003563067 -0500
@@ -195,10 +195,19 @@
#endif
+/*
+ * We allow stat calls while slub_lock is taken or while interrupts
+ * are enabled for simplicities sake.
+ *
+ * This results in potential inaccuracies. If the platform does not
+ * support per cpu atomic operations vs. interrupts then the counters
+ * may be updated in a racy manner due to slab processing in
+ * interrupts.
+ */
static inline void stat(struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- __this_cpu_inc(s->cpu_slab->stat[si]);
+ __this_cpu_inc(s->cpu->stat[si]);
#endif
}
@@ -303,7 +312,7 @@
static inline int queue_full(struct kmem_cache_queue *q)
{
- return q->objects == QUEUE_SIZE;
+ return q->objects == q->max;
}
static inline int queue_empty(struct kmem_cache_queue *q)
@@ -1571,6 +1580,11 @@
stat(s, QUEUE_FLUSH);
}
+struct flush_control {
+ struct kmem_cache *s;
+ struct kmem_cache_cpu *c;
+};
+
/*
* Flush cpu objects.
*
@@ -1578,22 +1592,96 @@
*/
static void __flush_cpu_objects(void *d)
{
- struct kmem_cache *s = d;
- struct kmem_cache_cpu *c = __this_cpu_ptr(s->cpu_slab);
+ struct flush_control *f = d;
+ struct kmem_cache_cpu *c = __this_cpu_ptr(f->c);
if (c->q.objects)
- flush_cpu_objects(s, c);
+ flush_cpu_objects(f->s, c);
}
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu(__flush_cpu_objects, s, 1);
+ struct flush_control f = { s, s->cpu };
+
+ on_each_cpu(__flush_cpu_objects, &f, 1);
}
struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, int n)
{
- return __alloc_percpu(sizeof(struct kmem_cache_cpu),
- __alignof__(struct kmem_cache_cpu));
+ struct kmem_cache_cpu *k;
+ int cpu;
+ int size;
+ int max;
+
+ /* Size the queue and the allocation to cacheline sizes */
+ size = ALIGN(n * sizeof(void *) + sizeof(struct kmem_cache_cpu), cache_line_size());
+
+ k = __alloc_percpu(size, cache_line_size());
+ if (!k)
+ return NULL;
+
+ max = (size - sizeof(struct kmem_cache_cpu)) / sizeof(void *);
+
+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(k, cpu);
+
+ c->q.max = max;
+ }
+
+ s->cpu_queue = max;
+ return k;
+}
+
+
+static void resize_cpu_queue(struct kmem_cache *s, int queue)
+{
+ struct kmem_cache_cpu *n = alloc_kmem_cache_cpu(s, queue);
+ struct flush_control f;
+
+ /* Create the new cpu queue and then free the old one */
+ f.s = s;
+ f.c = s->cpu;
+
+ /* We can only shrink the queue here since the new
+ * queue size may be smaller and there may be concurrent
+ * slab operations. The update of the queue must be seen
+ * before the change of the location of the percpu queue.
+ *
+ * Note that the queue may contain more object than the
+ * queue size after this operation.
+ */
+ if (queue < s->queue) {
+ s->queue = queue;
+ s->batch = (s->queue + 1) / 2;
+ barrier();
+ }
+
+ /* This is critical since allocation and free runs
+ * concurrently without taking the slub_lock!
+ * We point the cpu pointer to a different per cpu
+ * segment to redirect current processing and then
+ * flush the cpu objects on the old cpu structure.
+ *
+ * The old percpu structure is no longer reachable
+ * since slab_alloc/free must have terminated in order
+ * to execute __flush_cpu_objects. Both require
+ * interrupts to be disabled.
+ */
+ s->cpu = n;
+ on_each_cpu(__flush_cpu_objects, &f, 1);
+
+ /*
+ * If the queue needs to be extended then we deferred
+ * the update until now when the larger sized queue
+ * has been allocated and is working.
+ */
+ if (queue > s->queue) {
+ s->queue = queue;
+ s->batch = (s->queue + 1) / 2;
+ }
+
+ if (slab_state > UP)
+ free_percpu(f.c);
}
/*
@@ -1706,7 +1794,7 @@
{
int d;
- d = min(BATCH_SIZE - q->objects, nr);
+ d = min(s->batch - q->objects, nr);
retrieve_objects(s, page, q->object + q->objects, d);
q->objects += d;
}
@@ -1747,7 +1835,7 @@
redo:
local_irq_save(flags);
- c = __this_cpu_ptr(s->cpu_slab);
+ c = __this_cpu_ptr(s->cpu);
q = &c->q;
if (unlikely(queue_empty(q) || !node_match(c, node))) {
@@ -1756,7 +1844,7 @@
c->node = node;
}
- while (q->objects < BATCH_SIZE) {
+ while (q->objects < s->batch) {
struct page *new;
new = get_partial(s, gfpflags & ~__GFP_ZERO, node);
@@ -1773,7 +1861,7 @@
local_irq_disable();
/* process may have moved to different cpu */
- c = __this_cpu_ptr(s->cpu_slab);
+ c = __this_cpu_ptr(s->cpu);
q = &c->q;
if (!new) {
@@ -1875,7 +1963,7 @@
slab_free_hook_irq(s, x);
- c = __this_cpu_ptr(s->cpu_slab);
+ c = __this_cpu_ptr(s->cpu);
if (NUMA_BUILD) {
int node = page_to_nid(page);
@@ -1891,7 +1979,7 @@
if (unlikely(queue_full(q))) {
- drain_queue(s, q, BATCH_SIZE);
+ drain_queue(s, q, s->batch);
stat(s, FREE_SLOWPATH);
} else
@@ -2093,9 +2181,9 @@
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
SLUB_PAGE_SHIFT * sizeof(struct kmem_cache));
- s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+ s->cpu = alloc_kmem_cache_cpu(s, s->queue);
- return s->cpu_slab != NULL;
+ return s->cpu != NULL;
}
#ifdef CONFIG_NUMA
@@ -2317,6 +2405,18 @@
}
+static int initial_queue_size(int size)
+{
+ if (size > PAGE_SIZE)
+ return 8;
+ else if (size > 1024)
+ return 24;
+ else if (size > 256)
+ return 54;
+ else
+ return 120;
+}
+
static int kmem_cache_open(struct kmem_cache *s,
const char *name, size_t size,
size_t align, unsigned long flags,
@@ -2355,6 +2455,9 @@
if (!init_kmem_cache_nodes(s))
goto error;
+ s->queue = initial_queue_size(s->size);
+ s->batch = (s->queue + 1) / 2;
+
if (alloc_kmem_cache_cpus(s))
return 1;
@@ -2465,8 +2568,9 @@
{
int node;
+ down_read(&slub_lock);
flush_all(s);
- free_percpu(s->cpu_slab);
+ free_percpu(s->cpu);
/* Attempt to free all objects */
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -2476,6 +2580,7 @@
return 1;
}
free_kmem_cache_nodes(s);
+ up_read(&slub_lock);
return 0;
}
@@ -3122,6 +3227,7 @@
caches++;
}
+ /* Now the kmalloc array is fully functional (*not* the dma array) */
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
@@ -3149,6 +3255,7 @@
#ifdef CONFIG_ZONE_DMA
int i;
+ /* Create the dma kmalloc array and make it operational */
for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
struct kmem_cache *s = kmalloc_caches[i];
@@ -3297,7 +3404,7 @@
down_read(&slub_lock);
list_for_each_entry(s, &slab_caches, list) {
local_irq_save(flags);
- flush_cpu_objects(s, per_cpu_ptr(s->cpu_slab ,cpu));
+ flush_cpu_objects(s, per_cpu_ptr(s->cpu, cpu));
local_irq_restore(flags);
}
up_read(&slub_lock);
@@ -3764,6 +3871,7 @@
return -ENOMEM;
per_cpu = nodes + nr_node_ids;
+ down_read(&slub_lock);
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3794,6 +3902,7 @@
nodes[node] += x;
}
}
+
x = sprintf(buf, "%lu", total);
#ifdef CONFIG_NUMA
for_each_node_state(node, N_NORMAL_MEMORY)
@@ -3801,6 +3910,7 @@
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
+ up_read(&slub_lock);
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -3904,6 +4014,57 @@
}
SLAB_ATTR(min_partial);
+static ssize_t cpu_queue_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->queue);
+}
+
+static ssize_t cpu_queue_size_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long queue;
+ int err;
+
+ err = strict_strtoul(buf, 10, &queue);
+ if (err)
+ return err;
+
+ if (queue > 10000 || queue < 4)
+ return -EINVAL;
+
+ if (s->batch > queue)
+ s->batch = queue;
+
+ down_write(&slub_lock);
+ resize_cpu_queue(s, queue);
+ up_write(&slub_lock);
+ return length;
+}
+SLAB_ATTR(cpu_queue_size);
+
+static ssize_t cpu_batch_size_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->batch);
+}
+
+static ssize_t cpu_batch_size_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ unsigned long batch;
+ int err;
+
+ err = strict_strtoul(buf, 10, &batch);
+ if (err)
+ return err;
+
+ if (batch < s->queue || batch < 4)
+ return -EINVAL;
+
+ s->batch = batch;
+ return length;
+}
+SLAB_ATTR(cpu_batch_size);
+
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
if (s->ctor) {
@@ -3944,8 +4105,9 @@
if (!cpus)
return -ENOMEM;
+ down_read(&slub_lock);
for_each_online_cpu(cpu) {
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu, cpu);
total += c->q.objects;
}
@@ -3953,11 +4115,14 @@
x = sprintf(buf, "%lu", total);
for_each_online_cpu(cpu) {
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu, cpu);
+ struct kmem_cache_queue *q = &c->q;
- if (c->q.objects)
- x += sprintf(buf + x, " C%d=%u", cpu, c->q.objects);
+ if (!queue_empty(q))
+ x += sprintf(buf + x, " C%d=%u/%u",
+ cpu, q->objects, q->max);
}
+ up_read(&slub_lock);
kfree(cpus);
return x + sprintf(buf + x, "\n");
}
@@ -4209,12 +4374,14 @@
if (!data)
return -ENOMEM;
+ down_read(&slub_lock);
for_each_online_cpu(cpu) {
- unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
+ unsigned x = per_cpu_ptr(s->cpu, cpu)->stat[si];
data[cpu] = x;
sum += x;
}
+ up_read(&slub_lock);
len = sprintf(buf, "%lu", sum);
@@ -4232,8 +4399,10 @@
{
int cpu;
+ down_write(&slub_lock);
for_each_online_cpu(cpu)
- per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
+ per_cpu_ptr(s->cpu, cpu)->stat[si] = 0;
+ up_write(&slub_lock);
}
#define STAT_ATTR(si, text) \
@@ -4270,6 +4439,8 @@
&objs_per_slab_attr.attr,
&order_attr.attr,
&min_partial_attr.attr,
+ &cpu_queue_size_attr.attr,
+ &cpu_batch_size_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
&total_objects_attr.attr,
@@ -4631,7 +4802,7 @@
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
nr_objs, s->size, oo_objects(s->oo),
(1 << oo_order(s->oo)));
- seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
+ seq_printf(m, " : tunables %4u %4u %4u", s->queue, s->batch, 0);
seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
0UL);
seq_putc(m, '\n');
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2010-07-31 18:25:28.000000000 -0500
+++ linux-2.6/include/linux/slub_def.h 2010-07-31 19:00:58.738236361 -0500
@@ -29,14 +29,11 @@
ORDER_FALLBACK, /* Number of times fallback was necessary */
NR_SLUB_STAT_ITEMS };
-#define QUEUE_SIZE 50
-#define BATCH_SIZE 25
-
/* Queueing structure used for per cpu, l3 cache and alien queueing */
struct kmem_cache_queue {
int objects; /* Available objects */
int max; /* Queue capacity */
- void *object[QUEUE_SIZE];
+ void *object[];
};
struct kmem_cache_cpu {
@@ -71,7 +68,7 @@
* Slab cache management.
*/
struct kmem_cache {
- struct kmem_cache_cpu *cpu_slab;
+ struct kmem_cache_cpu *cpu;
/* Used for retriving partial slabs etc */
unsigned long flags;
int size; /* The size of an object including meta data */
@@ -87,6 +84,8 @@
void (*ctor)(void *);
int inuse; /* Offset to metadata */
int align; /* Alignment */
+ int queue; /* specified queue size */
+ int cpu_queue; /* cpu queue size */
unsigned long min_partial;
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2010-08-04 2:45 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-08-04 2:45 [S+Q3 00/23] SLUB: The Unified slab allocator (V3) Christoph Lameter
2010-08-04 2:45 ` [S+Q3 01/23] percpu: make @dyn_size always mean min dyn_size in first chunk init functions Christoph Lameter
2010-08-04 2:45 ` [S+Q3 02/23] percpu: allow limited allocation before slab is online Christoph Lameter
2010-08-04 2:45 ` [S+Q3 03/23] slub: Use a constant for a unspecified node Christoph Lameter
2010-08-04 3:34 ` David Rientjes
2010-08-04 16:15 ` Christoph Lameter
2010-08-05 7:40 ` David Rientjes
2010-08-04 2:45 ` [S+Q3 04/23] SLUB: Constants need UL Christoph Lameter
2010-08-04 2:45 ` [S+Q3 05/23] Subjec Slub: Force no inlining of debug functions Christoph Lameter
2010-08-04 2:45 ` [S+Q3 06/23] slub: Check kasprintf results in kmem_cache_init() Christoph Lameter
2010-08-04 2:45 ` [S+Q3 07/23] slub: Use kmem_cache flags to detect if slab is in debugging mode Christoph Lameter
2010-08-04 2:45 ` [S+Q3 08/23] slub: remove dynamic dma slab allocation Christoph Lameter
2010-08-04 2:45 ` [S+Q3 09/23] slub: Remove static kmem_cache_cpu array for boot Christoph Lameter
2010-08-04 2:45 ` [S+Q3 10/23] slub: Allow removal of slab caches during boot V2 Christoph Lameter
2010-08-04 2:45 ` [S+Q3 11/23] slub: Dynamically size kmalloc cache allocations Christoph Lameter
2010-08-04 2:45 ` [S+Q3 12/23] slub: Extract hooks for memory checkers from hotpaths Christoph Lameter
2010-08-04 2:45 ` [S+Q3 13/23] slub: Move gfpflag masking out of the hotpath Christoph Lameter
2010-08-04 2:45 ` [S+Q3 14/23] slub: Add SLAB style per cpu queueing Christoph Lameter
2010-08-04 2:45 ` Christoph Lameter [this message]
2010-08-04 2:45 ` [S+Q3 16/23] slub: Get rid of useless function count_free() Christoph Lameter
2010-08-04 2:45 ` [S+Q3 17/23] slub: Remove MAX_OBJS limitation Christoph Lameter
2010-08-04 2:45 ` [S+Q3 18/23] slub: Drop allocator announcement Christoph Lameter
2010-08-04 2:45 ` [S+Q3 19/23] slub: Object based NUMA policies Christoph Lameter
2010-08-04 2:45 ` [S+Q3 20/23] slub: Shared cache to exploit cross cpu caching abilities Christoph Lameter
2010-08-17 5:52 ` David Rientjes
2010-08-17 17:51 ` Christoph Lameter
2010-08-17 18:42 ` David Rientjes
2010-08-17 18:50 ` Christoph Lameter
2010-08-17 19:02 ` David Rientjes
2010-08-17 19:32 ` Christoph Lameter
2010-08-18 19:32 ` Christoph Lameter
2010-08-04 2:45 ` [S+Q3 21/23] slub: Support Alien Caches Christoph Lameter
2010-08-04 2:45 ` [S+Q3 22/23] slub: Cached object expiration Christoph Lameter
2010-08-04 2:45 ` [S+Q3 23/23] vmscan: Tie slub object expiration into page reclaim Christoph Lameter
2010-08-04 4:39 ` [S+Q3 00/23] SLUB: The Unified slab allocator (V3) David Rientjes
2010-08-04 16:17 ` Christoph Lameter
2010-08-05 8:38 ` David Rientjes
2010-08-05 17:33 ` Christoph Lameter
2010-08-17 4:56 ` David Rientjes
2010-08-17 7:55 ` Tejun Heo
2010-08-17 13:56 ` Christoph Lameter
2010-08-17 17:23 ` Christoph Lameter
2010-08-17 17:29 ` Christoph Lameter
2010-08-17 18:02 ` David Rientjes
2010-08-17 18:47 ` Christoph Lameter
2010-08-17 18:54 ` David Rientjes
2010-08-17 19:34 ` Christoph Lameter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100804024532.510006548@linux.com \
--to=cl@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=npiggin@suse.de \
--cc=penberg@cs.helsinki.fi \
--cc=rientjes@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).