* [patch 01/17] SLUB: Extend slabinfo to support -D and -F options
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 02/17] SLUB: Add defrag_ratio field and sysfs support Christoph Lameter
` (16 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0047-SLUB-Extend-slabinfo-to-support-D-and-C-options.patch --]
[-- Type: text/plain, Size: 6201 bytes --]
-F lists caches that support defragmentation
-C lists caches that use a ctor.
Change field names for defrag_ratio and remote_node_defrag_ratio.
Add determination of the allocation ratio for a slab. The allocation ratio
is the percentage of available slots for objects in use.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
Documentation/vm/slabinfo.c | 48 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 43 insertions(+), 5 deletions(-)
Index: linux-2.6/Documentation/vm/slabinfo.c
===================================================================
--- linux-2.6.orig/Documentation/vm/slabinfo.c 2008-02-14 15:18:49.077314846 -0800
+++ linux-2.6/Documentation/vm/slabinfo.c 2008-02-15 15:31:25.718359341 -0800
@@ -31,6 +31,8 @@ struct slabinfo {
int hwcache_align, object_size, objs_per_slab;
int sanity_checks, slab_size, store_user, trace;
int order, poison, reclaim_account, red_zone;
+ int defrag, ctor;
+ int defrag_ratio, remote_node_defrag_ratio;
unsigned long partial, objects, slabs;
unsigned long alloc_fastpath, alloc_slowpath;
unsigned long free_fastpath, free_slowpath;
@@ -64,6 +66,8 @@ int show_slab = 0;
int skip_zero = 1;
int show_numa = 0;
int show_track = 0;
+int show_defrag = 0;
+int show_ctor = 0;
int show_first_alias = 0;
int validate = 0;
int shrink = 0;
@@ -100,13 +104,15 @@ void fatal(const char *x, ...)
void usage(void)
{
printf("slabinfo 5/7/2007. (c) 2007 sgi. clameter@sgi.com\n\n"
- "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+ "slabinfo [-aCdDefFhnpvtsz] [-d debugopts] [slab-regexp]\n"
"-a|--aliases Show aliases\n"
"-A|--activity Most active slabs first\n"
+ "-C|--ctor Show slabs with ctors\n"
"-d<options>|--debug=<options> Set/Clear Debug options\n"
"-D|--display-active Switch line format to activity\n"
"-e|--empty Show empty slabs\n"
"-f|--first-alias Show first alias\n"
+ "-F|--defrag Show defragmentable caches\n"
"-h|--help Show usage information\n"
"-i|--inverted Inverted list\n"
"-l|--slabs Show slabs\n"
@@ -296,7 +302,7 @@ void first_line(void)
printf("Name Objects Alloc Free %%Fast\n");
else
printf("Name Objects Objsize Space "
- "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
+ "Slabs/Part/Cpu O/S O %%Ra %%Ef Flg\n");
}
/*
@@ -345,7 +351,7 @@ void slab_numa(struct slabinfo *s, int m
return;
if (!line) {
- printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+ printf("\n%-21s: Rto ", mode ? "NUMA nodes" : "Slab");
for(node = 0; node <= highest_node; node++)
printf(" %4d", node);
printf("\n----------------------");
@@ -354,6 +360,7 @@ void slab_numa(struct slabinfo *s, int m
printf("\n");
}
printf("%-21s ", mode ? "All slabs" : s->name);
+ printf("%3d ", s->remote_node_defrag_ratio);
for(node = 0; node <= highest_node; node++) {
char b[20];
@@ -492,6 +499,8 @@ void report(struct slabinfo *s)
printf("** Slabs are destroyed via RCU\n");
if (s->reclaim_account)
printf("** Reclaim accounting active\n");
+ if (s->defrag)
+ printf("** Defragmentation at %d%%\n", s->defrag_ratio);
printf("\nSizes (bytes) Slabs Debug Memory\n");
printf("------------------------------------------------------------------------\n");
@@ -539,6 +548,12 @@ void slabcache(struct slabinfo *s)
if (show_empty && s->slabs)
return;
+ if (show_defrag && !s->defrag)
+ return;
+
+ if (show_ctor && !s->ctor)
+ return;
+
store_size(size_str, slab_size(s));
snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
@@ -549,6 +564,10 @@ void slabcache(struct slabinfo *s)
*p++ = '*';
if (s->cache_dma)
*p++ = 'd';
+ if (s->defrag)
+ *p++ = 'F';
+ if (s->ctor)
+ *p++ = 'C';
if (s->hwcache_align)
*p++ = 'A';
if (s->poison)
@@ -582,7 +601,8 @@ void slabcache(struct slabinfo *s)
printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
s->name, s->objects, s->object_size, size_str, dist_str,
s->objs_per_slab, s->order,
- s->slabs ? (s->partial * 100) / s->slabs : 100,
+ s->slabs ? (s->partial * 100) /
+ (s->slabs * s->objs_per_slab) : 100,
s->slabs ? (s->objects * s->object_size * 100) /
(s->slabs * (page_size << s->order)) : 100,
flags);
@@ -1193,7 +1213,17 @@ void read_slab_dir(void)
slab->deactivate_to_head = get_obj("deactivate_to_head");
slab->deactivate_to_tail = get_obj("deactivate_to_tail");
slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
+ slab->defrag_ratio = get_obj("defrag_ratio");
+ slab->remote_node_defrag_ratio =
+ get_obj("remote_node_defrag_ratio");
chdir("..");
+ if (read_slab_obj(slab, "ops")) {
+ if (strstr(buffer, "ctor :"))
+ slab->ctor = 1;
+ if (strstr(buffer, "kick :"))
+ slab->defrag = 1;
+ }
+
if (slab->name[0] == ':')
alias_targets++;
slab++;
@@ -1244,10 +1274,12 @@ void output_slabs(void)
struct option opts[] = {
{ "aliases", 0, NULL, 'a' },
{ "activity", 0, NULL, 'A' },
+ { "ctor", 0, NULL, 'C' },
{ "debug", 2, NULL, 'd' },
{ "display-activity", 0, NULL, 'D' },
{ "empty", 0, NULL, 'e' },
{ "first-alias", 0, NULL, 'f' },
+ { "defrag", 0, NULL, 'F' },
{ "help", 0, NULL, 'h' },
{ "inverted", 0, NULL, 'i'},
{ "numa", 0, NULL, 'n' },
@@ -1270,7 +1302,7 @@ int main(int argc, char *argv[])
page_size = getpagesize();
- while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
+ while ((c = getopt_long(argc, argv, "aACd::DefFhil1noprstvzTS",
opts, NULL)) != -1)
switch (c) {
case '1':
@@ -1326,6 +1358,12 @@ int main(int argc, char *argv[])
case 'z':
skip_zero = 0;
break;
+ case 'C':
+ show_ctor = 1;
+ break;
+ case 'F':
+ show_defrag = 1;
+ break;
case 'T':
show_totals = 1;
break;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 02/17] SLUB: Add defrag_ratio field and sysfs support.
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
2008-02-16 0:45 ` [patch 01/17] SLUB: Extend slabinfo to support -D and -F options Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 03/17] SLUB: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
` (15 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0048-SLUB-Add-defrag_ratio-field-and-sysfs-support.patch --]
[-- Type: text/plain, Size: 2835 bytes --]
The defrag_ratio is used to set the threshold at which defragmentation
should be run on a slabcache.
The allocation ratio is measured in the percentage of the available slots
allocated. The percentage will be lower for slabs that are more fragmented.
Add a defrag ratio field and set it to 30% by default. A limit of 30% specified
that less than 3 out of 10 available slots for objects are in use before
slab defragmeentation runs.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
include/linux/slub_def.h | 7 +++++++
mm/slub.c | 18 ++++++++++++++++++
2 files changed, 25 insertions(+)
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2008-02-15 15:22:22.064550321 -0800
+++ linux-2.6/include/linux/slub_def.h 2008-02-15 15:31:53.954427883 -0800
@@ -79,6 +79,13 @@ struct kmem_cache {
void (*ctor)(struct kmem_cache *, void *);
int inuse; /* Offset to metadata */
int align; /* Alignment */
+ int defrag_ratio; /*
+ * objects/possible-objects limit. If we have
+ * less that the specified percentage of
+ * objects allocated then defrag passes
+ * will start to occur during reclaim.
+ */
+
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
#ifdef CONFIG_SLUB_DEBUG
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2008-02-15 15:22:22.884553646 -0800
+++ linux-2.6/mm/slub.c 2008-02-15 15:31:53.958427919 -0800
@@ -2395,6 +2395,7 @@ static int kmem_cache_open(struct kmem_c
goto error;
s->refcount = 1;
+ s->defrag_ratio = 30;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 100;
#endif
@@ -4084,6 +4085,22 @@ static ssize_t free_calls_show(struct km
}
SLAB_ATTR_RO(free_calls);
+static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->defrag_ratio);
+}
+
+static ssize_t defrag_ratio_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int n = simple_strtoul(buf, NULL, 10);
+
+ if (n < 100)
+ s->defrag_ratio = n;
+ return length;
+}
+SLAB_ATTR(defrag_ratio);
+
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -4184,6 +4201,7 @@ static struct attribute *slab_attrs[] =
&shrink_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+ &defrag_ratio_attr.attr,
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 03/17] SLUB: Replace ctor field with ops field in /sys/slab/*
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
2008-02-16 0:45 ` [patch 01/17] SLUB: Extend slabinfo to support -D and -F options Christoph Lameter
2008-02-16 0:45 ` [patch 02/17] SLUB: Add defrag_ratio field and sysfs support Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 04/17] SLUB: Add get() and kick() methods Christoph Lameter
` (14 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0049-SLUB-Replace-ctor-field-with-ops-field-in-sys-slab.patch --]
[-- Type: text/plain, Size: 1663 bytes --]
Create an ops field in /sys/slab/*/ops to contain all the operations defined
on a slab. This will be used to display the additional operations that will
be defined soon.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
mm/slub.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2008-02-15 15:31:53.958427919 -0800
+++ linux-2.6/mm/slub.c 2008-02-15 15:32:00.654444198 -0800
@@ -3862,16 +3862,18 @@ static ssize_t order_show(struct kmem_ca
}
SLAB_ATTR(order);
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
{
- if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
+ int x = 0;
- return n + sprintf(buf + n, "\n");
+ if (s->ctor) {
+ x += sprintf(buf + x, "ctor : ");
+ x += sprint_symbol(buf + x, (unsigned long)s->ops->ctor);
+ x += sprintf(buf + x, "\n");
}
- return 0;
+ return x;
}
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
@@ -4186,7 +4188,7 @@ static struct attribute *slab_attrs[] =
&slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
- &ctor_attr.attr,
+ &ops_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
&sanity_checks_attr.attr,
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 04/17] SLUB: Add get() and kick() methods
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (2 preceding siblings ...)
2008-02-16 0:45 ` [patch 03/17] SLUB: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 05/17] SLUB: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
` (13 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0050-SLUB-Add-get-and-kick-methods.patch --]
[-- Type: text/plain, Size: 5416 bytes --]
Add the two methods needed for defragmentation and add the display of the
methods via the proc interface.
Add documentation explaining the use of these methods.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
include/linux/slab_def.h | 4 ++++
include/linux/slob_def.h | 4 ++++
include/linux/slub_def.h | 35 +++++++++++++++++++++++++++++++++++
mm/slub.c | 32 ++++++++++++++++++++++++++++++--
4 files changed, 73 insertions(+), 2 deletions(-)
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2008-02-15 16:38:21.753876823 -0800
+++ linux-2.6/include/linux/slub_def.h 2008-02-15 16:38:31.401917080 -0800
@@ -74,6 +74,37 @@ struct kmem_cache {
gfp_t allocflags; /* gfp flags to use on each alloc */
int refcount; /* Refcount for slab cache destroy */
void (*ctor)(struct kmem_cache *, void *);
+ /*
+ * Called with slab lock held and interrupts disabled.
+ * No slab operation may be performed in get().
+ *
+ * Parameters passed are the number of objects to process
+ * and an array of pointers to objects for which we
+ * need references.
+ *
+ * Returns a pointer that is passed to the kick function.
+ * If all objects cannot be moved then the pointer may
+ * indicate that this wont work and then kick can simply
+ * remove the references that were already obtained.
+ *
+ * The array passed to get() is also passed to kick(). The
+ * function may remove objects by setting array elements to NULL.
+ */
+ void *(*get)(struct kmem_cache *, int nr, void **);
+
+ /*
+ * Called with no locks held and interrupts enabled.
+ * Any operation may be performed in kick().
+ *
+ * Parameters passed are the number of objects in the array,
+ * the array of pointers to the objects and the pointer
+ * returned by get().
+ *
+ * Success is checked by examining the number of remaining
+ * objects in the slab.
+ */
+ void (*kick)(struct kmem_cache *, int nr, void **, void *private);
+
int inuse; /* Offset to metadata */
int align; /* Alignment */
int defrag_ratio; /*
@@ -238,4 +269,8 @@ static __always_inline void *kmalloc_nod
}
#endif
+void kmem_cache_setup_defrag(struct kmem_cache *s,
+ void *(*get)(struct kmem_cache *, int nr, void **),
+ void (*kick)(struct kmem_cache *, int nr, void **, void *private));
+
#endif /* _LINUX_SLUB_DEF_H */
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2008-02-15 16:38:23.133882506 -0800
+++ linux-2.6/mm/slub.c 2008-02-15 16:39:19.946123239 -0800
@@ -2791,6 +2791,20 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+void kmem_cache_setup_defrag(struct kmem_cache *s,
+ void *(*get)(struct kmem_cache *, int nr, void **),
+ void (*kick)(struct kmem_cache *, int nr, void **, void *private))
+{
+ /*
+ * Defragmentable slabs must have a ctor otherwise objects may be
+ * in an undetermined state after they are allocated.
+ */
+ BUG_ON(!s->ctor);
+ s->get = get;
+ s->kick = kick;
+}
+EXPORT_SYMBOL(kmem_cache_setup_defrag);
+
static unsigned long count_partial(struct kmem_cache_node *n)
{
unsigned long flags;
@@ -3092,7 +3106,7 @@ static int slab_unmergeable(struct kmem_
if ((s->flags & __PAGE_ALLOC_FALLBACK))
return 1;
- if (s->ctor)
+ if (s->ctor || s->kick || s->get)
return 1;
/*
@@ -3827,7 +3841,21 @@ static ssize_t ops_show(struct kmem_cach
if (s->ctor) {
x += sprintf(buf + x, "ctor : ");
- x += sprint_symbol(buf + x, (unsigned long)s->ops->ctor);
+ x += sprint_symbol(buf + x, (unsigned long)s->ctor);
+ x += sprintf(buf + x, "\n");
+ }
+
+ if (s->get) {
+ x += sprintf(buf + x, "get : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->get);
+ x += sprintf(buf + x, "\n");
+ }
+
+ if (s->kick) {
+ x += sprintf(buf + x, "kick : ");
+ x += sprint_symbol(buf + x,
+ (unsigned long)s->kick);
x += sprintf(buf + x, "\n");
}
return x;
Index: linux-2.6/include/linux/slab_def.h
===================================================================
--- linux-2.6.orig/include/linux/slab_def.h 2008-02-15 16:34:53.392938232 -0800
+++ linux-2.6/include/linux/slab_def.h 2008-02-15 16:38:31.405917002 -0800
@@ -95,4 +95,8 @@ found:
#endif /* CONFIG_NUMA */
+static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
+ void *(*get)(struct kmem_cache *, int nr, void **),
+ void (*kick)(struct kmem_cache *, int nr, void **, void *private)) {}
+
#endif /* _LINUX_SLAB_DEF_H */
Index: linux-2.6/include/linux/slob_def.h
===================================================================
--- linux-2.6.orig/include/linux/slob_def.h 2008-02-15 16:34:53.392938232 -0800
+++ linux-2.6/include/linux/slob_def.h 2008-02-15 16:38:31.405917002 -0800
@@ -33,4 +33,8 @@ static inline void *__kmalloc(size_t siz
return kmalloc(size, flags);
}
+static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
+ void *(*get)(struct kmem_cache *, int nr, void **),
+ void (*kick)(struct kmem_cache *, int nr, void **, void *private)) {}
+
#endif /* __LINUX_SLOB_DEF_H */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 05/17] SLUB: Sort slab cache list and establish maximum objects for defrag slabs
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (3 preceding siblings ...)
2008-02-16 0:45 ` [patch 04/17] SLUB: Add get() and kick() methods Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 06/17] SLUB: Slab defrag core Christoph Lameter
` (12 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0051-SLUB-Sort-slab-cache-list-and-establish-maximum-obj.patch --]
[-- Type: text/plain, Size: 2621 bytes --]
When defragmenting slabs then it is advantageous to have all
defragmentable slabs together at the beginning of the list so that there is
no need to scan the complete list. Put defragmentable caches first when adding
a slab cache and others last.
Determine the maximum number of objects in defragmentable slabs. This allows
to size the allocation of arrays holding refs to these objects later.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
mm/slub.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2008-02-15 16:39:19.946123239 -0800
+++ linux-2.6/mm/slub.c 2008-02-15 16:39:24.198141417 -0800
@@ -236,6 +236,9 @@ static enum {
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
+/* Maximum objects in defragmentable slabs */
+static unsigned int max_defrag_slab_objects = 0;
+
/*
* Tracking user of a slab.
*/
@@ -2573,7 +2576,7 @@ static struct kmem_cache *create_kmalloc
flags | __KMALLOC_CACHE, NULL))
goto panic;
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
up_write(&slub_lock);
if (sysfs_slab_add(s))
goto panic;
@@ -2791,6 +2794,13 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+static inline void *alloc_scratch(void)
+{
+ return kmalloc(max_defrag_slab_objects * sizeof(void *) +
+ BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long),
+ GFP_KERNEL);
+}
+
void kmem_cache_setup_defrag(struct kmem_cache *s,
void *(*get)(struct kmem_cache *, int nr, void **),
void (*kick)(struct kmem_cache *, int nr, void **, void *private))
@@ -2802,6 +2812,11 @@ void kmem_cache_setup_defrag(struct kmem
BUG_ON(!s->ctor);
s->get = get;
s->kick = kick;
+ down_write(&slub_lock);
+ list_move(&s->list, &slab_caches);
+ if (s->objects > max_defrag_slab_objects)
+ max_defrag_slab_objects = s->objects;
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_setup_defrag);
@@ -3193,7 +3208,7 @@ struct kmem_cache *kmem_cache_create(con
if (s) {
if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
- list_add(&s->list, &slab_caches);
+ list_add_tail(&s->list, &slab_caches);
up_write(&slub_lock);
if (sysfs_slab_add(s))
goto err;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 06/17] SLUB: Slab defrag core
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (4 preceding siblings ...)
2008-02-16 0:45 ` [patch 05/17] SLUB: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 07/17] SLUB: Add KICKABLE to avoid repeated kick() attempts Christoph Lameter
` (11 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0052-SLUB-Slab-defrag-core.patch --]
[-- Type: text/plain, Size: 14131 bytes --]
Slab defragmentation may occur:
1. Unconditionally when kmem_cache_shrink is called on a slab cache by the
kernel calling kmem_cache_shrink.
2. Use of the slabinfo command line to trigger slab shrinking.
3. Per node defrag conditionally when kmem_cache_defrag(<node>) is called.
Defragmentation is only performed if the fragmentation of the slab
is lower than the specified percentage. Fragmentation ratios are measured
by calculating the percentage of objects in use compared to the total
number of objects that the slab cache could hold without extending it.
kmem_cache_defrag() takes a node parameter. This can either be -1 if
defragmentation should be performed on all nodes, or a node number.
If a node number was specified then defragmentation is only performed
on a specific node.
Slab defragmentation is a memory intensive operation that can be
sped up in a NUMA system if mostly node local memory is accessed. It is
possible to run shrinking on a node after execution of shrink_slabs().
A couple of functions must be setup via a call to kmem_cache_setup_defrag()
in order for a slabcache to support defragmentation. These are
void *get(struct kmem_cache *s, int nr, void **objects)
Must obtain a reference to the listed objects. SLUB guarantees that
the objects are still allocated. However, other threads may be blocked
in slab_free() attempting to free objects in the slab. These may succeed
as soon as get() returns to the slab allocator. The function must
be able to detect such situations and void the attempts to free such
objects (by for example voiding the corresponding entry in the objects
array).
No slab operations may be performed in get(). Interrupts
are disabled. What can be done is very limited. The slab lock
for the page that contains the object is taken. Any attempt to perform
a slab operation may lead to a deadlock.
get() returns a private pointer that is passed to kick. Should we
be unable to obtain all references then that pointer may indicate
to the kick() function that it should not attempt any object removal
or move but simply remove the reference counts.
void kick(struct kmem_cache *, int nr, void **objects, void *get_result)
After SLUB has established references to the objects in a
slab it will then drop all locks and use kick() to move objects out
of the slab. The existence of the object is guaranteed by virtue of
the earlier obtained references via get(). The callback may perform
any slab operation since no locks are held at the time of call.
The callback should remove the object from the slab in some way. This
may be accomplished by reclaiming the object and then running
kmem_cache_free() or reallocating it and then running
kmem_cache_free(). Reallocation is advantageous because the partial
slabs were just sorted to have the partial slabs with the most objects
first. Reallocation is likely to result in filling up a slab in
addition to freeing up one slab. A filled up slab can also be removed
from the partial list. So there could be a double effect.
Kick() does not return a result. SLUB will check the number of
remaining objects in the slab. If all objects were removed then
we know that the operation was successful.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
include/linux/slab_def.h | 1
include/linux/slob_def.h | 1
include/linux/slub_def.h | 1
mm/slub.c | 274 +++++++++++++++++++++++++++++++++++++----------
4 files changed, 220 insertions(+), 57 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2008-02-15 16:39:24.198141417 -0800
+++ linux-2.6/mm/slub.c 2008-02-15 16:41:07.606611300 -0800
@@ -183,10 +183,10 @@ static inline void ClearSlabDebug(struct
/*
* Maximum number of desirable partial slabs.
- * The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in the.
+ * More slabs cause kmem_cache_shrink to sort the slabs by objects
+ * and triggers slab defragmentation.
*/
-#define MAX_PARTIAL 10
+#define MAX_PARTIAL 20
#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
@@ -2834,76 +2834,236 @@ static unsigned long count_partial(struc
}
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * Vacate all objects in the given slab.
*
- * The slabs with the least items are placed last. This results in them
- * being allocated from last increasing the chance that the last objects
- * are freed in them.
+ * The scratch aread passed to list function is sufficient to hold
+ * struct listhead times objects per slab. We use it to hold void ** times
+ * objects per slab plus a bitmap for each object.
*/
-int kmem_cache_shrink(struct kmem_cache *s)
+static int kmem_cache_vacate(struct page *page, void *scratch)
{
- int node;
- int i;
- struct kmem_cache_node *n;
- struct page *page;
- struct page *t;
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
+ void **vector = scratch;
+ void *p;
+ void *addr = page_address(page);
+ struct kmem_cache *s;
+ unsigned long *map;
+ int leftover;
+ int count;
+ void *private;
unsigned long flags;
+ unsigned long objects;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ BUG_ON(!PageSlab(page));
+ local_irq_save(flags);
+ slab_lock(page);
+ BUG_ON(!SlabFrozen(page));
- flush_all(s);
- for_each_node_state(node, N_NORMAL_MEMORY) {
- n = get_node(s, node);
+ s = page->slab;
+ objects = s->objects;
+ map = scratch + max_defrag_slab_objects * sizeof(void **);
+ if (!page->inuse || !s->kick)
+ goto out;
- if (!n->nr_partial)
- continue;
+ /* Determine used objects */
+ bitmap_fill(map, objects);
+ for_each_free_object(p, s, page->freelist)
+ __clear_bit(slab_index(p, s, addr), map);
- for (i = 0; i < s->objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ count = 0;
+ memset(vector, 0, objects * sizeof(void **));
+ for_each_object(p, s, addr)
+ if (test_bit(slab_index(p, s, addr), map))
+ vector[count++] = p;
- spin_lock_irqsave(&n->list_lock, flags);
+ private = s->get(s, count, vector);
- /*
- * Build lists indexed by the items in use in each slab.
- *
- * Note that concurrent frees may occur while we hold the
- * list_lock. page->inuse here is the upper limit.
- */
- list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse && slab_trylock(page)) {
- /*
- * Must hold slab lock here because slab_free
- * may have freed the last object and be
- * waiting to release the slab.
- */
- list_del(&page->lru);
+ /*
+ * Got references. Now we can drop the slab lock. The slab
+ * is frozen so it cannot vanish from under us nor will
+ * allocations be performed on the slab. However, unlocking the
+ * slab will allow concurrent slab_frees to proceed.
+ */
+ slab_unlock(page);
+ local_irq_restore(flags);
+
+ /*
+ * Perform the KICK callbacks to remove the objects.
+ */
+ s->kick(s, count, vector, private);
+
+ local_irq_save(flags);
+ slab_lock(page);
+out:
+ /*
+ * Check the result and unfreeze the slab
+ */
+ leftover = page->inuse;
+ unfreeze_slab(s, page, leftover > 0);
+ local_irq_restore(flags);
+ return leftover;
+}
+
+/*
+ * Remove objects from a list of slab pages that have been gathered.
+ * Must be called with slabs that have been isolated before.
+ */
+int kmem_cache_reclaim(struct list_head *zaplist)
+{
+ int freed = 0;
+ void **scratch;
+ struct page *page;
+ struct page *page2;
+
+ if (list_empty(zaplist))
+ return 0;
+
+ scratch = alloc_scratch();
+ if (!scratch)
+ return 0;
+
+ list_for_each_entry_safe(page, page2, zaplist, lru) {
+ list_del(&page->lru);
+ if (kmem_cache_vacate(page, scratch) == 0)
+ freed++;
+ }
+ kfree(scratch);
+ return freed;
+}
+
+/*
+ * Shrink the slab cache on a particular node of the cache
+ * by releasing slabs with zero objects and trying to reclaim
+ * slabs with less than a quarter of objects allocated.
+ */
+static unsigned long __kmem_cache_shrink(struct kmem_cache *s,
+ struct kmem_cache_node *n)
+{
+ unsigned long flags;
+ struct page *page, *page2;
+ LIST_HEAD(zaplist);
+ int freed = 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ if (page->inuse > s->objects / 4)
+ continue;
+ if (!slab_trylock(page))
+ continue;
+
+ if (page->inuse) {
+
+ list_move(&page->lru, &zaplist);
+ if (s->kick) {
n->nr_partial--;
- slab_unlock(page);
- discard_slab(s, page);
- } else {
- list_move(&page->lru,
- slabs_by_inuse + page->inuse);
- }
+ SetSlabFrozen(page);
+ }
+ slab_unlock(page);
+
+ } else {
+ list_del(&page->lru);
+ n->nr_partial--;
+ slab_unlock(page);
+ discard_slab(s, page);
+ freed++;
}
+ }
- /*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
- */
- for (i = s->objects - 1; i >= 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ if (!s->kick)
+ /* Simply put the zaplist at the end */
+ list_splice(&zaplist, n->partial.prev);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (s->kick)
+ freed += kmem_cache_reclaim(&zaplist);
+ return freed;
+}
+
+static unsigned long __kmem_cache_defrag(struct kmem_cache *s, int node)
+{
+ unsigned long capacity;
+ unsigned long objects_in_full_slabs;
+ unsigned long ratio;
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * An insignificant number of partial slabs means that the
+ * slab cache does not need any defragmentation.
+ */
+ if (n->nr_partial <= MAX_PARTIAL)
+ return 0;
+
+ capacity = atomic_long_read(&n->nr_slabs) * s->objects;
+ objects_in_full_slabs =
+ (atomic_long_read(&n->nr_slabs) - n->nr_partial)
+ * s->objects;
+ /*
+ * Worst case calculation: If we would be over the ratio
+ * even if all partial slabs would only have one object
+ * then we can skip the next test that requires a scan
+ * through all the partial page structs to sum up the actual
+ * number of objects in the partial slabs.
+ */
+ ratio = (objects_in_full_slabs + 1 * n->nr_partial) * 100 / capacity;
+ if (ratio > s->defrag_ratio)
+ return 0;
+
+ /*
+ * Now for the real calculation. If usage ratio is more than required
+ * then no defragmentation is necessary.
+ */
+ ratio = (objects_in_full_slabs + count_partial(n)) * 100 / capacity;
+ if (ratio > s->defrag_ratio)
+ return 0;
+
+ return __kmem_cache_shrink(s, n) << s->order;
+}
+
+/*
+ * Defrag slabs conditional on the amount of fragmentation on each node.
+ */
+int kmem_cache_defrag(int node)
+{
+ struct kmem_cache *s;
+ unsigned long pages = 0;
+
+ /*
+ * kmem_cache_defrag may be called from the reclaim path which may be
+ * called for any page allocator alloc. So there is the danger that we
+ * get called in a situation where slub already acquired the slub_lock
+ * for other purposes.
+ */
+ if (!down_read_trylock(&slub_lock))
+ return 0;
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (node == -1) {
+ int nid;
+
+ for_each_node_state(nid, N_NORMAL_MEMORY)
+ pages += __kmem_cache_defrag(s, nid);
+ } else
+ pages += __kmem_cache_defrag(s, node);
}
+ up_read(&slub_lock);
+ return pages;
+}
+EXPORT_SYMBOL(kmem_cache_defrag);
- kfree(slabs_by_inuse);
- return 0;
+/*
+ * kmem_cache_shrink removes empty slabs from the partial lists.
+ * If the slab cache support defragmentation then objects are
+ * reclaimed.
+ */
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+ int node;
+
+ flush_all(s);
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ __kmem_cache_shrink(s, get_node(s, node));
+
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_shrink);
Index: linux-2.6/include/linux/slab_def.h
===================================================================
--- linux-2.6.orig/include/linux/slab_def.h 2008-02-15 16:38:31.405917002 -0800
+++ linux-2.6/include/linux/slab_def.h 2008-02-15 16:39:28.526160097 -0800
@@ -98,5 +98,6 @@ found:
static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
void *(*get)(struct kmem_cache *, int nr, void **),
void (*kick)(struct kmem_cache *, int nr, void **, void *private)) {}
+static inline int kmem_cache_defrag(int node) { return 0; }
#endif /* _LINUX_SLAB_DEF_H */
Index: linux-2.6/include/linux/slob_def.h
===================================================================
--- linux-2.6.orig/include/linux/slob_def.h 2008-02-15 16:38:31.405917002 -0800
+++ linux-2.6/include/linux/slob_def.h 2008-02-15 16:39:28.526160097 -0800
@@ -36,5 +36,6 @@ static inline void *__kmalloc(size_t siz
static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
void *(*get)(struct kmem_cache *, int nr, void **),
void (*kick)(struct kmem_cache *, int nr, void **, void *private)) {}
+static inline int kmem_cache_defrag(int node) { return 0; }
#endif /* __LINUX_SLOB_DEF_H */
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2008-02-15 16:38:31.401917080 -0800
+++ linux-2.6/include/linux/slub_def.h 2008-02-15 16:39:28.526160097 -0800
@@ -272,5 +272,6 @@ static __always_inline void *kmalloc_nod
void kmem_cache_setup_defrag(struct kmem_cache *s,
void *(*get)(struct kmem_cache *, int nr, void **),
void (*kick)(struct kmem_cache *, int nr, void **, void *private));
+int kmem_cache_defrag(int node);
#endif /* _LINUX_SLUB_DEF_H */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 07/17] SLUB: Add KICKABLE to avoid repeated kick() attempts
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (5 preceding siblings ...)
2008-02-16 0:45 ` [patch 06/17] SLUB: Slab defrag core Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 08/17] SLUB: Trigger defragmentation from memory reclaim Christoph Lameter
` (10 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0064-SLUB-Add-SlabReclaimable-to-avoid-repeated-reclai.patch --]
[-- Type: text/plain, Size: 3126 bytes --]
Add a flag KICKABLE to be set on slabs with a defragmentation method
Clear the flag if a kick action is not successful in reducing the
number of objects in a slab.
The KICKABLE flag is set again when all objeccts of the slab have been
allocated and it is removed from the partial lists.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
mm/slub.c | 28 ++++++++++++++++++++++++++--
1 file changed, 26 insertions(+), 2 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2008-02-15 16:41:07.606611300 -0800
+++ linux-2.6/mm/slub.c 2008-02-15 16:41:42.806805718 -0800
@@ -101,6 +101,7 @@
*/
#define FROZEN (1 << PG_active)
+#define KICKABLE (1 << PG_dirty)
#ifdef CONFIG_SLUB_DEBUG
#define SLABDEBUG (1 << PG_error)
@@ -138,6 +139,21 @@ static inline void ClearSlabDebug(struct
page->flags &= ~SLABDEBUG;
}
+static inline int SlabKickable(struct page *page)
+{
+ return page->flags & KICKABLE;
+}
+
+static inline void SetSlabKickable(struct page *page)
+{
+ page->flags |= KICKABLE;
+}
+
+static inline void ClearSlabKickable(struct page *page)
+{
+ page->flags &= ~KICKABLE;
+}
+
/*
* Issues still to be resolved:
*
@@ -1132,6 +1148,8 @@ static struct page *new_slab(struct kmem
if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
SLAB_STORE_USER | SLAB_TRACE))
SetSlabDebug(page);
+ if (s->kick)
+ SetSlabKickable(page);
start = page_address(page);
page->end = start + 1;
@@ -1203,6 +1221,7 @@ static void discard_slab(struct kmem_cac
atomic_long_dec(&n->nr_slabs);
reset_page_mapcount(page);
+ ClearSlabKickable(page);
__ClearPageSlab(page);
free_slab(s, page);
}
@@ -1383,6 +1402,8 @@ static void unfreeze_slab(struct kmem_ca
stat(c, DEACTIVATE_FULL);
if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
add_full(n, page);
+ if (s->kick)
+ SetSlabKickable(page);
}
slab_unlock(page);
} else {
@@ -2861,7 +2882,7 @@ static int kmem_cache_vacate(struct page
s = page->slab;
objects = s->objects;
map = scratch + max_defrag_slab_objects * sizeof(void **);
- if (!page->inuse || !s->kick)
+ if (!page->inuse || !s->kick || !SlabKickable(page))
goto out;
/* Determine used objects */
@@ -2898,6 +2919,8 @@ out:
* Check the result and unfreeze the slab
*/
leftover = page->inuse;
+ if (leftover)
+ ClearSlabKickable(page);
unfreeze_slab(s, page, leftover > 0);
local_irq_restore(flags);
return leftover;
@@ -2945,7 +2968,8 @@ static unsigned long __kmem_cache_shrink
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, page2, &n->partial, lru) {
- if (page->inuse > s->objects / 4)
+ if (page->inuse > s->objects / 4 ||
+ (s->kick && !SlabKickable(page)))
continue;
if (!slab_trylock(page))
continue;
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 08/17] SLUB: Trigger defragmentation from memory reclaim
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (6 preceding siblings ...)
2008-02-16 0:45 ` [patch 07/17] SLUB: Add KICKABLE to avoid repeated kick() attempts Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 09/17] Buffer heads: Support slab defrag Christoph Lameter
` (9 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0053-SLUB-Trigger-defragmentation-from-memory-reclaim.patch --]
[-- Type: text/plain, Size: 5589 bytes --]
This patch triggers slab defragmentation from memory reclaim.
The logical point for this is after slab shrinking was performed in
vmscan.c. At that point the fragmentation ratio of a slab was increased
because objects were freed via the LRUs. So we call kmem_cache_defrag() from
there.
slab_shrink() from vmscan.c is called in some contexts to do
global shrinking of slabs and in others to do shrinking for
a particular zone. Pass the zone to slab_shrink, so that slab_shrink
can call kmem_cache_defrag() and restrict the defragmentation to
the node that is under memory pressure.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/drop_caches.c | 2 +-
include/linux/mm.h | 2 +-
mm/vmscan.c | 26 +++++++++++++++++++-------
3 files changed, 21 insertions(+), 9 deletions(-)
Index: linux-2.6/fs/drop_caches.c
===================================================================
--- linux-2.6.orig/fs/drop_caches.c 2008-02-14 15:19:11.833503223 -0800
+++ linux-2.6/fs/drop_caches.c 2008-02-15 15:47:14.688851790 -0800
@@ -50,7 +50,7 @@ void drop_slab(void)
int nr_objects;
do {
- nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+ nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
} while (nr_objects > 10);
}
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2008-02-14 15:20:25.582017250 -0800
+++ linux-2.6/mm/vmscan.c 2008-02-15 15:47:14.724851829 -0800
@@ -173,10 +173,18 @@ EXPORT_SYMBOL(unregister_shrinker);
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages)
+ unsigned long lru_pages, struct zone *zone)
{
struct shrinker *shrinker;
unsigned long ret = 0;
@@ -233,6 +241,8 @@ unsigned long shrink_slab(unsigned long
shrinker->nr += total_scan;
}
up_read(&shrinker_rwsem);
+ if (gfp_mask & __GFP_FS)
+ kmem_cache_defrag(zone ? zone_to_nid(zone) : -1);
return ret;
}
@@ -1352,7 +1362,7 @@ static unsigned long do_try_to_free_page
* over limit cgroups
*/
if (scan_global_lru(sc)) {
- shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+ shrink_slab(sc->nr_scanned, gfp_mask, lru_pages, NULL);
if (reclaim_state) {
nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
@@ -1581,7 +1591,7 @@ loop_again:
nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
- lru_pages);
+ lru_pages, zone);
nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
@@ -1822,7 +1832,7 @@ unsigned long shrink_all_memory(unsigned
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
if (!reclaim_state.reclaimed_slab)
break;
@@ -1860,7 +1870,7 @@ unsigned long shrink_all_memory(unsigned
reclaim_state.reclaimed_slab = 0;
shrink_slab(sc.nr_scanned, sc.gfp_mask,
- count_lru_pages());
+ count_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
@@ -1877,7 +1887,8 @@ unsigned long shrink_all_memory(unsigned
if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+ shrink_slab(nr_pages, sc.gfp_mask,
+ count_lru_pages(), NULL);
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
}
@@ -2040,7 +2051,8 @@ static int __zone_reclaim(struct zone *z
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+ zone) &&
zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
slab_reclaimable - nr_pages)
;
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h 2008-02-14 15:20:04.897873207 -0800
+++ linux-2.6/include/linux/mm.h 2008-02-15 15:47:14.736851869 -0800
@@ -1191,7 +1191,7 @@ int in_gate_area_no_task(unsigned long a
int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
- unsigned long lru_pages);
+ unsigned long lru_pages, struct zone *z);
void drop_pagecache(void);
void drop_slab(void);
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 09/17] Buffer heads: Support slab defrag
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (7 preceding siblings ...)
2008-02-16 0:45 ` [patch 08/17] SLUB: Trigger defragmentation from memory reclaim Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 10/17] inodes: Support generic defragmentation Christoph Lameter
` (8 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0054-Buffer-heads-Support-slab-defrag.patch --]
[-- Type: text/plain, Size: 3491 bytes --]
Defragmentation support for buffer heads. We convert the references to
buffers to struct page references and try to remove the buffers from
those pages. If the pages are dirty then trigger writeout so that the
buffer heads can be removed later.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/buffer.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 101 insertions(+)
Index: linux-2.6/fs/buffer.c
===================================================================
--- linux-2.6.orig/fs/buffer.c 2008-02-14 15:19:11.157498510 -0800
+++ linux-2.6/fs/buffer.c 2008-02-15 15:49:05.537213877 -0800
@@ -3257,6 +3257,106 @@ int bh_submit_read(struct buffer_head *b
}
EXPORT_SYMBOL(bh_submit_read);
+/*
+ * Writeback a page to clean the dirty state
+ */
+static void trigger_write(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+ int rc;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = 1,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .nonblocking = 1,
+ .for_reclaim = 0
+ };
+
+ if (!mapping->a_ops->writepage)
+ /* No write method for the address space */
+ return;
+
+ if (!clear_page_dirty_for_io(page))
+ /* Someone else already triggered a write */
+ return;
+
+ rc = mapping->a_ops->writepage(page, &wbc);
+ if (rc < 0)
+ /* I/O Error writing */
+ return;
+
+ if (rc == AOP_WRITEPAGE_ACTIVATE)
+ unlock_page(page);
+}
+
+/*
+ * Get references on buffers.
+ *
+ * We obtain references on the page that uses the buffer. v[i] will point to
+ * the corresponding page after get_buffers() is through.
+ *
+ * We are safe from the underlying page being removed simply by doing
+ * a get_page_unless_zero. The buffer head removal may race at will.
+ * try_to_free_buffes will later take appropriate locks to remove the
+ * buffers if they are still there.
+ */
+static void *get_buffers(struct kmem_cache *s, int nr, void **v)
+{
+ struct page *page;
+ struct buffer_head *bh;
+ int i,j;
+ int n = 0;
+
+ for (i = 0; i < nr; i++) {
+ bh = v[i];
+ v[i] = NULL;
+
+ page = bh->b_page;
+
+ if (page && PagePrivate(page)) {
+ for (j = 0; j < n; j++)
+ if (page == v[j])
+ goto cont;
+ }
+
+ if (get_page_unless_zero(page))
+ v[n++] = page;
+cont: ;
+ }
+ return NULL;
+}
+
+/*
+ * Despite its name: kick_buffers operates on a list of pointers to
+ * page structs that was setup by get_buffer
+ */
+static void kick_buffers(struct kmem_cache *s, int nr, void **v,
+ void *private)
+{
+ struct page *page;
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ page = v[i];
+
+ if (!page || PageWriteback(page))
+ continue;
+
+
+ if (!TestSetPageLocked(page)) {
+ if (PageDirty(page))
+ trigger_write(page);
+ else {
+ if (PagePrivate(page))
+ try_to_free_buffers(page);
+ unlock_page(page);
+ }
+ }
+ put_page(page);
+ }
+}
+
static void
init_buffer_head(struct kmem_cache *cachep, void *data)
{
@@ -3275,6 +3375,7 @@ void __init buffer_init(void)
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
SLAB_MEM_SPREAD),
init_buffer_head);
+ kmem_cache_setup_defrag(bh_cachep, get_buffers, kick_buffers);
/*
* Limit the bh occupancy to 10% of ZONE_NORMAL
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 10/17] inodes: Support generic defragmentation
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (8 preceding siblings ...)
2008-02-16 0:45 ` [patch 09/17] Buffer heads: Support slab defrag Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 11/17] FS: ExtX filesystem defrag Christoph Lameter
` (7 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0055-inodes-Support-generic-defragmentation.patch --]
[-- Type: text/plain, Size: 4295 bytes --]
This implements the ability to remove inodes in a particular slab
from inode caches. In order to remove an inode we may have to write out
the pages of an inode, the inode itself and remove the dentries referring
to the node.
Provide generic functionality that can be used by filesystems that have
their own inode caches to also tie into the defragmentation functions
that are made available here.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/inode.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 6 +++
2 files changed, 102 insertions(+)
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c 2008-02-14 15:19:12.457507589 -0800
+++ linux-2.6/fs/inode.c 2008-02-15 15:49:22.309268623 -0800
@@ -1370,6 +1370,101 @@ static int __init set_ihash_entries(char
}
__setup("ihash_entries=", set_ihash_entries);
+void *get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ int i;
+
+ spin_lock(&inode_lock);
+ for (i = 0; i < nr; i++) {
+ struct inode *inode = v[i];
+
+ if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+ v[i] = NULL;
+ else
+ __iget(inode);
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(get_inodes);
+
+/*
+ * Function for filesystems that embedd struct inode into their own
+ * structures. The offset is the offset of the struct inode in the fs inode.
+ */
+void *fs_get_inodes(struct kmem_cache *s, int nr, void **v,
+ unsigned long offset)
+{
+ int i;
+
+ for (i = 0; i < nr; i++)
+ v[i] += offset;
+
+ return get_inodes(s, nr, v);
+}
+EXPORT_SYMBOL(fs_get_inodes);
+
+void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
+{
+ struct inode *inode;
+ int i;
+ int abort = 0;
+ LIST_HEAD(freeable);
+ struct super_block *sb;
+
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+ if (!inode)
+ continue;
+
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ if (remove_inode_buffers(inode))
+ invalidate_mapping_pages(&inode->i_data,
+ 0, -1);
+ }
+
+ /* Invalidate children and dentry */
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *d = d_find_alias(inode);
+
+ if (d) {
+ d_invalidate(d);
+ dput(d);
+ }
+ }
+
+ if (inode->i_state & I_DIRTY)
+ write_inode_now(inode, 1);
+
+ d_prune_aliases(inode);
+ }
+
+ mutex_lock(&iprune_mutex);
+ for (i = 0; i < nr; i++) {
+ inode = v[i];
+ if (!inode)
+ continue;
+
+ sb = inode->i_sb;
+ iput(inode);
+ if (abort || !(sb->s_flags & MS_ACTIVE))
+ continue;
+
+ spin_lock(&inode_lock);
+ abort = !can_unuse(inode);
+
+ if (!abort) {
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ inodes_stat.nr_unused--;
+ }
+ spin_unlock(&inode_lock);
+ }
+ dispose_list(&freeable);
+ mutex_unlock(&iprune_mutex);
+}
+EXPORT_SYMBOL(kick_inodes);
+
/*
* Initialize the waitqueues and inode hash table.
*/
@@ -1409,6 +1504,7 @@ void __init inode_init(void)
SLAB_MEM_SPREAD),
init_once);
register_shrinker(&icache_shrinker);
+ kmem_cache_setup_defrag(inode_cachep, get_inodes, kick_inodes);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h 2008-02-15 10:48:35.463846699 -0800
+++ linux-2.6/include/linux/fs.h 2008-02-15 15:49:22.309268623 -0800
@@ -1787,6 +1787,12 @@ static inline void insert_inode_hash(str
__insert_inode_hash(inode, inode->i_ino);
}
+/* Helper functions for inode defragmentation support in filesystems */
+extern void kick_inodes(struct kmem_cache *, int, void **, void *);
+extern void *get_inodes(struct kmem_cache *, int nr, void **);
+extern void *fs_get_inodes(struct kmem_cache *, int nr, void **,
+ unsigned long offset);
+
extern struct file * get_empty_filp(void);
extern void file_move(struct file *f, struct list_head *list);
extern void file_kill(struct file *f);
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 11/17] FS: ExtX filesystem defrag
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (9 preceding siblings ...)
2008-02-16 0:45 ` [patch 10/17] inodes: Support generic defragmentation Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 12/17] FS: XFS slab defragmentation Christoph Lameter
` (6 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0056-FS-ExtX-filesystem-defrag.patch --]
[-- Type: text/plain, Size: 3086 bytes --]
Support defragmentation for extX filesystem inodes
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/ext2/super.c | 9 +++++++++
fs/ext3/super.c | 8 ++++++++
fs/ext4/super.c | 8 ++++++++
3 files changed, 25 insertions(+)
Index: linux-2.6.24-rc6-mm1/fs/ext2/super.c
===================================================================
--- linux-2.6.24-rc6-mm1.orig/fs/ext2/super.c 2007-12-26 17:47:01.987405542 -0800
+++ linux-2.6.24-rc6-mm1/fs/ext2/super.c 2007-12-27 12:04:37.798315149 -0800
@@ -171,6 +171,12 @@ static void init_once(struct kmem_cache
inode_init_once(&ei->vfs_inode);
}
+static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext2_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
@@ -180,6 +186,9 @@ static int init_inodecache(void)
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
+
+ kmem_cache_setup_defrag(ext2_inode_cachep,
+ ext2_get_inodes, kick_inodes);
return 0;
}
Index: linux-2.6.24-rc6-mm1/fs/ext3/super.c
===================================================================
--- linux-2.6.24-rc6-mm1.orig/fs/ext3/super.c 2007-12-26 17:47:01.995405564 -0800
+++ linux-2.6.24-rc6-mm1/fs/ext3/super.c 2007-12-27 12:04:37.802315408 -0800
@@ -484,6 +484,12 @@ static void init_once(struct kmem_cache
inode_init_once(&ei->vfs_inode);
}
+static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext3_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
@@ -493,6 +499,8 @@ static int init_inodecache(void)
init_once);
if (ext3_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(ext3_inode_cachep,
+ ext3_get_inodes, kick_inodes);
return 0;
}
Index: linux-2.6.24-rc6-mm1/fs/ext4/super.c
===================================================================
--- linux-2.6.24-rc6-mm1.orig/fs/ext4/super.c 2007-12-26 17:47:02.011405842 -0800
+++ linux-2.6.24-rc6-mm1/fs/ext4/super.c 2007-12-27 12:04:37.814315317 -0800
@@ -600,6 +600,12 @@ static void init_once(struct kmem_cache
inode_init_once(&ei->vfs_inode);
}
+static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct ext4_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
@@ -609,6 +615,8 @@ static int init_inodecache(void)
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(ext4_inode_cachep,
+ ext4_get_inodes, kick_inodes);
return 0;
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 12/17] FS: XFS slab defragmentation
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (10 preceding siblings ...)
2008-02-16 0:45 ` [patch 11/17] FS: ExtX filesystem defrag Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 13/17] FS: Proc filesystem support for slab defrag Christoph Lameter
` (5 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0057-FS-XFS-slab-defragmentation.patch --]
[-- Type: text/plain, Size: 1046 bytes --]
Support inode defragmentation for xfs
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/xfs/linux-2.6/xfs_super.c | 1 +
1 file changed, 1 insertion(+)
Index: linux-2.6/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_super.c 2008-02-14 15:19:13.781516819 -0800
+++ linux-2.6/fs/xfs/linux-2.6/xfs_super.c 2008-02-15 15:49:28.377288588 -0800
@@ -862,6 +862,7 @@ xfs_init_zones(void)
xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
if (!xfs_ioend_zone)
goto out_destroy_vnode_zone;
+ kmem_cache_setup_defrag(xfs_vnode_zone, get_inodes, kick_inodes);
xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
xfs_ioend_zone);
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 13/17] FS: Proc filesystem support for slab defrag
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (11 preceding siblings ...)
2008-02-16 0:45 ` [patch 12/17] FS: XFS slab defragmentation Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 14/17] FS: Slab defrag: Reiserfs support Christoph Lameter
` (4 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0058-FS-Proc-filesystem-support-for-slab-defrag.patch --]
[-- Type: text/plain, Size: 1319 bytes --]
Support procfs inode defragmentation
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/proc/inode.c | 8 ++++++++
1 file changed, 8 insertions(+)
Index: linux-2.6.24-rc6-mm1/fs/proc/inode.c
===================================================================
--- linux-2.6.24-rc6-mm1.orig/fs/proc/inode.c 2007-12-26 17:47:03.223412165 -0800
+++ linux-2.6.24-rc6-mm1/fs/proc/inode.c 2007-12-27 12:04:43.742341773 -0800
@@ -104,6 +104,12 @@ static void init_once(struct kmem_cache
inode_init_once(&ei->vfs_inode);
}
+static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct proc_inode, vfs_inode));
+};
+
int __init proc_init_inodecache(void)
{
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
@@ -111,6 +117,8 @@ int __init proc_init_inodecache(void)
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_PANIC),
init_once);
+ kmem_cache_setup_defrag(proc_inode_cachep,
+ proc_get_inodes, kick_inodes);
return 0;
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 14/17] FS: Slab defrag: Reiserfs support
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (12 preceding siblings ...)
2008-02-16 0:45 ` [patch 13/17] FS: Proc filesystem support for slab defrag Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 15/17] FS: Socket inode defragmentation Christoph Lameter
` (3 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0059-FS-Slab-defrag-Reiserfs-support.patch --]
[-- Type: text/plain, Size: 1328 bytes --]
Slab defragmentation: Support reiserfs inode defragmentation
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/reiserfs/super.c | 8 ++++++++
1 file changed, 8 insertions(+)
Index: linux-2.6.24-rc6-mm1/fs/reiserfs/super.c
===================================================================
--- linux-2.6.24-rc6-mm1.orig/fs/reiserfs/super.c 2007-12-26 17:47:05.407423958 -0800
+++ linux-2.6.24-rc6-mm1/fs/reiserfs/super.c 2007-12-27 12:04:46.718354502 -0800
@@ -532,6 +532,12 @@ static void init_once(struct kmem_cache
#endif
}
+static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct reiserfs_inode_info, vfs_inode));
+}
+
static int init_inodecache(void)
{
reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
@@ -542,6 +548,8 @@ static int init_inodecache(void)
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(reiserfs_inode_cachep,
+ reiserfs_get_inodes, kick_inodes);
return 0;
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 15/17] FS: Socket inode defragmentation
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (13 preceding siblings ...)
2008-02-16 0:45 ` [patch 14/17] FS: Slab defrag: Reiserfs support Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 16/17] dentries: Add constructor Christoph Lameter
` (2 subsequent siblings)
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0060-FS-Socket-inode-defragmentation.patch --]
[-- Type: text/plain, Size: 1228 bytes --]
Support inode defragmentation for sockets
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
net/socket.c | 8 ++++++++
1 file changed, 8 insertions(+)
Index: mm/net/socket.c
===================================================================
--- mm.orig/net/socket.c 2007-11-28 12:28:01.311962427 -0800
+++ mm/net/socket.c 2007-11-28 12:31:46.383962876 -0800
@@ -269,6 +269,12 @@ static void init_once(struct kmem_cache
inode_init_once(&ei->vfs_inode);
}
+static void *sock_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+ return fs_get_inodes(s, nr, v,
+ offsetof(struct socket_alloc, vfs_inode));
+}
+
static int init_inodecache(void)
{
sock_inode_cachep = kmem_cache_create("sock_inode_cache",
@@ -280,6 +286,8 @@ static int init_inodecache(void)
init_once);
if (sock_inode_cachep == NULL)
return -ENOMEM;
+ kmem_cache_setup_defrag(sock_inode_cachep,
+ sock_get_inodes, kick_inodes);
return 0;
}
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 16/17] dentries: Add constructor
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (14 preceding siblings ...)
2008-02-16 0:45 ` [patch 15/17] FS: Socket inode defragmentation Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-16 0:45 ` [patch 17/17] dentries: dentry defragmentation Christoph Lameter
2008-02-23 8:07 ` [patch 00/17] Slab Fragmentation Reduction V10 Andrew Morton
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0061-dentries-Add-constructor.patch --]
[-- Type: text/plain, Size: 2412 bytes --]
In order to support defragmentation on the dentry cache we need to have
a determined object state at all times. Without a constructor the object
would have a random state after allocation.
So provide a constructor.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/dcache.c | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c 2008-02-15 10:48:35.011844303 -0800
+++ linux-2.6/fs/dcache.c 2008-02-15 15:49:39.169323892 -0800
@@ -870,6 +870,16 @@ static struct shrinker dcache_shrinker =
.seeks = DEFAULT_SEEKS,
};
+void dcache_ctor(struct kmem_cache *s, void *p)
+{
+ struct dentry *dentry = p;
+
+ spin_lock_init(&dentry->d_lock);
+ dentry->d_inode = NULL;
+ INIT_LIST_HEAD(&dentry->d_lru);
+ INIT_LIST_HEAD(&dentry->d_alias);
+}
+
/**
* d_alloc - allocate a dcache entry
* @parent: parent of entry to allocate
@@ -907,8 +917,6 @@ struct dentry *d_alloc(struct dentry * p
atomic_set(&dentry->d_count, 1);
dentry->d_flags = DCACHE_UNHASHED;
- spin_lock_init(&dentry->d_lock);
- dentry->d_inode = NULL;
dentry->d_parent = NULL;
dentry->d_sb = NULL;
dentry->d_op = NULL;
@@ -918,9 +926,7 @@ struct dentry *d_alloc(struct dentry * p
dentry->d_cookie = NULL;
#endif
INIT_HLIST_NODE(&dentry->d_hash);
- INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
- INIT_LIST_HEAD(&dentry->d_alias);
if (parent) {
dentry->d_parent = dget(parent);
@@ -2098,14 +2104,10 @@ static void __init dcache_init(void)
{
int loop;
- /*
- * A constructor could be added for stable state like the lists,
- * but it is probably not worth it because of the cache nature
- * of the dcache.
- */
- dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-
+ dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry),
+ 0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD,
+ dcache_ctor);
+
register_shrinker(&dcache_shrinker);
/* Hash may have been set up in dcache_init_early */
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* [patch 17/17] dentries: dentry defragmentation
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (15 preceding siblings ...)
2008-02-16 0:45 ` [patch 16/17] dentries: Add constructor Christoph Lameter
@ 2008-02-16 0:45 ` Christoph Lameter
2008-02-23 8:07 ` [patch 00/17] Slab Fragmentation Reduction V10 Andrew Morton
17 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-16 0:45 UTC (permalink / raw)
To: akpm; +Cc: linux-mm, Mel Gorman, andi
[-- Attachment #1: 0062-dentries-dentry-defragmentation.patch --]
[-- Type: text/plain, Size: 4291 bytes --]
The dentry pruning for unused entries works in a straightforward way. It
could be made more aggressive if one would actually move dentries instead
of just reclaiming them.
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
---
fs/dcache.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 100 insertions(+), 1 deletion(-)
Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c 2008-02-15 15:49:39.169323892 -0800
+++ linux-2.6/fs/dcache.c 2008-02-15 15:49:46.301347228 -0800
@@ -31,6 +31,7 @@
#include <linux/seqlock.h>
#include <linux/swap.h>
#include <linux/bootmem.h>
+#include <linux/backing-dev.h>
#include "internal.h"
@@ -143,7 +144,10 @@ static struct dentry *d_kill(struct dent
list_del(&dentry->d_u.d_child);
dentry_stat.nr_dentry--; /* For d_free, below */
- /*drops the locks, at that point nobody can reach this dentry */
+ /*
+ * drops the locks, at that point nobody (aside from defrag)
+ * can reach this dentry
+ */
dentry_iput(dentry);
parent = dentry->d_parent;
d_free(dentry);
@@ -2100,6 +2104,100 @@ static void __init dcache_init_early(voi
INIT_HLIST_HEAD(&dentry_hashtable[loop]);
}
+/*
+ * The slab allocator is holding off frees. We can safely examine
+ * the object without the danger of it vanishing from under us.
+ */
+static void *get_dentries(struct kmem_cache *s, int nr, void **v)
+{
+ struct dentry *dentry;
+ int i;
+
+ spin_lock(&dcache_lock);
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+
+ /*
+ * Three sorts of dentries cannot be reclaimed:
+ *
+ * 1. dentries that are in the process of being allocated
+ * or being freed. In that case the dentry is neither
+ * on the LRU nor hashed.
+ *
+ * 2. Fake hashed entries as used for anonymous dentries
+ * and pipe I/O. The fake hashed entries have d_flags
+ * set to indicate a hashed entry. However, the
+ * d_hash field indicates that the entry is not hashed.
+ *
+ * 3. dentries that have a backing store that is not
+ * writable. This is true for tmpsfs and other in
+ * memory filesystems. Removing dentries from them
+ * would loose dentries for good.
+ */
+ if ((d_unhashed(dentry) && list_empty(&dentry->d_lru)) ||
+ (!d_unhashed(dentry) && hlist_unhashed(&dentry->d_hash)) ||
+ (dentry->d_inode &&
+ !mapping_cap_writeback_dirty(dentry->d_inode->i_mapping)))
+ /* Ignore this dentry */
+ v[i] = NULL;
+ else
+ /* dget_locked will remove the dentry from the LRU */
+ dget_locked(dentry);
+ }
+ spin_unlock(&dcache_lock);
+ return NULL;
+}
+
+/*
+ * Slab has dropped all the locks. Get rid of the refcount obtained
+ * earlier and also free the object.
+ */
+static void kick_dentries(struct kmem_cache *s,
+ int nr, void **v, void *private)
+{
+ struct dentry *dentry;
+ int i;
+
+ /*
+ * First invalidate the dentries without holding the dcache lock
+ */
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+
+ if (dentry)
+ d_invalidate(dentry);
+ }
+
+ /*
+ * If we are the last one holding a reference then the dentries can
+ * be freed. We need the dcache_lock.
+ */
+ spin_lock(&dcache_lock);
+ for (i = 0; i < nr; i++) {
+ dentry = v[i];
+ if (!dentry)
+ continue;
+
+ spin_lock(&dentry->d_lock);
+ if (atomic_read(&dentry->d_count) > 1) {
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&dcache_lock);
+ dput(dentry);
+ spin_lock(&dcache_lock);
+ continue;
+ }
+
+ prune_one_dentry(dentry);
+ }
+ spin_unlock(&dcache_lock);
+
+ /*
+ * dentries are freed using RCU so we need to wait until RCU
+ * operations are complete
+ */
+ synchronize_rcu();
+}
+
static void __init dcache_init(void)
{
int loop;
@@ -2109,6 +2207,7 @@ static void __init dcache_init(void)
dcache_ctor);
register_shrinker(&dcache_shrinker);
+ kmem_cache_setup_defrag(dentry_cache, get_dentries, kick_dentries);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [patch 00/17] Slab Fragmentation Reduction V10
2008-02-16 0:45 [patch 00/17] Slab Fragmentation Reduction V10 Christoph Lameter
` (16 preceding siblings ...)
2008-02-16 0:45 ` [patch 17/17] dentries: dentry defragmentation Christoph Lameter
@ 2008-02-23 8:07 ` Andrew Morton
2008-02-23 14:20 ` Andi Kleen
17 siblings, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2008-02-23 8:07 UTC (permalink / raw)
To: Christoph Lameter; +Cc: linux-mm, Mel Gorman, andi
On Fri, 15 Feb 2008 16:45:26 -0800 Christoph Lameter <clameter@sgi.com> wrote:
> Slab fragmentation is mainly an issue if Linux is used as a fileserver
> and large amounts of dentries, inodes and buffer heads accumulate. In some
> load situations the slabs become very sparsely populated so that a lot of
> memory is wasted by slabs that only contain one or a few objects. In
> extreme cases the performance of a machine will become sluggish since
> we are continually running reclaim. Slab defragmentation adds the
> capability to recover the memory that is wasted.
I'm somewhat reluctant to consider this because it is slub-only, and slub
doesn't appear to be doing so well on the performance front wrt slab.
We do need to make one of those implementations go away, and if it's slub
that goes, we have a lump of defrag code hanging around in core VFS which
isn't used by anything.
So I think the first thing we need to do is to establish that slub is
viable as our only slab allocator (ignoring slob here). And if that means
tweaking the heck out of slub until it's competitive, we would be
duty-bound to ask "how fast will slab be if we do that much tweaking to
it as well".
Another basis for comparison is "which one uses the lowest-order
allocations to achieve its performance".
Of course, current performance isn't the only thing - it could be that slub
enables features such as defrag which wouldn't be possible with slab. We
can discuss that.
But one of these implementations needs to go away, and that decision
shouldn't be driven by the fact that we happen to have already implemented
some additional features on top of one of them.
hm?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [patch 00/17] Slab Fragmentation Reduction V10
2008-02-23 8:07 ` [patch 00/17] Slab Fragmentation Reduction V10 Andrew Morton
@ 2008-02-23 14:20 ` Andi Kleen
2008-02-27 19:22 ` Christoph Lameter
0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2008-02-23 14:20 UTC (permalink / raw)
To: Andrew Morton; +Cc: Christoph Lameter, linux-mm, Mel Gorman, andi
I personally would really like to see d/icache fragmentation in
one form or another. It's a serious long standing Linux issue
that would be really good to solve finally.
> So I think the first thing we need to do is to establish that slub is
> viable as our only slab allocator (ignoring slob here). And if that means
> tweaking the heck out of slub until it's competitive, we would be
> duty-bound to ask "how fast will slab be if we do that much tweaking to
> it as well".
There's another aspect: slab is quite unreadable and very hairy code.
slub is much cleaner. On the maintainability front slub wins easily.
> Another basis for comparison is "which one uses the lowest-order
> allocations to achieve its performance".
That's an important point I agree. It directly translates into
reliability under load and that is very important.
> But one of these implementations needs to go away, and that decision
I don't think slab is a good candidate to keep because it's so hard
to hack on. Especially since the slab NUMA changes the code flow and
data structures are really really hairy and I doubt there are many people
left who understand it. e.g. I tracked down an RT bug in slab some
time ago and it was a really unpleasant experience.
In the end even if it is slightly slower today the code
that is easiest to improve will be faster/better longer term.
I'm a little sceptical about the high order allocations in slub too
though. Christoph seems to think they're not a big deal, but that is
against a lot of conventional Linux wisdom at least.
That is one area that probably needs to be explored more.
-Andi
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [patch 00/17] Slab Fragmentation Reduction V10
2008-02-23 14:20 ` Andi Kleen
@ 2008-02-27 19:22 ` Christoph Lameter
0 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2008-02-27 19:22 UTC (permalink / raw)
To: Andi Kleen; +Cc: Andrew Morton, linux-mm, Mel Gorman
On Sat, 23 Feb 2008, Andi Kleen wrote:
> I'm a little sceptical about the high order allocations in slub too
> though. Christoph seems to think they're not a big deal, but that is
> against a lot of conventional Linux wisdom at least.
>
> That is one area that probably needs to be explored more.
Well there is a patchset that I posted recently that allows any slub alloc
to fallback to an order 0 alloc. That is something slab cannot do.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 27+ messages in thread