[patch 09/19] slub: Trigger defragmentation from memory reclaim

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 09/19] slub: Trigger defragmentation from memory reclaim
  2008-05-10  2:21 [patch 00/19] Slab Fragmentation Reduction V13 Christoph Lameter
@ 2008-05-10  2:21 ` Christoph Lameter
  0 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-05-10  2:21 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0009-SLUB-Trigger-defragmentation-from-memory-reclaim.patch --]
[-- Type: text/plain, Size: 10466 bytes --]

This patch triggers slab defragmentation from memory reclaim. The logical
point for this is after slab shrinking was performed in vmscan.c. At that point
the fragmentation ratio of a slab was increased because objects were freed via
the LRU lists maitained for various slab caches.
So we call kmem_cache_defrag() from there.

shrink_slab() is called in some contexts to do global shrinking
of slabs and in others to do shrinking for a particular zone. Pass the zone to
shrink_slab(), so that slab_shrink() can call kmem_cache_defrag() and restrict
the defragmentation to the node that is under memory pressure.

The callback frequency into slab reclaim can be controlled by a new field
/proc/sys/vm/slab_defrag_limit.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 Documentation/sysctl/vm.txt |   12 ++++++++
 fs/drop_caches.c            |    2 -
 include/linux/mm.h          |    3 --
 include/linux/mmzone.h      |    1 
 include/linux/swap.h        |    3 ++
 kernel/sysctl.c             |   20 +++++++++++++
 mm/vmscan.c                 |   65 +++++++++++++++++++++++++++++++++++++++-----
 mm/vmstat.c                 |    2 +
 8 files changed, 98 insertions(+), 10 deletions(-)

Index: linux-2.6/fs/drop_caches.c
===================================================================
--- linux-2.6.orig/fs/drop_caches.c	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/fs/drop_caches.c	2008-07-31 12:18:58.000000000 -0500
@@ -58,7 +58,7 @@
 	int nr_objects;
 
 	do {
-		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
 	} while (nr_objects > 10);
 }
 
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/include/linux/mm.h	2008-07-31 12:18:58.000000000 -0500
@@ -1283,8 +1283,7 @@
 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages);
-
+				unsigned long lru_pages, struct zone *z);
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
 #else
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/mm/vmscan.c	2008-07-31 12:18:58.000000000 -0500
@@ -150,6 +150,14 @@
 EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
+
+/*
+ * Trigger a call into slab defrag if the sum of the returns from
+ * shrinkers cross this value.
+ */
+int slab_defrag_limit = 1000;
+int slab_defrag_counter;
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -167,10 +175,18 @@
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+			unsigned long lru_pages, struct zone *zone)
 {
 	struct shrinker *shrinker;
 	unsigned long ret = 0;
@@ -227,6 +243,39 @@
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
+
+
+	/* Avoid dirtying cachelines */
+	if (!ret)
+		return 0;
+
+	/*
+	 * "ret" doesnt really contain the freed object count. The shrinkers
+	 * fake it. Gotta go with what we are getting though.
+	 *
+	 * Handling of the defrag_counter is also racy. If we get the
+	 * wrong counts then we may unnecessarily do a defrag pass or defer
+	 * one. "ret" is already faked. So this is just increasing
+	 * the already existing fuzziness to get some notion as to when
+	 * to initiate slab defrag which will hopefully be okay.
+	 */
+	if (zone) {
+		/* balance_pgdat running on a zone so we only scan one node */
+		zone->slab_defrag_counter += ret;
+		if (zone->slab_defrag_counter > slab_defrag_limit &&
+						(gfp_mask & __GFP_FS)) {
+			zone->slab_defrag_counter = 0;
+			kmem_cache_defrag(zone_to_nid(zone));
+		}
+	} else {
+		/* Direct (and thus global) reclaim. Scan all nodes */
+		slab_defrag_counter += ret;
+		if (slab_defrag_counter > slab_defrag_limit &&
+						(gfp_mask & __GFP_FS)) {
+			slab_defrag_counter = 0;
+			kmem_cache_defrag(-1);
+		}
+	}
 	return ret;
 }
 
@@ -1379,7 +1428,7 @@
 		 * over limit cgroups
 		 */
 		if (scan_global_lru(sc)) {
-			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL);
 			if (reclaim_state) {
 				nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
@@ -1606,7 +1655,7 @@
 				nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
-						lru_pages);
+						lru_pages, zone);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone_is_all_unreclaimable(zone))
@@ -1845,7 +1894,7 @@
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
-		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+		shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
 		if (!reclaim_state.reclaimed_slab)
 			break;
 
@@ -1883,7 +1932,7 @@
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					count_lru_pages());
+					count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
@@ -1900,7 +1949,7 @@
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
@@ -2062,7 +2111,8 @@
 		 * Note that shrink_slab will free memory on all zones and may
 		 * take a long time.
 		 */
-		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+		while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+						zone) &&
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
 				slab_reclaimable - nr_pages)
 			;
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/include/linux/mmzone.h	2008-07-31 12:18:58.000000000 -0500
@@ -256,6 +256,7 @@
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		pages_scanned;	   /* since last reclaim */
+	unsigned long		slab_defrag_counter; /* since last defrag */
 	unsigned long		flags;		   /* zone flags, see below */
 
 	/* Zone statistics */
Index: linux-2.6/include/linux/swap.h
===================================================================
--- linux-2.6.orig/include/linux/swap.h	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/include/linux/swap.h	2008-07-31 12:18:58.000000000 -0500
@@ -188,6 +188,9 @@
 extern int __isolate_lru_page(struct page *page, int mode);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
+extern int slab_defrag_limit;
+extern int slab_defrag_counter;
+
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
 
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/kernel/sysctl.c	2008-07-31 12:18:58.000000000 -0500
@@ -1071,6 +1071,26 @@
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slab_defrag_limit",
+		.data		= &slab_defrag_limit,
+		.maxlen		= sizeof(slab_defrag_limit),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one_hundred,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slab_defrag_count",
+		.data		= &slab_defrag_counter,
+		.maxlen		= sizeof(slab_defrag_counter),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
 		.ctl_name	= VM_LEGACY_VA_LAYOUT,
Index: linux-2.6/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.orig/Documentation/sysctl/vm.txt	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/Documentation/sysctl/vm.txt	2008-07-31 12:18:58.000000000 -0500
@@ -38,6 +38,7 @@
 - numa_zonelist_order
 - nr_hugepages
 - nr_overcommit_hugepages
+- slab_defrag_limit
 
 ==============================================================
 
@@ -347,3 +348,14 @@
 nr_hugepages + nr_overcommit_hugepages.
 
 See Documentation/vm/hugetlbpage.txt
+
+==============================================================
+
+slab_defrag_limit
+
+Determines the frequency of calls from reclaim into slab defragmentation.
+Slab defrag reclaims objects from sparsely populates slab pages.
+The default is 1000. Increase if slab defragmentation occurs
+too frequently. Decrease if more slab defragmentation passes
+are needed. The slabinfo tool can report on the frequency of the callbacks.
+
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2008-07-31 12:18:56.000000000 -0500
+++ linux-2.6/mm/vmstat.c	2008-07-31 12:18:58.000000000 -0500
@@ -714,10 +714,12 @@
 #endif
 	}
 	seq_printf(m,
+		   "\n  slab_defrag_count: %lu"
 		   "\n  all_unreclaimable: %u"
 		   "\n  prev_priority:     %i"
 		   "\n  start_pfn:         %lu",
-			   zone_is_all_unreclaimable(zone),
+			zone->slab_defrag_counter,
+			zone_is_all_unreclaimable(zone),
 		   zone->prev_priority,
 		   zone->zone_start_pfn);
 	seq_putc(m, '\n');

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 00/19] Slab Fragmentation Reduction V14
@ 2008-08-11 15:06 Christoph Lameter
  2008-08-11 15:06 ` [patch 01/19] slub: Add defrag_ratio field and sysfs support Christoph Lameter
                   ` (19 more replies)
  0 siblings, 20 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, linux-kernel, linux-fsdevel, Mel Gorman, andi, Rik van Riel,
	mpm, Dave Chinner

V13->V14
- Rediff against linux-next on request of Andrew
- TestSetPageLocked -> trylock_page conversion.

Slab fragmentation is mainly an issue if Linux is used as a fileserver
and large amounts of dentries, inodes and buffer heads accumulate. In some
load situations the slabs become very sparsely populated so that a lot of
memory is wasted by slabs that only contain one or a few objects. In
extreme cases the performance of a machine will become sluggish since
we are continually running reclaim without much succes.
Slab defragmentation adds the capability to recover the memory that
is wasted.

Memory reclaim for the following slab caches is possible:

1. dentry cache
2. inode cache (with a generic interface to allow easy setup of more
   filesystems than the currently supported ext2/3/4 reiserfs, XFS
   and proc)
3. buffer_heads

One typical mechanism that triggers slab defragmentation on my systems
is the daily run of

	updatedb

Updatedb scans all files on the system which causes a high inode and dentry
use. After updatedb is complete we need to go back to the regular use
patterns (typical on my machine: kernel compiles). Those need the memory now
for different purposes. The inodes and dentries used for updatedb will
gradually be aged by the dentry/inode reclaim algorithm which will free
up the dentries and inode entries randomly through the slabs that were
allocated. As a result the slabs will become sparsely populated. If they
become empty then they can be freed but a lot of them will remain sparsely
populated. That is where slab defrag comes in: It removes the objects from
the slabs with just a few entries reclaiming more memory for other uses.
In the simplest case (as provided here) this is done by simply reclaiming
the objects.

However, if the logic in the kick() function is made more
sophisticated then we will be able to move the objects out of the slabs.
Allocations of objects is possible if a slab is fragmented without the use of
the page allocator because a large number of free slots are available. Moving
an object will reduce fragmentation in the slab the object is moved to.

V12->v13:
- Rebase onto Linux 2.6.27-rc1 (deal with page flags conversion, ctor parameters etc)
- Fix unitialized variable issue

V11->V12:
- Pekka and me fixed various minor issues pointed out by Andrew.
- Split ext2/3/4 defrag support patches.
- Add more documentation
- Revise the way that slab defrag is triggered from reclaim. No longer
  use a timeout but track the amount of slab reclaim done by the shrinkers.
  Add a field in /proc/sys/vm/slab_defrag_limit to control the threshold.
- Display current slab_defrag_counters in /proc/zoneinfo (for a zone) and
  /proc/sys/vm/slab_defrag_count (for global reclaim).
- Add new config vaue slab_defrag_limit to /proc/sys/vm/slab_defrag_limit
- Add a patch that obsoletes SLAB and explains why SLOB does not support
  defrag (Either of those could be theoretically equipped to support
  slab defrag in some way but it seems that Andrew/Linus want to reduce
  the number of slab allocators).

V10->V11
- Simplify determination when to reclaim: Just scan over all partials
  and check if they are sparsely populated.
- Add support for performance counters
- Rediff on top of current slab-mm.
- Reduce frequency of scanning. A look at the stats showed that we
  were calling into reclaim very frequently when the system was under
  memory pressure which slowed things down. Various measures to
  avoid scanning the partial list too frequently were added and the
  earlier (expensive) method of determining the defrag ratio of the slab
  cache as a whole was dropped. I think this addresses the issues that
  Mel saw with V10.

V9->V10
- Rediff against upstream

V8->V9
- Rediff against 2.6.24-rc6-mm1

V7->V8
- Rediff against 2.6.24-rc3-mm2

V6->V7
- Rediff against 2.6.24-rc2-mm1
- Remove lumpy reclaim support. No point anymore given that the antifrag
  handling in 2.6.24-rc2 puts reclaimable slabs into different sections.
  Targeted reclaim never triggers. This has to wait until we make
  slabs movable or we need to perform a special version of lumpy reclaim
  in SLUB while we scan the partial lists for slabs to kick out.
  Removal simplifies handling significantly since we
  get to slabs in a more controlled way via the partial lists.
  The patchset now provides pure reduction of fragmentation levels.
- SLAB/SLOB: Provide inlines that do nothing
- Fix various smaller issues that were brought up during review of V6.

V5->V6
- Rediff against 2.6.24-rc2 + mm slub patches.
- Add reviewed by lines.
- Take out the experimental code to make slab pages movable. That
  has to wait until this has been considered by Mel.

V4->V5:
- Support lumpy reclaim for slabs
- Support reclaim via slab_shrink()
- Add constructors to insure a consistent object state at all times.

V3->V4:
- Optimize scan for slabs that need defragmentation
- Add /sys/slab/*/defrag_ratio to allow setting defrag limits
  per slab.
- Add support for buffer heads.
- Describe how the cleanup after the daily updatedb can be
  improved by slab defragmentation.

V2->V3
- Support directory reclaim
- Add infrastructure to trigger defragmentation after slab shrinking if we
  have slabs with a high degree of fragmentation.

V1->V2
- Clean up control flow using a state variable. Simplify API. Back to 2
  functions that now take arrays of objects.
- Inode defrag support for a set of filesystems
- Fix up dentry defrag support to work on negative dentries by adding
  a new dentry flag that indicates that a dentry is not in the process
  of being freed or allocated.

-- 
-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 01/19] slub: Add defrag_ratio field and sysfs support.
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-13  0:40   ` Greg KH
  2008-08-11 15:06 ` [patch 02/19] slub: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
                   ` (18 subsequent siblings)
  19 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0001-SLUB-Add-defrag_ratio-field-and-sysfs-support.patch --]
[-- Type: text/plain, Size: 2709 bytes --]

The defrag_ratio is used to set the threshold at which defragmentation
should be attempted on a slab page.

The allocation ratio is measured by the percentage of the available slots
allocated.

Add a defrag ratio field and set it to 30% by default. A limit of 30% specified
that less than 3 out of 10 available slots for objects are in use before
slab defragmeentation runs.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 include/linux/slub_def.h |    7 +++++++
 mm/slub.c                |   23 +++++++++++++++++++++++
 2 files changed, 30 insertions(+)

Index: linux-next/include/linux/slub_def.h
===================================================================
--- linux-next.orig/include/linux/slub_def.h	2008-08-11 07:42:32.301017715 -0700
+++ linux-next/include/linux/slub_def.h	2008-08-11 07:50:11.382348462 -0700
@@ -89,6 +89,13 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
+	int defrag_ratio;	/*
+				 * Ratio used to check the percentage of
+				 * objects allocate in a slab page.
+				 * If less than this ratio is allocated
+				 * then reclaim attempts are made.
+				 */
+
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
 #ifdef CONFIG_SLUB_DEBUG
Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:42:34.392348172 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:50:11.802347432 -0700
@@ -2324,6 +2324,7 @@ static int kmem_cache_open(struct kmem_c
 		goto error;
 
 	s->refcount = 1;
+	s->defrag_ratio = 30;
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 100;
 #endif
@@ -4056,6 +4057,27 @@ static ssize_t free_calls_show(struct km
 }
 SLAB_ATTR_RO(free_calls);
 
+static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf)
+{
+	return sprintf(buf, "%d\n", s->defrag_ratio);
+}
+
+static ssize_t defrag_ratio_store(struct kmem_cache *s,
+				const char *buf, size_t length)
+{
+	unsigned long ratio;
+	int err;
+
+	err = strict_strtoul(buf, 10, &ratio);
+	if (err)
+		return err;
+
+	if (ratio < 100)
+		s->defrag_ratio = ratio;
+	return length;
+}
+SLAB_ATTR(defrag_ratio);
+
 #ifdef CONFIG_NUMA
 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
 {
@@ -4163,6 +4185,7 @@ static struct attribute *slab_attrs[] = 
 	&shrink_attr.attr,
 	&alloc_calls_attr.attr,
 	&free_calls_attr.attr,
+	&defrag_ratio_attr.attr,
 #ifdef CONFIG_ZONE_DMA
 	&cache_dma_attr.attr,
 #endif

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 02/19] slub: Replace ctor field with ops field in /sys/slab/*
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
  2008-08-11 15:06 ` [patch 01/19] slub: Add defrag_ratio field and sysfs support Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 03/19] slub: Add get() and kick() methods Christoph Lameter
                   ` (17 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0002-SLUB-Replace-ctor-field-with-ops-field-in-sys-slab.patch --]
[-- Type: text/plain, Size: 1548 bytes --]

Create an ops field in /sys/slab/*/ops to contain all the operations defined
on a slab. This will be used to display the additional operations that will
be defined soon.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 mm/slub.c |   16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:47:01.030203735 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:50:11.402353636 -0700
@@ -3828,16 +3828,18 @@ static ssize_t order_show(struct kmem_ca
 }
 SLAB_ATTR(order);
 
-static ssize_t ctor_show(struct kmem_cache *s, char *buf)
+static ssize_t ops_show(struct kmem_cache *s, char *buf)
 {
-	if (s->ctor) {
-		int n = sprint_symbol(buf, (unsigned long)s->ctor);
+	int x = 0;
 
-		return n + sprintf(buf + n, "\n");
+	if (s->ctor) {
+		x += sprintf(buf + x, "ctor : ");
+		x += sprint_symbol(buf + x, (unsigned long)s->ctor);
+		x += sprintf(buf + x, "\n");
 	}
-	return 0;
+	return x;
 }
-SLAB_ATTR_RO(ctor);
+SLAB_ATTR_RO(ops);
 
 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 {
@@ -4170,7 +4172,7 @@ static struct attribute *slab_attrs[] = 
 	&slabs_attr.attr,
 	&partial_attr.attr,
 	&cpu_slabs_attr.attr,
-	&ctor_attr.attr,
+	&ops_attr.attr,
 	&aliases_attr.attr,
 	&align_attr.attr,
 	&sanity_checks_attr.attr,

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 03/19] slub: Add get() and kick() methods
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
  2008-08-11 15:06 ` [patch 01/19] slub: Add defrag_ratio field and sysfs support Christoph Lameter
  2008-08-11 15:06 ` [patch 02/19] slub: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 04/19] slub: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
                   ` (16 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0003-SLUB-Add-get-and-kick-methods.patch --]
[-- Type: text/plain, Size: 5621 bytes --]

Add the two methods needed for defragmentation and add the display of the
methods via the proc interface.

Add documentation explaining the use of these methods and the prototypes
for slab.h. Add functions to setup the defrag methods for a slab cache.

Add empty functions for SLAB/SLOB. The API is generic so it
could be theoretically implemented for either allocator.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 include/linux/slab.h     |   50 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slub_def.h |    3 ++
 mm/slub.c                |   29 ++++++++++++++++++++++++++-
 3 files changed, 81 insertions(+), 1 deletion(-)

Index: linux-next/include/linux/slub_def.h
===================================================================
--- linux-next.orig/include/linux/slub_def.h	2008-08-11 07:47:01.022359011 -0700
+++ linux-next/include/linux/slub_def.h	2008-08-11 07:50:09.262348432 -0700
@@ -87,6 +87,9 @@ struct kmem_cache {
 	gfp_t allocflags;	/* gfp flags to use on each alloc */
 	int refcount;		/* Refcount for slab cache destroy */
 	void (*ctor)(void *);
+	kmem_defrag_get_func *get;
+	kmem_defrag_kick_func *kick;
+
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	int defrag_ratio;	/*
Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:47:01.328753936 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:50:11.002349094 -0700
@@ -2761,6 +2761,19 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
+void kmem_cache_setup_defrag(struct kmem_cache *s,
+	kmem_defrag_get_func get, kmem_defrag_kick_func kick)
+{
+	/*
+	 * Defragmentable slabs must have a ctor otherwise objects may be
+	 * in an undetermined state after they are allocated.
+	 */
+	BUG_ON(!s->ctor);
+	s->get = get;
+	s->kick = kick;
+}
+EXPORT_SYMBOL(kmem_cache_setup_defrag);
+
 /*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
  * the remaining slabs by the number of items in use. The slabs with the
@@ -3054,7 +3067,7 @@ static int slab_unmergeable(struct kmem_
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
-	if (s->ctor)
+	if (s->ctor || s->kick || s->get)
 		return 1;
 
 	/*
@@ -3837,6 +3850,20 @@ static ssize_t ops_show(struct kmem_cach
 		x += sprint_symbol(buf + x, (unsigned long)s->ctor);
 		x += sprintf(buf + x, "\n");
 	}
+
+	if (s->get) {
+		x += sprintf(buf + x, "get : ");
+		x += sprint_symbol(buf + x,
+				(unsigned long)s->get);
+		x += sprintf(buf + x, "\n");
+	}
+
+	if (s->kick) {
+		x += sprintf(buf + x, "kick : ");
+		x += sprint_symbol(buf + x,
+				(unsigned long)s->kick);
+		x += sprintf(buf + x, "\n");
+	}
 	return x;
 }
 SLAB_ATTR_RO(ops);
Index: linux-next/include/linux/slab.h
===================================================================
--- linux-next.orig/include/linux/slab.h	2008-08-11 07:42:32.301017715 -0700
+++ linux-next/include/linux/slab.h	2008-08-11 07:50:10.542348059 -0700
@@ -109,6 +109,56 @@ void kfree(const void *);
 size_t ksize(const void *);
 
 /*
+ * Function prototypes passed to kmem_cache_defrag() to enable defragmentation
+ * and targeted reclaim in slab caches.
+ */
+
+/*
+ * kmem_cache_defrag_get_func() is called with locks held so that the slab
+ * objects cannot be freed. We are in an atomic context and no slab
+ * operations may be performed. The purpose of kmem_cache_defrag_get_func()
+ * is to obtain a stable refcount on the objects, so that they cannot be
+ * removed until kmem_cache_kick_func() has handled them.
+ *
+ * Parameters passed are the number of objects to process and an array of
+ * pointers to objects for which we need references.
+ *
+ * Returns a pointer that is passed to the kick function. If any objects
+ * cannot be moved then the pointer may indicate a failure and
+ * then kick can simply remove the references that were already obtained.
+ *
+ * The object pointer array passed is also passed to kmem_cache_defrag_kick().
+ * The function may remove objects from the array by setting pointers to
+ * NULL. This is useful if we can determine that an object is already about
+ * to be removed. In that case it is often impossible to obtain the necessary
+ * refcount.
+ */
+typedef void *kmem_defrag_get_func(struct kmem_cache *, int, void **);
+
+/*
+ * kmem_cache_defrag_kick_func is called with no locks held and interrupts
+ * enabled. Sleeping is possible. Any operation may be performed in kick().
+ * kmem_cache_defrag should free all the objects in the pointer array.
+ *
+ * Parameters passed are the number of objects in the array, the array of
+ * pointers to the objects and the pointer returned by kmem_cache_defrag_get().
+ *
+ * Success is checked by examining the number of remaining objects in the slab.
+ */
+typedef void kmem_defrag_kick_func(struct kmem_cache *, int, void **, void *);
+
+/*
+ * kmem_cache_setup_defrag() is used to setup callbacks for a slab cache.
+ */
+#ifdef CONFIG_SLUB
+void kmem_cache_setup_defrag(struct kmem_cache *, kmem_defrag_get_func,
+						kmem_defrag_kick_func);
+#else
+static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
+	kmem_defrag_get_func get, kmem_defrag_kick_func kiok) {}
+#endif
+
+/*
  * Allocator specific definitions. These are mainly used to establish optimized
  * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
  * selecting the appropriate general cache at compile time.

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 04/19] slub: Sort slab cache list and establish maximum objects for defrag slabs
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (2 preceding siblings ...)
  2008-08-11 15:06 ` [patch 03/19] slub: Add get() and kick() methods Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 05/19] slub: Slab defrag core Christoph Lameter
                   ` (15 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0004-SLUB-Sort-slab-cache-list-and-establish-maximum-obj.patch --]
[-- Type: text/plain, Size: 2790 bytes --]

When defragmenting slabs then it is advantageous to have all
defragmentable slabs together at the beginning of the list so that there is
no need to scan the complete list. Put defragmentable caches first when adding
a slab cache and others last.

Determine the maximum number of objects in defragmentable slabs. This allows
to size the allocation of arrays holding refs to these objects later.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 mm/slub.c |   26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:47:01.362349055 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:50:10.532348432 -0700
@@ -174,6 +174,9 @@ static enum {
 static DECLARE_RWSEM(slub_lock);
 static LIST_HEAD(slab_caches);
 
+/* Maximum objects in defragmentable slabs */
+static unsigned int max_defrag_slab_objects;
+
 /*
  * Tracking user of a slab.
  */
@@ -2531,7 +2534,7 @@ static struct kmem_cache *create_kmalloc
 								flags, NULL))
 		goto panic;
 
-	list_add(&s->list, &slab_caches);
+	list_add_tail(&s->list, &slab_caches);
 	up_write(&slub_lock);
 	if (sysfs_slab_add(s))
 		goto panic;
@@ -2761,9 +2764,23 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
+/*
+ * Allocate a slab scratch space that is sufficient to keep at least
+ * max_defrag_slab_objects pointers to individual objects and also a bitmap
+ * for max_defrag_slab_objects.
+ */
+static inline void *alloc_scratch(void)
+{
+	return kmalloc(max_defrag_slab_objects * sizeof(void *) +
+		BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long),
+		GFP_KERNEL);
+}
+
 void kmem_cache_setup_defrag(struct kmem_cache *s,
 	kmem_defrag_get_func get, kmem_defrag_kick_func kick)
 {
+	int max_objects = oo_objects(s->max);
+
 	/*
 	 * Defragmentable slabs must have a ctor otherwise objects may be
 	 * in an undetermined state after they are allocated.
@@ -2771,6 +2788,11 @@ void kmem_cache_setup_defrag(struct kmem
 	BUG_ON(!s->ctor);
 	s->get = get;
 	s->kick = kick;
+	down_write(&slub_lock);
+	list_move(&s->list, &slab_caches);
+	if (max_objects > max_defrag_slab_objects)
+		max_defrag_slab_objects = max_objects;
+	up_write(&slub_lock);
 }
 EXPORT_SYMBOL(kmem_cache_setup_defrag);
 
@@ -3156,7 +3178,7 @@ struct kmem_cache *kmem_cache_create(con
 	if (s) {
 		if (kmem_cache_open(s, GFP_KERNEL, name,
 				size, align, flags, ctor)) {
-			list_add(&s->list, &slab_caches);
+			list_add_tail(&s->list, &slab_caches);
 			up_write(&slub_lock);
 			if (sysfs_slab_add(s))
 				goto err;

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 05/19] slub: Slab defrag core
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (3 preceding siblings ...)
  2008-08-11 15:06 ` [patch 04/19] slub: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 06/19] slub: Add KICKABLE to avoid repeated kick() attempts Christoph Lameter
                   ` (14 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0005-SLUB-Slab-defrag-core.patch --]
[-- Type: text/plain, Size: 13004 bytes --]

Slab defragmentation may occur:

1. Unconditionally when kmem_cache_shrink is called on a slab cache by the
   kernel calling kmem_cache_shrink.

2. Through the use of the slabinfo command.

3. Per node defrag conditionally when kmem_cache_defrag(<node>) is called
   (can be called from reclaim code with a later patch).

   Defragmentation is only performed if the fragmentation of the slab
   is lower than the specified percentage. Fragmentation ratios are measured
   by calculating the percentage of objects in use compared to the total
   number of objects that the slab page can accomodate.

   The scanning of slab caches is optimized because the
   defragmentable slabs come first on the list. Thus we can terminate scans
   on the first slab encountered that does not support defragmentation.

   kmem_cache_defrag() takes a node parameter. This can either be -1 if
   defragmentation should be performed on all nodes, or a node number.

A couple of functions must be setup via a call to kmem_cache_setup_defrag()
in order for a slabcache to support defragmentation. These are

kmem_defrag_get_func (void *get(struct kmem_cache *s, int nr, void **objects))

	Must obtain a reference to the listed objects. SLUB guarantees that
	the objects are still allocated. However, other threads may be blocked
	in slab_free() attempting to free objects in the slab. These may succeed
	as soon as get() returns to the slab allocator. The function must
	be able to detect such situations and void the attempts to free such
	objects (by for example voiding the corresponding entry in the objects
	array).

	No slab operations may be performed in get(). Interrupts
	are disabled. What can be done is very limited. The slab lock
	for the page that contains the object is taken. Any attempt to perform
	a slab operation may lead to a deadlock.

	kmem_defrag_get_func returns a private pointer that is passed to
	kmem_defrag_kick_func(). Should we be unable to obtain all references
	then that pointer may indicate to the kick() function that it should
	not attempt any object removal or move but simply remove the
	reference counts.

kmem_defrag_kick_func (void kick(struct kmem_cache *, int nr, void **objects,
							void *get_result))

	After SLUB has established references to the objects in a
	slab it will then drop all locks and use kick() to move objects out
	of the slab. The existence of the object is guaranteed by virtue of
	the earlier obtained references via kmem_defrag_get_func(). The
	callback may perform any slab operation since no locks are held at
	the time of call.

	The callback should remove the object from the slab in some way. This
	may be accomplished by reclaiming the object and then running
	kmem_cache_free() or reallocating it and then running
	kmem_cache_free(). Reallocation is advantageous because the partial
	slabs were just sorted to have the partial slabs with the most objects
	first. Reallocation is likely to result in filling up a slab in
	addition to freeing up one slab. A filled up slab can also be removed
	from the partial list. So there could be a double effect.

	kmem_defrag_kick_func() does not return a result. SLUB will check
	the number of remaining objects in the slab. If all objects were
	removed then the operation was successful.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 include/linux/slab.h |    3 
 mm/slub.c            |  265 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 215 insertions(+), 53 deletions(-)

Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:47:01.472348849 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:50:10.102347910 -0700
@@ -128,10 +128,10 @@
 
 /*
  * Maximum number of desirable partial slabs.
- * The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in the.
+ * More slabs cause kmem_cache_shrink to sort the slabs by objects
+ * and triggers slab defragmentation.
  */
-#define MAX_PARTIAL 10
+#define MAX_PARTIAL 20
 
 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 				SLAB_POISON | SLAB_STORE_USER)
@@ -2797,76 +2797,235 @@ void kmem_cache_setup_defrag(struct kmem
 EXPORT_SYMBOL(kmem_cache_setup_defrag);
 
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * Vacate all objects in the given slab.
  *
- * The slabs with the least items are placed last. This results in them
- * being allocated from last increasing the chance that the last objects
- * are freed in them.
+ * The scratch aread passed to list function is sufficient to hold
+ * struct listhead times objects per slab. We use it to hold void ** times
+ * objects per slab plus a bitmap for each object.
  */
-int kmem_cache_shrink(struct kmem_cache *s)
+static int kmem_cache_vacate(struct page *page, void *scratch)
 {
-	int node;
-	int i;
-	struct kmem_cache_node *n;
-	struct page *page;
-	struct page *t;
-	int objects = oo_objects(s->max);
-	struct list_head *slabs_by_inuse =
-		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+	void **vector = scratch;
+	void *p;
+	void *addr = page_address(page);
+	struct kmem_cache *s;
+	unsigned long *map;
+	int leftover;
+	int count;
+	void *private;
 	unsigned long flags;
+	unsigned long objects;
 
-	if (!slabs_by_inuse)
-		return -ENOMEM;
+	local_irq_save(flags);
+	slab_lock(page);
 
-	flush_all(s);
-	for_each_node_state(node, N_NORMAL_MEMORY) {
-		n = get_node(s, node);
+	BUG_ON(!PageSlab(page));	/* Must be s slab page */
+	BUG_ON(!SlabFrozen(page));	/* Slab must have been frozen earlier */
+
+	s = page->slab;
+	objects = page->objects;
+	map = scratch + objects * sizeof(void **);
+	if (!page->inuse || !s->kick)
+		goto out;
+
+	/* Determine used objects */
+	bitmap_fill(map, objects);
+	for_each_free_object(p, s, page->freelist)
+		__clear_bit(slab_index(p, s, addr), map);
+
+	/* Build vector of pointers to objects */
+	count = 0;
+	memset(vector, 0, objects * sizeof(void **));
+	for_each_object(p, s, addr, objects)
+		if (test_bit(slab_index(p, s, addr), map))
+			vector[count++] = p;
+
+	private = s->get(s, count, vector);
+
+	/*
+	 * Got references. Now we can drop the slab lock. The slab
+	 * is frozen so it cannot vanish from under us nor will
+	 * allocations be performed on the slab. However, unlocking the
+	 * slab will allow concurrent slab_frees to proceed.
+	 */
+	slab_unlock(page);
+	local_irq_restore(flags);
+
+	/*
+	 * Perform the KICK callbacks to remove the objects.
+	 */
+	s->kick(s, count, vector, private);
+
+	local_irq_save(flags);
+	slab_lock(page);
+out:
+	/*
+	 * Check the result and unfreeze the slab
+	 */
+	leftover = page->inuse;
+	unfreeze_slab(s, page, leftover > 0);
+	local_irq_restore(flags);
+	return leftover;
+}
+
+/*
+ * Remove objects from a list of slab pages that have been gathered.
+ * Must be called with slabs that have been isolated before.
+ *
+ * kmem_cache_reclaim() is never called from an atomic context. It
+ * allocates memory for temporary storage. We are holding the
+ * slub_lock semaphore which prevents another call into
+ * the defrag logic.
+ */
+int kmem_cache_reclaim(struct list_head *zaplist)
+{
+	int freed = 0;
+	void **scratch;
+	struct page *page;
+	struct page *page2;
+
+	if (list_empty(zaplist))
+		return 0;
+
+	scratch = alloc_scratch();
+	if (!scratch)
+		return 0;
+
+	list_for_each_entry_safe(page, page2, zaplist, lru) {
+		list_del(&page->lru);
+		if (kmem_cache_vacate(page, scratch) == 0)
+			freed++;
+	}
+	kfree(scratch);
+	return freed;
+}
+
+/*
+ * Shrink the slab cache on a particular node of the cache
+ * by releasing slabs with zero objects and trying to reclaim
+ * slabs with less than the configured percentage of objects allocated.
+ */
+static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node,
+							unsigned long limit)
+{
+	unsigned long flags;
+	struct page *page, *page2;
+	LIST_HEAD(zaplist);
+	int freed = 0;
+	struct kmem_cache_node *n = get_node(s, node);
 
-		if (!n->nr_partial)
+	if (n->nr_partial <= limit)
+		return 0;
+
+	spin_lock_irqsave(&n->list_lock, flags);
+	list_for_each_entry_safe(page, page2, &n->partial, lru) {
+		if (!slab_trylock(page))
+			/* Busy slab. Get out of the way */
 			continue;
 
-		for (i = 0; i < objects; i++)
-			INIT_LIST_HEAD(slabs_by_inuse + i);
+		if (page->inuse) {
+			if (page->inuse * 100 >=
+					s->defrag_ratio * page->objects) {
+				slab_unlock(page);
+				/* Slab contains enough objects */
+				continue;
+			}
 
-		spin_lock_irqsave(&n->list_lock, flags);
+			list_move(&page->lru, &zaplist);
+			if (s->kick) {
+				n->nr_partial--;
+				SetSlabFrozen(page);
+			}
+			slab_unlock(page);
+		} else {
+			/* Empty slab page */
+			list_del(&page->lru);
+			n->nr_partial--;
+			slab_unlock(page);
+			discard_slab(s, page);
+			freed++;
+		}
+	}
 
+	if (!s->kick)
 		/*
-		 * Build lists indexed by the items in use in each slab.
+		 * No defrag methods. By simply putting the zaplist at the
+		 * end of the partial list we can let them simmer longer
+		 * and thus increase the chance of all objects being
+		 * reclaimed.
 		 *
-		 * Note that concurrent frees may occur while we hold the
-		 * list_lock. page->inuse here is the upper limit.
+		 * We have effectively sorted the partial list and put
+		 * the slabs with more objects first. As soon as they
+		 * are allocated they are going to be removed from the
+		 * partial list.
 		 */
-		list_for_each_entry_safe(page, t, &n->partial, lru) {
-			if (!page->inuse && slab_trylock(page)) {
-				/*
-				 * Must hold slab lock here because slab_free
-				 * may have freed the last object and be
-				 * waiting to release the slab.
-				 */
-				list_del(&page->lru);
-				n->nr_partial--;
-				slab_unlock(page);
-				discard_slab(s, page);
-			} else {
-				list_move(&page->lru,
-				slabs_by_inuse + page->inuse);
-			}
-		}
+		list_splice(&zaplist, n->partial.prev);
+
+
+	spin_unlock_irqrestore(&n->list_lock, flags);
+
+	if (s->kick)
+		freed += kmem_cache_reclaim(&zaplist);
+
+	return freed;
+}
+
+/*
+ * Defrag slabs conditional on the amount of fragmentation in a page.
+ */
+int kmem_cache_defrag(int node)
+{
+	struct kmem_cache *s;
+	unsigned long slabs = 0;
+
+	/*
+	 * kmem_cache_defrag may be called from the reclaim path which may be
+	 * called for any page allocator alloc. So there is the danger that we
+	 * get called in a situation where slub already acquired the slub_lock
+	 * for other purposes.
+	 */
+	if (!down_read_trylock(&slub_lock))
+		return 0;
+
+	list_for_each_entry(s, &slab_caches, list) {
+		unsigned long reclaimed = 0;
 
 		/*
-		 * Rebuild the partial list with the slabs filled up most
-		 * first and the least used slabs at the end.
+		 * Defragmentable caches come first. If the slab cache is not
+		 * defragmentable then we can stop traversing the list.
 		 */
-		for (i = objects - 1; i >= 0; i--)
-			list_splice(slabs_by_inuse + i, n->partial.prev);
+		if (!s->kick)
+			break;
 
-		spin_unlock_irqrestore(&n->list_lock, flags);
+		if (node == -1) {
+			int nid;
+
+			for_each_node_state(nid, N_NORMAL_MEMORY)
+				reclaimed += __kmem_cache_shrink(s, nid,
+								MAX_PARTIAL);
+		} else
+			reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL);
+
+		slabs += reclaimed;
 	}
+	up_read(&slub_lock);
+	return slabs;
+}
+EXPORT_SYMBOL(kmem_cache_defrag);
+
+/*
+ * kmem_cache_shrink removes empty slabs from the partial lists.
+ * If the slab cache supports defragmentation then objects are
+ * reclaimed.
+ */
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+	int node;
+
+	flush_all(s);
+	for_each_node_state(node, N_NORMAL_MEMORY)
+		__kmem_cache_shrink(s, node, 0);
 
-	kfree(slabs_by_inuse);
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
Index: linux-next/include/linux/slab.h
===================================================================
--- linux-next.orig/include/linux/slab.h	2008-08-11 07:47:01.452357887 -0700
+++ linux-next/include/linux/slab.h	2008-08-11 07:47:01.522357705 -0700
@@ -149,13 +149,16 @@ typedef void kmem_defrag_kick_func(struc
 
 /*
  * kmem_cache_setup_defrag() is used to setup callbacks for a slab cache.
+ * kmem_cache_defrag() performs the actual defragmentation.
  */
 #ifdef CONFIG_SLUB
 void kmem_cache_setup_defrag(struct kmem_cache *, kmem_defrag_get_func,
 						kmem_defrag_kick_func);
+int kmem_cache_defrag(int node);
 #else
 static inline void kmem_cache_setup_defrag(struct kmem_cache *s,
 	kmem_defrag_get_func get, kmem_defrag_kick_func kiok) {}
+static inline int kmem_cache_defrag(int node) { return 0; }
 #endif
 
 /*

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 06/19] slub: Add KICKABLE to avoid repeated kick() attempts
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (4 preceding siblings ...)
  2008-08-11 15:06 ` [patch 05/19] slub: Slab defrag core Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 07/19] slub: Extend slabinfo to support -D and -F options Christoph Lameter
                   ` (13 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0006-SLUB-Add-KICKABLE-to-avoid-repeated-kick-attempts.patch --]
[-- Type: text/plain, Size: 3839 bytes --]

Add a flag KICKABLE to be set on slabs with a defragmentation method

Clear the flag if a kick action is not successful in reducing the
number of objects in a slab. This will avoid future attempts to
kick objects out.

The KICKABLE flag is set again when all objects of the slab have been
allocated (Occurs during removal of a slab from the partial lists).

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 include/linux/page-flags.h |    2 ++
 mm/slub.c                  |   23 ++++++++++++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:47:01.482348822 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:50:09.282347574 -0700
@@ -1138,6 +1138,9 @@ static struct page *new_slab(struct kmem
 			SLAB_STORE_USER | SLAB_TRACE))
 		__SetPageSlubDebug(page);
 
+	if (s->kick)
+		__SetPageSlubKickable(page);
+
 	start = page_address(page);
 
 	if (unlikely(s->flags & SLAB_POISON))
@@ -1181,6 +1184,7 @@ static void __free_slab(struct kmem_cach
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		-pages);
 
+	__ClearPageSlubKickable(page);
 	__ClearPageSlab(page);
 	reset_page_mapcount(page);
 	__free_pages(page, order);
@@ -1391,6 +1395,8 @@ static void unfreeze_slab(struct kmem_ca
 			if (SLABDEBUG && PageSlubDebug(page) &&
 						(s->flags & SLAB_STORE_USER))
 				add_full(n, page);
+			if (s->kick)
+				__SetPageSlubKickable(page);
 		}
 		slab_unlock(page);
 	} else {
@@ -2820,12 +2826,12 @@ static int kmem_cache_vacate(struct page
 	slab_lock(page);
 
 	BUG_ON(!PageSlab(page));	/* Must be s slab page */
-	BUG_ON(!SlabFrozen(page));	/* Slab must have been frozen earlier */
+	BUG_ON(!PageSlubFrozen(page));	/* Slab must have been frozen earlier */
 
 	s = page->slab;
 	objects = page->objects;
 	map = scratch + objects * sizeof(void **);
-	if (!page->inuse || !s->kick)
+	if (!page->inuse || !s->kick || !PageSlubKickable(page))
 		goto out;
 
 	/* Determine used objects */
@@ -2863,6 +2869,9 @@ out:
 	 * Check the result and unfreeze the slab
 	 */
 	leftover = page->inuse;
+	if (leftover)
+		/* Unsuccessful reclaim. Avoid future reclaim attempts. */
+		__ClearPageSlubKickable(page);
 	unfreeze_slab(s, page, leftover > 0);
 	local_irq_restore(flags);
 	return leftover;
@@ -2924,17 +2933,21 @@ static unsigned long __kmem_cache_shrink
 			continue;
 
 		if (page->inuse) {
-			if (page->inuse * 100 >=
+			if (!PageSlubKickable(page) || page->inuse * 100 >=
 					s->defrag_ratio * page->objects) {
 				slab_unlock(page);
-				/* Slab contains enough objects */
+				/*
+				 * Slab contains enough objects
+				 * or we alrady tried reclaim before and
+				 * it failed. Skip this one.
+				 */
 				continue;
 			}
 
 			list_move(&page->lru, &zaplist);
 			if (s->kick) {
 				n->nr_partial--;
-				SetSlabFrozen(page);
+				__SetPageSlubFrozen(page);
 			}
 			slab_unlock(page);
 		} else {
Index: linux-next/include/linux/page-flags.h
===================================================================
--- linux-next.orig/include/linux/page-flags.h	2008-08-11 07:42:31.882358716 -0700
+++ linux-next/include/linux/page-flags.h	2008-08-11 07:47:01.612357581 -0700
@@ -112,6 +112,7 @@ enum pageflags {
 	/* SLUB */
 	PG_slub_frozen = PG_active,
 	PG_slub_debug = PG_error,
+	PG_slub_kickable = PG_dirty,
 };
 
 #ifndef __GENERATING_BOUNDS_H
@@ -182,6 +183,7 @@ __PAGEFLAG(SlobFree, slob_free)
 
 __PAGEFLAG(SlubFrozen, slub_frozen)
 __PAGEFLAG(SlubDebug, slub_debug)
+__PAGEFLAG(SlubKickable, slub_kickable)
 
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 07/19] slub: Extend slabinfo to support -D and -F options
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (5 preceding siblings ...)
  2008-08-11 15:06 ` [patch 06/19] slub: Add KICKABLE to avoid repeated kick() attempts Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 08/19] slub/slabinfo: add defrag statistics Christoph Lameter
                   ` (12 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0007-SLUB-Extend-slabinfo-to-support-D-and-F-options.patch --]
[-- Type: text/plain, Size: 6131 bytes --]

-F lists caches that support defragmentation

-C lists caches that use a ctor.

Change field names for defrag_ratio and remote_node_defrag_ratio.

Add determination of the allocation ratio for a slab. The allocation ratio
is the percentage of available slots for objects in use.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 Documentation/vm/slabinfo.c |   48 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 5 deletions(-)

Index: linux-next/Documentation/vm/slabinfo.c
===================================================================
--- linux-next.orig/Documentation/vm/slabinfo.c	2008-08-11 07:40:41.832348842 -0700
+++ linux-next/Documentation/vm/slabinfo.c	2008-08-11 07:50:09.252348163 -0700
@@ -31,6 +31,8 @@ struct slabinfo {
 	int hwcache_align, object_size, objs_per_slab;
 	int sanity_checks, slab_size, store_user, trace;
 	int order, poison, reclaim_account, red_zone;
+	int defrag, ctor;
+	int defrag_ratio, remote_node_defrag_ratio;
 	unsigned long partial, objects, slabs, objects_partial, objects_total;
 	unsigned long alloc_fastpath, alloc_slowpath;
 	unsigned long free_fastpath, free_slowpath;
@@ -64,6 +66,8 @@ int show_slab = 0;
 int skip_zero = 1;
 int show_numa = 0;
 int show_track = 0;
+int show_defrag = 0;
+int show_ctor = 0;
 int show_first_alias = 0;
 int validate = 0;
 int shrink = 0;
@@ -100,13 +104,15 @@ void fatal(const char *x, ...)
 void usage(void)
 {
 	printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n"
-		"slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n"
+		"slabinfo [-aCdDefFhnpvtsz] [-d debugopts] [slab-regexp]\n"
 		"-a|--aliases           Show aliases\n"
 		"-A|--activity          Most active slabs first\n"
 		"-d<options>|--debug=<options> Set/Clear Debug options\n"
+		"-C|--ctor              Show slabs with ctors\n"
 		"-D|--display-active    Switch line format to activity\n"
 		"-e|--empty             Show empty slabs\n"
 		"-f|--first-alias       Show first alias\n"
+		"-F|--defrag            Show defragmentable caches\n"
 		"-h|--help              Show usage information\n"
 		"-i|--inverted          Inverted list\n"
 		"-l|--slabs             Show slabs\n"
@@ -296,7 +302,7 @@ void first_line(void)
 		printf("Name                   Objects      Alloc       Free   %%Fast Fallb O\n");
 	else
 		printf("Name                   Objects Objsize    Space "
-			"Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n");
+			"Slabs/Part/Cpu  O/S O %%Ra %%Ef Flg\n");
 }
 
 /*
@@ -345,7 +351,7 @@ void slab_numa(struct slabinfo *s, int m
 		return;
 
 	if (!line) {
-		printf("\n%-21s:", mode ? "NUMA nodes" : "Slab");
+		printf("\n%-21s: Rto ", mode ? "NUMA nodes" : "Slab");
 		for(node = 0; node <= highest_node; node++)
 			printf(" %4d", node);
 		printf("\n----------------------");
@@ -354,6 +360,7 @@ void slab_numa(struct slabinfo *s, int m
 		printf("\n");
 	}
 	printf("%-21s ", mode ? "All slabs" : s->name);
+	printf("%3d ", s->remote_node_defrag_ratio);
 	for(node = 0; node <= highest_node; node++) {
 		char b[20];
 
@@ -492,6 +499,8 @@ void report(struct slabinfo *s)
 		printf("** Slabs are destroyed via RCU\n");
 	if (s->reclaim_account)
 		printf("** Reclaim accounting active\n");
+	if (s->defrag)
+		printf("** Defragmentation at %d%%\n", s->defrag_ratio);
 
 	printf("\nSizes (bytes)     Slabs              Debug                Memory\n");
 	printf("------------------------------------------------------------------------\n");
@@ -539,6 +548,12 @@ void slabcache(struct slabinfo *s)
 	if (show_empty && s->slabs)
 		return;
 
+	if (show_defrag && !s->defrag)
+		return;
+
+	if (show_ctor && !s->ctor)
+		return;
+
 	store_size(size_str, slab_size(s));
 	snprintf(dist_str, 40, "%lu/%lu/%d", s->slabs - s->cpu_slabs,
 						s->partial, s->cpu_slabs);
@@ -550,6 +565,10 @@ void slabcache(struct slabinfo *s)
 		*p++ = '*';
 	if (s->cache_dma)
 		*p++ = 'd';
+	if (s->defrag)
+		*p++ = 'F';
+	if (s->ctor)
+		*p++ = 'C';
 	if (s->hwcache_align)
 		*p++ = 'A';
 	if (s->poison)
@@ -584,7 +603,8 @@ void slabcache(struct slabinfo *s)
 		printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
 			s->name, s->objects, s->object_size, size_str, dist_str,
 			s->objs_per_slab, s->order,
-			s->slabs ? (s->partial * 100) / s->slabs : 100,
+			s->slabs ? (s->partial * 100) /
+					(s->slabs * s->objs_per_slab) : 100,
 			s->slabs ? (s->objects * s->object_size * 100) /
 				(s->slabs * (page_size << s->order)) : 100,
 			flags);
@@ -1190,7 +1210,17 @@ void read_slab_dir(void)
 			slab->deactivate_to_tail = get_obj("deactivate_to_tail");
 			slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
 			slab->order_fallback = get_obj("order_fallback");
+			slab->defrag_ratio = get_obj("defrag_ratio");
+			slab->remote_node_defrag_ratio =
+					get_obj("remote_node_defrag_ratio");
 			chdir("..");
+			if (read_slab_obj(slab, "ops")) {
+				if (strstr(buffer, "ctor :"))
+					slab->ctor = 1;
+				if (strstr(buffer, "kick :"))
+					slab->defrag = 1;
+			}
+
 			if (slab->name[0] == ':')
 				alias_targets++;
 			slab++;
@@ -1241,10 +1271,12 @@ void output_slabs(void)
 struct option opts[] = {
 	{ "aliases", 0, NULL, 'a' },
 	{ "activity", 0, NULL, 'A' },
+	{ "ctor", 0, NULL, 'C' },
 	{ "debug", 2, NULL, 'd' },
 	{ "display-activity", 0, NULL, 'D' },
 	{ "empty", 0, NULL, 'e' },
 	{ "first-alias", 0, NULL, 'f' },
+	{ "defrag", 0, NULL, 'F' },
 	{ "help", 0, NULL, 'h' },
 	{ "inverted", 0, NULL, 'i'},
 	{ "numa", 0, NULL, 'n' },
@@ -1267,7 +1299,7 @@ int main(int argc, char *argv[])
 
 	page_size = getpagesize();
 
-	while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTS",
+	while ((c = getopt_long(argc, argv, "aACd::DefFhil1noprstvzTS",
 						opts, NULL)) != -1)
 		switch (c) {
 		case '1':
@@ -1323,6 +1355,12 @@ int main(int argc, char *argv[])
 		case 'z':
 			skip_zero = 0;
 			break;
+		case 'C':
+			show_ctor = 1;
+			break;
+		case 'F':
+			show_defrag = 1;
+			break;
 		case 'T':
 			show_totals = 1;
 			break;

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 08/19] slub/slabinfo: add defrag statistics
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (6 preceding siblings ...)
  2008-08-11 15:06 ` [patch 07/19] slub: Extend slabinfo to support -D and -F options Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 09/19] slub: Trigger defragmentation from memory reclaim Christoph Lameter
                   ` (11 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0008-slub-add-defrag-statistics.patch --]
[-- Type: text/plain, Size: 9338 bytes --]

Add statistics counters for slab defragmentation.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 Documentation/vm/slabinfo.c |   45 ++++++++++++++++++++++++++++++++++++--------
 include/linux/slub_def.h    |    6 +++++
 mm/slub.c                   |   24 ++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 9 deletions(-)

Index: linux-next/Documentation/vm/slabinfo.c
===================================================================
--- linux-next.orig/Documentation/vm/slabinfo.c	2008-08-11 07:47:01.749625123 -0700
+++ linux-next/Documentation/vm/slabinfo.c	2008-08-11 07:47:01.762349658 -0700
@@ -41,6 +41,9 @@ struct slabinfo {
 	unsigned long cpuslab_flush, deactivate_full, deactivate_empty;
 	unsigned long deactivate_to_head, deactivate_to_tail;
 	unsigned long deactivate_remote_frees, order_fallback;
+	unsigned long shrink_calls, shrink_attempt_defrag, shrink_empty_slab;
+	unsigned long shrink_slab_skipped, shrink_slab_reclaimed;
+	unsigned long shrink_object_reclaim_failed;
 	int numa[MAX_NODES];
 	int numa_partial[MAX_NODES];
 } slabinfo[MAX_SLABS];
@@ -79,6 +82,7 @@ int sort_active = 0;
 int set_debug = 0;
 int show_ops = 0;
 int show_activity = 0;
+int show_defragcount = 0;
 
 /* Debug options */
 int sanity = 0;
@@ -113,6 +117,7 @@ void usage(void)
 		"-e|--empty             Show empty slabs\n"
 		"-f|--first-alias       Show first alias\n"
 		"-F|--defrag            Show defragmentable caches\n"
+		"-G:--display-defrag    Display defrag counters\n"
 		"-h|--help              Show usage information\n"
 		"-i|--inverted          Inverted list\n"
 		"-l|--slabs             Show slabs\n"
@@ -300,6 +305,8 @@ void first_line(void)
 {
 	if (show_activity)
 		printf("Name                   Objects      Alloc       Free   %%Fast Fallb O\n");
+	else if (show_defragcount)
+		printf("Name                   Objects DefragRQ  Slabs Success   Empty Skipped  Failed\n");
 	else
 		printf("Name                   Objects Objsize    Space "
 			"Slabs/Part/Cpu  O/S O %%Ra %%Ef Flg\n");
@@ -466,22 +473,28 @@ void slab_stats(struct slabinfo *s)
 
 	printf("Total                %8lu %8lu\n\n", total_alloc, total_free);
 
-	if (s->cpuslab_flush)
-		printf("Flushes %8lu\n", s->cpuslab_flush);
-
-	if (s->alloc_refill)
-		printf("Refill %8lu\n", s->alloc_refill);
+	if (s->cpuslab_flush || s->alloc_refill)
+		printf("CPU Slab  : Flushes=%lu Refills=%lu\n",
+			s->cpuslab_flush, s->alloc_refill);
 
 	total = s->deactivate_full + s->deactivate_empty +
 			s->deactivate_to_head + s->deactivate_to_tail;
 
 	if (total)
-		printf("Deactivate Full=%lu(%lu%%) Empty=%lu(%lu%%) "
+		printf("Deactivate: Full=%lu(%lu%%) Empty=%lu(%lu%%) "
 			"ToHead=%lu(%lu%%) ToTail=%lu(%lu%%)\n",
 			s->deactivate_full, (s->deactivate_full * 100) / total,
 			s->deactivate_empty, (s->deactivate_empty * 100) / total,
 			s->deactivate_to_head, (s->deactivate_to_head * 100) / total,
 			s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total);
+
+	if (s->shrink_calls)
+		printf("Shrink    : Calls=%lu Attempts=%lu Empty=%lu Successful=%lu\n",
+			s->shrink_calls, s->shrink_attempt_defrag,
+			s->shrink_empty_slab, s->shrink_slab_reclaimed);
+	if (s->shrink_slab_skipped || s->shrink_object_reclaim_failed)
+		printf("Defrag    : Slabs skipped=%lu Object reclaim failed=%lu\n",
+		s->shrink_slab_skipped, s->shrink_object_reclaim_failed);
 }
 
 void report(struct slabinfo *s)
@@ -598,7 +611,12 @@ void slabcache(struct slabinfo *s)
 			total_alloc ? (s->alloc_fastpath * 100 / total_alloc) : 0,
 			total_free ? (s->free_fastpath * 100 / total_free) : 0,
 			s->order_fallback, s->order);
-	}
+	} else
+	if (show_defragcount)
+		printf("%-21s %8ld %7d %7d %7d %7d %7d %7d\n",
+			s->name, s->objects, s->shrink_calls, s->shrink_attempt_defrag,
+			s->shrink_slab_reclaimed, s->shrink_empty_slab,
+			s->shrink_slab_skipped, s->shrink_object_reclaim_failed);
 	else
 		printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
 			s->name, s->objects, s->object_size, size_str, dist_str,
@@ -1210,6 +1228,13 @@ void read_slab_dir(void)
 			slab->deactivate_to_tail = get_obj("deactivate_to_tail");
 			slab->deactivate_remote_frees = get_obj("deactivate_remote_frees");
 			slab->order_fallback = get_obj("order_fallback");
+			slab->shrink_calls = get_obj("shrink_calls");
+			slab->shrink_attempt_defrag = get_obj("shrink_attempt_defrag");
+			slab->shrink_empty_slab = get_obj("shrink_empty_slab");
+			slab->shrink_slab_skipped = get_obj("shrink_slab_skipped");
+			slab->shrink_slab_reclaimed = get_obj("shrink_slab_reclaimed");
+			slab->shrink_object_reclaim_failed =
+					get_obj("shrink_object_reclaim_failed");
 			slab->defrag_ratio = get_obj("defrag_ratio");
 			slab->remote_node_defrag_ratio =
 					get_obj("remote_node_defrag_ratio");
@@ -1274,6 +1299,7 @@ struct option opts[] = {
 	{ "ctor", 0, NULL, 'C' },
 	{ "debug", 2, NULL, 'd' },
 	{ "display-activity", 0, NULL, 'D' },
+	{ "display-defrag", 0, NULL, 'G' },
 	{ "empty", 0, NULL, 'e' },
 	{ "first-alias", 0, NULL, 'f' },
 	{ "defrag", 0, NULL, 'F' },
@@ -1299,7 +1325,7 @@ int main(int argc, char *argv[])
 
 	page_size = getpagesize();
 
-	while ((c = getopt_long(argc, argv, "aACd::DefFhil1noprstvzTS",
+	while ((c = getopt_long(argc, argv, "aACd::DefFGhil1noprstvzTS",
 						opts, NULL)) != -1)
 		switch (c) {
 		case '1':
@@ -1325,6 +1351,9 @@ int main(int argc, char *argv[])
 		case 'f':
 			show_first_alias = 1;
 			break;
+		case 'G':
+			show_defragcount = 1;
+			break;
 		case 'h':
 			usage();
 			return 0;
Index: linux-next/include/linux/slub_def.h
===================================================================
--- linux-next.orig/include/linux/slub_def.h	2008-08-11 07:47:01.362349055 -0700
+++ linux-next/include/linux/slub_def.h	2008-08-11 07:47:01.852357729 -0700
@@ -30,6 +30,12 @@ enum stat_item {
 	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
 	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
 	ORDER_FALLBACK,		/* Number of times fallback was necessary */
+	SHRINK_CALLS,		/* Number of invocations of kmem_cache_shrink */
+	SHRINK_ATTEMPT_DEFRAG,	/* Slabs that were attempted to be reclaimed */
+	SHRINK_EMPTY_SLAB,	/* Shrink encountered and freed empty slab */
+	SHRINK_SLAB_SKIPPED,	/* Slab reclaim skipped an slab (busy etc) */
+	SHRINK_SLAB_RECLAIMED,	/* Successfully reclaimed slabs */
+	SHRINK_OBJECT_RECLAIM_FAILED, /* Callbacks signaled busy objects */
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
Index: linux-next/mm/slub.c
===================================================================
--- linux-next.orig/mm/slub.c	2008-08-11 07:47:01.542349442 -0700
+++ linux-next/mm/slub.c	2008-08-11 07:47:01.932358747 -0700
@@ -2821,6 +2821,7 @@ static int kmem_cache_vacate(struct page
 	void *private;
 	unsigned long flags;
 	unsigned long objects;
+	struct kmem_cache_cpu *c;
 
 	local_irq_save(flags);
 	slab_lock(page);
@@ -2869,9 +2870,13 @@ out:
 	 * Check the result and unfreeze the slab
 	 */
 	leftover = page->inuse;
-	if (leftover)
+	c = get_cpu_slab(s, smp_processor_id());
+	if (leftover) {
 		/* Unsuccessful reclaim. Avoid future reclaim attempts. */
+		stat(c, SHRINK_OBJECT_RECLAIM_FAILED);
 		__ClearPageSlubKickable(page);
+	} else
+		stat(c, SHRINK_SLAB_RECLAIMED);
 	unfreeze_slab(s, page, leftover > 0);
 	local_irq_restore(flags);
 	return leftover;
@@ -2922,11 +2927,14 @@ static unsigned long __kmem_cache_shrink
 	LIST_HEAD(zaplist);
 	int freed = 0;
 	struct kmem_cache_node *n = get_node(s, node);
+	struct kmem_cache_cpu *c;
 
 	if (n->nr_partial <= limit)
 		return 0;
 
 	spin_lock_irqsave(&n->list_lock, flags);
+	c = get_cpu_slab(s, smp_processor_id());
+	stat(c, SHRINK_CALLS);
 	list_for_each_entry_safe(page, page2, &n->partial, lru) {
 		if (!slab_trylock(page))
 			/* Busy slab. Get out of the way */
@@ -2946,12 +2954,14 @@ static unsigned long __kmem_cache_shrink
 
 			list_move(&page->lru, &zaplist);
 			if (s->kick) {
+				stat(c, SHRINK_ATTEMPT_DEFRAG);
 				n->nr_partial--;
 				__SetPageSlubFrozen(page);
 			}
 			slab_unlock(page);
 		} else {
 			/* Empty slab page */
+			stat(c, SHRINK_EMPTY_SLAB);
 			list_del(&page->lru);
 			n->nr_partial--;
 			slab_unlock(page);
@@ -4380,6 +4390,12 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate
 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
 STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(SHRINK_CALLS, shrink_calls);
+STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag);
+STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab);
+STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped);
+STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed);
+STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed);
 #endif
 
 static struct attribute *slab_attrs[] = {
@@ -4434,6 +4450,12 @@ static struct attribute *slab_attrs[] = 
 	&deactivate_to_tail_attr.attr,
 	&deactivate_remote_frees_attr.attr,
 	&order_fallback_attr.attr,
+	&shrink_calls_attr.attr,
+	&shrink_attempt_defrag_attr.attr,
+	&shrink_empty_slab_attr.attr,
+	&shrink_slab_skipped_attr.attr,
+	&shrink_slab_reclaimed_attr.attr,
+	&shrink_object_reclaim_failed_attr.attr,
 #endif
 	NULL
 };

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 09/19] slub: Trigger defragmentation from memory reclaim
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (7 preceding siblings ...)
  2008-08-11 15:06 ` [patch 08/19] slub/slabinfo: add defrag statistics Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 10/19] buffer heads: Support slab defrag Christoph Lameter
                   ` (10 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0009-SLUB-Trigger-defragmentation-from-memory-reclaim.patch --]
[-- Type: text/plain, Size: 11112 bytes --]

This patch triggers slab defragmentation from memory reclaim. The logical
point for this is after slab shrinking was performed in vmscan.c. At that point
the fragmentation ratio of a slab was increased because objects were freed via
the LRU lists maitained for various slab caches.
So we call kmem_cache_defrag() from there.

shrink_slab() is called in some contexts to do global shrinking
of slabs and in others to do shrinking for a particular zone. Pass the zone to
shrink_slab(), so that slab_shrink() can call kmem_cache_defrag() and restrict
the defragmentation to the node that is under memory pressure.

The callback frequency into slab reclaim can be controlled by a new field
/proc/sys/vm/slab_defrag_limit.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 Documentation/sysctl/vm.txt |   12 ++++++++
 fs/drop_caches.c            |    2 -
 include/linux/mm.h          |    3 --
 include/linux/mmzone.h      |    1 
 include/linux/swap.h        |    3 ++
 kernel/sysctl.c             |   20 +++++++++++++
 mm/vmscan.c                 |   64 +++++++++++++++++++++++++++++++++++++++-----
 mm/vmstat.c                 |    4 ++
 8 files changed, 98 insertions(+), 11 deletions(-)

Index: linux-next/fs/drop_caches.c
===================================================================
--- linux-next.orig/fs/drop_caches.c	2008-08-11 07:42:10.122359386 -0700
+++ linux-next/fs/drop_caches.c	2008-08-11 07:47:01.952350081 -0700
@@ -58,7 +58,7 @@ static void drop_slab(void)
 	int nr_objects;
 
 	do {
-		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+		nr_objects = shrink_slab(1000, GFP_KERNEL, 1000, NULL);
 	} while (nr_objects > 10);
 }
 
Index: linux-next/include/linux/mm.h
===================================================================
--- linux-next.orig/include/linux/mm.h	2008-08-11 07:42:31.392357922 -0700
+++ linux-next/include/linux/mm.h	2008-08-11 07:47:02.002358245 -0700
@@ -1263,8 +1263,7 @@ int in_gate_area_no_task(unsigned long a
 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages);
-
+				unsigned long lru_pages, struct zone *z);
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
 #else
Index: linux-next/mm/vmscan.c
===================================================================
--- linux-next.orig/mm/vmscan.c	2008-08-11 07:42:34.492359354 -0700
+++ linux-next/mm/vmscan.c	2008-08-11 07:47:02.042358614 -0700
@@ -150,6 +150,14 @@ void unregister_shrinker(struct shrinker
 EXPORT_SYMBOL(unregister_shrinker);
 
 #define SHRINK_BATCH 128
+
+/*
+ * Trigger a call into slab defrag if the sum of the returns from
+ * shrinkers cross this value.
+ */
+int slab_defrag_limit = 1000;
+int slab_defrag_counter;
+
 /*
  * Call the shrink functions to age shrinkable caches
  *
@@ -167,10 +175,18 @@ EXPORT_SYMBOL(unregister_shrinker);
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  *
+ * zone is the zone for which we are shrinking the slabs. If the intent
+ * is to do a global shrink then zone may be NULL. Specification of a
+ * zone is currently only used to limit slab defragmentation to a NUMA node.
+ * The performace of shrink_slab would be better (in particular under NUMA)
+ * if it could be targeted as a whole to the zone that is under memory
+ * pressure but the VFS infrastructure does not allow that at the present
+ * time.
+ *
  * Returns the number of slab objects which we shrunk.
  */
 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
-			unsigned long lru_pages)
+			unsigned long lru_pages, struct zone *zone)
 {
 	struct shrinker *shrinker;
 	unsigned long ret = 0;
@@ -227,6 +243,39 @@ unsigned long shrink_slab(unsigned long 
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
+
+
+	/* Avoid dirtying cachelines */
+	if (!ret)
+		return 0;
+
+	/*
+	 * "ret" doesnt really contain the freed object count. The shrinkers
+	 * fake it. Gotta go with what we are getting though.
+	 *
+	 * Handling of the defrag_counter is also racy. If we get the
+	 * wrong counts then we may unnecessarily do a defrag pass or defer
+	 * one. "ret" is already faked. So this is just increasing
+	 * the already existing fuzziness to get some notion as to when
+	 * to initiate slab defrag which will hopefully be okay.
+	 */
+	if (zone) {
+		/* balance_pgdat running on a zone so we only scan one node */
+		zone->slab_defrag_counter += ret;
+		if (zone->slab_defrag_counter > slab_defrag_limit &&
+						(gfp_mask & __GFP_FS)) {
+			zone->slab_defrag_counter = 0;
+			kmem_cache_defrag(zone_to_nid(zone));
+		}
+	} else {
+		/* Direct (and thus global) reclaim. Scan all nodes */
+		slab_defrag_counter += ret;
+		if (slab_defrag_counter > slab_defrag_limit &&
+						(gfp_mask & __GFP_FS)) {
+			slab_defrag_counter = 0;
+			kmem_cache_defrag(-1);
+		}
+	}
 	return ret;
 }
 
@@ -1379,7 +1428,7 @@ static unsigned long do_try_to_free_page
 		 * over limit cgroups
 		 */
 		if (scan_global_lru(sc)) {
-			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+			shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages, NULL);
 			if (reclaim_state) {
 				nr_reclaimed += reclaim_state->reclaimed_slab;
 				reclaim_state->reclaimed_slab = 0;
@@ -1606,7 +1655,7 @@ loop_again:
 				nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
-						lru_pages);
+						lru_pages, zone);
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone_is_all_unreclaimable(zone))
@@ -1845,7 +1894,7 @@ unsigned long shrink_all_memory(unsigned
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
 		reclaim_state.reclaimed_slab = 0;
-		shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+		shrink_slab(nr_pages, sc.gfp_mask, lru_pages, NULL);
 		if (!reclaim_state.reclaimed_slab)
 			break;
 
@@ -1883,7 +1932,7 @@ unsigned long shrink_all_memory(unsigned
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					count_lru_pages());
+					count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
@@ -1900,7 +1949,7 @@ unsigned long shrink_all_memory(unsigned
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages(), NULL);
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
@@ -2062,7 +2111,8 @@ static int __zone_reclaim(struct zone *z
 		 * Note that shrink_slab will free memory on all zones and may
 		 * take a long time.
 		 */
-		while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+		while (shrink_slab(sc.nr_scanned, gfp_mask, order,
+						zone) &&
 			zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
 				slab_reclaimable - nr_pages)
 			;
Index: linux-next/include/linux/mmzone.h
===================================================================
--- linux-next.orig/include/linux/mmzone.h	2008-08-11 07:42:31.432357916 -0700
+++ linux-next/include/linux/mmzone.h	2008-08-11 07:47:02.122357983 -0700
@@ -256,6 +256,7 @@ struct zone {
 	unsigned long		nr_scan_active;
 	unsigned long		nr_scan_inactive;
 	unsigned long		pages_scanned;	   /* since last reclaim */
+	unsigned long		slab_defrag_counter; /* since last defrag */
 	unsigned long		flags;		   /* zone flags, see below */
 
 	/* Zone statistics */
Index: linux-next/include/linux/swap.h
===================================================================
--- linux-next.orig/include/linux/swap.h	2008-08-11 07:42:32.602094688 -0700
+++ linux-next/include/linux/swap.h	2008-08-11 07:47:02.182346970 -0700
@@ -188,6 +188,9 @@ extern unsigned long try_to_free_mem_cgr
 extern int __isolate_lru_page(struct page *page, int mode);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
+extern int slab_defrag_limit;
+extern int slab_defrag_counter;
+
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
 
Index: linux-next/kernel/sysctl.c
===================================================================
--- linux-next.orig/kernel/sysctl.c	2008-08-11 07:42:34.022358796 -0700
+++ linux-next/kernel/sysctl.c	2008-08-11 07:47:02.242347675 -0700
@@ -1073,6 +1073,26 @@ static struct ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slab_defrag_limit",
+		.data		= &slab_defrag_limit,
+		.maxlen		= sizeof(slab_defrag_limit),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one_hundred,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slab_defrag_count",
+		.data		= &slab_defrag_counter,
+		.maxlen		= sizeof(slab_defrag_counter),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
 		.ctl_name	= VM_LEGACY_VA_LAYOUT,
Index: linux-next/Documentation/sysctl/vm.txt
===================================================================
--- linux-next.orig/Documentation/sysctl/vm.txt	2008-08-11 07:40:41.632346812 -0700
+++ linux-next/Documentation/sysctl/vm.txt	2008-08-11 07:47:02.302358085 -0700
@@ -38,6 +38,7 @@ Currently, these files are in /proc/sys/
 - numa_zonelist_order
 - nr_hugepages
 - nr_overcommit_hugepages
+- slab_defrag_limit
 
 ==============================================================
 
@@ -347,3 +348,14 @@ Change the maximum size of the hugepage 
 nr_hugepages + nr_overcommit_hugepages.
 
 See Documentation/vm/hugetlbpage.txt
+
+==============================================================
+
+slab_defrag_limit
+
+Determines the frequency of calls from reclaim into slab defragmentation.
+Slab defrag reclaims objects from sparsely populates slab pages.
+The default is 1000. Increase if slab defragmentation occurs
+too frequently. Decrease if more slab defragmentation passes
+are needed. The slabinfo tool can report on the frequency of the callbacks.
+
Index: linux-next/mm/vmstat.c
===================================================================
--- linux-next.orig/mm/vmstat.c	2008-08-11 07:42:34.492359354 -0700
+++ linux-next/mm/vmstat.c	2008-08-11 07:47:04.222357891 -0700
@@ -714,10 +714,12 @@ static void zoneinfo_show_print(struct s
 #endif
 	}
 	seq_printf(m,
+		   "\n  slab_defrag_count: %lu"
 		   "\n  all_unreclaimable: %u"
 		   "\n  prev_priority:     %i"
 		   "\n  start_pfn:         %lu",
-			   zone_is_all_unreclaimable(zone),
+			zone->slab_defrag_counter,
+			zone_is_all_unreclaimable(zone),
 		   zone->prev_priority,
 		   zone->zone_start_pfn);
 	seq_putc(m, '\n');

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 10/19] buffer heads: Support slab defrag
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (8 preceding siblings ...)
  2008-08-11 15:06 ` [patch 09/19] slub: Trigger defragmentation from memory reclaim Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 11/19] inodes: Support generic defragmentation Christoph Lameter
                   ` (9 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0024-Buffer-heads-Support-slab-defrag.patch --]
[-- Type: text/plain, Size: 3287 bytes --]

Defragmentation support for buffer heads. We convert the references to
buffers to struct page references and try to remove the buffers from
those pages. If the pages are dirty then trigger writeout so that the
buffer heads can be removed later.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/buffer.c |   99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

Index: linux-next/fs/buffer.c
===================================================================
--- linux-next.orig/fs/buffer.c	2008-08-11 07:50:08.322348091 -0700
+++ linux-next/fs/buffer.c	2008-08-11 07:54:49.222357893 -0700
@@ -3316,6 +3316,104 @@ int bh_submit_read(struct buffer_head *b
 }
 EXPORT_SYMBOL(bh_submit_read);
 
+/*
+ * Writeback a page to clean the dirty state
+ */
+static void trigger_write(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int rc;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_NONE,
+		.nr_to_write = 1,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
+		.nonblocking = 1,
+		.for_reclaim = 0
+	};
+
+	if (!mapping->a_ops->writepage)
+		/* No write method for the address space */
+		return;
+
+	if (!clear_page_dirty_for_io(page))
+		/* Someone else already triggered a write */
+		return;
+
+	rc = mapping->a_ops->writepage(page, &wbc);
+	if (rc < 0)
+		/* I/O Error writing */
+		return;
+
+	if (rc == AOP_WRITEPAGE_ACTIVATE)
+		unlock_page(page);
+}
+
+/*
+ * Get references on buffers.
+ *
+ * We obtain references on the page that uses the buffer. v[i] will point to
+ * the corresponding page after get_buffers() is through.
+ *
+ * We are safe from the underlying page being removed simply by doing
+ * a get_page_unless_zero. The buffer head removal may race at will.
+ * try_to_free_buffes will later take appropriate locks to remove the
+ * buffers if they are still there.
+ */
+static void *get_buffers(struct kmem_cache *s, int nr, void **v)
+{
+	struct page *page;
+	struct buffer_head *bh;
+	int i, j;
+	int n = 0;
+
+	for (i = 0; i < nr; i++) {
+		bh = v[i];
+		v[i] = NULL;
+
+		page = bh->b_page;
+
+		if (page && PagePrivate(page)) {
+			for (j = 0; j < n; j++)
+				if (page == v[j])
+					continue;
+		}
+
+		if (get_page_unless_zero(page))
+			v[n++] = page;
+	}
+	return NULL;
+}
+
+/*
+ * Despite its name: kick_buffers operates on a list of pointers to
+ * page structs that was set up by get_buffer().
+ */
+static void kick_buffers(struct kmem_cache *s, int nr, void **v,
+							void *private)
+{
+	struct page *page;
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		page = v[i];
+
+		if (!page || PageWriteback(page))
+			continue;
+
+		if (trylock_page(page)) {
+			if (PageDirty(page))
+				trigger_write(page);
+			else {
+				if (PagePrivate(page))
+					try_to_free_buffers(page);
+				unlock_page(page);
+			}
+		}
+		put_page(page);
+	}
+}
+
 static void
 init_buffer_head(void *data)
 {
@@ -3334,6 +3432,7 @@ void __init buffer_init(void)
 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
 				SLAB_MEM_SPREAD),
 				init_buffer_head);
+	kmem_cache_setup_defrag(bh_cachep, get_buffers, kick_buffers);
 
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 11/19] inodes: Support generic defragmentation
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (9 preceding siblings ...)
  2008-08-11 15:06 ` [patch 10/19] buffer heads: Support slab defrag Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 12/19] Filesystem: Ext2 filesystem defrag Christoph Lameter
                   ` (8 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Alexander Viro, Christoph Hellwig, Christoph Lameter,
	Christoph Lameter, linux-kernel, linux-fsdevel, Mel Gorman, andi,
	Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0025-inodes-Support-generic-defragmentation.patch --]
[-- Type: text/plain, Size: 5241 bytes --]

This implements the ability to remove inodes in a particular slab
from inode caches. In order to remove an inode we may have to write out
the pages of an inode, the inode itself and remove the dentries referring
to the node.

Provide generic functionality that can be used by filesystems that have
their own inode caches to also tie into the defragmentation functions
that are made available here.

Cc: Alexander Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/inode.c         |  123 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |    6 ++
 2 files changed, 129 insertions(+)

Index: linux-next/fs/inode.c
===================================================================
--- linux-next.orig/fs/inode.c	2008-08-11 07:42:10.738607937 -0700
+++ linux-next/fs/inode.c	2008-08-11 07:47:04.342348902 -0700
@@ -1363,6 +1363,128 @@ static int __init set_ihash_entries(char
 __setup("ihash_entries=", set_ihash_entries);
 
 /*
+ * Obtain a refcount on a list of struct inodes pointed to by v. If the
+ * inode is in the process of being freed then zap the v[] entry so that
+ * we skip the freeing attempts later.
+ *
+ * This is a generic function for the ->get slab defrag callback.
+ */
+void *get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	int i;
+
+	spin_lock(&inode_lock);
+	for (i = 0; i < nr; i++) {
+		struct inode *inode = v[i];
+
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+			v[i] = NULL;
+		else
+			__iget(inode);
+	}
+	spin_unlock(&inode_lock);
+	return NULL;
+}
+EXPORT_SYMBOL(get_inodes);
+
+/*
+ * Function for filesystems that embedd struct inode into their own
+ * fs inode. The offset is the offset of the struct inode in the fs inode.
+ *
+ * The function adds to the pointers in v[] in order to make them point to
+ * struct inode. Then get_inodes() is used to get the refcount.
+ * The converted v[] pointers can then also be passed to the kick() callback
+ * without further processing.
+ */
+void *fs_get_inodes(struct kmem_cache *s, int nr, void **v,
+						unsigned long offset)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		v[i] += offset;
+
+	return get_inodes(s, nr, v);
+}
+EXPORT_SYMBOL(fs_get_inodes);
+
+/*
+ * Generic callback function slab defrag ->kick methods. Takes the
+ * array with inodes where we obtained refcounts using fs_get_inodes()
+ * or get_inodes() and tries to free them.
+ */
+void kick_inodes(struct kmem_cache *s, int nr, void **v, void *private)
+{
+	struct inode *inode;
+	int i;
+	int abort = 0;
+	LIST_HEAD(freeable);
+	int active;
+
+	for (i = 0; i < nr; i++) {
+		inode = v[i];
+		if (!inode)
+			continue;
+
+		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+			if (remove_inode_buffers(inode))
+				/*
+				 * Should we really be doing this? Or
+				 * limit the writeback here to only a few pages?
+				 *
+				 * Possibly an expensive operation but we
+				 * cannot reclaim the inode if the pages
+				 * are still present.
+				 */
+				invalidate_mapping_pages(&inode->i_data,
+								0, -1);
+		}
+
+		/* Invalidate children and dentry */
+		if (S_ISDIR(inode->i_mode)) {
+			struct dentry *d = d_find_alias(inode);
+
+			if (d) {
+				d_invalidate(d);
+				dput(d);
+			}
+		}
+
+		if (inode->i_state & I_DIRTY)
+			write_inode_now(inode, 1);
+
+		d_prune_aliases(inode);
+	}
+
+	mutex_lock(&iprune_mutex);
+	for (i = 0; i < nr; i++) {
+		inode = v[i];
+
+		if (!inode)
+			/* inode is alrady being freed */
+			continue;
+
+		active = inode->i_sb->s_flags & MS_ACTIVE;
+		iput(inode);
+		if (abort || !active)
+			continue;
+
+		spin_lock(&inode_lock);
+		abort =  !can_unuse(inode);
+
+		if (!abort) {
+			list_move(&inode->i_list, &freeable);
+			inode->i_state |= I_FREEING;
+			inodes_stat.nr_unused--;
+		}
+		spin_unlock(&inode_lock);
+	}
+	dispose_list(&freeable);
+	mutex_unlock(&iprune_mutex);
+}
+EXPORT_SYMBOL(kick_inodes);
+
+/*
  * Initialize the waitqueues and inode hash table.
  */
 void __init inode_init_early(void)
@@ -1401,6 +1523,7 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
+	kmem_cache_setup_defrag(inode_cachep, get_inodes, kick_inodes);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)
Index: linux-next/include/linux/fs.h
===================================================================
--- linux-next.orig/include/linux/fs.h	2008-08-11 07:42:30.598607988 -0700
+++ linux-next/include/linux/fs.h	2008-08-11 07:47:05.012377598 -0700
@@ -1846,6 +1846,12 @@ static inline void insert_inode_hash(str
 	__insert_inode_hash(inode, inode->i_ino);
 }
 
+/* Helper functions for inode defragmentation support in filesystems */
+extern void kick_inodes(struct kmem_cache *, int, void **, void *);
+extern void *get_inodes(struct kmem_cache *, int nr, void **);
+extern void *fs_get_inodes(struct kmem_cache *, int nr, void **,
+						unsigned long offset);
+
 extern struct file * get_empty_filp(void);
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 12/19] Filesystem: Ext2 filesystem defrag
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (10 preceding siblings ...)
  2008-08-11 15:06 ` [patch 11/19] inodes: Support generic defragmentation Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 13/19] Filesystem: Ext3 " Christoph Lameter
                   ` (7 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: ext2-defrag --]
[-- Type: text/plain, Size: 1104 bytes --]

Support defragmentation for ext2 filesystem inodes

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/ext2/super.c |    9 +++++++++
 1 file changed, 9 insertions(+)

Index: linux-next/fs/ext2/super.c
===================================================================
--- linux-next.orig/fs/ext2/super.c	2008-08-11 07:42:10.212350007 -0700
+++ linux-next/fs/ext2/super.c	2008-08-11 07:47:05.022348226 -0700
@@ -171,6 +171,12 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *ext2_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct ext2_inode_info, vfs_inode));
+}
+
 static int init_inodecache(void)
 {
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
@@ -180,6 +186,9 @@ static int init_inodecache(void)
 					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
+
+	kmem_cache_setup_defrag(ext2_inode_cachep,
+			ext2_get_inodes, kick_inodes);
 	return 0;
 }
 

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 13/19] Filesystem: Ext3 filesystem defrag
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (11 preceding siblings ...)
  2008-08-11 15:06 ` [patch 12/19] Filesystem: Ext2 filesystem defrag Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 14/19] Filesystem: Ext4 " Christoph Lameter
                   ` (6 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: ext3-defrag --]
[-- Type: text/plain, Size: 1101 bytes --]

Support defragmentation for ext3 filesystem inodes

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/ext3/super.c |    8 ++++++++
 1 file changed, 8 insertions(+)

Index: linux-next/fs/ext3/super.c
===================================================================
--- linux-next.orig/fs/ext3/super.c	2008-08-11 07:42:10.348607875 -0700
+++ linux-next/fs/ext3/super.c	2008-08-11 07:47:05.042348829 -0700
@@ -484,6 +484,12 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *ext3_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct ext3_inode_info, vfs_inode));
+}
+
 static int init_inodecache(void)
 {
 	ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
@@ -493,6 +499,8 @@ static int init_inodecache(void)
 					     init_once);
 	if (ext3_inode_cachep == NULL)
 		return -ENOMEM;
+	kmem_cache_setup_defrag(ext3_inode_cachep,
+			ext3_get_inodes, kick_inodes);
 	return 0;
 }
 

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 14/19] Filesystem: Ext4 filesystem defrag
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (12 preceding siblings ...)
  2008-08-11 15:06 ` [patch 13/19] Filesystem: Ext3 " Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 15/19] Filesystem: XFS slab defragmentation Christoph Lameter
                   ` (5 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: ext4-defrag --]
[-- Type: text/plain, Size: 1101 bytes --]

Support defragmentation for extX filesystem inodes

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/ext4/super.c |    8 ++++++++
 1 file changed, 8 insertions(+)

Index: linux-next/fs/ext4/super.c
===================================================================
--- linux-next.orig/fs/ext4/super.c	2008-08-11 07:42:10.412359410 -0700
+++ linux-next/fs/ext4/super.c	2008-08-11 07:47:05.062348866 -0700
@@ -605,6 +605,12 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *ext4_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct ext4_inode_info, vfs_inode));
+}
+
 static int init_inodecache(void)
 {
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
@@ -614,6 +620,8 @@ static int init_inodecache(void)
 					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
+	kmem_cache_setup_defrag(ext4_inode_cachep,
+			ext4_get_inodes, kick_inodes);
 	return 0;
 }
 

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 15/19] Filesystem: XFS slab defragmentation
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (13 preceding siblings ...)
  2008-08-11 15:06 ` [patch 14/19] Filesystem: Ext4 " Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-12  0:20   ` Dave Chinner
  2008-08-11 15:06 ` [patch 16/19] Filesystem: /proc filesystem support for slab defrag Christoph Lameter
                   ` (4 subsequent siblings)
  19 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0027-FS-XFS-slab-defragmentation.patch --]
[-- Type: text/plain, Size: 885 bytes --]

Support inode defragmentation for xfs

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/xfs/linux-2.6/xfs_super.c |    1 +
 1 file changed, 1 insertion(+)

Index: linux-next/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- linux-next.orig/fs/xfs/linux-2.6/xfs_super.c	2008-08-11 07:42:11.892359719 -0700
+++ linux-next/fs/xfs/linux-2.6/xfs_super.c	2008-08-11 07:47:05.072348463 -0700
@@ -1947,6 +1947,7 @@ xfs_init_zones(void)
 	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
 	if (!xfs_ioend_zone)
 		goto out_destroy_vnode_zone;
+	kmem_cache_setup_defrag(xfs_vnode_zone, get_inodes, kick_inodes);
 
 	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
 						  xfs_ioend_zone);

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 16/19] Filesystem: /proc filesystem support for slab defrag
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (14 preceding siblings ...)
  2008-08-11 15:06 ` [patch 15/19] Filesystem: XFS slab defragmentation Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 17/19] Filesystem: Slab defrag: Reiserfs support Christoph Lameter
                   ` (3 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Alexey Dobriyan, Christoph Lameter, Christoph Lameter,
	linux-kernel, linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm,
	Dave Chinner

[-- Attachment #1: 0028-FS-Proc-filesystem-support-for-slab-defrag.patch --]
[-- Type: text/plain, Size: 1153 bytes --]

Support procfs inode defragmentation

Cc: Alexey Dobriyan <adobriyan@sw.ru>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/proc/inode.c |    8 ++++++++
 1 file changed, 8 insertions(+)

Index: linux-next/fs/proc/inode.c
===================================================================
--- linux-next.orig/fs/proc/inode.c	2008-08-11 07:42:11.542357951 -0700
+++ linux-next/fs/proc/inode.c	2008-08-11 07:47:05.102359374 -0700
@@ -106,6 +106,12 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
+static void *proc_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct proc_inode, vfs_inode));
+};
+
 int __init proc_init_inodecache(void)
 {
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
@@ -113,6 +119,8 @@ int __init proc_init_inodecache(void)
 					     0, (SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD|SLAB_PANIC),
 					     init_once);
+	kmem_cache_setup_defrag(proc_inode_cachep,
+				proc_get_inodes, kick_inodes);
 	return 0;
 }
 

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 17/19] Filesystem: Slab defrag: Reiserfs support
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (15 preceding siblings ...)
  2008-08-11 15:06 ` [patch 16/19] Filesystem: /proc filesystem support for slab defrag Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-11 15:06 ` [patch 18/19] dentries: Add constructor Christoph Lameter
                   ` (2 subsequent siblings)
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0029-FS-Slab-defrag-Reiserfs-support.patch --]
[-- Type: text/plain, Size: 1125 bytes --]

Slab defragmentation: Support reiserfs inode defragmentation.

Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/reiserfs/super.c |    8 ++++++++
 1 file changed, 8 insertions(+)

Index: linux-next/fs/reiserfs/super.c
===================================================================
--- linux-next.orig/fs/reiserfs/super.c	2008-08-11 07:42:11.712359126 -0700
+++ linux-next/fs/reiserfs/super.c	2008-08-11 07:47:05.122348709 -0700
@@ -533,6 +533,12 @@ static void init_once(void *foo)
 #endif
 }
 
+static void *reiserfs_get_inodes(struct kmem_cache *s, int nr, void **v)
+{
+	return fs_get_inodes(s, nr, v,
+		offsetof(struct reiserfs_inode_info, vfs_inode));
+}
+
 static int init_inodecache(void)
 {
 	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
@@ -543,6 +549,8 @@ static int init_inodecache(void)
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
+	kmem_cache_setup_defrag(reiserfs_inode_cachep,
+			reiserfs_get_inodes, kick_inodes);
 	return 0;
 }
 

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 18/19] dentries: Add constructor
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (16 preceding siblings ...)
  2008-08-11 15:06 ` [patch 17/19] Filesystem: Slab defrag: Reiserfs support Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-13  7:12   ` Pekka Enberg
  2008-08-11 15:06 ` [patch 19/19] dentries: dentry defragmentation Christoph Lameter
  2008-08-12  6:50 ` [patch 00/19] Slab Fragmentation Reduction V14 Pekka Enberg
  19 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Alexander Viro, Christoph Hellwig, Christoph Lameter,
	Christoph Lameter, linux-kernel, linux-fsdevel, Mel Gorman, andi,
	Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0031-dentries-Add-constructor.patch --]
[-- Type: text/plain, Size: 2319 bytes --]

In order to support defragmentation on the dentry cache we need to have
a determined object state at all times. Without a constructor the object
would have a random state after allocation.

So provide a constructor.

Cc: Alexander Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/dcache.c |   26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

Index: linux-next/fs/dcache.c
===================================================================
--- linux-next.orig/fs/dcache.c	2008-08-11 07:42:10.072351650 -0700
+++ linux-next/fs/dcache.c	2008-08-11 07:50:02.361383352 -0700
@@ -899,6 +899,16 @@ static struct shrinker dcache_shrinker =
 	.seeks = DEFAULT_SEEKS,
 };
 
+static void dcache_ctor(void *p)
+{
+	struct dentry *dentry = p;
+
+	spin_lock_init(&dentry->d_lock);
+	dentry->d_inode = NULL;
+	INIT_LIST_HEAD(&dentry->d_lru);
+	INIT_LIST_HEAD(&dentry->d_alias);
+}
+
 /**
  * d_alloc	-	allocate a dcache entry
  * @parent: parent of entry to allocate
@@ -936,8 +946,6 @@ struct dentry *d_alloc(struct dentry * p
 
 	atomic_set(&dentry->d_count, 1);
 	dentry->d_flags = DCACHE_UNHASHED;
-	spin_lock_init(&dentry->d_lock);
-	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
 	dentry->d_sb = NULL;
 	dentry->d_op = NULL;
@@ -947,9 +955,7 @@ struct dentry *d_alloc(struct dentry * p
 	dentry->d_cookie = NULL;
 #endif
 	INIT_HLIST_NODE(&dentry->d_hash);
-	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
-	INIT_LIST_HEAD(&dentry->d_alias);
 
 	if (parent) {
 		dentry->d_parent = dget(parent);
@@ -2275,14 +2281,10 @@ static void __init dcache_init(void)
 {
 	int loop;
 
-	/* 
-	 * A constructor could be added for stable state like the lists,
-	 * but it is probably not worth it because of the cache nature
-	 * of the dcache. 
-	 */
-	dentry_cache = KMEM_CACHE(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
-	
+	dentry_cache = kmem_cache_create("dentry_cache", sizeof(struct dentry),
+		0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD,
+		dcache_ctor);
+
 	register_shrinker(&dcache_shrinker);
 
 	/* Hash may have been set up in dcache_init_early */

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [patch 19/19] dentries: dentry defragmentation
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (17 preceding siblings ...)
  2008-08-11 15:06 ` [patch 18/19] dentries: Add constructor Christoph Lameter
@ 2008-08-11 15:06 ` Christoph Lameter
  2008-08-12  6:50 ` [patch 00/19] Slab Fragmentation Reduction V14 Pekka Enberg
  19 siblings, 0 replies; 28+ messages in thread
From: Christoph Lameter @ 2008-08-11 15:06 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Alexander Viro, Christoph Hellwig, Christoph Lameter,
	Christoph Lameter, linux-kernel, linux-fsdevel, Mel Gorman, andi,
	Rik van Riel, mpm, Dave Chinner

[-- Attachment #1: 0032-dentries-dentry-defragmentation.patch --]
[-- Type: text/plain, Size: 4214 bytes --]

The dentry pruning for unused entries works in a straightforward way. It
could be made more aggressive if one would actually move dentries instead
of just reclaiming them.

Cc: Alexander Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/dcache.c |  101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 1 deletion(-)

Index: linux-next/fs/dcache.c
===================================================================
--- linux-next.orig/fs/dcache.c	2008-08-11 07:47:05.132349135 -0700
+++ linux-next/fs/dcache.c	2008-08-11 07:47:05.152349011 -0700
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 
 
@@ -172,7 +173,10 @@ static struct dentry *d_kill(struct dent
 
 	list_del(&dentry->d_u.d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
-	/*drops the locks, at that point nobody can reach this dentry */
+	/*
+	 * drops the locks, at that point nobody (aside from defrag)
+	 * can reach this dentry
+	 */
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
 	d_free(dentry);
@@ -2277,6 +2281,100 @@ static void __init dcache_init_early(voi
 		INIT_HLIST_HEAD(&dentry_hashtable[loop]);
 }
 
+/*
+ * The slab allocator is holding off frees. We can safely examine
+ * the object without the danger of it vanishing from under us.
+ */
+static void *get_dentries(struct kmem_cache *s, int nr, void **v)
+{
+	struct dentry *dentry;
+	int i;
+
+	spin_lock(&dcache_lock);
+	for (i = 0; i < nr; i++) {
+		dentry = v[i];
+
+		/*
+		 * Three sorts of dentries cannot be reclaimed:
+		 *
+		 * 1. dentries that are in the process of being allocated
+		 *    or being freed. In that case the dentry is neither
+		 *    on the LRU nor hashed.
+		 *
+		 * 2. Fake hashed entries as used for anonymous dentries
+		 *    and pipe I/O. The fake hashed entries have d_flags
+		 *    set to indicate a hashed entry. However, the
+		 *    d_hash field indicates that the entry is not hashed.
+		 *
+		 * 3. dentries that have a backing store that is not
+		 *    writable. This is true for tmpsfs and other in
+		 *    memory filesystems. Removing dentries from them
+		 *    would loose dentries for good.
+		 */
+		if ((d_unhashed(dentry) && list_empty(&dentry->d_lru)) ||
+		   (!d_unhashed(dentry) && hlist_unhashed(&dentry->d_hash)) ||
+		   (dentry->d_inode &&
+		   !mapping_cap_writeback_dirty(dentry->d_inode->i_mapping)))
+			/* Ignore this dentry */
+			v[i] = NULL;
+		else
+			/* dget_locked will remove the dentry from the LRU */
+			dget_locked(dentry);
+	}
+	spin_unlock(&dcache_lock);
+	return NULL;
+}
+
+/*
+ * Slab has dropped all the locks. Get rid of the refcount obtained
+ * earlier and also free the object.
+ */
+static void kick_dentries(struct kmem_cache *s,
+				int nr, void **v, void *private)
+{
+	struct dentry *dentry;
+	int i;
+
+	/*
+	 * First invalidate the dentries without holding the dcache lock
+	 */
+	for (i = 0; i < nr; i++) {
+		dentry = v[i];
+
+		if (dentry)
+			d_invalidate(dentry);
+	}
+
+	/*
+	 * If we are the last one holding a reference then the dentries can
+	 * be freed. We need the dcache_lock.
+	 */
+	spin_lock(&dcache_lock);
+	for (i = 0; i < nr; i++) {
+		dentry = v[i];
+		if (!dentry)
+			continue;
+
+		spin_lock(&dentry->d_lock);
+		if (atomic_read(&dentry->d_count) > 1) {
+			spin_unlock(&dentry->d_lock);
+			spin_unlock(&dcache_lock);
+			dput(dentry);
+			spin_lock(&dcache_lock);
+			continue;
+		}
+
+		prune_one_dentry(dentry);
+	}
+	spin_unlock(&dcache_lock);
+
+	/*
+	 * dentries are freed using RCU so we need to wait until RCU
+	 * operations are complete.
+	 */
+	synchronize_rcu();
+}
+
 static void __init dcache_init(void)
 {
 	int loop;
@@ -2286,6 +2384,7 @@ static void __init dcache_init(void)
 		dcache_ctor);
 
 	register_shrinker(&dcache_shrinker);
+	kmem_cache_setup_defrag(dentry_cache, get_dentries, kick_dentries);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)

-- 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 15/19] Filesystem: XFS slab defragmentation
  2008-08-11 15:06 ` [patch 15/19] Filesystem: XFS slab defragmentation Christoph Lameter
@ 2008-08-12  0:20   ` Dave Chinner
  2008-08-12  7:08     ` Pekka Enberg
  0 siblings, 1 reply; 28+ messages in thread
From: Dave Chinner @ 2008-08-12  0:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, akpm, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm

On Mon, Aug 11, 2008 at 08:06:31AM -0700, Christoph Lameter wrote:
> Support inode defragmentation for xfs
> 
> Reviewed-by: Rik van Riel <riel@redhat.com>
> Signed-off-by: Christoph Lameter <clameter@sgi.com>
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 
> ---
>  fs/xfs/linux-2.6/xfs_super.c |    1 +
>  1 file changed, 1 insertion(+)
> 
> Index: linux-next/fs/xfs/linux-2.6/xfs_super.c
> ===================================================================
> --- linux-next.orig/fs/xfs/linux-2.6/xfs_super.c	2008-08-11 07:42:11.892359719 -0700
> +++ linux-next/fs/xfs/linux-2.6/xfs_super.c	2008-08-11 07:47:05.072348463 -0700
> @@ -1947,6 +1947,7 @@ xfs_init_zones(void)
>  	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
>  	if (!xfs_ioend_zone)
>  		goto out_destroy_vnode_zone;
> +	kmem_cache_setup_defrag(xfs_vnode_zone, get_inodes, kick_inodes);
>  
>  	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
>  						  xfs_ioend_zone);

Please update this patch as per my last comment.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 00/19] Slab Fragmentation Reduction V14
  2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
                   ` (18 preceding siblings ...)
  2008-08-11 15:06 ` [patch 19/19] dentries: dentry defragmentation Christoph Lameter
@ 2008-08-12  6:50 ` Pekka Enberg
  19 siblings, 0 replies; 28+ messages in thread
From: Pekka Enberg @ 2008-08-12  6:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, linux-kernel, linux-fsdevel, Mel Gorman, andi, Rik van Riel,
	mpm, Dave Chinner

Christoph Lameter wrote:
> V13->V14
> - Rediff against linux-next on request of Andrew
> - TestSetPageLocked -> trylock_page conversion.
> 
> Slab fragmentation is mainly an issue if Linux is used as a fileserver
> and large amounts of dentries, inodes and buffer heads accumulate. In some
> load situations the slabs become very sparsely populated so that a lot of
> memory is wasted by slabs that only contain one or a few objects. In
> extreme cases the performance of a machine will become sluggish since
> we are continually running reclaim without much succes.
> Slab defragmentation adds the capability to recover the memory that
> is wasted.

Applied, thanks!

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 15/19] Filesystem: XFS slab defragmentation
  2008-08-12  0:20   ` Dave Chinner
@ 2008-08-12  7:08     ` Pekka Enberg
  2008-08-12 18:21       ` Christoph Lameter
  0 siblings, 1 reply; 28+ messages in thread
From: Pekka Enberg @ 2008-08-12  7:08 UTC (permalink / raw)
  To: Christoph Lameter, Pekka Enberg, akpm, Christoph Lameter,
	linux-kernel, linu

Dave Chinner wrote:
> On Mon, Aug 11, 2008 at 08:06:31AM -0700, Christoph Lameter wrote:
>> Support inode defragmentation for xfs
>>
>> Reviewed-by: Rik van Riel <riel@redhat.com>
>> Signed-off-by: Christoph Lameter <clameter@sgi.com>
>> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
>>
>> ---
>>  fs/xfs/linux-2.6/xfs_super.c |    1 +
>>  1 file changed, 1 insertion(+)
>>
>> Index: linux-next/fs/xfs/linux-2.6/xfs_super.c
>> ===================================================================
>> --- linux-next.orig/fs/xfs/linux-2.6/xfs_super.c	2008-08-11 07:42:11.892359719 -0700
>> +++ linux-next/fs/xfs/linux-2.6/xfs_super.c	2008-08-11 07:47:05.072348463 -0700
>> @@ -1947,6 +1947,7 @@ xfs_init_zones(void)
>>  	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
>>  	if (!xfs_ioend_zone)
>>  		goto out_destroy_vnode_zone;
>> +	kmem_cache_setup_defrag(xfs_vnode_zone, get_inodes, kick_inodes);
>>  
>>  	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
>>  						  xfs_ioend_zone);
> 
> Please update this patch as per my last comment.

Christoph, I'm dropping this patch from my tree.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 15/19] Filesystem: XFS slab defragmentation
  2008-08-12  7:08     ` Pekka Enberg
@ 2008-08-12 18:21       ` Christoph Lameter
  2008-08-13  6:38         ` Pekka Enberg
  0 siblings, 1 reply; 28+ messages in thread
From: Christoph Lameter @ 2008-08-12 18:21 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: akpm, Christoph Lameter, linux-kernel, linux-fsdevel, Mel Gorman,
	andi, Rik van Riel, mpm

Pekka Enberg wrote:
>
>>
>> Please update this patch as per my last comment.
> 
> Christoph, I'm dropping this patch from my tree.

Just apply this patch:

Subject: defrag/xfs: Move defrag setup directly after xfs_vnode_zone kmem
cache creation

Move the setup of the defrag directly after the creation of the xfs_vnode_zone

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

Index: linux-2.6/fs/xfs/linux-2.6/xfs_super.c
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_super.c	2008-08-04 08:27:09.000000000
-0500
+++ linux-2.6/fs/xfs/linux-2.6/xfs_super.c	2008-08-04 08:27:25.000000000 -0500
@@ -2021,11 +2021,11 @@
 	if (!xfs_vnode_zone)
 		goto out;

+	kmem_cache_setup_defrag(xfs_vnode_zone, get_inodes, kick_inodes);
+
 	xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
 	if (!xfs_ioend_zone)
 		goto out_destroy_vnode_zone;
-	kmem_cache_setup_defrag(xfs_vnode_zone, get_inodes, kick_inodes);
-
 	xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
 						  xfs_ioend_zone);
 	if (!xfs_ioend_pool)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 01/19] slub: Add defrag_ratio field and sysfs support.
  2008-08-11 15:06 ` [patch 01/19] slub: Add defrag_ratio field and sysfs support Christoph Lameter
@ 2008-08-13  0:40   ` Greg KH
  0 siblings, 0 replies; 28+ messages in thread
From: Greg KH @ 2008-08-13  0:40 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, akpm, Christoph Lameter, linux-kernel,
	linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm, Dave Chinner

On Mon, Aug 11, 2008 at 08:06:17AM -0700, Christoph Lameter wrote:
> The defrag_ratio is used to set the threshold at which defragmentation
> should be attempted on a slab page.
> 
> The allocation ratio is measured by the percentage of the available slots
> allocated.
> 
> Add a defrag ratio field and set it to 30% by default. A limit of 30% specified
> that less than 3 out of 10 available slots for objects are in use before
> slab defragmeentation runs.

Any new sysfs file should also add documentation in Documentation/ABI.

Please add that to this patch.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 15/19] Filesystem: XFS slab defragmentation
  2008-08-12 18:21       ` Christoph Lameter
@ 2008-08-13  6:38         ` Pekka Enberg
  0 siblings, 0 replies; 28+ messages in thread
From: Pekka Enberg @ 2008-08-13  6:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Christoph Lameter, linux-kernel, linux-fsdevel, Mel Gorman,
	andi, Rik van Riel, mpm

Christoph Lameter wrote:
> Pekka Enberg wrote:
>>> Please update this patch as per my last comment.
>> Christoph, I'm dropping this patch from my tree.
> 
> Just apply this patch:
> 
> Subject: defrag/xfs: Move defrag setup directly after xfs_vnode_zone kmem
> cache creation

Applied, thanks!

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [patch 18/19] dentries: Add constructor
  2008-08-11 15:06 ` [patch 18/19] dentries: Add constructor Christoph Lameter
@ 2008-08-13  7:12   ` Pekka Enberg
  0 siblings, 0 replies; 28+ messages in thread
From: Pekka Enberg @ 2008-08-13  7:12 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: akpm, Alexander Viro, Christoph Hellwig, Christoph Lameter,
	linux-kernel, linux-fsdevel, Mel Gorman, andi, Rik van Riel, mpm,
	Dave Chinner

Christoph Lameter wrote:
> In order to support defragmentation on the dentry cache we need to have
> a determined object state at all times. Without a constructor the object
> would have a random state after allocation.
> 
> So provide a constructor.

I'm dropping these dentry patches for now because of boot-time oopses.

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2008-08-13  7:15 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-11 15:06 [patch 00/19] Slab Fragmentation Reduction V14 Christoph Lameter
2008-08-11 15:06 ` [patch 01/19] slub: Add defrag_ratio field and sysfs support Christoph Lameter
2008-08-13  0:40   ` Greg KH
2008-08-11 15:06 ` [patch 02/19] slub: Replace ctor field with ops field in /sys/slab/* Christoph Lameter
2008-08-11 15:06 ` [patch 03/19] slub: Add get() and kick() methods Christoph Lameter
2008-08-11 15:06 ` [patch 04/19] slub: Sort slab cache list and establish maximum objects for defrag slabs Christoph Lameter
2008-08-11 15:06 ` [patch 05/19] slub: Slab defrag core Christoph Lameter
2008-08-11 15:06 ` [patch 06/19] slub: Add KICKABLE to avoid repeated kick() attempts Christoph Lameter
2008-08-11 15:06 ` [patch 07/19] slub: Extend slabinfo to support -D and -F options Christoph Lameter
2008-08-11 15:06 ` [patch 08/19] slub/slabinfo: add defrag statistics Christoph Lameter
2008-08-11 15:06 ` [patch 09/19] slub: Trigger defragmentation from memory reclaim Christoph Lameter
2008-08-11 15:06 ` [patch 10/19] buffer heads: Support slab defrag Christoph Lameter
2008-08-11 15:06 ` [patch 11/19] inodes: Support generic defragmentation Christoph Lameter
2008-08-11 15:06 ` [patch 12/19] Filesystem: Ext2 filesystem defrag Christoph Lameter
2008-08-11 15:06 ` [patch 13/19] Filesystem: Ext3 " Christoph Lameter
2008-08-11 15:06 ` [patch 14/19] Filesystem: Ext4 " Christoph Lameter
2008-08-11 15:06 ` [patch 15/19] Filesystem: XFS slab defragmentation Christoph Lameter
2008-08-12  0:20   ` Dave Chinner
2008-08-12  7:08     ` Pekka Enberg
2008-08-12 18:21       ` Christoph Lameter
2008-08-13  6:38         ` Pekka Enberg
2008-08-11 15:06 ` [patch 16/19] Filesystem: /proc filesystem support for slab defrag Christoph Lameter
2008-08-11 15:06 ` [patch 17/19] Filesystem: Slab defrag: Reiserfs support Christoph Lameter
2008-08-11 15:06 ` [patch 18/19] dentries: Add constructor Christoph Lameter
2008-08-13  7:12   ` Pekka Enberg
2008-08-11 15:06 ` [patch 19/19] dentries: dentry defragmentation Christoph Lameter
2008-08-12  6:50 ` [patch 00/19] Slab Fragmentation Reduction V14 Pekka Enberg
  -- strict thread matches above, loose matches on Subject: below --
2008-05-10  2:21 [patch 00/19] Slab Fragmentation Reduction V13 Christoph Lameter
2008-05-10  2:21 ` [patch 09/19] slub: Trigger defragmentation from memory reclaim Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).