[PATCH] VM: Implements the swap-out page-clustering technique

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] VM: Implements the swap-out page-clustering technique
@ 2008-09-04 12:04 Hamid R. Jahanjou
  2008-09-04 23:14 ` Rik van Riel
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Hamid R. Jahanjou @ 2008-09-04 12:04 UTC (permalink / raw)
  To: linux-kernel

From: Hamid R. Jahanjou

Implements the idea of swap-out page clustering from *BSD for
Linux. Each time a candidate page is to be swapped out,
virtually-nearby pages are scanned to find eligible pages to be
swapped out too as a cluster. This technique increases the likelihood of
bringing in related data on a page fault and decreases swap space
fragmentation in the long run. Currently, Linux searches only
physically-nearby pages which is not optimal since, over time, physically-
adjacent pages may become unrelated.

The code can be statically tuned. No benchmarks. I'm not sure whether
the added complexity is acceptable.

Signed-off-by: Hamid R. Jahanjou <hamid.jahanjou@gmail.com>
---

diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h linux-2.6.26.1HJ/include/linux/pageout_clustering.h
--- linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h	1970-01-01 03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/pageout_clustering.h	2008-08-28 20:19:52.000000000 +0330
@@ -0,0 +1,12 @@
+#ifndef PAGEOUT_CLUSTERING_H_
+#define PAGEOUT_CLUSTERING_H_
+
+#include<linux/list.h>
+#define INITIAL_CLUSTER_SCAN_RANGE 4 
+#define HALF_PAGEOUT_CLUSTER_SIZE 8      /* default number of pages to be clustered 
+					  * on either side of a given page */
+
+int try_to_cluster(struct page *, struct list_head *, 
+		   struct list_head *, int, int);
+
+#endif 
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/include/linux/rmap.h linux-2.6.26.1HJ/include/linux/rmap.h
--- linux-2.6.26.1-vanilla/include/linux/rmap.h	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/rmap.h	2008-08-28 20:19:52.000000000 +0330
@@ -102,6 +102,12 @@ pte_t *page_check_address(struct page *,
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
 /*
+ * Used by pageout clustering 
+ */
+struct anon_vma *page_lock_anon_vma(struct page *);
+void page_unlock_anon_vma(struct anon_vma *);
+
+/*
  * Cleans the PTEs of shared mappings.
  * (and since clean PTEs should also be readonly, write protects them too)
  *
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/Makefile linux-2.6.26.1HJ/mm/Makefile
--- linux-2.6.26.1-vanilla/mm/Makefile	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/Makefile	2008-08-28 20:19:52.000000000 +0330
@@ -9,9 +9,9 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o $(mmu-y)
+			   readahead.o swap.o truncate.o pageout_clustering.o \
+			   vmscan.o prio_tree.o util.o mmzone.o vmstat.o \
+                           backing-dev.o page_isolation.o $(mmu-y)
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/pageout_clustering.c linux-2.6.26.1HJ/mm/pageout_clustering.c
--- linux-2.6.26.1-vanilla/mm/pageout_clustering.c	1970-01-01 03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/mm/pageout_clustering.c	2008-09-04 09:26:40.000000000 +0330
@@ -0,0 +1,278 @@
+#include<linux/pageout_clustering.h>
+#include<linux/mm.h>
+#include<linux/pagemap.h>
+#include<linux/swap.h>
+#include<linux/swapops.h>
+#include<linux/slab.h>
+#include<linux/init.h>
+#include<linux/rmap.h>
+#include<linux/module.h>
+#include<linux/memcontrol.h>
+
+
+/*
+ * contains data related to pageout clustering
+ */
+struct clustering_info 
+{
+
+    struct page *page;                 /* page descriptor of a page used for sampling cluster pages */ 
+    struct vm_area_struct *vma;        /* VMA under consideration */
+    struct list_head *src;             /* source page list used by shrink_list and friends */
+    
+    union {
+	struct anon_vma *anon_vma;     /* anon_vma object we're using for reverse mapping */
+	struct address_space *mapping; /* address_space object we're using for reverse mapping */
+};
+    
+    struct page **cluster;             /* our page cluster */
+    unsigned int cluster_size;         /* cluster size */
+    unsigned int page_index;           /* index of the sampling page within the cluster */
+    unsigned int index;                /* slot in the cluster for the next page */
+    unsigned int range;                /* maximum number of VMA's to scan */
+    unsigned int nr_collected;         /* number of pages collected in the cluster */
+    unsigned int nr_sc;                /* current number of VMA's scanned */
+    int mode;                          /* page-isolation mode */
+    int anonym;                        /* indicates whether we are to collect mapped pages (0)
+					* or anonymous pages (1) */
+    int (*continue_search)
+	(struct clustering_info *);    /* returns true if the search can go on */
+};
+
+
+static inline int check_conditions(struct clustering_info *ci)
+{
+    return (ci->nr_sc < ci->range) && (ci->nr_collected < ci->cluster_size);
+}
+
+    
+/* embodies page-selection-for-clustering policy */
+static inline int page_allowed_in_cluster(struct page *page, struct clustering_info *ci)
+{
+    int zone_id = page_zone_id(ci->page);
+
+
+    /* scanning the original page ? */
+    if (unlikely(page == ci->page))
+	return 0;
+
+    /* skip pages belonging to other zones */
+    if (unlikely(page_zone_id(page) != zone_id))
+	return 0;
+
+    /* skip pages not in LRU lists */
+    if (!PageLRU(page))
+	return 0;
+
+    /* skip active pages */
+    if (page->flags & PG_active)
+	return 0;
+
+    /* skip non-dirty pages */
+    if (!(page->flags & PG_dirty))
+	return 0;
+        
+    return 1;
+}
+
+
+
+/* embodies vma-selection-for-clustering policy */ 
+static inline int vma_allowed_in_cluster(struct vm_area_struct *vma, struct clustering_info *ci)
+{
+    if ( (vma->vm_flags & VM_RESERVED) ||
+	 (vma->vm_flags & VM_LOCKED) ||
+	 (vma->vm_flags & VM_IO) ||
+	 (vma->vm_flags & VM_NONLINEAR) )
+	return 0;
+
+    /* we ignore mapped VMA's when doing anonym clustering; */
+    /* similarly, we ignore anonym VMA's when doing mapped clustering */
+    if ( ((ci->anonym) && (vma->vm_file != NULL)) ||
+	 (!(ci->anonym) && (vma->vm_file == NULL)) )
+	return 0;
+    
+    return 1;
+}
+
+
+
+/*
+ * scans a process's vma structures, to find pages eligible for being added to cluster
+ */
+static void try_to_cluster_vma(struct clustering_info *ci)
+{
+    struct vm_area_struct *cursor_vma = ci->vma;
+    struct page *cursor_page;
+    struct mm_struct *cursor_mm;
+    unsigned long vm_address;
+
+
+    cursor_mm = cursor_vma->vm_mm;
+    if (!down_read_trylock(&cursor_mm->mmap_sem))
+	return;
+
+    do
+    {
+	if (!vma_allowed_in_cluster(cursor_vma, ci))
+	    goto next_vma;
+
+	ci->nr_sc++;
+
+	for(vm_address = cursor_vma->vm_start; 
+	    vm_address < cursor_vma->vm_end && ci->nr_collected < ci->cluster_size; 
+	    vm_address += PAGE_SIZE)
+	{
+	    cursor_page = virt_to_page(vm_address);
+	    if (!page_allowed_in_cluster(cursor_page, ci))
+		continue;
+
+	    switch (__isolate_lru_page(cursor_page, ci->mode)) 
+	    {
+	    case 0:
+		ci->cluster[ci->index] = cursor_page;
+		ci->nr_collected++;
+		ci->index = (++ci->index) % (ci->cluster_size);
+		break;
+
+	    case -EBUSY:
+		/* it is being freed else where */
+		list_move(&cursor_page->lru, ci->src);
+		break;
+
+	    default:
+		break;
+	    };   
+	}
+
+next_vma: 
+	cursor_vma = cursor_vma->vm_next;
+	/* begin from the first VMA if the end of list is reached */
+	if (cursor_vma == NULL)
+	    cursor_vma = cursor_mm->mmap;
+
+    } while (cursor_vma != ci->vma && ci->continue_search(ci));
+
+    up_read(&cursor_mm->mmap_sem);
+    return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles mapped pages
+ */
+static inline void try_to_cluster_mapped(struct clustering_info *ci)
+{
+    struct vm_area_struct *vma;
+    struct prio_tree_iter pst_itr;
+    const pgoff_t pgoff = ci->page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+    
+    vma_prio_tree_foreach(vma, &pst_itr, &ci->mapping->i_mmap, pgoff, pgoff) {
+	if (!ci->continue_search(ci))
+	    break;
+
+	ci->vma = vma;
+	try_to_cluster_vma(ci);
+    }
+    return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles anonymous pages
+ */
+static inline void try_to_cluster_anon(struct clustering_info *ci)
+{
+    struct list_head *list_item;
+    struct vm_area_struct *vma;
+
+    
+    for (list_item = ci->anon_vma->head.next; 
+	 (list_item != &ci->anon_vma->head) && ci->continue_search(ci); 
+	 list_item = list_item->next)
+    {
+	vma = list_entry(list_item, struct vm_area_struct, anon_vma_node);
+	ci->vma = vma;
+	try_to_cluster_vma(ci);
+    }
+
+    return;
+}
+
+
+
+/*
+ * Searches in the virtual address space of the process possessing the page 
+ * to find other suitable pages to be swapped out as well. This is known as
+ * page clustering. Not to be confused with other usages of the word in Linux.
+ * 
+ * We adjust the range of our search by the value of the allocation order.
+ *
+ * @page: an originally-selected page
+ * @half_cluster_size: maximum number of pages to cluster on each side of the page
+ * @src: the list containing the page
+ * @dst: the list to insert pages to
+ * @order: allocation order
+ * @mode: page-isolation mode
+ *
+ * returns the number of pages added to the list, including the passed page.
+ */
+int try_to_cluster(struct page *page, struct list_head *src, 
+		   struct list_head *dst, int order, int mode)
+{
+    struct clustering_info cluster_info;
+    struct address_space *mapping;
+    struct anon_vma *anon_vma;
+    unsigned int range = INITIAL_CLUSTER_SCAN_RANGE << order;
+#define def_cluster_size (2 * HALF_PAGEOUT_CLUSTER_SIZE + 1)
+    struct page *cluster[def_cluster_size] = {NULL};
+    int i;
+
+
+    cluster[HALF_PAGEOUT_CLUSTER_SIZE] = page;
+    /* initialize commom fields only; other fields are
+     * set as we go on */
+    cluster_info.page = page;
+    cluster_info.vma = NULL;
+    cluster_info.src = src;
+    cluster_info.cluster = cluster;
+    cluster_info.cluster_size = def_cluster_size;
+    cluster_info.page_index = HALF_PAGEOUT_CLUSTER_SIZE;
+    cluster_info.index = HALF_PAGEOUT_CLUSTER_SIZE + 1;
+    cluster_info.range = range;
+    cluster_info.nr_collected = 1;
+    cluster_info.nr_sc = 0;
+    cluster_info.mode = mode;
+    cluster_info.continue_search = check_conditions;
+    
+    /* try to cluster anonymous pages ? */
+    anon_vma = page_lock_anon_vma(page);
+    if (anon_vma != NULL)
+    {
+	cluster_info.anon_vma = anon_vma;
+	cluster_info.anonym = 1;
+	try_to_cluster_anon(&cluster_info);
+	page_unlock_anon_vma(anon_vma);
+    }
+
+    /* try to cluster mapped pages ? */
+    else if (page_mapped(page))
+    {
+	mapping = page->mapping;
+	spin_lock(&mapping->i_mmap_lock);
+	cluster_info.mapping = mapping;
+	cluster_info.anonym = 0;
+	try_to_cluster_mapped(&cluster_info);
+	spin_unlock(&mapping->i_mmap_lock);
+    }
+
+    /* add to destination list */
+    for (i = 0; i < def_cluster_size; i++)
+	if (cluster[i] != NULL)
+	    list_move(&cluster[i]->lru, dst);
+
+    return cluster_info.nr_collected;
+}
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/rmap.c linux-2.6.26.1HJ/mm/rmap.c
--- linux-2.6.26.1-vanilla/mm/rmap.c	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/rmap.c	2008-08-28 20:19:52.000000000 +0330
@@ -156,7 +156,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	unsigned long anon_mapping;
@@ -176,7 +176,7 @@ out:
 	return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	spin_unlock(&anon_vma->lock);
 	rcu_read_unlock();
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/vmscan.c linux-2.6.26.1HJ/mm/vmscan.c
--- linux-2.6.26.1-vanilla/mm/vmscan.c	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/vmscan.c	2008-08-28 20:19:52.000000000 +0330
@@ -38,6 +38,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/pageout_clustering.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -700,10 +701,7 @@ static unsigned long isolate_lru_pages(u
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
-		unsigned long pfn;
-		unsigned long end_pfn;
-		unsigned long page_pfn;
-		int zone_id;
+		int nr_clustered;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
@@ -712,8 +710,7 @@ static unsigned long isolate_lru_pages(u
 
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			list_move(&page->lru, dst);
-			nr_taken++;
+		        /* counted and added to dst in try_to_cluster() */
 			break;
 
 		case -EBUSY:
@@ -725,51 +722,13 @@ static unsigned long isolate_lru_pages(u
 			BUG();
 		}
 
-		if (!order)
-			continue;
-
-		/*
-		 * Attempt to take all pages in the order aligned region
-		 * surrounding the tag page.  Only take those pages of
-		 * the same active state as that tag page.  We may safely
-		 * round the target page pfn down to the requested order
-		 * as the mem_map is guarenteed valid out to MAX_ORDER,
-		 * where that page is in a different zone we will detect
-		 * it from its zone id and abort this block scan.
-		 */
-		zone_id = page_zone_id(page);
-		page_pfn = page_to_pfn(page);
-		pfn = page_pfn & ~((1 << order) - 1);
-		end_pfn = pfn + (1 << order);
-		for (; pfn < end_pfn; pfn++) {
-			struct page *cursor_page;
-
-			/* The target page is in the block, ignore it. */
-			if (unlikely(pfn == page_pfn))
-				continue;
-
-			/* Avoid holes within the zone. */
-			if (unlikely(!pfn_valid_within(pfn)))
-				break;
-
-			cursor_page = pfn_to_page(pfn);
-			/* Check that we have not crossed a zone boundary. */
-			if (unlikely(page_zone_id(cursor_page) != zone_id))
-				continue;
-			switch (__isolate_lru_page(cursor_page, mode)) {
-			case 0:
-				list_move(&cursor_page->lru, dst);
-				nr_taken++;
-				scan++;
-				break;
-
-			case -EBUSY:
-				/* else it is being freed elsewhere */
-				list_move(&cursor_page->lru, src);
-			default:
-				break;
-			}
-		}
+		/* 
+		 * scan virtually-nearby pages in process address space
+		 * to find eligible pages to be swapped out as well
+		 */
+		nr_clustered = try_to_cluster(page, src, dst, order, mode);
+		nr_taken += nr_clustered;
+		scan += nr_clustered - 1;
 	}
 
 	*scanned = scan;



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
@ 2008-09-04 23:14 ` Rik van Riel
  2008-09-04 23:14 ` Andrew Morton
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 18+ messages in thread
From: Rik van Riel @ 2008-09-04 23:14 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: linux-kernel

On Thu, 04 Sep 2008 15:34:30 +0330
"Hamid R. Jahanjou" <hamid.jahanjou@gmail.com> wrote:

> From: Hamid R. Jahanjou
> 
> Implements the idea of swap-out page clustering from *BSD for
> Linux. Each time a candidate page is to be swapped out,
> virtually-nearby pages are scanned to find eligible pages to be
> swapped out too as a cluster. This technique increases the likelihood of
> bringing in related data on a page fault and decreases swap space
> fragmentation in the long run. Currently, Linux searches only
> physically-nearby pages which is not optimal since, over time, physically-
> adjacent pages may become unrelated.
> 
> The code can be statically tuned. No benchmarks. I'm not sure whether
> the added complexity is acceptable.

It would be good to get some idea of what testing you have done
with this patch.

-- 
All rights reversed.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
  2008-09-04 23:14 ` Rik van Riel
@ 2008-09-04 23:14 ` Andrew Morton
  2008-09-05  7:45   ` Hamid R. Jahanjou
  2008-09-05  9:19 ` Andi Kleen
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 18+ messages in thread
From: Andrew Morton @ 2008-09-04 23:14 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: linux-kernel

On Thu, 04 Sep 2008 15:34:30 +0330
"Hamid R. Jahanjou" <hamid.jahanjou@gmail.com> wrote:

> Implements the idea of swap-out page clustering from *BSD for
> Linux. Each time a candidate page is to be swapped out,
> virtually-nearby pages are scanned to find eligible pages to be
> swapped out too as a cluster. This technique increases the likelihood of
> bringing in related data on a page fault and decreases swap space
> fragmentation in the long run. Currently, Linux searches only
> physically-nearby pages which is not optimal since, over time, physically-
> adjacent pages may become unrelated.

I tried that once.  The code all worked as-designed but didn't seem to
improve performance much across a spread of workloads.

> The code can be statically tuned. No benchmarks. I'm not sure whether
> the added complexity is acceptable.

Benchmarks are essential, please.  Good ones.

The whole point of the patch is to improve performance.  If we don't
know whether it improves performance, we cannot proceed in any way.

Secondly, please don't just dump a pile of new code in our laps and
expect us to pick through it and work out what it does and how it does
it.  Please at least provide a carefully-written english-language
description of the design and implementation.

Thirdly, I'd suggest that this code be converted into vaguely standard
kernel coding style sooner rather than later.  Convert it to use
eight-column hard tabs then check it with scripts/checkpatch.pl,
thanks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-04 23:14 ` Andrew Morton
@ 2008-09-05  7:45   ` Hamid R. Jahanjou
  2008-09-05  6:58     ` Andrew Morton
  0 siblings, 1 reply; 18+ messages in thread
From: Hamid R. Jahanjou @ 2008-09-05  7:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Andrew Morton wrote:
> I tried that once.  The code all worked as-designed but didn't seem to
> improve performance much across a spread of workloads.
>
> Benchmarks are essential, please.  Good ones.
Do you recommend any specific benchmarks for memory-related projects ?
Which
one did you use for your own implementation ?

Thank You

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-05  7:45   ` Hamid R. Jahanjou
@ 2008-09-05  6:58     ` Andrew Morton
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2008-09-05  6:58 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: linux-kernel

On Fri, 05 Sep 2008 11:15:07 +0330 "Hamid R. Jahanjou" <hamid.jahanjou@gmail.com> wrote:

> Andrew Morton wrote:
> > I tried that once.  The code all worked as-designed but didn't seem to
> > improve performance much across a spread of workloads.
> >
> > Benchmarks are essential, please.  Good ones.
> Do you recommend any specific benchmarks for memory-related projects ?
> Which
> one did you use for your own implementation ?
> 

Pretty much any and all benchmarks, when there's not enough memory
(boot with mem=).  Kernel compiles, database benchmarks, you name it. 
Basically anything which has a measurable execution time is relevant.

Designing and running these things is a lot of work and thought.  And
it's an integral part of the development process, because some things
are going to get slower and others will show large inter-run variations
and then one needs to get in there and find explanations and hopefully
fixes.

For example, start with small and simple microbenchmarks: run two
processes which concurrently allocate 1*mem, then modify it all
linearly, then read it all linearly.  Build up from there.  Do not
dive into complex benchmarks on day 1, because you'll have no hope of
explaining the results which they produce.

Beware that behaviour on SMP can be very different from behaviour on
uniprocessor because of the relatively large duration of a timeslice. 
Both must be tested.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
  2008-09-04 23:14 ` Rik van Riel
  2008-09-04 23:14 ` Andrew Morton
@ 2008-09-05  9:19 ` Andi Kleen
  2008-09-05 20:27   ` Hamid R. Jahanjou
  2008-09-08  3:50 ` Li Yu
  2008-09-24 13:56 ` Luiz Fernando N. Capitulino
  4 siblings, 1 reply; 18+ messages in thread
From: Andi Kleen @ 2008-09-05  9:19 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: linux-kernel

"Hamid R. Jahanjou" <hamid.jahanjou@gmail.com> writes:

> From: Hamid R. Jahanjou
>
> Implements the idea of swap-out page clustering from *BSD for
> Linux. Each time a candidate page is to be swapped out,
> virtually-nearby pages are scanned to find eligible pages to be
> swapped out too as a cluster. This technique increases the likelihood of
> bringing in related data on a page fault and decreases swap space
> fragmentation in the long run. Currently, Linux searches only
> physically-nearby pages which is not optimal since, over time, physically-
> adjacent pages may become unrelated.
>
> The code can be statically tuned. No benchmarks. I'm not sure whether
> the added complexity is acceptable.

Just some general comments:

First I think virtual swap clustering is a great idea in theory and
long overdue.  Hopefully the numbers will agree.

In general the code would be much nicer if you didn't pass around 
all that much in a structure (which is just a fancy way to have
a function with lots of arguments) Perhaps try to restructure
it a bit to make this smaller? Ideally clustering_info should disappear
or at least get much smaller.

Then continue_search seems to be only set to one value so it 
can be eliminated? Perhaps there is more like this.

I didn't quite understand the "adjust the value of our search by
the allocation order". The allocation order should be normally 0.
I think having a tunable for the cluster sizes would be a good idea.
At some point they might be even device dependent (e.g. on a flash
device you would like to have them roughly erase block sized) 

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-05  9:19 ` Andi Kleen
@ 2008-09-05 20:27   ` Hamid R. Jahanjou
  2008-09-05 19:45     ` Andi Kleen
  0 siblings, 1 reply; 18+ messages in thread
From: Hamid R. Jahanjou @ 2008-09-05 20:27 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

Andi Kleen wrote:
> "Hamid R. Jahanjou" <hamid.jahanjou@gmail.com> writes:
>
>   
>> From: Hamid R. Jahanjou
>>
>> Implements the idea of swap-out page clustering from *BSD for
>> Linux. Each time a candidate page is to be swapped out,
>> virtually-nearby pages are scanned to find eligible pages to be
>> swapped out too as a cluster. This technique increases the likelihood of
>> bringing in related data on a page fault and decreases swap space
>> fragmentation in the long run. Currently, Linux searches only
>> physically-nearby pages which is not optimal since, over time, physically-
>> adjacent pages may become unrelated.
>>
>> The code can be statically tuned. No benchmarks. I'm not sure whether
>> the added complexity is acceptable.
>>     
>
> Just some general comments:
>
> First I think virtual swap clustering is a great idea in theory and
> long overdue.  Hopefully the numbers will agree.
>
> In general the code would be much nicer if you didn't pass around 
> all that much in a structure (which is just a fancy way to have
> a function with lots of arguments) Perhaps try to restructure
> it a bit to make this smaller? Ideally clustering_info should disappear
> or at least get much smaller.
>   

Thanks for the review and the comments.
Do you consider the clustering_info struct to hurt the readability
of the code or there is some other technical reasons ?

> I didn't quite understand the "adjust the value of our search by
> the allocation order". The allocation order should be normally 0.
> I think having a tunable for the cluster sizes would be a good idea.
> At some point they might be even device dependent (e.g. on a flash
> device you would like to have them roughly erase block sized)
>   

The allocation order value is initially passed to the try_to_free_pages()
in __alloc_pages_internal() and its value can be well more than zero.
The clustering code adjusts the cluster size to the value of the allocation
order (in a non-linear fashion of course). In this sense the cluster
size is
not determined at compile time, but the parameters affecting it are.
Perhaps the term "statically-tunable" is appropriate here.

-- 
Hamid R. Jahanjou
(hamid.jahanjou@gmail.com) 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-05 20:27   ` Hamid R. Jahanjou
@ 2008-09-05 19:45     ` Andi Kleen
  2008-09-06  5:42       ` Rik van Riel
  0 siblings, 1 reply; 18+ messages in thread
From: Andi Kleen @ 2008-09-05 19:45 UTC (permalink / raw)
  To: Hamid R. Jahanjou; +Cc: Andi Kleen, linux-kernel

On Fri, Sep 05, 2008 at 11:57:52PM +0330, Hamid R. Jahanjou wrote:
> > In general the code would be much nicer if you didn't pass around 
> > all that much in a structure (which is just a fancy way to have
> > a function with lots of arguments) Perhaps try to restructure
> > it a bit to make this smaller? Ideally clustering_info should disappear
> > or at least get much smaller.
> >   
> 
> Thanks for the review and the comments.
> Do you consider the clustering_info struct to hurt the readability
> of the code or there is some other technical reasons ?

It's a fancy way to have a function with lots of arguments
and having lots of arguments is typically a sign for a poor code
structure, with functions not being properly separated.

I know the standard scanner does it too, but it's also somewhat
poor there and especially there's no excuse for doing it in much
simpler code/

> 
> > I didn't quite understand the "adjust the value of our search by
> > the allocation order". The allocation order should be normally 0.
> > I think having a tunable for the cluster sizes would be a good idea.
> > At some point they might be even device dependent (e.g. on a flash
> > device you would like to have them roughly erase block sized)
> >   
> 
> The allocation order value is initially passed to the try_to_free_pages()

Yes, but that has nothing to do in how much you should swap cluster.

AFAIK the main motivation for swap clustering is to avoid the extreme
seekiness that happens during swap in (that is the main reason
why swap on Linux often takes so long). And also to use the IO
device well. IO devices today typically have a "minimum useful 
unit of IO" which is much larger than a page. Typically it's at least a 
MB, sometimes even more e.g. on flash or on some RAID controllers.  

Making it even larger is good to avoid seeks on read which are very 
costly on spinning disks.

Also for user application swapping the order will be near always 0
(except for large page users, but that's really a exotic corner case today)
And clustering for one page doesn't make much sense.

So I don't think you should care about that order, but make it a 
tunable and experiment what the best values are on different IO
devices and then possibly later make it dependent on the device.

-Andi
-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-05 19:45     ` Andi Kleen
@ 2008-09-06  5:42       ` Rik van Riel
  2008-09-08  0:28         ` Zan Lynx
  0 siblings, 1 reply; 18+ messages in thread
From: Rik van Riel @ 2008-09-06  5:42 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Hamid R. Jahanjou, Andi Kleen, linux-kernel

On Fri, 5 Sep 2008 21:45:01 +0200
Andi Kleen <andi@firstfloor.org> wrote:

> Yes, but that has nothing to do in how much you should swap cluster.
> 
> AFAIK the main motivation for swap clustering is to avoid the extreme
> seekiness that happens during swap in (that is the main reason
> why swap on Linux often takes so long). And also to use the IO
> device well. IO devices today typically have a "minimum useful 
> unit of IO" which is much larger than a page. Typically it's at least
> a MB, sometimes even more e.g. on flash or on some RAID controllers.  

That is indeed the reason.

Basically, the swap IO size should be large enough that
swap throughput is not incredibly slow when swapping in
lots of related data in a process.

On the other hand, you do want to avoid evicting data 
that the process is still using, just because you are 
swapping out a not recently referenced page on a
nearby virtual address.

I suspect that the swap IO cluster could contain nearby
and virtually contiguous pages that are (1) not recently
referenced and (2) on the inactive list.

Of course, my guess may be wrong - this needs testing :)

-- 
All rights reversed.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-06  5:42       ` Rik van Riel
@ 2008-09-08  0:28         ` Zan Lynx
  2008-09-08  0:55           ` Rik van Riel
  0 siblings, 1 reply; 18+ messages in thread
From: Zan Lynx @ 2008-09-08  0:28 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Andi Kleen, Hamid R. Jahanjou, linux-kernel

Rik van Riel wrote:
[cut]
> On the other hand, you do want to avoid evicting data 
> that the process is still using, just because you are 
> swapping out a not recently referenced page on a
> nearby virtual address.

How about writing the nearby pages to swap anyway and mark the still 
in-use pages as SwapCache.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-08  0:28         ` Zan Lynx
@ 2008-09-08  0:55           ` Rik van Riel
  2008-09-10  8:16             ` Hamid R. Jahanjou
  0 siblings, 1 reply; 18+ messages in thread
From: Rik van Riel @ 2008-09-08  0:55 UTC (permalink / raw)
  To: Zan Lynx; +Cc: Andi Kleen, Hamid R. Jahanjou, linux-kernel

On Sun, 07 Sep 2008 18:28:30 -0600
Zan Lynx <zlynx@acm.org> wrote:
> Rik van Riel wrote:
> [cut]
> > On the other hand, you do want to avoid evicting data 
> > that the process is still using, just because you are 
> > swapping out a not recently referenced page on a
> > nearby virtual address.
> 
> How about writing the nearby pages to swap anyway and mark the still 
> in-use pages as SwapCache.

That is what will happen pretty much automatically.

Another thing the swap out page clustering code could
do is move pages that were recently referenced back
to the active list, so the VM will not have to scan
those pages again.

-- 
All rights reversed.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-08  0:55           ` Rik van Riel
@ 2008-09-10  8:16             ` Hamid R. Jahanjou
  2008-09-10 17:08               ` Ray Lee
  0 siblings, 1 reply; 18+ messages in thread
From: Hamid R. Jahanjou @ 2008-09-10  8:16 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Zan Lynx, Andi Kleen, linux-kernel

Rik van Riel wrote:
> On Sun, 07 Sep 2008 18:28:30 -0600
> Zan Lynx <zlynx@acm.org> wrote:
>   
>> Rik van Riel wrote:
>> [cut]
>>     
>>> On the other hand, you do want to avoid evicting data 
>>> that the process is still using, just because you are 
>>> swapping out a not recently referenced page on a
>>> nearby virtual address.
>>>       
>> How about writing the nearby pages to swap anyway and mark the still 
>> in-use pages as SwapCache.
>>     
>
> That is what will happen pretty much automatically.
>   

I agree. As you said, we have two conflicting requirements here: on the
one hand one likes to swap as many pages as to satisfy the backing
storage "proper block IO size," on the other hand, one should not make a
process lose too many pages.  The code needs tuning really, I'm going to
rewrite it making it more flexible according to the suggestion given on
the mailing list.

> Another thing the swap out page clustering code could
> do is move pages that were recently referenced back
> to the active list, so the VM will not have to scan
> those pages again.
>
>   

I think this is a very good idea, actually the code does this already.

Thanks.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-10  8:16             ` Hamid R. Jahanjou
@ 2008-09-10 17:08               ` Ray Lee
  2008-09-10 17:39                 ` Rik van Riel
  0 siblings, 1 reply; 18+ messages in thread
From: Ray Lee @ 2008-09-10 17:08 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: Rik van Riel, Zan Lynx, Andi Kleen, linux-kernel

On Wed, Sep 10, 2008 at 1:16 AM, Hamid R. Jahanjou
<hamid.jahanjou@gmail.com> wrote:
> I agree. As you said, we have two conflicting requirements here: on the
> one hand one likes to swap as many pages as to satisfy the backing
> storage "proper block IO size," on the other hand, one should not make a
> process lose too many pages.

I'd much rather that one process lose a lot of pages than many
processes lose a few. When a system is swap thrashing, each process
has some of its active working set tossed to swap, which means *any*
application you try to switch to is then sluggish. If it were only one
application, then only that one takes the penalty hit when the user
switches focus and context.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-10 17:08               ` Ray Lee
@ 2008-09-10 17:39                 ` Rik van Riel
  0 siblings, 0 replies; 18+ messages in thread
From: Rik van Riel @ 2008-09-10 17:39 UTC (permalink / raw)
  To: Ray Lee; +Cc: hamid.jahanjou, Zan Lynx, Andi Kleen, linux-kernel

On Wed, 10 Sep 2008 10:08:07 -0700
"Ray Lee" <ray-lk@madrabbit.org> wrote:
> On Wed, Sep 10, 2008 at 1:16 AM, Hamid R. Jahanjou
> <hamid.jahanjou@gmail.com> wrote:
> > I agree. As you said, we have two conflicting requirements here: on the
> > one hand one likes to swap as many pages as to satisfy the backing
> > storage "proper block IO size," on the other hand, one should not make a
> > process lose too many pages.
> 
> I'd much rather that one process lose a lot of pages than many
> processes lose a few. When a system is swap thrashing, each process
> has some of its active working set tossed to swap, which means *any*
> application you try to switch to is then sluggish. If it were only one
> application, then only that one takes the penalty hit when the user
> switches focus and context.

Better still, with proper IO clustering that one process can get
its pages back into memory quite quickly.

Paging of anonymous pages is limited by disk seeks, so minimizing
those is a top goal.

-- 
All rights reversed.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
                   ` (2 preceding siblings ...)
  2008-09-05  9:19 ` Andi Kleen
@ 2008-09-08  3:50 ` Li Yu
  2008-09-08  9:51   ` hamidreza jahanjou
  2008-09-24 13:56 ` Luiz Fernando N. Capitulino
  4 siblings, 1 reply; 18+ messages in thread
From: Li Yu @ 2008-09-08  3:50 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: linux-kernel

Hamid R. Jahanjou 写道:
> From: Hamid R. Jahanjou
>
> Implements the idea of swap-out page clustering from *BSD for
> Linux. Each time a candidate page is to be swapped out,
> virtually-nearby pages are scanned to find eligible pages to be
> swapped out too as a cluster. This technique increases the likelihood of
> bringing in related data on a page fault and decreases swap space
> fragmentation in the long run. Currently, Linux searches only
> physically-nearby pages which is not optimal since, over time, physically-
> adjacent pages may become unrelated.
>
> The code can be statically tuned. No benchmarks. I'm not sure whether
> the added complexity is acceptable.
>
> Signed-off-by: Hamid R. Jahanjou <hamid.jahanjou@gmail.com>
> [snip a lot of code here]
> +
> +	for(vm_address = cursor_vma->vm_start; 
> +	    vm_address < cursor_vma->vm_end && ci->nr_collected < ci->cluster_size; 
> +	    vm_address += PAGE_SIZE)
> +	{
> +	    cursor_page = virt_to_page(vm_address);
>   

I do not think that the virt_to_page() can work well on userland virtual 
address space.
And the linear searching for whole address space of a vma is not good 
idea too, really.

> +	    if (!page_allowed_in_cluster(cursor_page, ci))
> +		continue;
>
>   


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-08  3:50 ` Li Yu
@ 2008-09-08  9:51   ` hamidreza jahanjou
       [not found]     ` <48C4FECF.2000708@gmail.com>
  0 siblings, 1 reply; 18+ messages in thread
From: hamidreza jahanjou @ 2008-09-08  9:51 UTC (permalink / raw)
  To: Li Yu; +Cc: linux-kernel

2008/9/8, Li Yu <raise.sail@gmail.com>:
> And the linear searching for whole address space of a vma is not good
> idea too, really.
>
>> +	    if (!page_allowed_in_cluster(cursor_page, ci))
>> +		continue;
>>

Thank you for the review and comments. You are right in that searching
the whole process address space is not a good idea. The idea is to
make the scan range flexible, thus normally, assuming that the code
has been well-tuned, only a very limited number of VMA's are scanned.

In general, i think that the cluster size and the scan range should be
tuned depending on the backing storage characteristics and the
low-on-memory severity. Like any VM code, this one needs tuning.

BTW, the code could be implemented much more elegantly if the VMA's
were connected using a doubly-linked list. I wonder if there are
enought other codes with the same situation to justify making it
doubly-linked.

^ permalink raw reply	[flat|nested] 18+ messages in thread

[parent not found: <48C4FECF.2000708@gmail.com>]

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
       [not found]     ` <48C4FECF.2000708@gmail.com>
@ 2008-09-08 10:31       ` Li Yu
  0 siblings, 0 replies; 18+ messages in thread
From: Li Yu @ 2008-09-08 10:31 UTC (permalink / raw)
  Cc: linux-kernel

Li Yu 写道:
> hamidreza jahanjou 写道:
>> 2008/9/8, Li Yu <raise.sail@gmail.com>:
>>   
>>> And the linear searching for whole address space of a vma is not good
>>> idea too, really.
>>>
>>>     
>>>> +	    if (!page_allowed_in_cluster(cursor_page, ci))
>>>> +		continue;
>>>>
>>>>       
>>
>> Thank you for the review and comments. You are right in that searching
>> the whole process address space is not a good idea. The idea is to
>> make the scan range flexible, thus normally, assuming that the code
>> has been well-tuned, only a very limited number of VMA's are scanned.
>>
>> In general, i think that the cluster size and the scan range should be
>> tuned depending on the backing storage characteristics and the
>> low-on-memory severity. Like any VM code, this one needs tuning.
>>
>> BTW, the code could be implemented much more elegantly if the VMA's
>> were connected using a doubly-linked list. I wonder if there are
>> enought other codes with the same situation to justify making it
>> doubly-linked.
>>   
>
> You are welcome!
>
> I think that scanning one vma each task is enough.
>
> The goal of swapoutclustering is to reduce disk heads movement while 
> swapping in, it is based on space/time locality. I think that 
> different vmas implied different using pattern, which also means they 
> have different in  program locality. Therefore, I think scanning multi 
> vmas per task is not a good idea.
>
> Maybe, we need a page cache searching like for anonymous mapping 
> instead of *linear*  searching in each vma.
>
> I also have an uncompleted implementation for this feature ^_^
>
> Good luck.
>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] VM: Implements the swap-out page-clustering technique
  2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
                   ` (3 preceding siblings ...)
  2008-09-08  3:50 ` Li Yu
@ 2008-09-24 13:56 ` Luiz Fernando N. Capitulino
  4 siblings, 0 replies; 18+ messages in thread
From: Luiz Fernando N. Capitulino @ 2008-09-24 13:56 UTC (permalink / raw)
  To: hamid.jahanjou; +Cc: linux-kernel

Em Thu, 04 Sep 2008 15:34:30 +0330
"Hamid R. Jahanjou" <hamid.jahanjou@gmail.com> escreveu:

| From: Hamid R. Jahanjou
| 
| Implements the idea of swap-out page clustering from *BSD for
| Linux. Each time a candidate page is to be swapped out,
| virtually-nearby pages are scanned to find eligible pages to be
| swapped out too as a cluster. This technique increases the likelihood of
| bringing in related data on a page fault and decreases swap space
| fragmentation in the long run. Currently, Linux searches only
| physically-nearby pages which is not optimal since, over time, physically-
| adjacent pages may become unrelated.
| 
| The code can be statically tuned. No benchmarks. I'm not sure whether
| the added complexity is acceptable.

 Sorry for this (very) late response, but I ran some very simple tests
this week and thought you might be interested in the results.

 The test machine has 512MB of RAM. For the tests I booted with 64MB
and setup a 512MB swap file.

 For the first test I let an allyesconfig kernel building for several
hours with six jobs (make -j6). I did not have any problems, so
considering that your code was used I think it is stable enough.

 In the second test I have built an allnoconfig kernel with and without
your patches, still with six jobs. Results:

	2.6.27-rc7-vanilla   44:12 minutes
	2.6.27-rc7-pgc       52:09 minutes

 (pgc means page-clustering patches).

 For both builds I did not change any swap paramenter and I could
observe that about 80MB of swap were used most of the time.

 But please, note that these tests were quite far from being 'scientific'
eg. I should have built the kernel more times and should have tried other
setups...

 I will wait for the next version of the patches to run more serious
tests.

-- 
Luiz Fernando N. Capitulino

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2008-09-24 13:56 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
2008-09-04 23:14 ` Rik van Riel
2008-09-04 23:14 ` Andrew Morton
2008-09-05  7:45   ` Hamid R. Jahanjou
2008-09-05  6:58     ` Andrew Morton
2008-09-05  9:19 ` Andi Kleen
2008-09-05 20:27   ` Hamid R. Jahanjou
2008-09-05 19:45     ` Andi Kleen
2008-09-06  5:42       ` Rik van Riel
2008-09-08  0:28         ` Zan Lynx
2008-09-08  0:55           ` Rik van Riel
2008-09-10  8:16             ` Hamid R. Jahanjou
2008-09-10 17:08               ` Ray Lee
2008-09-10 17:39                 ` Rik van Riel
2008-09-08  3:50 ` Li Yu
2008-09-08  9:51   ` hamidreza jahanjou
     [not found]     ` <48C4FECF.2000708@gmail.com>
2008-09-08 10:31       ` Li Yu
2008-09-24 13:56 ` Luiz Fernando N. Capitulino

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox