All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] VM: Implements the swap-out page-clustering technique
@ 2008-09-04 12:04 Hamid R. Jahanjou
  2008-09-04 23:14 ` Rik van Riel
                   ` (4 more replies)
  0 siblings, 5 replies; 18+ messages in thread
From: Hamid R. Jahanjou @ 2008-09-04 12:04 UTC (permalink / raw)
  To: linux-kernel

From: Hamid R. Jahanjou

Implements the idea of swap-out page clustering from *BSD for
Linux. Each time a candidate page is to be swapped out,
virtually-nearby pages are scanned to find eligible pages to be
swapped out too as a cluster. This technique increases the likelihood of
bringing in related data on a page fault and decreases swap space
fragmentation in the long run. Currently, Linux searches only
physically-nearby pages which is not optimal since, over time, physically-
adjacent pages may become unrelated.

The code can be statically tuned. No benchmarks. I'm not sure whether
the added complexity is acceptable.

Signed-off-by: Hamid R. Jahanjou <hamid.jahanjou@gmail.com>
---

diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h linux-2.6.26.1HJ/include/linux/pageout_clustering.h
--- linux-2.6.26.1-vanilla/include/linux/pageout_clustering.h	1970-01-01 03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/pageout_clustering.h	2008-08-28 20:19:52.000000000 +0330
@@ -0,0 +1,12 @@
+#ifndef PAGEOUT_CLUSTERING_H_
+#define PAGEOUT_CLUSTERING_H_
+
+#include<linux/list.h>
+#define INITIAL_CLUSTER_SCAN_RANGE 4 
+#define HALF_PAGEOUT_CLUSTER_SIZE 8      /* default number of pages to be clustered 
+					  * on either side of a given page */
+
+int try_to_cluster(struct page *, struct list_head *, 
+		   struct list_head *, int, int);
+
+#endif 
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/include/linux/rmap.h linux-2.6.26.1HJ/include/linux/rmap.h
--- linux-2.6.26.1-vanilla/include/linux/rmap.h	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/include/linux/rmap.h	2008-08-28 20:19:52.000000000 +0330
@@ -102,6 +102,12 @@ pte_t *page_check_address(struct page *,
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
 /*
+ * Used by pageout clustering 
+ */
+struct anon_vma *page_lock_anon_vma(struct page *);
+void page_unlock_anon_vma(struct anon_vma *);
+
+/*
  * Cleans the PTEs of shared mappings.
  * (and since clean PTEs should also be readonly, write protects them too)
  *
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/Makefile linux-2.6.26.1HJ/mm/Makefile
--- linux-2.6.26.1-vanilla/mm/Makefile	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/Makefile	2008-08-28 20:19:52.000000000 +0330
@@ -9,9 +9,9 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o 
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page_alloc.o page-writeback.o pdflush.o \
-			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-			   page_isolation.o $(mmu-y)
+			   readahead.o swap.o truncate.o pageout_clustering.o \
+			   vmscan.o prio_tree.o util.o mmzone.o vmstat.o \
+                           backing-dev.o page_isolation.o $(mmu-y)
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/pageout_clustering.c linux-2.6.26.1HJ/mm/pageout_clustering.c
--- linux-2.6.26.1-vanilla/mm/pageout_clustering.c	1970-01-01 03:30:00.000000000 +0330
+++ linux-2.6.26.1HJ/mm/pageout_clustering.c	2008-09-04 09:26:40.000000000 +0330
@@ -0,0 +1,278 @@
+#include<linux/pageout_clustering.h>
+#include<linux/mm.h>
+#include<linux/pagemap.h>
+#include<linux/swap.h>
+#include<linux/swapops.h>
+#include<linux/slab.h>
+#include<linux/init.h>
+#include<linux/rmap.h>
+#include<linux/module.h>
+#include<linux/memcontrol.h>
+
+
+/*
+ * contains data related to pageout clustering
+ */
+struct clustering_info 
+{
+
+    struct page *page;                 /* page descriptor of a page used for sampling cluster pages */ 
+    struct vm_area_struct *vma;        /* VMA under consideration */
+    struct list_head *src;             /* source page list used by shrink_list and friends */
+    
+    union {
+	struct anon_vma *anon_vma;     /* anon_vma object we're using for reverse mapping */
+	struct address_space *mapping; /* address_space object we're using for reverse mapping */
+};
+    
+    struct page **cluster;             /* our page cluster */
+    unsigned int cluster_size;         /* cluster size */
+    unsigned int page_index;           /* index of the sampling page within the cluster */
+    unsigned int index;                /* slot in the cluster for the next page */
+    unsigned int range;                /* maximum number of VMA's to scan */
+    unsigned int nr_collected;         /* number of pages collected in the cluster */
+    unsigned int nr_sc;                /* current number of VMA's scanned */
+    int mode;                          /* page-isolation mode */
+    int anonym;                        /* indicates whether we are to collect mapped pages (0)
+					* or anonymous pages (1) */
+    int (*continue_search)
+	(struct clustering_info *);    /* returns true if the search can go on */
+};
+
+
+static inline int check_conditions(struct clustering_info *ci)
+{
+    return (ci->nr_sc < ci->range) && (ci->nr_collected < ci->cluster_size);
+}
+
+    
+/* embodies page-selection-for-clustering policy */
+static inline int page_allowed_in_cluster(struct page *page, struct clustering_info *ci)
+{
+    int zone_id = page_zone_id(ci->page);
+
+
+    /* scanning the original page ? */
+    if (unlikely(page == ci->page))
+	return 0;
+
+    /* skip pages belonging to other zones */
+    if (unlikely(page_zone_id(page) != zone_id))
+	return 0;
+
+    /* skip pages not in LRU lists */
+    if (!PageLRU(page))
+	return 0;
+
+    /* skip active pages */
+    if (page->flags & PG_active)
+	return 0;
+
+    /* skip non-dirty pages */
+    if (!(page->flags & PG_dirty))
+	return 0;
+        
+    return 1;
+}
+
+
+
+/* embodies vma-selection-for-clustering policy */ 
+static inline int vma_allowed_in_cluster(struct vm_area_struct *vma, struct clustering_info *ci)
+{
+    if ( (vma->vm_flags & VM_RESERVED) ||
+	 (vma->vm_flags & VM_LOCKED) ||
+	 (vma->vm_flags & VM_IO) ||
+	 (vma->vm_flags & VM_NONLINEAR) )
+	return 0;
+
+    /* we ignore mapped VMA's when doing anonym clustering; */
+    /* similarly, we ignore anonym VMA's when doing mapped clustering */
+    if ( ((ci->anonym) && (vma->vm_file != NULL)) ||
+	 (!(ci->anonym) && (vma->vm_file == NULL)) )
+	return 0;
+    
+    return 1;
+}
+
+
+
+/*
+ * scans a process's vma structures, to find pages eligible for being added to cluster
+ */
+static void try_to_cluster_vma(struct clustering_info *ci)
+{
+    struct vm_area_struct *cursor_vma = ci->vma;
+    struct page *cursor_page;
+    struct mm_struct *cursor_mm;
+    unsigned long vm_address;
+
+
+    cursor_mm = cursor_vma->vm_mm;
+    if (!down_read_trylock(&cursor_mm->mmap_sem))
+	return;
+
+    do
+    {
+	if (!vma_allowed_in_cluster(cursor_vma, ci))
+	    goto next_vma;
+
+	ci->nr_sc++;
+
+	for(vm_address = cursor_vma->vm_start; 
+	    vm_address < cursor_vma->vm_end && ci->nr_collected < ci->cluster_size; 
+	    vm_address += PAGE_SIZE)
+	{
+	    cursor_page = virt_to_page(vm_address);
+	    if (!page_allowed_in_cluster(cursor_page, ci))
+		continue;
+
+	    switch (__isolate_lru_page(cursor_page, ci->mode)) 
+	    {
+	    case 0:
+		ci->cluster[ci->index] = cursor_page;
+		ci->nr_collected++;
+		ci->index = (++ci->index) % (ci->cluster_size);
+		break;
+
+	    case -EBUSY:
+		/* it is being freed else where */
+		list_move(&cursor_page->lru, ci->src);
+		break;
+
+	    default:
+		break;
+	    };   
+	}
+
+next_vma: 
+	cursor_vma = cursor_vma->vm_next;
+	/* begin from the first VMA if the end of list is reached */
+	if (cursor_vma == NULL)
+	    cursor_vma = cursor_mm->mmap;
+
+    } while (cursor_vma != ci->vma && ci->continue_search(ci));
+
+    up_read(&cursor_mm->mmap_sem);
+    return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles mapped pages
+ */
+static inline void try_to_cluster_mapped(struct clustering_info *ci)
+{
+    struct vm_area_struct *vma;
+    struct prio_tree_iter pst_itr;
+    const pgoff_t pgoff = ci->page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+    
+    vma_prio_tree_foreach(vma, &pst_itr, &ci->mapping->i_mmap, pgoff, pgoff) {
+	if (!ci->continue_search(ci))
+	    break;
+
+	ci->vma = vma;
+	try_to_cluster_vma(ci);
+    }
+    return;
+}
+
+
+
+/*
+ * helper function for try_to_cluster(); handles anonymous pages
+ */
+static inline void try_to_cluster_anon(struct clustering_info *ci)
+{
+    struct list_head *list_item;
+    struct vm_area_struct *vma;
+
+    
+    for (list_item = ci->anon_vma->head.next; 
+	 (list_item != &ci->anon_vma->head) && ci->continue_search(ci); 
+	 list_item = list_item->next)
+    {
+	vma = list_entry(list_item, struct vm_area_struct, anon_vma_node);
+	ci->vma = vma;
+	try_to_cluster_vma(ci);
+    }
+
+    return;
+}
+
+
+
+/*
+ * Searches in the virtual address space of the process possessing the page 
+ * to find other suitable pages to be swapped out as well. This is known as
+ * page clustering. Not to be confused with other usages of the word in Linux.
+ * 
+ * We adjust the range of our search by the value of the allocation order.
+ *
+ * @page: an originally-selected page
+ * @half_cluster_size: maximum number of pages to cluster on each side of the page
+ * @src: the list containing the page
+ * @dst: the list to insert pages to
+ * @order: allocation order
+ * @mode: page-isolation mode
+ *
+ * returns the number of pages added to the list, including the passed page.
+ */
+int try_to_cluster(struct page *page, struct list_head *src, 
+		   struct list_head *dst, int order, int mode)
+{
+    struct clustering_info cluster_info;
+    struct address_space *mapping;
+    struct anon_vma *anon_vma;
+    unsigned int range = INITIAL_CLUSTER_SCAN_RANGE << order;
+#define def_cluster_size (2 * HALF_PAGEOUT_CLUSTER_SIZE + 1)
+    struct page *cluster[def_cluster_size] = {NULL};
+    int i;
+
+
+    cluster[HALF_PAGEOUT_CLUSTER_SIZE] = page;
+    /* initialize commom fields only; other fields are
+     * set as we go on */
+    cluster_info.page = page;
+    cluster_info.vma = NULL;
+    cluster_info.src = src;
+    cluster_info.cluster = cluster;
+    cluster_info.cluster_size = def_cluster_size;
+    cluster_info.page_index = HALF_PAGEOUT_CLUSTER_SIZE;
+    cluster_info.index = HALF_PAGEOUT_CLUSTER_SIZE + 1;
+    cluster_info.range = range;
+    cluster_info.nr_collected = 1;
+    cluster_info.nr_sc = 0;
+    cluster_info.mode = mode;
+    cluster_info.continue_search = check_conditions;
+    
+    /* try to cluster anonymous pages ? */
+    anon_vma = page_lock_anon_vma(page);
+    if (anon_vma != NULL)
+    {
+	cluster_info.anon_vma = anon_vma;
+	cluster_info.anonym = 1;
+	try_to_cluster_anon(&cluster_info);
+	page_unlock_anon_vma(anon_vma);
+    }
+
+    /* try to cluster mapped pages ? */
+    else if (page_mapped(page))
+    {
+	mapping = page->mapping;
+	spin_lock(&mapping->i_mmap_lock);
+	cluster_info.mapping = mapping;
+	cluster_info.anonym = 0;
+	try_to_cluster_mapped(&cluster_info);
+	spin_unlock(&mapping->i_mmap_lock);
+    }
+
+    /* add to destination list */
+    for (i = 0; i < def_cluster_size; i++)
+	if (cluster[i] != NULL)
+	    list_move(&cluster[i]->lru, dst);
+
+    return cluster_info.nr_collected;
+}
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/rmap.c linux-2.6.26.1HJ/mm/rmap.c
--- linux-2.6.26.1-vanilla/mm/rmap.c	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/rmap.c	2008-08-28 20:19:52.000000000 +0330
@@ -156,7 +156,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	unsigned long anon_mapping;
@@ -176,7 +176,7 @@ out:
 	return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	spin_unlock(&anon_vma->lock);
 	rcu_read_unlock();
diff -uprN -X linux-2.6.26.1-vanilla/Documentation/dontdiff linux-2.6.26.1-vanilla/mm/vmscan.c linux-2.6.26.1HJ/mm/vmscan.c
--- linux-2.6.26.1-vanilla/mm/vmscan.c	2008-08-02 02:28:24.000000000 +0330
+++ linux-2.6.26.1HJ/mm/vmscan.c	2008-08-28 20:19:52.000000000 +0330
@@ -38,6 +38,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/pageout_clustering.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -700,10 +701,7 @@ static unsigned long isolate_lru_pages(u
 
 	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct page *page;
-		unsigned long pfn;
-		unsigned long end_pfn;
-		unsigned long page_pfn;
-		int zone_id;
+		int nr_clustered;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
@@ -712,8 +710,7 @@ static unsigned long isolate_lru_pages(u
 
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			list_move(&page->lru, dst);
-			nr_taken++;
+		        /* counted and added to dst in try_to_cluster() */
 			break;
 
 		case -EBUSY:
@@ -725,51 +722,13 @@ static unsigned long isolate_lru_pages(u
 			BUG();
 		}
 
-		if (!order)
-			continue;
-
-		/*
-		 * Attempt to take all pages in the order aligned region
-		 * surrounding the tag page.  Only take those pages of
-		 * the same active state as that tag page.  We may safely
-		 * round the target page pfn down to the requested order
-		 * as the mem_map is guarenteed valid out to MAX_ORDER,
-		 * where that page is in a different zone we will detect
-		 * it from its zone id and abort this block scan.
-		 */
-		zone_id = page_zone_id(page);
-		page_pfn = page_to_pfn(page);
-		pfn = page_pfn & ~((1 << order) - 1);
-		end_pfn = pfn + (1 << order);
-		for (; pfn < end_pfn; pfn++) {
-			struct page *cursor_page;
-
-			/* The target page is in the block, ignore it. */
-			if (unlikely(pfn == page_pfn))
-				continue;
-
-			/* Avoid holes within the zone. */
-			if (unlikely(!pfn_valid_within(pfn)))
-				break;
-
-			cursor_page = pfn_to_page(pfn);
-			/* Check that we have not crossed a zone boundary. */
-			if (unlikely(page_zone_id(cursor_page) != zone_id))
-				continue;
-			switch (__isolate_lru_page(cursor_page, mode)) {
-			case 0:
-				list_move(&cursor_page->lru, dst);
-				nr_taken++;
-				scan++;
-				break;
-
-			case -EBUSY:
-				/* else it is being freed elsewhere */
-				list_move(&cursor_page->lru, src);
-			default:
-				break;
-			}
-		}
+		/* 
+		 * scan virtually-nearby pages in process address space
+		 * to find eligible pages to be swapped out as well
+		 */
+		nr_clustered = try_to_cluster(page, src, dst, order, mode);
+		nr_taken += nr_clustered;
+		scan += nr_clustered - 1;
 	}
 
 	*scanned = scan;



^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2008-09-24 13:56 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-04 12:04 [PATCH] VM: Implements the swap-out page-clustering technique Hamid R. Jahanjou
2008-09-04 23:14 ` Rik van Riel
2008-09-04 23:14 ` Andrew Morton
2008-09-05  7:45   ` Hamid R. Jahanjou
2008-09-05  6:58     ` Andrew Morton
2008-09-05  9:19 ` Andi Kleen
2008-09-05 20:27   ` Hamid R. Jahanjou
2008-09-05 19:45     ` Andi Kleen
2008-09-06  5:42       ` Rik van Riel
2008-09-08  0:28         ` Zan Lynx
2008-09-08  0:55           ` Rik van Riel
2008-09-10  8:16             ` Hamid R. Jahanjou
2008-09-10 17:08               ` Ray Lee
2008-09-10 17:39                 ` Rik van Riel
2008-09-08  3:50 ` Li Yu
2008-09-08  9:51   ` hamidreza jahanjou
     [not found]     ` <48C4FECF.2000708@gmail.com>
2008-09-08 10:31       ` Li Yu
2008-09-24 13:56 ` Luiz Fernando N. Capitulino

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.