Re: Fastpath prototype? - Christoph Lameter

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Christoph Lameter <clameter@sgi.com>
To: Andi Kleen <ak@suse.de>
Cc: Nick Piggin <npiggin@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mel@csn.ul.ie>,
	linux-mm@kvack.org, Pekka J Enberg <penberg@cs.helsinki.fi>
Subject: Re: Fastpath prototype?
Date: Tue, 12 Feb 2008 14:31:20 -0800 (PST)	[thread overview]
Message-ID: <Pine.LNX.4.64.0802121426060.9829@schroedinger.engr.sgi.com> (raw)
In-Reply-To: <Pine.LNX.4.64.0802121208150.2120@schroedinger.engr.sgi.com>

Here is a patch to remove the pcp lists (just in case someone wants to toy 
around with these things too). It hits tbench/SLUB badly because that 
relies heavily on effective caching by the page allocator.

tbench/SLUB:  726.25 MB/sec

Even adding the fast path prototype (covers only slab allocs >=4K 
allocs) yields only 1825.68 MB/sec

I guess these results indicate that tbench would improve even more if we 
had a better fastpath.

A lot of the ugly NUMA stuff (page draining etc) would go away with the 
pcps. Also we could likely simplify NUMA bootstrap.


---
 include/linux/gfp.h    |    5 
 include/linux/mmzone.h |    8 -
 kernel/sysctl.c        |   12 --
 mm/page_alloc.c        |  272 +------------------------------------------------
 mm/vmstat.c            |   39 -------
 5 files changed, 12 insertions(+), 324 deletions(-)

Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h	2008-02-12 14:06:48.883814096 -0800
+++ linux-2.6/include/linux/gfp.h	2008-02-12 14:25:11.185781673 -0800
@@ -227,8 +227,7 @@ extern void FASTCALL(free_cold_page(stru
 #define free_page(addr) free_pages((addr),0)
 
 void page_alloc_init(void);
-void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
-void drain_all_pages(void);
-void drain_local_pages(void *dummy);
+static inline void drain_all_pages(void) {}
+static inline void drain_local_pages(void *dummy) {}
 
 #endif /* __LINUX_GFP_H */
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2008-02-07 23:28:11.328553973 -0800
+++ linux-2.6/include/linux/mmzone.h	2008-02-12 14:06:53.599840561 -0800
@@ -105,15 +105,7 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
-struct per_cpu_pages {
-	int count;		/* number of pages in the list */
-	int high;		/* high watermark, emptying needed */
-	int batch;		/* chunk size for buddy add/remove */
-	struct list_head list;	/* the list of pages */
-};
-
 struct per_cpu_pageset {
-	struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
 	s8 expire;
 #endif
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2008-02-12 14:06:48.895814167 -0800
+++ linux-2.6/mm/page_alloc.c	2008-02-12 14:25:11.213781917 -0800
@@ -475,35 +475,6 @@ static inline int free_pages_check(struc
 	return PageReserved(page);
 }
 
-/*
- * Frees a list of pages. 
- * Assumes all pages on list are in same zone, and of same order.
- * count is the number of pages to free.
- *
- * If the zone was previously in an "all pages pinned" state then look to
- * see if this freeing clears that state.
- *
- * And clear the zone's pages_scanned counter, to hold off the "all pages are
- * pinned" detection logic.
- */
-static void free_pages_bulk(struct zone *zone, int count,
-					struct list_head *list, int order)
-{
-	spin_lock(&zone->lock);
-	zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
-	zone->pages_scanned = 0;
-	while (count--) {
-		struct page *page;
-
-		VM_BUG_ON(list_empty(list));
-		page = list_entry(list->prev, struct page, lru);
-		/* have to delete it as __free_one_page list manipulates */
-		list_del(&page->lru);
-		__free_one_page(page, zone, order);
-	}
-	spin_unlock(&zone->lock);
-}
-
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
 	spin_lock(&zone->lock);
@@ -832,110 +803,6 @@ static struct page *__rmqueue(struct zon
 	return page;
 }
 
-/* 
- * Obtain a specified number of elements from the buddy allocator, all under
- * a single hold of the lock, for efficiency.  Add them to the supplied list.
- * Returns the number of new pages which were placed at *list.
- */
-static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-			unsigned long count, struct list_head *list,
-			int migratetype)
-{
-	int i;
-	
-	spin_lock(&zone->lock);
-	for (i = 0; i < count; ++i) {
-		struct page *page = __rmqueue(zone, order, migratetype);
-		if (unlikely(page == NULL))
-			break;
-
-		/*
-		 * Split buddy pages returned by expand() are received here
-		 * in physical page order. The page is added to the callers and
-		 * list and the list head then moves forward. From the callers
-		 * perspective, the linked list is ordered by page number in
-		 * some conditions. This is useful for IO devices that can
-		 * merge IO requests if the physical pages are ordered
-		 * properly.
-		 */
-		list_add(&page->lru, list);
-		set_page_private(page, migratetype);
-		list = &page->lru;
-	}
-	spin_unlock(&zone->lock);
-	return i;
-}
-
-#ifdef CONFIG_NUMA
-/*
- * Called from the vmstat counter updater to drain pagesets of this
- * currently executing processor on remote nodes after they have
- * expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
- */
-void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
-{
-	unsigned long flags;
-	int to_drain;
-
-	local_irq_save(flags);
-	if (pcp->count >= pcp->batch)
-		to_drain = pcp->batch;
-	else
-		to_drain = pcp->count;
-	free_pages_bulk(zone, to_drain, &pcp->list, 0);
-	pcp->count -= to_drain;
-	local_irq_restore(flags);
-}
-#endif
-
-/*
- * Drain pages of the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
- */
-static void drain_pages(unsigned int cpu)
-{
-	unsigned long flags;
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset;
-		struct per_cpu_pages *pcp;
-
-		if (!populated_zone(zone))
-			continue;
-
-		pset = zone_pcp(zone, cpu);
-
-		pcp = &pset->pcp;
-		local_irq_save(flags);
-		free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-		pcp->count = 0;
-		local_irq_restore(flags);
-	}
-}
-
-/*
- * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- */
-void drain_local_pages(void *arg)
-{
-	drain_pages(smp_processor_id());
-}
-
-/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
- */
-void drain_all_pages(void)
-{
-	on_each_cpu(drain_local_pages, NULL, 0, 1);
-}
-
 #ifdef CONFIG_HIBERNATION
 
 void mark_free_pages(struct zone *zone)
@@ -978,7 +845,6 @@ void mark_free_pages(struct zone *zone)
 static void free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
-	struct per_cpu_pages *pcp;
 	unsigned long flags;
 
 	if (PageAnon(page))
@@ -992,21 +858,11 @@ static void free_hot_cold_page(struct pa
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp;
 	local_irq_save(flags);
 	__count_vm_event(PGFREE);
-	if (cold)
-		list_add_tail(&page->lru, &pcp->list);
-	else
-		list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
-	pcp->count++;
-	if (pcp->count >= pcp->high) {
-		free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
-		pcp->count -= pcp->batch;
-	}
+	free_one_page(zone, page, 0);
 	local_irq_restore(flags);
-	put_cpu();
 }
 
 void free_hot_page(struct page *page)
@@ -1047,56 +903,18 @@ static struct page *buffered_rmqueue(str
 {
 	unsigned long flags;
 	struct page *page;
-	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 
 again:
-	cpu  = get_cpu();
-	if (likely(order == 0)) {
-		struct per_cpu_pages *pcp;
-
-		pcp = &zone_pcp(zone, cpu)->pcp;
-		local_irq_save(flags);
-		if (!pcp->count) {
-			pcp->count = rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list, migratetype);
-			if (unlikely(!pcp->count))
-				goto failed;
-		}
-
-		/* Find a page of the appropriate migrate type */
-		if (cold) {
-			list_for_each_entry_reverse(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		} else {
-			list_for_each_entry(page, &pcp->list, lru)
-				if (page_private(page) == migratetype)
-					break;
-		}
-
-		/* Allocate more to the pcp list if necessary */
-		if (unlikely(&page->lru == &pcp->list)) {
-			pcp->count += rmqueue_bulk(zone, 0,
-					pcp->batch, &pcp->list, migratetype);
-			page = list_entry(pcp->list.next, struct page, lru);
-		}
-
-		list_del(&page->lru);
-		pcp->count--;
-	} else {
-		spin_lock_irqsave(&zone->lock, flags);
-		page = __rmqueue(zone, order, migratetype);
-		spin_unlock(&zone->lock);
-		if (!page)
-			goto failed;
-	}
+	spin_lock_irqsave(&zone->lock, flags);
+	page = __rmqueue(zone, order, migratetype);
+	spin_unlock(&zone->lock);
+	if (!page)
+		goto failed;
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(zonelist, zone);
 	local_irq_restore(flags);
-	put_cpu();
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1786,7 +1604,6 @@ void si_meminfo_node(struct sysinfo *val
  */
 void show_free_areas(void)
 {
-	int cpu;
 	struct zone *zone;
 
 	for_each_zone(zone) {
@@ -1794,17 +1611,6 @@ void show_free_areas(void)
 			continue;
 
 		show_node(zone);
-		printk("%s per-cpu:\n", zone->name);
-
-		for_each_online_cpu(cpu) {
-			struct per_cpu_pageset *pageset;
-
-			pageset = zone_pcp(zone, cpu);
-
-			printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
-			       cpu, pageset->pcp.high,
-			       pageset->pcp.batch, pageset->pcp.count);
-		}
 	}
 
 	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
@@ -2597,37 +2403,11 @@ static int zone_batchsize(struct zone *z
 	return batch;
 }
 
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+inline void setup_pageset(struct per_cpu_pageset *p)
 {
-	struct per_cpu_pages *pcp;
-
 	memset(p, 0, sizeof(*p));
-
-	pcp = &p->pcp;
-	pcp->count = 0;
-	pcp->high = 6 * batch;
-	pcp->batch = max(1UL, 1 * batch);
-	INIT_LIST_HEAD(&pcp->list);
 }
 
-/*
- * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
- */
-
-static void setup_pagelist_highmark(struct per_cpu_pageset *p,
-				unsigned long high)
-{
-	struct per_cpu_pages *pcp;
-
-	pcp = &p->pcp;
-	pcp->high = high;
-	pcp->batch = max(1UL, high/4);
-	if ((high/4) > (PAGE_SHIFT * 8))
-		pcp->batch = PAGE_SHIFT * 8;
-}
-
-
 #ifdef CONFIG_NUMA
 /*
  * Boot pageset table. One per cpu which is going to be used for all
@@ -2669,11 +2449,7 @@ static int __cpuinit process_zones(int c
 		if (!zone_pcp(zone, cpu))
 			goto bad;
 
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
-		if (percpu_pagelist_fraction)
-			setup_pagelist_highmark(zone_pcp(zone, cpu),
-			 	(zone->present_pages / percpu_pagelist_fraction));
+		setup_pageset(zone_pcp(zone, cpu));
 	}
 
 	return 0;
@@ -2798,9 +2574,9 @@ static __meminit void zone_pcp_init(stru
 #ifdef CONFIG_NUMA
 		/* Early boot. Slab allocator not functional yet */
 		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
+		setup_pageset(&boot_pageset[cpu]);
 #else
-		setup_pageset(zone_pcp(zone,cpu), batch);
+		setup_pageset(zone_pcp(zone,cpu));
 #endif
 	}
 	if (zone->present_pages)
@@ -3971,8 +3747,6 @@ static int page_alloc_cpu_notify(struct 
 	int cpu = (unsigned long)hcpu;
 
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		drain_pages(cpu);
-
 		/*
 		 * Spill the event counters of the dead processor
 		 * into the current processors event counters.
@@ -4236,32 +4010,6 @@ int lowmem_reserve_ratio_sysctl_handler(
 	return 0;
 }
 
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
- */
-
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
-	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
-{
-	struct zone *zone;
-	unsigned int cpu;
-	int ret;
-
-	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
-	if (!write || (ret == -EINVAL))
-		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
-			unsigned long  high;
-			high = zone->present_pages / percpu_pagelist_fraction;
-			setup_pagelist_highmark(zone_pcp(zone, cpu), high);
-		}
-	}
-	return 0;
-}
-
 int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c	2008-02-07 23:28:12.596577762 -0800
+++ linux-2.6/mm/vmstat.c	2008-02-12 14:06:53.619840675 -0800
@@ -317,37 +317,7 @@ void refresh_cpu_vm_stats(int cpu)
 				local_irq_restore(flags);
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_diff[i] += v;
-#ifdef CONFIG_NUMA
-				/* 3 seconds idle till flush */
-				p->expire = 3;
-#endif
 			}
-#ifdef CONFIG_NUMA
-		/*
-		 * Deal with draining the remote pageset of this
-		 * processor
-		 *
-		 * Check if there are pages remaining in this pageset
-		 * if not then there is nothing to expire.
-		 */
-		if (!p->expire || !p->pcp.count)
-			continue;
-
-		/*
-		 * We never drain zones local to this processor.
-		 */
-		if (zone_to_nid(zone) == numa_node_id()) {
-			p->expire = 0;
-			continue;
-		}
-
-		p->expire--;
-		if (p->expire)
-			continue;
-
-		if (p->pcp.count)
-			drain_zone_pages(zone, &p->pcp);
-#endif
 	}
 
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -685,15 +655,6 @@ static void zoneinfo_show_print(struct s
 		struct per_cpu_pageset *pageset;
 
 		pageset = zone_pcp(zone, i);
-		seq_printf(m,
-			   "\n    cpu: %i"
-			   "\n              count: %i"
-			   "\n              high:  %i"
-			   "\n              batch: %i",
-			   i,
-			   pageset->pcp.count,
-			   pageset->pcp.high,
-			   pageset->pcp.batch);
 #ifdef CONFIG_SMP
 		seq_printf(m, "\n  vm stats threshold: %d",
 				pageset->stat_threshold);
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c	2008-02-12 14:11:38.553441536 -0800
+++ linux-2.6/kernel/sysctl.c	2008-02-12 14:11:56.161540376 -0800
@@ -75,7 +75,6 @@ extern int pid_max;
 extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
-extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
@@ -100,7 +99,6 @@ static int one_hundred = 100;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-static int min_percpu_pagelist_fract = 8;
 
 static int ngroups_max = NGROUPS_MAX;
 
@@ -1012,16 +1010,6 @@ static struct ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
-	{
-		.ctl_name	= VM_PERCPU_PAGELIST_FRACTION,
-		.procname	= "percpu_pagelist_fraction",
-		.data		= &percpu_pagelist_fraction,
-		.maxlen		= sizeof(percpu_pagelist_fraction),
-		.mode		= 0644,
-		.proc_handler	= &percpu_pagelist_fraction_sysctl_handler,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &min_percpu_pagelist_fract,
-	},
 #ifdef CONFIG_MMU
 	{
 		.ctl_name	= VM_MAX_MAP_COUNT,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2008-02-12 22:31 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-02-09 21:45 SLUB tbench regression due to page allocator deficiency Christoph Lameter
2008-02-09 22:35 ` Andrew Morton
2008-02-10  0:19   ` Christoph Lameter
2008-02-10  2:45     ` Nick Piggin
2008-02-10  3:36       ` Christoph Lameter
2008-02-10  3:39       ` Christoph Lameter
2008-02-10 23:24         ` Nick Piggin
2008-02-11 19:14           ` Christoph Lameter
2008-02-11 22:03           ` Christoph Lameter
2008-02-11  7:18         ` Nick Piggin
2008-02-11 19:21           ` Christoph Lameter
2008-02-11 23:40             ` Nick Piggin
2008-02-11 23:42               ` Christoph Lameter
2008-02-11 23:56                 ` Nick Piggin
2008-02-12  0:08                   ` Christoph Lameter
2008-02-12  6:06                   ` Fastpath prototype? Christoph Lameter
2008-02-12 10:40                     ` Andi Kleen
2008-02-12 20:10                       ` Christoph Lameter
2008-02-12 22:31                         ` Christoph Lameter [this message]
2008-02-13 11:38                           ` Andi Kleen
2008-02-13 20:09                             ` Christoph Lameter
2008-02-13 18:33                   ` SLUB tbench regression due to page allocator deficiency Paul Jackson
2008-02-11 13:50 ` Mel Gorman
2008-02-13 11:15 ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.64.0802121426060.9829@schroedinger.engr.sgi.com \
    --to=clameter@sgi.com \
    --cc=ak@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=linux-mm@kvack.org \
    --cc=mel@csn.ul.ie \
    --cc=npiggin@suse.de \
    --cc=penberg@cs.helsinki.fi \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).