From: Mel Gorman <mel@csn.ul.ie>
To: cl@linux-foundation.org
Cc: Tejun Heo <tj@kernel.org>,
linux-kernel@vger.kernel.org,
Pekka Enberg <penberg@cs.helsinki.fi>,
Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Subject: Re: [this_cpu_xx V6 1/7] this_cpu_ops: page allocator conversion
Date: Thu, 8 Oct 2009 11:53:58 +0100 [thread overview]
Message-ID: <20091008105357.GC10004@csn.ul.ie> (raw)
In-Reply-To: <20091007211052.229772418@gentwo.org>
On Wed, Oct 07, 2009 at 05:10:25PM -0400, cl@linux-foundation.org wrote:
> Use the per cpu allocator functionality to avoid per cpu arrays in struct zone.
>
> This drastically reduces the size of struct zone for systems with large
> amounts of processors and allows placement of critical variables of struct
> zone in one cacheline even on very large systems.
>
> Another effect is that the pagesets of one processor are placed near one
> another. If multiple pagesets from different zones fit into one cacheline
> then additional cacheline fetches can be avoided on the hot paths when
> allocating memory from multiple zones.
>
> Bootstrap becomes simpler if we use the same scheme for UP, SMP, NUMA. #ifdefs
> are reduced and we can drop the zone_pcp macro.
>
> Hotplug handling is also simplified since cpu alloc can bring up and
> shut down cpu areas for a specific cpu as a whole. So there is no need to
> allocate or free individual pagesets.
>
> V4-V5:
> - Fix up cases where per_cpu_ptr is called before irq disable
> - Integrate the bootstrap logic that was separate before.
>
> Cc: Mel Gorman <mel@csn.ul.ie>
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
>
I haven't tested the patch series but it now looks good to my eyes at
least. Thanks
Acked-by: Mel Gorman <mel@csn.ul.ie>
> ---
> include/linux/mm.h | 4 -
> include/linux/mmzone.h | 12 ---
> mm/page_alloc.c | 187 ++++++++++++++++++-------------------------------
> mm/vmstat.c | 14 ++-
> 4 files changed, 81 insertions(+), 136 deletions(-)
>
> Index: linux-2.6/include/linux/mm.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm.h 2009-10-07 14:34:25.000000000 -0500
> +++ linux-2.6/include/linux/mm.h 2009-10-07 14:48:09.000000000 -0500
> @@ -1061,11 +1061,7 @@ extern void si_meminfo(struct sysinfo *
> extern void si_meminfo_node(struct sysinfo *val, int nid);
> extern int after_bootmem;
>
> -#ifdef CONFIG_NUMA
> extern void setup_per_cpu_pageset(void);
> -#else
> -static inline void setup_per_cpu_pageset(void) {}
> -#endif
>
> extern void zone_pcp_update(struct zone *zone);
>
> Index: linux-2.6/include/linux/mmzone.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mmzone.h 2009-10-07 14:34:25.000000000 -0500
> +++ linux-2.6/include/linux/mmzone.h 2009-10-07 14:48:09.000000000 -0500
> @@ -184,13 +184,7 @@ struct per_cpu_pageset {
> s8 stat_threshold;
> s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
> #endif
> -} ____cacheline_aligned_in_smp;
> -
> -#ifdef CONFIG_NUMA
> -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
> -#else
> -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
> -#endif
> +};
>
> #endif /* !__GENERATING_BOUNDS.H */
>
> @@ -306,10 +300,8 @@ struct zone {
> */
> unsigned long min_unmapped_pages;
> unsigned long min_slab_pages;
> - struct per_cpu_pageset *pageset[NR_CPUS];
> -#else
> - struct per_cpu_pageset pageset[NR_CPUS];
> #endif
> + struct per_cpu_pageset *pageset;
> /*
> * free areas of different sizes
> */
> Index: linux-2.6/mm/page_alloc.c
> ===================================================================
> --- linux-2.6.orig/mm/page_alloc.c 2009-10-07 14:34:25.000000000 -0500
> +++ linux-2.6/mm/page_alloc.c 2009-10-07 14:48:09.000000000 -0500
> @@ -1011,10 +1011,10 @@ static void drain_pages(unsigned int cpu
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + local_irq_save(flags);
> + pset = per_cpu_ptr(zone->pageset, cpu);
>
> pcp = &pset->pcp;
> - local_irq_save(flags);
> free_pcppages_bulk(zone, pcp->count, pcp);
> pcp->count = 0;
> local_irq_restore(flags);
> @@ -1098,7 +1098,6 @@ static void free_hot_cold_page(struct pa
> arch_free_page(page, 0);
> kernel_map_pages(page, 1, 0);
>
> - pcp = &zone_pcp(zone, get_cpu())->pcp;
> migratetype = get_pageblock_migratetype(page);
> set_page_private(page, migratetype);
> local_irq_save(flags);
> @@ -1121,6 +1120,7 @@ static void free_hot_cold_page(struct pa
> migratetype = MIGRATE_MOVABLE;
> }
>
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> if (cold)
> list_add_tail(&page->lru, &pcp->lists[migratetype]);
> else
> @@ -1133,7 +1133,6 @@ static void free_hot_cold_page(struct pa
>
> out:
> local_irq_restore(flags);
> - put_cpu();
> }
>
> void free_hot_page(struct page *page)
> @@ -1183,17 +1182,15 @@ struct page *buffered_rmqueue(struct zon
> unsigned long flags;
> struct page *page;
> int cold = !!(gfp_flags & __GFP_COLD);
> - int cpu;
>
> again:
> - cpu = get_cpu();
> if (likely(order == 0)) {
> struct per_cpu_pages *pcp;
> struct list_head *list;
>
> - pcp = &zone_pcp(zone, cpu)->pcp;
> - list = &pcp->lists[migratetype];
> local_irq_save(flags);
> + pcp = &this_cpu_ptr(zone->pageset)->pcp;
> + list = &pcp->lists[migratetype];
> if (list_empty(list)) {
> pcp->count += rmqueue_bulk(zone, 0,
> pcp->batch, list,
> @@ -1234,7 +1231,6 @@ again:
> __count_zone_vm_events(PGALLOC, zone, 1 << order);
> zone_statistics(preferred_zone, zone);
> local_irq_restore(flags);
> - put_cpu();
>
> VM_BUG_ON(bad_range(zone, page));
> if (prep_new_page(page, order, gfp_flags))
> @@ -1243,7 +1239,6 @@ again:
>
> failed:
> local_irq_restore(flags);
> - put_cpu();
> return NULL;
> }
>
> @@ -2172,7 +2167,7 @@ void show_free_areas(void)
> for_each_online_cpu(cpu) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, cpu);
> + pageset = per_cpu_ptr(zone->pageset, cpu);
>
> printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
> cpu, pageset->pcp.high,
> @@ -2735,10 +2730,29 @@ static void build_zonelist_cache(pg_data
>
> #endif /* CONFIG_NUMA */
>
> +/*
> + * Boot pageset table. One per cpu which is going to be used for all
> + * zones and all nodes. The parameters will be set in such a way
> + * that an item put on a list will immediately be handed over to
> + * the buddy list. This is safe since pageset manipulation is done
> + * with interrupts disabled.
> + *
> + * The boot_pagesets must be kept even after bootup is complete for
> + * unused processors and/or zones. They do play a role for bootstrapping
> + * hotplugged processors.
> + *
> + * zoneinfo_show() and maybe other functions do
> + * not check if the processor is online before following the pageset pointer.
> + * Other parts of the kernel may not check if the zone is available.
> + */
> +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
> +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
> +
> /* return values int ....just for stop_machine() */
> static int __build_all_zonelists(void *dummy)
> {
> int nid;
> + int cpu;
>
> #ifdef CONFIG_NUMA
> memset(node_load, 0, sizeof(node_load));
> @@ -2749,6 +2763,14 @@ static int __build_all_zonelists(void *d
> build_zonelists(pgdat);
> build_zonelist_cache(pgdat);
> }
> +
> + /*
> + * Initialize the boot_pagesets that are going to be used
> + * for bootstrapping processors.
> + */
> + for_each_possible_cpu(cpu)
> + setup_pageset(&per_cpu(boot_pageset, cpu), 0);
> +
> return 0;
> }
>
> @@ -3087,120 +3109,60 @@ static void setup_pagelist_highmark(stru
> }
>
>
> -#ifdef CONFIG_NUMA
> -/*
> - * Boot pageset table. One per cpu which is going to be used for all
> - * zones and all nodes. The parameters will be set in such a way
> - * that an item put on a list will immediately be handed over to
> - * the buddy list. This is safe since pageset manipulation is done
> - * with interrupts disabled.
> - *
> - * Some NUMA counter updates may also be caught by the boot pagesets.
> - *
> - * The boot_pagesets must be kept even after bootup is complete for
> - * unused processors and/or zones. They do play a role for bootstrapping
> - * hotplugged processors.
> - *
> - * zoneinfo_show() and maybe other functions do
> - * not check if the processor is online before following the pageset pointer.
> - * Other parts of the kernel may not check if the zone is available.
> - */
> -static struct per_cpu_pageset boot_pageset[NR_CPUS];
> -
> -/*
> - * Dynamically allocate memory for the
> - * per cpu pageset array in struct zone.
> - */
> -static int __cpuinit process_zones(int cpu)
> -{
> - struct zone *zone, *dzone;
> - int node = cpu_to_node(cpu);
> -
> - node_set_state(node, N_CPU); /* this node has a cpu */
> -
> - for_each_populated_zone(zone) {
> - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
> - GFP_KERNEL, node);
> - if (!zone_pcp(zone, cpu))
> - goto bad;
> -
> - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
> -
> - if (percpu_pagelist_fraction)
> - setup_pagelist_highmark(zone_pcp(zone, cpu),
> - (zone->present_pages / percpu_pagelist_fraction));
> - }
> -
> - return 0;
> -bad:
> - for_each_zone(dzone) {
> - if (!populated_zone(dzone))
> - continue;
> - if (dzone == zone)
> - break;
> - kfree(zone_pcp(dzone, cpu));
> - zone_pcp(dzone, cpu) = &boot_pageset[cpu];
> - }
> - return -ENOMEM;
> -}
> -
> -static inline void free_zone_pagesets(int cpu)
> -{
> - struct zone *zone;
> -
> - for_each_zone(zone) {
> - struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
> -
> - /* Free per_cpu_pageset if it is slab allocated */
> - if (pset != &boot_pageset[cpu])
> - kfree(pset);
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - }
> -}
> -
> static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
> unsigned long action,
> void *hcpu)
> {
> int cpu = (long)hcpu;
> - int ret = NOTIFY_OK;
>
> switch (action) {
> case CPU_UP_PREPARE:
> case CPU_UP_PREPARE_FROZEN:
> - if (process_zones(cpu))
> - ret = NOTIFY_BAD;
> - break;
> - case CPU_UP_CANCELED:
> - case CPU_UP_CANCELED_FROZEN:
> - case CPU_DEAD:
> - case CPU_DEAD_FROZEN:
> - free_zone_pagesets(cpu);
> + node_set_state(cpu_to_node(cpu), N_CPU);
> break;
> default:
> break;
> }
> - return ret;
> + return NOTIFY_OK;
> }
>
> static struct notifier_block __cpuinitdata pageset_notifier =
> { &pageset_cpuup_callback, NULL, 0 };
>
> +/*
> + * Allocate per cpu pagesets and initialize them.
> + * Before this call only boot pagesets were available.
> + * Boot pagesets will no longer be used by this processorr
> + * after setup_per_cpu_pageset().
> + */
> void __init setup_per_cpu_pageset(void)
> {
> - int err;
> + struct zone *zone;
> + int cpu;
> +
> + for_each_populated_zone(zone) {
> + zone->pageset = alloc_percpu(struct per_cpu_pageset);
> +
> + for_each_possible_cpu(cpu) {
> + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
> +
> + setup_pageset(pcp, zone_batchsize(zone));
> +
> + if (percpu_pagelist_fraction)
> + setup_pagelist_highmark(pcp,
> + (zone->present_pages /
> + percpu_pagelist_fraction));
> + }
> + }
>
> - /* Initialize per_cpu_pageset for cpu 0.
> - * A cpuup callback will do this for every cpu
> - * as it comes online
> + /*
> + * The boot cpu is always the first active.
> + * The boot node has a processor
> */
> - err = process_zones(smp_processor_id());
> - BUG_ON(err);
> + node_set_state(cpu_to_node(smp_processor_id()), N_CPU);
> register_cpu_notifier(&pageset_notifier);
> }
>
> -#endif
> -
> static noinline __init_refok
> int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
> {
> @@ -3254,7 +3216,7 @@ static int __zone_pcp_update(void *data)
> struct per_cpu_pageset *pset;
> struct per_cpu_pages *pcp;
>
> - pset = zone_pcp(zone, cpu);
> + pset = per_cpu_ptr(zone->pageset, cpu);
> pcp = &pset->pcp;
>
> local_irq_save(flags);
> @@ -3272,21 +3234,13 @@ void zone_pcp_update(struct zone *zone)
>
> static __meminit void zone_pcp_init(struct zone *zone)
> {
> - int cpu;
> - unsigned long batch = zone_batchsize(zone);
> + /* Use boot pagesets until we have the per cpu allocator up */
> + zone->pageset = &per_cpu_var(boot_pageset);
>
> - for (cpu = 0; cpu < NR_CPUS; cpu++) {
> -#ifdef CONFIG_NUMA
> - /* Early boot. Slab allocator not functional yet */
> - zone_pcp(zone, cpu) = &boot_pageset[cpu];
> - setup_pageset(&boot_pageset[cpu],0);
> -#else
> - setup_pageset(zone_pcp(zone,cpu), batch);
> -#endif
> - }
> if (zone->present_pages)
> - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
> - zone->name, zone->present_pages, batch);
> + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
> + zone->name, zone->present_pages,
> + zone_batchsize(zone));
> }
>
> __meminit int init_currently_empty_zone(struct zone *zone,
> @@ -4800,10 +4754,11 @@ int percpu_pagelist_fraction_sysctl_hand
> if (!write || (ret == -EINVAL))
> return ret;
> for_each_populated_zone(zone) {
> - for_each_online_cpu(cpu) {
> + for_each_possible_cpu(cpu) {
> unsigned long high;
> high = zone->present_pages / percpu_pagelist_fraction;
> - setup_pagelist_highmark(zone_pcp(zone, cpu), high);
> + setup_pagelist_highmark(
> + per_cpu_ptr(zone->pageset, cpu), high);
> }
> }
> return 0;
> Index: linux-2.6/mm/vmstat.c
> ===================================================================
> --- linux-2.6.orig/mm/vmstat.c 2009-10-07 14:34:25.000000000 -0500
> +++ linux-2.6/mm/vmstat.c 2009-10-07 14:48:09.000000000 -0500
> @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds
> threshold = calculate_threshold(zone);
>
> for_each_online_cpu(cpu)
> - zone_pcp(zone, cpu)->stat_threshold = threshold;
> + per_cpu_ptr(zone->pageset, cpu)->stat_threshold
> + = threshold;
> }
> }
>
> @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds
> void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
> int delta)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> +
> s8 *p = pcp->vm_stat_diff + item;
> long x;
>
> @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
> */
> void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)++;
> @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
>
> void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
> {
> - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
> + struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
> s8 *p = pcp->vm_stat_diff + item;
>
> (*p)--;
> @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
> for_each_populated_zone(zone) {
> struct per_cpu_pageset *p;
>
> - p = zone_pcp(zone, cpu);
> + p = per_cpu_ptr(zone->pageset, cpu);
>
> for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
> if (p->vm_stat_diff[i]) {
> @@ -738,7 +740,7 @@ static void zoneinfo_show_print(struct s
> for_each_online_cpu(i) {
> struct per_cpu_pageset *pageset;
>
> - pageset = zone_pcp(zone, i);
> + pageset = per_cpu_ptr(zone->pageset, i);
> seq_printf(m,
> "\n cpu: %i"
> "\n count: %i"
>
> --
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
next prev parent reply other threads:[~2009-10-08 10:54 UTC|newest]
Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-10-07 21:10 [this_cpu_xx V6 0/7] Introduce per cpu atomic operations and avoid per cpu address arithmetic cl
2009-10-07 21:10 ` [this_cpu_xx V6 1/7] this_cpu_ops: page allocator conversion cl
2009-10-08 10:38 ` Tejun Heo
2009-10-08 10:40 ` Tejun Heo
2009-10-08 16:15 ` Christoph Lameter
2009-10-08 10:53 ` Mel Gorman [this message]
2009-10-07 21:10 ` [this_cpu_xx V6 2/7] this_cpu ops: Remove pageset_notifier cl
2009-10-07 21:10 ` [this_cpu_xx V6 3/7] Use this_cpu operations in slub cl
2009-10-12 10:19 ` Tejun Heo
2009-10-12 10:21 ` Tejun Heo
2009-10-12 14:54 ` Christoph Lameter
2009-10-13 2:13 ` Tejun Heo
2009-10-13 14:41 ` Christoph Lameter
2009-10-13 14:56 ` Tejun Heo
2009-10-13 15:20 ` Christoph Lameter
2009-10-14 1:57 ` Tejun Heo
2009-10-14 14:14 ` Christoph Lameter
2009-10-15 7:47 ` Tejun Heo
2009-10-16 16:44 ` Christoph Lameter
2009-10-18 3:11 ` Tejun Heo
2009-10-07 21:10 ` [this_cpu_xx V6 4/7] SLUB: Get rid of dynamic DMA kmalloc cache allocation cl
2009-10-13 18:48 ` [FIX] patch "SLUB: Get rid of dynamic DMA kmalloc cache allocation" Christoph Lameter
2009-10-07 21:10 ` [this_cpu_xx V6 5/7] this_cpu: Remove slub kmem_cache fields cl
2009-10-07 23:10 ` Christoph Lameter
2009-10-07 21:10 ` [this_cpu_xx V6 6/7] Make slub statistics use this_cpu_inc cl
2009-10-07 21:10 ` [this_cpu_xx V6 7/7] this_cpu: slub aggressive use of this_cpu operations in the hotpaths cl
2009-10-12 10:40 ` Tejun Heo
2009-10-12 13:14 ` Pekka Enberg
2009-10-12 14:55 ` Christoph Lameter
2009-10-13 9:45 ` David Rientjes
2009-10-13 14:43 ` Christoph Lameter
2009-10-13 19:14 ` Christoph Lameter
2009-10-13 19:44 ` Pekka Enberg
2009-10-13 19:48 ` Christoph Lameter
2009-10-13 20:15 ` David Rientjes
2009-10-13 20:28 ` Christoph Lameter
2009-10-13 22:53 ` David Rientjes
2009-10-14 13:34 ` Mel Gorman
2009-10-14 14:08 ` Christoph Lameter
2009-10-14 15:49 ` Mel Gorman
2009-10-14 15:53 ` Pekka Enberg
2009-10-14 15:56 ` Christoph Lameter
2009-10-14 16:14 ` Pekka Enberg
2009-10-14 18:19 ` Christoph Lameter
2009-10-16 10:50 ` Mel Gorman
2009-10-16 18:40 ` David Rientjes
2009-10-15 9:03 ` David Rientjes
2009-10-16 16:45 ` Christoph Lameter
2009-10-16 18:43 ` David Rientjes
2009-10-16 18:50 ` Christoph Lameter
2009-10-13 20:25 ` Christoph Lameter
2009-10-14 1:33 ` David Rientjes
2009-10-13 15:40 ` [this_cpu_xx V6 0/7] Introduce per cpu atomic operations and avoid per cpu address arithmetic Mel Gorman
2009-10-13 15:45 ` Christoph Lameter
2009-10-13 16:09 ` Mel Gorman
2009-10-13 17:17 ` Christoph Lameter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20091008105357.GC10004@csn.ul.ie \
--to=mel@csn.ul.ie \
--cc=cl@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@polymtl.ca \
--cc=penberg@cs.helsinki.fi \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox