From mboxrd@z Thu Jan 1 00:00:00 1970 From: Pekka Enberg Subject: Re: Mainline kernel OLTP performance update Date: Thu, 22 Jan 2009 11:47:52 +0200 Message-ID: <1232617672.14549.25.camel@penberg-laptop> References: <200901161503.13730.nickpiggin@yahoo.com.au> <20090115201210.ca1a9542.akpm@linux-foundation.org> <200901161746.25205.nickpiggin@yahoo.com.au> <20090116065546.GJ31013@parisc-linux.org> <1232092430.11429.52.camel@ymzhang> <87sknjeemn.fsf@basil.nowhere.org> <1232428583.11429.83.camel@ymzhang> <1232613395.11429.122.camel@ymzhang> <1232615707.14549.6.camel@penberg-laptop> <1232616517.11429.129.camel@ymzhang> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: Christoph Lameter , Andi Kleen , Matthew Wilcox , Nick Piggin , Andrew Morton , netdev@vger.kernel.org, sfr@canb.auug.org.au, matthew.r.wilcox@intel.com, chinang.ma@intel.com, linux-kernel@vger.kernel.org, sharad.c.tripathi@intel.com, arjan@linux.intel.com, suresh.b.siddha@intel.com, harita.chilukuri@intel.com, douglas.w.styner@intel.com, peter.xihong.wang@intel.com, hubert.nueckel@intel.com, chris.mason@oracle.com, srostedt@redhat.com, linux-scsi@vger.kernel.org, andrew.vasquez@qlogic.com, anirban.chakraborty@qlogic.com To: "Zhang, Yanmin" Return-path: Received: from courier.cs.helsinki.fi ([128.214.9.1]:35187 "EHLO mail.cs.helsinki.fi" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753016AbZAVJry (ORCPT ); Thu, 22 Jan 2009 04:47:54 -0500 In-Reply-To: <1232616517.11429.129.camel@ymzhang> Sender: netdev-owner@vger.kernel.org List-ID: On Thu, 2009-01-22 at 17:28 +0800, Zhang, Yanmin wrote: > On Thu, 2009-01-22 at 11:15 +0200, Pekka Enberg wrote: > > On Thu, 2009-01-22 at 16:36 +0800, Zhang, Yanmin wrote: > > > On Wed, 2009-01-21 at 18:58 -0500, Christoph Lameter wrote: > > > > On Tue, 20 Jan 2009, Zhang, Yanmin wrote: > > > >=20 > > > > > kmem_cache =EF=BB=BFskbuff_head_cache's object size is just 2= 56, so it shares the kmem_cache > > > > > with =EF=BB=BF:0000256. Their order is 1 which means every sl= ab consists of 2 physical pages. > > > >=20 > > > > That order can be changed. Try specifying slub_max_order=3D0 on= the kernel > > > > command line to force an order 0 alloc. > > > I tried =EF=BB=BFslub_max_order=3D0 and there is no improvement o= n this UDP-U-4k issue. > > > Both get_page_from_freelist and __free_pages_ok's cpu time are st= ill very high. > > >=20 > > > I checked my instrumentation in kernel and found it's caused by l= arge object allocation/free > > > whose size is more than PAGE_SIZE. Here its order is 1. > > >=20 > > > The right free callchain is __kfree_skb =3D> skb_release_all =3D>= skb_release_data. > > >=20 > > > So this case isn't the issue that batch of allocation/free might = erase partial page > > > functionality. > >=20 > > So is this the kfree(skb->head) in skb_release_data() or the put_pa= ge() > > calls in the same function in a loop? > It's =EF=BB=BFkfree(skb->head). >=20 > >=20 > > If it's the former, with big enough size passed to __alloc_skb(), t= he > > networking code might be taking a hit from the SLUB page allocator > > pass-through. Do we know what kind of size is being passed to __alloc_skb() in this case? Maybe we want to do something like this. Pekka SLUB: revert page allocator pass-through This is a revert of commit aadb4bc4a1f9108c1d0fbd121827c936c2ed4217 ("S= LUB: direct pass through of page size or higher kmalloc requests"). --- diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b..3bd3662 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -124,7 +124,7 @@ struct kmem_cache { * We keep the general caches in an array of slab caches that are used= for * 2^x bytes of allocations. */ -extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; +extern struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; =20 /* * Sorry that the following has to be that ugly but some versions of G= CC @@ -135,6 +135,9 @@ static __always_inline int kmalloc_index(size_t siz= e) if (!size) return 0; =20 + if (size > KMALLOC_MAX_SIZE) + return -1; + if (size <=3D KMALLOC_MIN_SIZE) return KMALLOC_SHIFT_LOW; =20 @@ -154,10 +157,6 @@ static __always_inline int kmalloc_index(size_t si= ze) if (size <=3D 1024) return 10; if (size <=3D 2 * 1024) return 11; if (size <=3D 4 * 1024) return 12; -/* - * The following is only needed to support architectures with a larger= page - * size than 4k. - */ if (size <=3D 8 * 1024) return 13; if (size <=3D 16 * 1024) return 14; if (size <=3D 32 * 1024) return 15; @@ -167,6 +166,10 @@ static __always_inline int kmalloc_index(size_t si= ze) if (size <=3D 512 * 1024) return 19; if (size <=3D 1024 * 1024) return 20; if (size <=3D 2 * 1024 * 1024) return 21; + if (size <=3D 4 * 1024 * 1024) return 22; + if (size <=3D 8 * 1024 * 1024) return 23; + if (size <=3D 16 * 1024 * 1024) return 24; + if (size <=3D 32 * 1024 * 1024) return 25; return -1; =20 /* @@ -191,6 +194,19 @@ static __always_inline struct kmem_cache *kmalloc_= slab(size_t size) if (index =3D=3D 0) return NULL; =20 + /* + * This function only gets expanded if __builtin_constant_p(size), so + * testing it here shouldn't be needed. But some versions of gcc nee= d + * help. + */ + if (__builtin_constant_p(size) && index < 0) { + /* + * Generate a link failure. Would be great if we could + * do something to stop the compile here. + */ + extern void __kmalloc_size_too_large(void); + __kmalloc_size_too_large(); + } return &kmalloc_caches[index]; } =20 @@ -204,17 +220,9 @@ static __always_inline struct kmem_cache *kmalloc_= slab(size_t size) void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); =20 -static __always_inline void *kmalloc_large(size_t size, gfp_t flags) -{ - return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size)); -} - static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { - if (size > PAGE_SIZE) - return kmalloc_large(size, flags); - if (!(flags & SLUB_DMA)) { struct kmem_cache *s =3D kmalloc_slab(size); =20 diff --git a/mm/slub.c b/mm/slub.c index 6392ae5..8fad23f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ =20 -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_a= ligned; EXPORT_SYMBOL(kmalloc_caches); =20 static int __init setup_slub_min_order(char *str) @@ -2537,7 +2537,7 @@ panic: } =20 #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; +static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; =20 static void sysfs_add_func(struct work_struct *w) { @@ -2643,8 +2643,12 @@ static struct kmem_cache *get_slab(size_t size, = gfp_t flags) return ZERO_SIZE_PTR; =20 index =3D size_index[(size - 1) / 8]; - } else + } else { + if (size > KMALLOC_MAX_SIZE) + return NULL; + index =3D fls(size - 1); + } =20 #ifdef CONFIG_ZONE_DMA if (unlikely((flags & SLUB_DMA))) @@ -2658,9 +2662,6 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; =20 - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large(size, flags); - s =3D get_slab(size, flags); =20 if (unlikely(ZERO_OR_NULL_PTR(s))) @@ -2670,25 +2671,11 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); =20 -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page =3D alloc_pages_node(node, flags | __GFP_COMP, - get_order(size)); - - if (page) - return page_address(page); - else - return NULL; -} - #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; =20 - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); - s =3D get_slab(size, flags); =20 if (unlikely(ZERO_OR_NULL_PTR(s))) @@ -2746,11 +2733,8 @@ void kfree(const void *x) return; =20 page =3D virt_to_head_page(x); - if (unlikely(!PageSlab(page))) { - BUG_ON(!PageCompound(page)); - put_page(page); + if (unlikely(WARN_ON(!PageSlab(page)))) /* XXX */ return; - } slab_free(page->slab, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -2985,7 +2969,7 @@ void __init kmem_cache_init(void) caches++; } =20 - for (i =3D KMALLOC_SHIFT_LOW; i <=3D PAGE_SHIFT; i++) { + for (i =3D KMALLOC_SHIFT_LOW; i <=3D KMALLOC_SHIFT_HIGH; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -3022,7 +3006,7 @@ void __init kmem_cache_init(void) slab_state =3D UP; =20 /* Provide the correct kmalloc names now that the caches are up */ - for (i =3D KMALLOC_SHIFT_LOW; i <=3D PAGE_SHIFT; i++) + for (i =3D KMALLOC_SHIFT_LOW; i <=3D KMALLOC_SHIFT_HIGH; i++) kmalloc_caches[i]. name =3D kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); =20 @@ -3222,9 +3206,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t g= fpflags, unsigned long caller) { struct kmem_cache *s; =20 - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large(size, gfpflags); - s =3D get_slab(size, gfpflags); =20 if (unlikely(ZERO_OR_NULL_PTR(s))) @@ -3238,9 +3219,6 @@ void *__kmalloc_node_track_caller(size_t size, gf= p_t gfpflags, { struct kmem_cache *s; =20 - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, gfpflags, node); - s =3D get_slab(size, gfpflags); =20 if (unlikely(ZERO_OR_NULL_PTR(s)))