* [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-26 15:28 [PATCH -mm 0/4] kmemcg: get rid of __GFP_KMEMCG Vladimir Davydov
@ 2014-03-26 15:28 ` Vladimir Davydov
2014-03-26 21:53 ` Michal Hocko
2014-03-27 4:31 ` Greg Thelen
2014-03-26 15:28 ` [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly Vladimir Davydov
` (2 subsequent siblings)
3 siblings, 2 replies; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-26 15:28 UTC (permalink / raw)
To: akpm
Cc: hannes, mhocko, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
We don't track any random page allocation, so we shouldn't track kmalloc
that falls back to the page allocator.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
---
include/linux/slab.h | 2 +-
mm/memcontrol.c | 27 +--------------------------
mm/slub.c | 4 ++--
3 files changed, 4 insertions(+), 29 deletions(-)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3dd389aa91c7..8a928ff71d93 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -363,7 +363,7 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
void *ret;
- flags |= (__GFP_COMP | __GFP_KMEMCG);
+ flags |= __GFP_COMP;
ret = (void *) __get_free_pages(flags, order);
kmemleak_alloc(ret, size, 1, flags);
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b4b6aef562fa..81a162d01d4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3528,35 +3528,10 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
*_memcg = NULL;
- /*
- * Disabling accounting is only relevant for some specific memcg
- * internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are marked
- * with GFP_KMEMCG only happen outside memcg core. We are mostly
- * concerned with cache allocations, and by having this test at
- * memcg_kmem_get_cache, we are already able to relay the allocation to
- * the root cache and bypass the memcg cache altogether.
- *
- * There is one exception, though: the SLUB allocator does not create
- * large order caches, but rather service large kmallocs directly from
- * the page allocator. Therefore, the following sequence when backed by
- * the SLUB allocator:
- *
- * memcg_stop_kmem_account();
- * kmalloc(<large_number>)
- * memcg_resume_kmem_account();
- *
- * would effectively ignore the fact that we should skip accounting,
- * since it will drive us directly to this function without passing
- * through the cache selector memcg_kmem_get_cache. Such large
- * allocations are extremely rare but can happen, for instance, for the
- * cache arrays. We bring this test here.
- */
- if (!current->mm || current->memcg_kmem_skip_account)
+ if (!current->mm)
return true;
memcg = get_mem_cgroup_from_mm(current->mm);
-
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
return true;
diff --git a/mm/slub.c b/mm/slub.c
index 5e234f1f8853..c2e58a787443 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3325,7 +3325,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
+ flags |= __GFP_COMP | __GFP_NOTRACK;
page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3395,7 +3395,7 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kfree_hook(x);
- __free_memcg_kmem_pages(page, compound_order(page));
+ __free_pages(page, compound_order(page));
return;
}
slab_free(page->slab_cache, page, object, _RET_IP_);
--
1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-26 15:28 ` [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg Vladimir Davydov
@ 2014-03-26 21:53 ` Michal Hocko
2014-03-27 7:34 ` Vladimir Davydov
2014-03-27 4:31 ` Greg Thelen
1 sibling, 1 reply; 19+ messages in thread
From: Michal Hocko @ 2014-03-26 21:53 UTC (permalink / raw)
To: Vladimir Davydov
Cc: akpm, hannes, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On Wed 26-03-14 19:28:04, Vladimir Davydov wrote:
> We don't track any random page allocation, so we shouldn't track kmalloc
> that falls back to the page allocator.
Why did we do that in the first place? d79923fad95b (sl[au]b: allocate
objects from memcg cache) didn't tell me much.
How is memcg_kmem_skip_account removal related?
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux-foundation.org>
> Cc: Pekka Enberg <penberg@kernel.org>
> ---
> include/linux/slab.h | 2 +-
> mm/memcontrol.c | 27 +--------------------------
> mm/slub.c | 4 ++--
> 3 files changed, 4 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 3dd389aa91c7..8a928ff71d93 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -363,7 +363,7 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order)
> {
> void *ret;
>
> - flags |= (__GFP_COMP | __GFP_KMEMCG);
> + flags |= __GFP_COMP;
> ret = (void *) __get_free_pages(flags, order);
> kmemleak_alloc(ret, size, 1, flags);
> return ret;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b4b6aef562fa..81a162d01d4d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3528,35 +3528,10 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
>
> *_memcg = NULL;
>
> - /*
> - * Disabling accounting is only relevant for some specific memcg
> - * internal allocations. Therefore we would initially not have such
> - * check here, since direct calls to the page allocator that are marked
> - * with GFP_KMEMCG only happen outside memcg core. We are mostly
> - * concerned with cache allocations, and by having this test at
> - * memcg_kmem_get_cache, we are already able to relay the allocation to
> - * the root cache and bypass the memcg cache altogether.
> - *
> - * There is one exception, though: the SLUB allocator does not create
> - * large order caches, but rather service large kmallocs directly from
> - * the page allocator. Therefore, the following sequence when backed by
> - * the SLUB allocator:
> - *
> - * memcg_stop_kmem_account();
> - * kmalloc(<large_number>)
> - * memcg_resume_kmem_account();
> - *
> - * would effectively ignore the fact that we should skip accounting,
> - * since it will drive us directly to this function without passing
> - * through the cache selector memcg_kmem_get_cache. Such large
> - * allocations are extremely rare but can happen, for instance, for the
> - * cache arrays. We bring this test here.
> - */
> - if (!current->mm || current->memcg_kmem_skip_account)
> + if (!current->mm)
> return true;
>
> memcg = get_mem_cgroup_from_mm(current->mm);
> -
> if (!memcg_can_account_kmem(memcg)) {
> css_put(&memcg->css);
> return true;
> diff --git a/mm/slub.c b/mm/slub.c
> index 5e234f1f8853..c2e58a787443 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3325,7 +3325,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
> struct page *page;
> void *ptr = NULL;
>
> - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
> + flags |= __GFP_COMP | __GFP_NOTRACK;
> page = alloc_pages_node(node, flags, get_order(size));
> if (page)
> ptr = page_address(page);
> @@ -3395,7 +3395,7 @@ void kfree(const void *x)
> if (unlikely(!PageSlab(page))) {
> BUG_ON(!PageCompound(page));
> kfree_hook(x);
> - __free_memcg_kmem_pages(page, compound_order(page));
> + __free_pages(page, compound_order(page));
> return;
> }
> slab_free(page->slab_cache, page, object, _RET_IP_);
> --
> 1.7.10.4
>
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-26 21:53 ` Michal Hocko
@ 2014-03-27 7:34 ` Vladimir Davydov
2014-03-27 20:40 ` Michal Hocko
0 siblings, 1 reply; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-27 7:34 UTC (permalink / raw)
To: Michal Hocko, glommer
Cc: akpm, hannes, linux-kernel, linux-mm, devel, Christoph Lameter,
Pekka Enberg
Hi Michal,
On 03/27/2014 01:53 AM, Michal Hocko wrote:
> On Wed 26-03-14 19:28:04, Vladimir Davydov wrote:
>> We don't track any random page allocation, so we shouldn't track kmalloc
>> that falls back to the page allocator.
> Why did we do that in the first place? d79923fad95b (sl[au]b: allocate
> objects from memcg cache) didn't tell me much.
I don't know, we'd better ask Glauber about that.
> How is memcg_kmem_skip_account removal related?
The comment this patch removes along with the memcg_kmem_skip_account
check explains that pretty well IMO. In short, we only use
memcg_kmem_skip_account to prevent kmalloc's from charging, which is
crucial for recursion-avoidance in memcg_kmem_get_cache. Since we don't
charge pages allocated from a root (not per-memcg) cache, from the first
glance it would be enough to check for memcg_kmem_skip_account only in
memcg_kmem_get_cache and return the root cache if it's set. However, for
we can also kmalloc w/o issuing memcg_kmem_get_cache (kmalloc_large), we
also need this check in memcg_kmem_newpage_charge. This patch removes
kmalloc_large accounting, so we don't need this check anymore.
Thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-27 7:34 ` Vladimir Davydov
@ 2014-03-27 20:40 ` Michal Hocko
0 siblings, 0 replies; 19+ messages in thread
From: Michal Hocko @ 2014-03-27 20:40 UTC (permalink / raw)
To: Vladimir Davydov
Cc: glommer, akpm, hannes, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On Thu 27-03-14 11:34:10, Vladimir Davydov wrote:
> Hi Michal,
>
> On 03/27/2014 01:53 AM, Michal Hocko wrote:
> > On Wed 26-03-14 19:28:04, Vladimir Davydov wrote:
> >> We don't track any random page allocation, so we shouldn't track kmalloc
> >> that falls back to the page allocator.
> > Why did we do that in the first place? d79923fad95b (sl[au]b: allocate
> > objects from memcg cache) didn't tell me much.
>
> I don't know, we'd better ask Glauber about that.
>
> > How is memcg_kmem_skip_account removal related?
>
> The comment this patch removes along with the memcg_kmem_skip_account
> check explains that pretty well IMO. In short, we only use
> memcg_kmem_skip_account to prevent kmalloc's from charging, which is
> crucial for recursion-avoidance in memcg_kmem_get_cache. Since we don't
> charge pages allocated from a root (not per-memcg) cache, from the first
> glance it would be enough to check for memcg_kmem_skip_account only in
> memcg_kmem_get_cache and return the root cache if it's set. However, for
> we can also kmalloc w/o issuing memcg_kmem_get_cache (kmalloc_large), we
> also need this check in memcg_kmem_newpage_charge. This patch removes
> kmalloc_large accounting, so we don't need this check anymore.
Document that in the changelog please.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-26 15:28 ` [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg Vladimir Davydov
2014-03-26 21:53 ` Michal Hocko
@ 2014-03-27 4:31 ` Greg Thelen
2014-03-27 7:37 ` Vladimir Davydov
1 sibling, 1 reply; 19+ messages in thread
From: Greg Thelen @ 2014-03-27 4:31 UTC (permalink / raw)
To: Vladimir Davydov
Cc: akpm, hannes, mhocko, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On Wed, Mar 26 2014, Vladimir Davydov <vdavydov@parallels.com> wrote:
> We don't track any random page allocation, so we shouldn't track kmalloc
> that falls back to the page allocator.
This seems like a change which will leads to confusing (and arguably
improper) kernel behavior. I prefer the behavior prior to this patch.
Before this change both of the following allocations are charged to
memcg (assuming kmem accounting is enabled):
a = kmalloc(KMALLOC_MAX_CACHE_SIZE, GFP_KERNEL)
b = kmalloc(KMALLOC_MAX_CACHE_SIZE + 1, GFP_KERNEL)
After this change only 'a' is charged; 'b' goes directly to page
allocator which no longer does accounting.
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux-foundation.org>
> Cc: Pekka Enberg <penberg@kernel.org>
> ---
> include/linux/slab.h | 2 +-
> mm/memcontrol.c | 27 +--------------------------
> mm/slub.c | 4 ++--
> 3 files changed, 4 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 3dd389aa91c7..8a928ff71d93 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -363,7 +363,7 @@ kmalloc_order(size_t size, gfp_t flags, unsigned int order)
> {
> void *ret;
>
> - flags |= (__GFP_COMP | __GFP_KMEMCG);
> + flags |= __GFP_COMP;
> ret = (void *) __get_free_pages(flags, order);
> kmemleak_alloc(ret, size, 1, flags);
> return ret;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b4b6aef562fa..81a162d01d4d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3528,35 +3528,10 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
>
> *_memcg = NULL;
>
> - /*
> - * Disabling accounting is only relevant for some specific memcg
> - * internal allocations. Therefore we would initially not have such
> - * check here, since direct calls to the page allocator that are marked
> - * with GFP_KMEMCG only happen outside memcg core. We are mostly
> - * concerned with cache allocations, and by having this test at
> - * memcg_kmem_get_cache, we are already able to relay the allocation to
> - * the root cache and bypass the memcg cache altogether.
> - *
> - * There is one exception, though: the SLUB allocator does not create
> - * large order caches, but rather service large kmallocs directly from
> - * the page allocator. Therefore, the following sequence when backed by
> - * the SLUB allocator:
> - *
> - * memcg_stop_kmem_account();
> - * kmalloc(<large_number>)
> - * memcg_resume_kmem_account();
> - *
> - * would effectively ignore the fact that we should skip accounting,
> - * since it will drive us directly to this function without passing
> - * through the cache selector memcg_kmem_get_cache. Such large
> - * allocations are extremely rare but can happen, for instance, for the
> - * cache arrays. We bring this test here.
> - */
> - if (!current->mm || current->memcg_kmem_skip_account)
> + if (!current->mm)
> return true;
>
> memcg = get_mem_cgroup_from_mm(current->mm);
> -
> if (!memcg_can_account_kmem(memcg)) {
> css_put(&memcg->css);
> return true;
> diff --git a/mm/slub.c b/mm/slub.c
> index 5e234f1f8853..c2e58a787443 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3325,7 +3325,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
> struct page *page;
> void *ptr = NULL;
>
> - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
> + flags |= __GFP_COMP | __GFP_NOTRACK;
> page = alloc_pages_node(node, flags, get_order(size));
> if (page)
> ptr = page_address(page);
> @@ -3395,7 +3395,7 @@ void kfree(const void *x)
> if (unlikely(!PageSlab(page))) {
> BUG_ON(!PageCompound(page));
> kfree_hook(x);
> - __free_memcg_kmem_pages(page, compound_order(page));
> + __free_pages(page, compound_order(page));
> return;
> }
> slab_free(page->slab_cache, page, object, _RET_IP_);
> --
> 1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-27 4:31 ` Greg Thelen
@ 2014-03-27 7:37 ` Vladimir Davydov
2014-03-27 20:42 ` Greg Thelen
2014-03-27 20:43 ` Michal Hocko
0 siblings, 2 replies; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-27 7:37 UTC (permalink / raw)
To: Greg Thelen
Cc: akpm, hannes, mhocko, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
Hi Greg,
On 03/27/2014 08:31 AM, Greg Thelen wrote:
> On Wed, Mar 26 2014, Vladimir Davydov <vdavydov@parallels.com> wrote:
>
>> We don't track any random page allocation, so we shouldn't track kmalloc
>> that falls back to the page allocator.
> This seems like a change which will leads to confusing (and arguably
> improper) kernel behavior. I prefer the behavior prior to this patch.
>
> Before this change both of the following allocations are charged to
> memcg (assuming kmem accounting is enabled):
> a = kmalloc(KMALLOC_MAX_CACHE_SIZE, GFP_KERNEL)
> b = kmalloc(KMALLOC_MAX_CACHE_SIZE + 1, GFP_KERNEL)
>
> After this change only 'a' is charged; 'b' goes directly to page
> allocator which no longer does accounting.
Why do we need to charge 'b' in the first place? Can the userspace
trigger such allocations massively? If there can only be one or two such
allocations from a cgroup, is there any point in charging them?
In fact, do we actually need to charge every random kmem allocation? I
guess not. For instance, filesystems often allocate data shared among
all the FS users. It's wrong to charge such allocations to a particular
memcg, IMO. That said the next step is going to be adding a per kmem
cache flag specifying if allocations from this cache should be charged
so that accounting will work only for those caches that are marked so
explicitly.
There is one more argument for removing kmalloc_large accounting - we
don't have an easy way to track such allocations, which prevents us from
reparenting kmemcg charges on css offline. Of course, we could link
kmalloc_large pages in some sort of per-memcg list which would allow us
to find them on css offline, but I don't think such a complication is
justified.
Thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-27 7:37 ` Vladimir Davydov
@ 2014-03-27 20:42 ` Greg Thelen
2014-03-28 7:56 ` Vladimir Davydov
2014-03-27 20:43 ` Michal Hocko
1 sibling, 1 reply; 19+ messages in thread
From: Greg Thelen @ 2014-03-27 20:42 UTC (permalink / raw)
To: Vladimir Davydov
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Glauber Costa,
linux-kernel@vger.kernel.org, linux-mm@kvack.org, devel,
Christoph Lameter, Pekka Enberg
On Thu, Mar 27, 2014 at 12:37 AM, Vladimir Davydov
<vdavydov@parallels.com> wrote:
> Hi Greg,
>
> On 03/27/2014 08:31 AM, Greg Thelen wrote:
>> On Wed, Mar 26 2014, Vladimir Davydov <vdavydov@parallels.com> wrote:
>>
>>> We don't track any random page allocation, so we shouldn't track kmalloc
>>> that falls back to the page allocator.
>> This seems like a change which will leads to confusing (and arguably
>> improper) kernel behavior. I prefer the behavior prior to this patch.
>>
>> Before this change both of the following allocations are charged to
>> memcg (assuming kmem accounting is enabled):
>> a = kmalloc(KMALLOC_MAX_CACHE_SIZE, GFP_KERNEL)
>> b = kmalloc(KMALLOC_MAX_CACHE_SIZE + 1, GFP_KERNEL)
>>
>> After this change only 'a' is charged; 'b' goes directly to page
>> allocator which no longer does accounting.
>
> Why do we need to charge 'b' in the first place? Can the userspace
> trigger such allocations massively? If there can only be one or two such
> allocations from a cgroup, is there any point in charging them?
Of the top of my head I don't know of any >8KIB kmalloc()s so I can't
say if they're directly triggerable by user space en masse. But we
recently ran into some order:3 allocations in networking. The
networking allocations used a non-generic kmem_cache (rather than
kmalloc which started this discussion). For details, see ed98df3361f0
("net: use __GFP_NORETRY for high order allocations"). I can't say if
such allocations exist in device drivers, but given the networking
example, it's conceivable that they may (or will) exist.
With slab this isn't a problem because sla has kmalloc kmem_caches for
all supported allocation sizes. However, slub shows this issue for
any kmalloc() allocations larger than 8KIB (at least on x86_64). It
seems like a strange directly to take kmem accounting to say that
kmalloc allocations are kmem limited, but only if they are either less
than a threshold size or done with slab. Simply increasing the size
of a data structure doesn't seem like it should automatically cause
the memory to become exempt from kmem limits.
> In fact, do we actually need to charge every random kmem allocation? I
> guess not. For instance, filesystems often allocate data shared among
> all the FS users. It's wrong to charge such allocations to a particular
> memcg, IMO. That said the next step is going to be adding a per kmem
> cache flag specifying if allocations from this cache should be charged
> so that accounting will work only for those caches that are marked so
> explicitly.
It's a question of what direction to approach kmem slab accounting
from: either opt-out (as the code currently is), or opt-in (with per
kmem_cache flags as you suggest). I agree that some structures end up
being shared (e.g. filesystem block bit map structures). In an
opt-out system these are charged to a memcg initially and remain
charged there until the memcg is deleted at which point the shared
objects are reparented to a shared location. While this isn't
perfect, it's unclear if it's better or worse than analyzing each
class of allocation and deciding if they should be opt'd-in. One
could (though I'm not) make the case that even dentries are easily
shareable between containers and thus shouldn't be accounted to a
single memcg. But given user space's ability to DoS a machine with
dentires, they should be accounted.
> There is one more argument for removing kmalloc_large accounting - we
> don't have an easy way to track such allocations, which prevents us from
> reparenting kmemcg charges on css offline. Of course, we could link
> kmalloc_large pages in some sort of per-memcg list which would allow us
> to find them on css offline, but I don't think such a complication is
> justified.
I assume that reparenting of such non kmem_cache allocations (e.g.
large kmalloc) is difficult because such pages refer to the memcg,
which we're trying to delete and the memcg has no index of such pages.
If such zombie memcg are undesirable, then an alternative to indexing
the pages is to define a kmem context object which such large pages
point to. The kmem context would be reparented without needing to
adjust the individual large pages. But there are plenty of options.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-27 20:42 ` Greg Thelen
@ 2014-03-28 7:56 ` Vladimir Davydov
0 siblings, 0 replies; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-28 7:56 UTC (permalink / raw)
To: Greg Thelen
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Glauber Costa,
linux-kernel@vger.kernel.org, linux-mm@kvack.org, devel,
Christoph Lameter, Pekka Enberg
On 03/28/2014 12:42 AM, Greg Thelen wrote:
> On Thu, Mar 27, 2014 at 12:37 AM, Vladimir Davydov
> <vdavydov@parallels.com> wrote:
>> On 03/27/2014 08:31 AM, Greg Thelen wrote:
>>> Before this change both of the following allocations are charged to
>>> memcg (assuming kmem accounting is enabled):
>>> a = kmalloc(KMALLOC_MAX_CACHE_SIZE, GFP_KERNEL)
>>> b = kmalloc(KMALLOC_MAX_CACHE_SIZE + 1, GFP_KERNEL)
>>>
>>> After this change only 'a' is charged; 'b' goes directly to page
>>> allocator which no longer does accounting.
>>
>> Why do we need to charge 'b' in the first place? Can the userspace
>> trigger such allocations massively? If there can only be one or two such
>> allocations from a cgroup, is there any point in charging them?
>
> Of the top of my head I don't know of any >8KIB kmalloc()s so I can't
> say if they're directly triggerable by user space en masse. But we
> recently ran into some order:3 allocations in networking. The
> networking allocations used a non-generic kmem_cache (rather than
> kmalloc which started this discussion). For details, see ed98df3361f0
> ("net: use __GFP_NORETRY for high order allocations"). I can't say if
> such allocations exist in device drivers, but given the networking
> example, it's conceivable that they may (or will) exist.
Hmm, also not sure about device drivers, but the sock frag pages you
mentioned are worth charging I guess.
For such non-generic kmem allocations, we have two options: either go
with __GFP_KMEMCG, or introduce special alloc/free_kmem_pages methods,
which would be used instead of kmalloc for large allocations (e.g.
threadinfo, sock frags). I vote for the second, because I dislike having
kmemcg charging in the general allocation path.
Anyway, that brings us back to the necessity to reliably track arbitrary
pages in kmemcg to allow reparenting.
> With slab this isn't a problem because sla has kmalloc kmem_caches for
> all supported allocation sizes. However, slub shows this issue for
> any kmalloc() allocations larger than 8KIB (at least on x86_64). It
> seems like a strange directly to take kmem accounting to say that
> kmalloc allocations are kmem limited, but only if they are either less
> than a threshold size or done with slab. Simply increasing the size
> of a data structure doesn't seem like it should automatically cause
> the memory to become exempt from kmem limits.
Sounds fair.
>> In fact, do we actually need to charge every random kmem allocation? I
>> guess not. For instance, filesystems often allocate data shared among
>> all the FS users. It's wrong to charge such allocations to a particular
>> memcg, IMO. That said the next step is going to be adding a per kmem
>> cache flag specifying if allocations from this cache should be charged
>> so that accounting will work only for those caches that are marked so
>> explicitly.
>
> It's a question of what direction to approach kmem slab accounting
> from: either opt-out (as the code currently is), or opt-in (with per
> kmem_cache flags as you suggest). I agree that some structures end up
> being shared (e.g. filesystem block bit map structures). In an
> opt-out system these are charged to a memcg initially and remain
> charged there until the memcg is deleted at which point the shared
> objects are reparented to a shared location. While this isn't
> perfect, it's unclear if it's better or worse than analyzing each
> class of allocation and deciding if they should be opt'd-in. One
> could (though I'm not) make the case that even dentries are easily
> shareable between containers and thus shouldn't be accounted to a
> single memcg. But given user space's ability to DoS a machine with
> dentires, they should be accounted.
Again, you must be right. After a bit of thinking I agree that deciding
which caches should be accounted and which shouldn't would be
cumbersome. Opt-out would be clearer, I guess.
>> There is one more argument for removing kmalloc_large accounting - we
>> don't have an easy way to track such allocations, which prevents us from
>> reparenting kmemcg charges on css offline. Of course, we could link
>> kmalloc_large pages in some sort of per-memcg list which would allow us
>> to find them on css offline, but I don't think such a complication is
>> justified.
>
> I assume that reparenting of such non kmem_cache allocations (e.g.
> large kmalloc) is difficult because such pages refer to the memcg,
> which we're trying to delete and the memcg has no index of such pages.
> If such zombie memcg are undesirable, then an alternative to indexing
> the pages is to define a kmem context object which such large pages
> point to. The kmem context would be reparented without needing to
> adjust the individual large pages. But there are plenty of options.
I like the idea about the context object. For usual kmalloc'ed data, we
already have one - the kmem_cache itself. For non-generic kmem (e.g.
threadinfo pages), we could easily introduce a separate one with the
pointer to the owning memcg on it. Reparenting wouldn't be a problem at
all then.
I guess I'll give it a try in the next iteration. Thank you!
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-27 7:37 ` Vladimir Davydov
2014-03-27 20:42 ` Greg Thelen
@ 2014-03-27 20:43 ` Michal Hocko
2014-03-28 7:58 ` Vladimir Davydov
1 sibling, 1 reply; 19+ messages in thread
From: Michal Hocko @ 2014-03-27 20:43 UTC (permalink / raw)
To: Vladimir Davydov
Cc: Greg Thelen, akpm, hannes, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On Thu 27-03-14 11:37:11, Vladimir Davydov wrote:
[...]
> In fact, do we actually need to charge every random kmem allocation? I
> guess not. For instance, filesystems often allocate data shared among
> all the FS users. It's wrong to charge such allocations to a particular
> memcg, IMO. That said the next step is going to be adding a per kmem
> cache flag specifying if allocations from this cache should be charged
> so that accounting will work only for those caches that are marked so
> explicitly.
How do you select which caches to track?
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg
2014-03-27 20:43 ` Michal Hocko
@ 2014-03-28 7:58 ` Vladimir Davydov
0 siblings, 0 replies; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-28 7:58 UTC (permalink / raw)
To: Michal Hocko
Cc: Greg Thelen, akpm, hannes, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On 03/28/2014 12:43 AM, Michal Hocko wrote:
> On Thu 27-03-14 11:37:11, Vladimir Davydov wrote:
> [...]
>> In fact, do we actually need to charge every random kmem allocation? I
>> guess not. For instance, filesystems often allocate data shared among
>> all the FS users. It's wrong to charge such allocations to a particular
>> memcg, IMO. That said the next step is going to be adding a per kmem
>> cache flag specifying if allocations from this cache should be charged
>> so that accounting will work only for those caches that are marked so
>> explicitly.
>
> How do you select which caches to track?
I though we should pick some objects that are definitely used by most
processes, e.g. mm_struct, task_struct, inodes, dentries, as a first
step, and then add some new objects to the set upon requests.
Now, after Greg's explanation, I admit the idea is rather unjustified,
because charging all objects by default and providing a way to
explicitly exclude some caches from accounting requires much less
efforts and changes to the code.
Thanks.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly
2014-03-26 15:28 [PATCH -mm 0/4] kmemcg: get rid of __GFP_KMEMCG Vladimir Davydov
2014-03-26 15:28 ` [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg Vladimir Davydov
@ 2014-03-26 15:28 ` Vladimir Davydov
2014-03-26 21:58 ` Michal Hocko
2014-03-26 15:28 ` [PATCH -mm 3/4] fork: charge threadinfo " Vladimir Davydov
2014-03-26 15:28 ` [PATCH -mm 4/4] mm: kill __GFP_KMEMCG Vladimir Davydov
3 siblings, 1 reply; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-26 15:28 UTC (permalink / raw)
To: akpm
Cc: hannes, mhocko, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
We have only a few places where we actually want to charge kmem so
instead of intruding into the general page allocation path with
__GFP_KMEMCG it's better to explictly charge kmem there. All kmem
charges will be easier to follow that way.
This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG
from memcg caches' allocflags. Instead it makes slab allocation path
call memcg_charge_kmem directly getting memcg to charge from the cache's
memcg params.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
---
include/linux/memcontrol.h | 24 +++++++++++++-----------
mm/memcontrol.c | 15 +++++++++++++++
mm/slab.c | 7 ++++++-
mm/slab_common.c | 6 +-----
mm/slub.c | 24 +++++++++++++++++-------
5 files changed, 52 insertions(+), 24 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e9dfcdad24c5..b8aaecc25cbf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -512,6 +512,9 @@ void memcg_update_array_size(int num_groups);
struct kmem_cache *
__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
+int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order);
+void memcg_uncharge_slab(struct kmem_cache *s, int order);
+
void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
@@ -589,17 +592,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
* @cachep: the original global kmem cache
* @gfp: allocation flags.
*
- * This function assumes that the task allocating, which determines the memcg
- * in the page allocator, belongs to the same cgroup throughout the whole
- * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
- * while belonging to a cgroup, and later on changes. This is considered
- * acceptable, and should only happen upon task migration.
- *
- * Before the cache is created by the memcg core, there is also a possible
- * imbalance: the task belongs to a memcg, but the cache being allocated from
- * is the global cache, since the child cache is not yet guaranteed to be
- * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
- * passed and the page allocator will not attempt any cgroup accounting.
+ * All memory allocated from a per-memcg cache is charged to the owner memcg.
*/
static __always_inline struct kmem_cache *
memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
@@ -667,6 +660,15 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
return cachep;
}
+
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
#endif /* CONFIG_MEMCG_KMEM */
#endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81a162d01d4d..9bbc088e3107 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3506,6 +3506,21 @@ out:
}
EXPORT_SYMBOL(__memcg_kmem_get_cache);
+int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+ if (is_root_cache(s))
+ return 0;
+ return memcg_charge_kmem(s->memcg_params->memcg, gfp,
+ PAGE_SIZE << order);
+}
+
+void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+ if (is_root_cache(s))
+ return;
+ memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order);
+}
+
/*
* We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll
diff --git a/mm/slab.c b/mm/slab.c
index eebc619ae33c..af126a37dafd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1664,8 +1664,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
+ if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+ return NULL;
+
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
+ memcg_uncharge_slab(cachep, cachep->gfporder);
if (!(flags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
@@ -1724,7 +1728,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_memcg_kmem_pages(page, cachep->gfporder);
+ __free_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(cachep, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f3cfccf76dda..6673597ac967 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -290,12 +290,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
root_cache->size, root_cache->align,
root_cache->flags, root_cache->ctor,
memcg, root_cache);
- if (IS_ERR(s)) {
+ if (IS_ERR(s))
kfree(cache_name);
- goto out_unlock;
- }
-
- s->allocflags |= __GFP_KMEMCG;
out_unlock:
mutex_unlock(&slab_mutex);
diff --git a/mm/slub.c b/mm/slub.c
index c2e58a787443..6fefe3b33ce0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1317,17 +1317,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
/*
* Slab allocation and freeing
*/
-static inline struct page *alloc_slab_page(gfp_t flags, int node,
- struct kmem_cache_order_objects oo)
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+ gfp_t flags, int node, struct kmem_cache_order_objects oo)
{
+ struct page *page;
int order = oo_order(oo);
flags |= __GFP_NOTRACK;
+ if (memcg_charge_slab(s, flags, order))
+ return NULL;
+
if (node == NUMA_NO_NODE)
- return alloc_pages(flags, order);
+ page = alloc_pages(flags, order);
else
- return alloc_pages_exact_node(node, flags, order);
+ page = alloc_pages_exact_node(node, flags, order);
+
+ if (!page)
+ memcg_uncharge_slab(s, order);
+
+ return page;
}
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1349,7 +1358,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
- page = alloc_slab_page(alloc_gfp, node, oo);
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
alloc_gfp = flags;
@@ -1357,7 +1366,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(alloc_gfp, node, oo);
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
if (page)
stat(s, ORDER_FALLBACK);
@@ -1473,7 +1482,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_memcg_kmem_pages(page, order);
+ __free_pages(page, order);
+ memcg_uncharge_slab(s, order);
}
#define need_reserve_slab_rcu \
--
1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly
2014-03-26 15:28 ` [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly Vladimir Davydov
@ 2014-03-26 21:58 ` Michal Hocko
2014-03-27 7:38 ` Vladimir Davydov
0 siblings, 1 reply; 19+ messages in thread
From: Michal Hocko @ 2014-03-26 21:58 UTC (permalink / raw)
To: Vladimir Davydov
Cc: akpm, hannes, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On Wed 26-03-14 19:28:05, Vladimir Davydov wrote:
> We have only a few places where we actually want to charge kmem so
> instead of intruding into the general page allocation path with
> __GFP_KMEMCG it's better to explictly charge kmem there. All kmem
> charges will be easier to follow that way.
>
> This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG
> from memcg caches' allocflags. Instead it makes slab allocation path
> call memcg_charge_kmem directly getting memcg to charge from the cache's
> memcg params.
Yes, removing __GFP_KMEMCG is definitely a good step. I am currently at
a conference and do not have much time to review this properly (even
worse will be on vacation for the next 2 weeks) but where did all the
static_key optimization go? What am I missing.
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Glauber Costa <glommer@gmail.com>
> Cc: Christoph Lameter <cl@linux-foundation.org>
> Cc: Pekka Enberg <penberg@kernel.org>
> ---
> include/linux/memcontrol.h | 24 +++++++++++++-----------
> mm/memcontrol.c | 15 +++++++++++++++
> mm/slab.c | 7 ++++++-
> mm/slab_common.c | 6 +-----
> mm/slub.c | 24 +++++++++++++++++-------
> 5 files changed, 52 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e9dfcdad24c5..b8aaecc25cbf 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -512,6 +512,9 @@ void memcg_update_array_size(int num_groups);
> struct kmem_cache *
> __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
>
> +int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order);
> +void memcg_uncharge_slab(struct kmem_cache *s, int order);
> +
> void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
> int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
>
> @@ -589,17 +592,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
> * @cachep: the original global kmem cache
> * @gfp: allocation flags.
> *
> - * This function assumes that the task allocating, which determines the memcg
> - * in the page allocator, belongs to the same cgroup throughout the whole
> - * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
> - * while belonging to a cgroup, and later on changes. This is considered
> - * acceptable, and should only happen upon task migration.
> - *
> - * Before the cache is created by the memcg core, there is also a possible
> - * imbalance: the task belongs to a memcg, but the cache being allocated from
> - * is the global cache, since the child cache is not yet guaranteed to be
> - * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
> - * passed and the page allocator will not attempt any cgroup accounting.
> + * All memory allocated from a per-memcg cache is charged to the owner memcg.
> */
> static __always_inline struct kmem_cache *
> memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
> @@ -667,6 +660,15 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
> {
> return cachep;
> }
> +
> +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
> +{
> + return 0;
> +}
> +
> +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
> +{
> +}
> #endif /* CONFIG_MEMCG_KMEM */
> #endif /* _LINUX_MEMCONTROL_H */
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 81a162d01d4d..9bbc088e3107 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3506,6 +3506,21 @@ out:
> }
> EXPORT_SYMBOL(__memcg_kmem_get_cache);
>
> +int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
> +{
> + if (is_root_cache(s))
> + return 0;
> + return memcg_charge_kmem(s->memcg_params->memcg, gfp,
> + PAGE_SIZE << order);
> +}
> +
> +void memcg_uncharge_slab(struct kmem_cache *s, int order)
> +{
> + if (is_root_cache(s))
> + return;
> + memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order);
> +}
> +
> /*
> * We need to verify if the allocation against current->mm->owner's memcg is
> * possible for the given order. But the page is not allocated yet, so we'll
> diff --git a/mm/slab.c b/mm/slab.c
> index eebc619ae33c..af126a37dafd 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -1664,8 +1664,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
> if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
> flags |= __GFP_RECLAIMABLE;
>
> + if (memcg_charge_slab(cachep, flags, cachep->gfporder))
> + return NULL;
> +
> page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
> if (!page) {
> + memcg_uncharge_slab(cachep, cachep->gfporder);
> if (!(flags & __GFP_NOWARN) && printk_ratelimit())
> slab_out_of_memory(cachep, flags, nodeid);
> return NULL;
> @@ -1724,7 +1728,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
> memcg_release_pages(cachep, cachep->gfporder);
> if (current->reclaim_state)
> current->reclaim_state->reclaimed_slab += nr_freed;
> - __free_memcg_kmem_pages(page, cachep->gfporder);
> + __free_pages(page, cachep->gfporder);
> + memcg_uncharge_slab(cachep, cachep->gfporder);
> }
>
> static void kmem_rcu_free(struct rcu_head *head)
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index f3cfccf76dda..6673597ac967 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -290,12 +290,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
> root_cache->size, root_cache->align,
> root_cache->flags, root_cache->ctor,
> memcg, root_cache);
> - if (IS_ERR(s)) {
> + if (IS_ERR(s))
> kfree(cache_name);
> - goto out_unlock;
> - }
> -
> - s->allocflags |= __GFP_KMEMCG;
>
> out_unlock:
> mutex_unlock(&slab_mutex);
> diff --git a/mm/slub.c b/mm/slub.c
> index c2e58a787443..6fefe3b33ce0 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1317,17 +1317,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
> /*
> * Slab allocation and freeing
> */
> -static inline struct page *alloc_slab_page(gfp_t flags, int node,
> - struct kmem_cache_order_objects oo)
> +static inline struct page *alloc_slab_page(struct kmem_cache *s,
> + gfp_t flags, int node, struct kmem_cache_order_objects oo)
> {
> + struct page *page;
> int order = oo_order(oo);
>
> flags |= __GFP_NOTRACK;
>
> + if (memcg_charge_slab(s, flags, order))
> + return NULL;
> +
> if (node == NUMA_NO_NODE)
> - return alloc_pages(flags, order);
> + page = alloc_pages(flags, order);
> else
> - return alloc_pages_exact_node(node, flags, order);
> + page = alloc_pages_exact_node(node, flags, order);
> +
> + if (!page)
> + memcg_uncharge_slab(s, order);
> +
> + return page;
> }
>
> static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> @@ -1349,7 +1358,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> */
> alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
>
> - page = alloc_slab_page(alloc_gfp, node, oo);
> + page = alloc_slab_page(s, alloc_gfp, node, oo);
> if (unlikely(!page)) {
> oo = s->min;
> alloc_gfp = flags;
> @@ -1357,7 +1366,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> * Allocation may have failed due to fragmentation.
> * Try a lower order alloc if possible
> */
> - page = alloc_slab_page(alloc_gfp, node, oo);
> + page = alloc_slab_page(s, alloc_gfp, node, oo);
>
> if (page)
> stat(s, ORDER_FALLBACK);
> @@ -1473,7 +1482,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
> page_mapcount_reset(page);
> if (current->reclaim_state)
> current->reclaim_state->reclaimed_slab += pages;
> - __free_memcg_kmem_pages(page, order);
> + __free_pages(page, order);
> + memcg_uncharge_slab(s, order);
> }
>
> #define need_reserve_slab_rcu \
> --
> 1.7.10.4
>
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly
2014-03-26 21:58 ` Michal Hocko
@ 2014-03-27 7:38 ` Vladimir Davydov
2014-03-27 20:38 ` Michal Hocko
0 siblings, 1 reply; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-27 7:38 UTC (permalink / raw)
To: Michal Hocko
Cc: akpm, hannes, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On 03/27/2014 01:58 AM, Michal Hocko wrote:
> On Wed 26-03-14 19:28:05, Vladimir Davydov wrote:
>> We have only a few places where we actually want to charge kmem so
>> instead of intruding into the general page allocation path with
>> __GFP_KMEMCG it's better to explictly charge kmem there. All kmem
>> charges will be easier to follow that way.
>>
>> This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG
>> from memcg caches' allocflags. Instead it makes slab allocation path
>> call memcg_charge_kmem directly getting memcg to charge from the cache's
>> memcg params.
> Yes, removing __GFP_KMEMCG is definitely a good step. I am currently at
> a conference and do not have much time to review this properly (even
> worse will be on vacation for the next 2 weeks) but where did all the
> static_key optimization go? What am I missing.
I expected this question, because I want somebody to confirm if we
really need such kind of optimization in the slab allocation path. From
my POV, since we thrash cpu caches there anyway by calling alloc_pages,
wrapping memcg_charge_slab in a static branch wouldn't result in any
noticeable performance boost.
I do admit we benefit from static branching in memcg_kmem_get_cache,
because this one is called on every kmem object allocation, but slab
allocations happen much rarer.
I don't insist on that though, so if you say "no", I'll just add
__memcg_charge_slab and make memcg_charge_slab call it if the static key
is on, but may be, we can avoid such code bloating?
Thanks.
>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Glauber Costa <glommer@gmail.com>
>> Cc: Christoph Lameter <cl@linux-foundation.org>
>> Cc: Pekka Enberg <penberg@kernel.org>
>> ---
>> include/linux/memcontrol.h | 24 +++++++++++++-----------
>> mm/memcontrol.c | 15 +++++++++++++++
>> mm/slab.c | 7 ++++++-
>> mm/slab_common.c | 6 +-----
>> mm/slub.c | 24 +++++++++++++++++-------
>> 5 files changed, 52 insertions(+), 24 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index e9dfcdad24c5..b8aaecc25cbf 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -512,6 +512,9 @@ void memcg_update_array_size(int num_groups);
>> struct kmem_cache *
>> __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
>>
>> +int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order);
>> +void memcg_uncharge_slab(struct kmem_cache *s, int order);
>> +
>> void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
>> int __kmem_cache_destroy_memcg_children(struct kmem_cache *s);
>>
>> @@ -589,17 +592,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
>> * @cachep: the original global kmem cache
>> * @gfp: allocation flags.
>> *
>> - * This function assumes that the task allocating, which determines the memcg
>> - * in the page allocator, belongs to the same cgroup throughout the whole
>> - * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
>> - * while belonging to a cgroup, and later on changes. This is considered
>> - * acceptable, and should only happen upon task migration.
>> - *
>> - * Before the cache is created by the memcg core, there is also a possible
>> - * imbalance: the task belongs to a memcg, but the cache being allocated from
>> - * is the global cache, since the child cache is not yet guaranteed to be
>> - * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
>> - * passed and the page allocator will not attempt any cgroup accounting.
>> + * All memory allocated from a per-memcg cache is charged to the owner memcg.
>> */
>> static __always_inline struct kmem_cache *
>> memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
>> @@ -667,6 +660,15 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
>> {
>> return cachep;
>> }
>> +
>> +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
>> +{
>> + return 0;
>> +}
>> +
>> +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
>> +{
>> +}
>> #endif /* CONFIG_MEMCG_KMEM */
>> #endif /* _LINUX_MEMCONTROL_H */
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 81a162d01d4d..9bbc088e3107 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3506,6 +3506,21 @@ out:
>> }
>> EXPORT_SYMBOL(__memcg_kmem_get_cache);
>>
>> +int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
>> +{
>> + if (is_root_cache(s))
>> + return 0;
>> + return memcg_charge_kmem(s->memcg_params->memcg, gfp,
>> + PAGE_SIZE << order);
>> +}
>> +
>> +void memcg_uncharge_slab(struct kmem_cache *s, int order)
>> +{
>> + if (is_root_cache(s))
>> + return;
>> + memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order);
>> +}
>> +
>> /*
>> * We need to verify if the allocation against current->mm->owner's memcg is
>> * possible for the given order. But the page is not allocated yet, so we'll
>> diff --git a/mm/slab.c b/mm/slab.c
>> index eebc619ae33c..af126a37dafd 100644
>> --- a/mm/slab.c
>> +++ b/mm/slab.c
>> @@ -1664,8 +1664,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
>> if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
>> flags |= __GFP_RECLAIMABLE;
>>
>> + if (memcg_charge_slab(cachep, flags, cachep->gfporder))
>> + return NULL;
>> +
>> page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
>> if (!page) {
>> + memcg_uncharge_slab(cachep, cachep->gfporder);
>> if (!(flags & __GFP_NOWARN) && printk_ratelimit())
>> slab_out_of_memory(cachep, flags, nodeid);
>> return NULL;
>> @@ -1724,7 +1728,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
>> memcg_release_pages(cachep, cachep->gfporder);
>> if (current->reclaim_state)
>> current->reclaim_state->reclaimed_slab += nr_freed;
>> - __free_memcg_kmem_pages(page, cachep->gfporder);
>> + __free_pages(page, cachep->gfporder);
>> + memcg_uncharge_slab(cachep, cachep->gfporder);
>> }
>>
>> static void kmem_rcu_free(struct rcu_head *head)
>> diff --git a/mm/slab_common.c b/mm/slab_common.c
>> index f3cfccf76dda..6673597ac967 100644
>> --- a/mm/slab_common.c
>> +++ b/mm/slab_common.c
>> @@ -290,12 +290,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
>> root_cache->size, root_cache->align,
>> root_cache->flags, root_cache->ctor,
>> memcg, root_cache);
>> - if (IS_ERR(s)) {
>> + if (IS_ERR(s))
>> kfree(cache_name);
>> - goto out_unlock;
>> - }
>> -
>> - s->allocflags |= __GFP_KMEMCG;
>>
>> out_unlock:
>> mutex_unlock(&slab_mutex);
>> diff --git a/mm/slub.c b/mm/slub.c
>> index c2e58a787443..6fefe3b33ce0 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -1317,17 +1317,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
>> /*
>> * Slab allocation and freeing
>> */
>> -static inline struct page *alloc_slab_page(gfp_t flags, int node,
>> - struct kmem_cache_order_objects oo)
>> +static inline struct page *alloc_slab_page(struct kmem_cache *s,
>> + gfp_t flags, int node, struct kmem_cache_order_objects oo)
>> {
>> + struct page *page;
>> int order = oo_order(oo);
>>
>> flags |= __GFP_NOTRACK;
>>
>> + if (memcg_charge_slab(s, flags, order))
>> + return NULL;
>> +
>> if (node == NUMA_NO_NODE)
>> - return alloc_pages(flags, order);
>> + page = alloc_pages(flags, order);
>> else
>> - return alloc_pages_exact_node(node, flags, order);
>> + page = alloc_pages_exact_node(node, flags, order);
>> +
>> + if (!page)
>> + memcg_uncharge_slab(s, order);
>> +
>> + return page;
>> }
>>
>> static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> @@ -1349,7 +1358,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> */
>> alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
>>
>> - page = alloc_slab_page(alloc_gfp, node, oo);
>> + page = alloc_slab_page(s, alloc_gfp, node, oo);
>> if (unlikely(!page)) {
>> oo = s->min;
>> alloc_gfp = flags;
>> @@ -1357,7 +1366,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> * Allocation may have failed due to fragmentation.
>> * Try a lower order alloc if possible
>> */
>> - page = alloc_slab_page(alloc_gfp, node, oo);
>> + page = alloc_slab_page(s, alloc_gfp, node, oo);
>>
>> if (page)
>> stat(s, ORDER_FALLBACK);
>> @@ -1473,7 +1482,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
>> page_mapcount_reset(page);
>> if (current->reclaim_state)
>> current->reclaim_state->reclaimed_slab += pages;
>> - __free_memcg_kmem_pages(page, order);
>> + __free_pages(page, order);
>> + memcg_uncharge_slab(s, order);
>> }
>>
>> #define need_reserve_slab_rcu \
>> --
>> 1.7.10.4
>>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly
2014-03-27 7:38 ` Vladimir Davydov
@ 2014-03-27 20:38 ` Michal Hocko
0 siblings, 0 replies; 19+ messages in thread
From: Michal Hocko @ 2014-03-27 20:38 UTC (permalink / raw)
To: Vladimir Davydov
Cc: akpm, hannes, glommer, linux-kernel, linux-mm, devel,
Christoph Lameter, Pekka Enberg
On Thu 27-03-14 11:38:30, Vladimir Davydov wrote:
> On 03/27/2014 01:58 AM, Michal Hocko wrote:
> > On Wed 26-03-14 19:28:05, Vladimir Davydov wrote:
> >> We have only a few places where we actually want to charge kmem so
> >> instead of intruding into the general page allocation path with
> >> __GFP_KMEMCG it's better to explictly charge kmem there. All kmem
> >> charges will be easier to follow that way.
> >>
> >> This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG
> >> from memcg caches' allocflags. Instead it makes slab allocation path
> >> call memcg_charge_kmem directly getting memcg to charge from the cache's
> >> memcg params.
> > Yes, removing __GFP_KMEMCG is definitely a good step. I am currently at
> > a conference and do not have much time to review this properly (even
> > worse will be on vacation for the next 2 weeks) but where did all the
> > static_key optimization go? What am I missing.
>
> I expected this question, because I want somebody to confirm if we
> really need such kind of optimization in the slab allocation path. From
> my POV, since we thrash cpu caches there anyway by calling alloc_pages,
> wrapping memcg_charge_slab in a static branch wouldn't result in any
> noticeable performance boost.
>
> I do admit we benefit from static branching in memcg_kmem_get_cache,
> because this one is called on every kmem object allocation, but slab
> allocations happen much rarer.
>
> I don't insist on that though, so if you say "no", I'll just add
> __memcg_charge_slab and make memcg_charge_slab call it if the static key
> is on, but may be, we can avoid such code bloating?
I definitely do not insist on static branching at places where it
doesn't help much. The less tricky code we will have the better. Please
document this in the changelog and drop a comment in memcg_charge_slab
which would tell us why we do not have to check for kmem enabling.
Thanks!
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH -mm 3/4] fork: charge threadinfo to memcg explicitly
2014-03-26 15:28 [PATCH -mm 0/4] kmemcg: get rid of __GFP_KMEMCG Vladimir Davydov
2014-03-26 15:28 ` [PATCH -mm 1/4] sl[au]b: do not charge large allocations to memcg Vladimir Davydov
2014-03-26 15:28 ` [PATCH -mm 2/4] sl[au]b: charge slabs to memcg explicitly Vladimir Davydov
@ 2014-03-26 15:28 ` Vladimir Davydov
2014-03-26 22:00 ` Michal Hocko
2014-03-26 15:28 ` [PATCH -mm 4/4] mm: kill __GFP_KMEMCG Vladimir Davydov
3 siblings, 1 reply; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-26 15:28 UTC (permalink / raw)
To: akpm; +Cc: hannes, mhocko, glommer, linux-kernel, linux-mm, devel
We have only a few places where we actually want to charge kmem so
instead of intruding into the general page allocation path with
__GFP_KMEMCG it's better to explictly charge kmem there. All kmem
charges will be easier to follow that way.
This is a step toward removing __GFP_KMEMCG. It makes fork charge task
threadinfo pages explicitly instead of passing __GFP_KMEMCG to
alloc_pages.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
---
kernel/fork.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index f4b09bc15f3a..8209780cf732 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,22 @@ void __weak arch_release_thread_info(struct thread_info *ti)
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
- THREAD_SIZE_ORDER);
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
+ if (!memcg_kmem_newpage_charge(THREADINFO_GFP_ACCOUNTED, &memcg,
+ THREAD_SIZE_ORDER))
+ return NULL;
+ page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
+ memcg_kmem_commit_charge(page, memcg, THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ if (ti)
+ memcg_kmem_uncharge_pages(virt_to_page(ti), THREAD_SIZE_ORDER);
+ free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
--
1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 3/4] fork: charge threadinfo to memcg explicitly
2014-03-26 15:28 ` [PATCH -mm 3/4] fork: charge threadinfo " Vladimir Davydov
@ 2014-03-26 22:00 ` Michal Hocko
2014-03-27 7:39 ` Vladimir Davydov
0 siblings, 1 reply; 19+ messages in thread
From: Michal Hocko @ 2014-03-26 22:00 UTC (permalink / raw)
To: Vladimir Davydov; +Cc: akpm, hannes, glommer, linux-kernel, linux-mm, devel
On Wed 26-03-14 19:28:06, Vladimir Davydov wrote:
> We have only a few places where we actually want to charge kmem so
> instead of intruding into the general page allocation path with
> __GFP_KMEMCG it's better to explictly charge kmem there. All kmem
> charges will be easier to follow that way.
>
> This is a step toward removing __GFP_KMEMCG. It makes fork charge task
> threadinfo pages explicitly instead of passing __GFP_KMEMCG to
> alloc_pages.
Looks good from a quick glance. I would also remove
THREADINFO_GFP_ACCOUNTED in this patch.
> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Michal Hocko <mhocko@suse.cz>
> Cc: Glauber Costa <glommer@gmail.com>
> ---
> kernel/fork.c | 13 ++++++++++---
> 1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f4b09bc15f3a..8209780cf732 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -150,15 +150,22 @@ void __weak arch_release_thread_info(struct thread_info *ti)
> static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
> int node)
> {
> - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
> - THREAD_SIZE_ORDER);
> + struct page *page;
> + struct mem_cgroup *memcg = NULL;
>
> + if (!memcg_kmem_newpage_charge(THREADINFO_GFP_ACCOUNTED, &memcg,
> + THREAD_SIZE_ORDER))
> + return NULL;
> + page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
> + memcg_kmem_commit_charge(page, memcg, THREAD_SIZE_ORDER);
> return page ? page_address(page) : NULL;
> }
>
> static inline void free_thread_info(struct thread_info *ti)
> {
> - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
> + if (ti)
> + memcg_kmem_uncharge_pages(virt_to_page(ti), THREAD_SIZE_ORDER);
> + free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
> }
> # else
> static struct kmem_cache *thread_info_cache;
> --
> 1.7.10.4
>
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH -mm 3/4] fork: charge threadinfo to memcg explicitly
2014-03-26 22:00 ` Michal Hocko
@ 2014-03-27 7:39 ` Vladimir Davydov
0 siblings, 0 replies; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-27 7:39 UTC (permalink / raw)
To: Michal Hocko; +Cc: akpm, hannes, glommer, linux-kernel, linux-mm, devel
On 03/27/2014 02:00 AM, Michal Hocko wrote:
> On Wed 26-03-14 19:28:06, Vladimir Davydov wrote:
>> We have only a few places where we actually want to charge kmem so
>> instead of intruding into the general page allocation path with
>> __GFP_KMEMCG it's better to explictly charge kmem there. All kmem
>> charges will be easier to follow that way.
>>
>> This is a step toward removing __GFP_KMEMCG. It makes fork charge task
>> threadinfo pages explicitly instead of passing __GFP_KMEMCG to
>> alloc_pages.
> Looks good from a quick glance. I would also remove
> THREADINFO_GFP_ACCOUNTED in this patch.
To do so, I'd have to remove __GFP_KMEMCG check from
memcg_kmem_newpage_charge, which is better to do in the next patch,
which removes __GFP_KMEMCG everywhere, IMO.
Thanks.
>> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
>> Cc: Johannes Weiner <hannes@cmpxchg.org>
>> Cc: Michal Hocko <mhocko@suse.cz>
>> Cc: Glauber Costa <glommer@gmail.com>
>> ---
>> kernel/fork.c | 13 ++++++++++---
>> 1 file changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index f4b09bc15f3a..8209780cf732 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -150,15 +150,22 @@ void __weak arch_release_thread_info(struct thread_info *ti)
>> static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
>> int node)
>> {
>> - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
>> - THREAD_SIZE_ORDER);
>> + struct page *page;
>> + struct mem_cgroup *memcg = NULL;
>>
>> + if (!memcg_kmem_newpage_charge(THREADINFO_GFP_ACCOUNTED, &memcg,
>> + THREAD_SIZE_ORDER))
>> + return NULL;
>> + page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
>> + memcg_kmem_commit_charge(page, memcg, THREAD_SIZE_ORDER);
>> return page ? page_address(page) : NULL;
>> }
>>
>> static inline void free_thread_info(struct thread_info *ti)
>> {
>> - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
>> + if (ti)
>> + memcg_kmem_uncharge_pages(virt_to_page(ti), THREAD_SIZE_ORDER);
>> + free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
>> }
>> # else
>> static struct kmem_cache *thread_info_cache;
>> --
>> 1.7.10.4
>>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH -mm 4/4] mm: kill __GFP_KMEMCG
2014-03-26 15:28 [PATCH -mm 0/4] kmemcg: get rid of __GFP_KMEMCG Vladimir Davydov
` (2 preceding siblings ...)
2014-03-26 15:28 ` [PATCH -mm 3/4] fork: charge threadinfo " Vladimir Davydov
@ 2014-03-26 15:28 ` Vladimir Davydov
3 siblings, 0 replies; 19+ messages in thread
From: Vladimir Davydov @ 2014-03-26 15:28 UTC (permalink / raw)
To: akpm; +Cc: hannes, mhocko, glommer, linux-kernel, linux-mm, devel
All kmem is now charged to memcg explicitly, and __GFP_KMEMCG is not
used anywhere, so just get rid of it.
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Glauber Costa <glommer@gmail.com>
---
include/linux/gfp.h | 5 -----
include/linux/memcontrol.h | 2 +-
include/linux/thread_info.h | 2 --
include/trace/events/gfpflags.h | 1 -
kernel/fork.c | 2 +-
mm/page_alloc.c | 35 -----------------------------------
6 files changed, 2 insertions(+), 45 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 39b81dc7d01a..e37b662cd869 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -31,7 +31,6 @@ struct vm_area_struct;
#define ___GFP_HARDWALL 0x20000u
#define ___GFP_THISNODE 0x40000u
#define ___GFP_RECLAIMABLE 0x80000u
-#define ___GFP_KMEMCG 0x100000u
#define ___GFP_NOTRACK 0x200000u
#define ___GFP_NO_KSWAPD 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
@@ -91,7 +90,6 @@ struct vm_area_struct;
#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */
/*
@@ -372,9 +370,6 @@ extern void free_pages(unsigned long addr, unsigned int order);
extern void free_hot_cold_page(struct page *page, int cold);
extern void free_hot_cold_page_list(struct list_head *list, int cold);
-extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
-extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
-
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b8aaecc25cbf..c709f1d30bd5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -543,7 +543,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
* res_counter_charge_nofail, but we hope those allocations are rare,
* and won't be worth the trouble.
*/
- if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+ if (gfp & __GFP_NOFAIL)
return true;
if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
return true;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index fddbe2023a5d..1807bb194816 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm);
# define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK)
#endif
-#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
-
/*
* flag set/clear/test wrappers
* - pass TIF_xxxx constants to these functions
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 1eddbf1557f2..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,7 +34,6 @@
{(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \
{(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \
{(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \
- {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \
{(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \
{(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \
{(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \
diff --git a/kernel/fork.c b/kernel/fork.c
index 8209780cf732..043658430e04 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -153,7 +153,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
struct page *page;
struct mem_cgroup *memcg = NULL;
- if (!memcg_kmem_newpage_charge(THREADINFO_GFP_ACCOUNTED, &memcg,
+ if (!memcg_kmem_newpage_charge(THREADINFO_GFP, &memcg,
THREAD_SIZE_ORDER))
return NULL;
page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0327f9d5a8c0..80cce64d30d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2723,7 +2723,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2742,13 +2741,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- /*
- * Will only have any effect when __GFP_KMEMCG is set. This is
- * verified in the (always inline) callee
- */
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
-
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2810,8 +2802,6 @@ out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
- memcg_kmem_commit_charge(page, memcg, order);
-
if (page)
set_page_owner(page, order, gfp_mask);
@@ -2867,31 +2857,6 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
-/*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
- * pages allocated with __GFP_KMEMCG.
- *
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
- *
- * The caller knows better which flags it relies on.
- */
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
-{
- memcg_kmem_uncharge_pages(page, order);
- __free_pages(page, order);
-}
-
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
-{
- if (addr != 0) {
- VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
- }
-}
-
static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
{
if (addr) {
--
1.7.10.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 19+ messages in thread