From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail143.messagelabs.com (mail143.messagelabs.com [216.82.254.35]) by kanga.kvack.org (Postfix) with SMTP id C9B1D6B007D for ; Thu, 21 Jan 2010 11:09:07 -0500 (EST) Date: Thu, 21 Jan 2010 17:08:07 +0100 From: Andrea Arcangeli Subject: Re: [PATCH 28 of 30] memcg huge memory Message-ID: <20100121160807.GB5598@random.random> References: <4c405faf58cfe5d1aa6e.1264054852@v2.random> <20100121161601.6612fd79.kamezawa.hiroyu@jp.fujitsu.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20100121161601.6612fd79.kamezawa.hiroyu@jp.fujitsu.com> Sender: owner-linux-mm@kvack.org To: KAMEZAWA Hiroyuki Cc: linux-mm@kvack.org, Marcelo Tosatti , Adam Litke , Avi Kivity , Izik Eidus , Hugh Dickins , Nick Piggin , Rik van Riel , Mel Gorman , Andi Kleen , Dave Hansen , Benjamin Herrenschmidt , Ingo Molnar , Mike Travis , Christoph Lameter , Chris Wright , Andrew Morton List-ID: > > @@ -228,6 +229,7 @@ static int __do_huge_pmd_anonymous_page( > > > > spin_lock(&mm->page_table_lock); > > if (unlikely(!pmd_none(*pmd))) { > > + mem_cgroup_uncharge_page(page); > > put_page(page); > > pte_free(mm, pgtable); > On Thu, Jan 21, 2010 at 04:16:01PM +0900, KAMEZAWA Hiroyuki wrote: > Can't we do this put_page() and uncharge() outside of page table lock ? Yes we can, but it's only a microoptimization because this only triggers during a controlled race condition across different threads. But no problem to optimize it... diff --git a/mm/huge_memory.c b/mm/huge_memory.c --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -228,6 +228,7 @@ static int __do_huge_pmd_anonymous_page( spin_lock(&mm->page_table_lock); if (unlikely(!pmd_none(*pmd))) { + spin_unlock(&mm->page_table_lock); put_page(page); pte_free(mm, pgtable); } else { @@ -238,8 +239,8 @@ static int __do_huge_pmd_anonymous_page( page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); prepare_pmd_huge_pte(pgtable, mm); + spin_unlock(&mm->page_table_lock); } - spin_unlock(&mm->page_table_lock); return ret; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -230,6 +230,7 @@ static int __do_huge_pmd_anonymous_page( spin_lock(&mm->page_table_lock); if (unlikely(!pmd_none(*pmd))) { spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); } else { Also below I appended the update memcg_compound to stop using the batch system. Also note your "likely" I removed it because for KVM most of the time it'll be TransHugePages to be charged. I prefer likely/unlikely when it's always a slow path no matter what workload (assuming useful/optimized workloads only ;). Like said in earlier email I guess the below may be wasted time because of the rework coming on the file. Also note the TransHugePage check here it's used instead of page_size == PAGE_SIZE to eliminate that additional branch at compile time if TRANSPARENT_HUGEPAGE=n. Now the only real pain remains in the LRU list accounting, I tried to solve it but found no clean way that didn't require mess all over vmscan.c. So for now hugepages in lru are accounted as 4k pages ;). Nothing breaks just stats won't be as useful to the admin... Subject: memcg compound From: Andrea Arcangeli Teach memcg to charge/uncharge compound pages. Signed-off-by: Andrea Arcangeli --- diff --git a/mm/memcontrol.c b/mm/memcontrol.c --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1401,8 +1401,8 @@ static int __cpuinit memcg_stock_cpu_cal * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom, struct page *page) + gfp_t gfp_mask, struct mem_cgroup **memcg, + bool oom, struct page *page, int page_size) { struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -1415,6 +1415,9 @@ static int __mem_cgroup_try_charge(struc return 0; } + if (PageTransHuge(page)) + csize = page_size; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the @@ -1439,8 +1442,9 @@ static int __mem_cgroup_try_charge(struc int ret = 0; unsigned long flags = 0; - if (consume_stock(mem)) - goto charged; + if (!PageTransHuge(page)) + if (consume_stock(mem)) + goto charged; ret = res_counter_charge(&mem->res, csize, &fail_res); if (likely(!ret)) { @@ -1460,7 +1464,7 @@ static int __mem_cgroup_try_charge(struc res); /* reduce request size and retry */ - if (csize > PAGE_SIZE) { + if (csize > page_size) { csize = PAGE_SIZE; continue; } @@ -1491,7 +1495,7 @@ static int __mem_cgroup_try_charge(struc goto nomem; } } - if (csize > PAGE_SIZE) + if (csize > page_size) refill_stock(mem, csize - PAGE_SIZE); charged: /* @@ -1512,12 +1516,12 @@ nomem: * This function is for that and do uncharge, put css's refcnt. * gotten by try_charge(). */ -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, int page_size) { if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); } css_put(&mem->css); } @@ -1575,8 +1579,9 @@ struct mem_cgroup *try_get_mem_cgroup_fr */ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) @@ -1585,7 +1590,7 @@ static void __mem_cgroup_commit_charge(s lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } @@ -1722,7 +1727,8 @@ static int mem_cgroup_move_parent(struct goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page, + PAGE_SIZE); if (ret || !parent) goto put_back; @@ -1730,7 +1736,7 @@ static int mem_cgroup_move_parent(struct if (!ret) css_put(&parent->css); /* drop extra refcnt by try_charge() */ else - mem_cgroup_cancel_charge(parent); /* does css_put */ + mem_cgroup_cancel_charge(parent, PAGE_SIZE); /* does css_put */ put_back: putback_lru_page(page); put: @@ -1752,6 +1758,10 @@ static int mem_cgroup_charge_common(stru struct mem_cgroup *mem; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -1760,11 +1770,12 @@ static int mem_cgroup_charge_common(stru prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page, + page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -1773,8 +1784,6 @@ int mem_cgroup_newpage_charge(struct pag { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -1787,7 +1796,7 @@ int mem_cgroup_newpage_charge(struct pag if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } static void @@ -1880,14 +1889,14 @@ int mem_cgroup_try_charge_swapin(struct if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page, PAGE_SIZE); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true, page); + return __mem_cgroup_try_charge(mm, mask, ptr, true, page, PAGE_SIZE); } static void @@ -1903,7 +1912,7 @@ __mem_cgroup_commit_charge_swapin(struct cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -1952,11 +1961,12 @@ void mem_cgroup_cancel_charge_swapin(str return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -1989,14 +1999,14 @@ __do_uncharge(struct mem_cgroup *mem, co if (batch->memcg != mem) goto direct_uncharge; /* remember freed charge and uncharge it later */ - batch->bytes += PAGE_SIZE; + batch->bytes += page_size; if (uncharge_memsw) - batch->memsw_bytes += PAGE_SIZE; + batch->memsw_bytes += page_size; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); return; } @@ -2009,6 +2019,10 @@ __mem_cgroup_uncharge_common(struct page struct page_cgroup *pc; struct mem_cgroup *mem = NULL; struct mem_cgroup_per_zone *mz; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -2048,7 +2062,7 @@ __mem_cgroup_uncharge_common(struct page } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); @@ -2217,7 +2231,7 @@ int mem_cgroup_prepare_migration(struct if (mem) { ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, - page); + page, PAGE_SIZE); css_put(&mem->css); } *ptr = mem; @@ -2260,7 +2274,7 @@ void mem_cgroup_end_migration(struct mem * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. * So, double-counting is effectively avoided. */ - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); /* * Both of oldpage and newpage are still under lock_page(). -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org