From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 7610219AD8D for ; Fri, 10 Jan 2025 09:06:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=217.140.110.172 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736499965; cv=none; b=DYgDHAyEHw798Z8fzRJZIaIvA+PFNx8QY0dT0zaXVhxGR5b9NqNdV+SolTfYgHgc9OnGxJZuSrNL87Kn1RASa7tKk7cK7V8WBnjx20JxrIfw0UbhMEcq26dpBw3A8wESnrLD5eKhVxuQx+Ow9x0FlQUukFZtp0J+RHLUz6qEowU= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1736499965; c=relaxed/simple; bh=LHzk6cjnKsDkOoRDc9k0YN32UANj1KFOjdgWs22h9HY=; h=Message-ID:Date:MIME-Version:Subject:To:Cc:References:From: In-Reply-To:Content-Type; b=h9ybGncFcuHsrifQV8LtXtRTWqp+beMreOeE/dnMznOWkJHhmIDxPw5euHAWHBBNR8GuDpF2x+aXlZHQPoza5iK1Hy63vkwTSgTV5PgBOirmvgxlkVm7Ln14BMDZIA8gtCYvtG/MaSqi9/OKwb7Fk2ztNd5JorqNGvuVUyIQx8c= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com; spf=pass smtp.mailfrom=arm.com; arc=none smtp.client-ip=217.140.110.172 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=arm.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=arm.com Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id B30E312FC; Fri, 10 Jan 2025 01:06:30 -0800 (PST) Received: from [10.162.42.21] (K4MQJ0H1H2.blr.arm.com [10.162.42.21]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id A4B1A3F673; Fri, 10 Jan 2025 01:05:50 -0800 (PST) Message-ID: Date: Fri, 10 Jan 2025 14:35:47 +0530 Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 User-Agent: Mozilla Thunderbird Subject: Re: [RFC 08/11] khugepaged: introduce khugepaged_scan_bitmap for mTHP support To: Nico Pache , linux-kernel@vger.kernel.org, linux-mm@kvack.org Cc: ryan.roberts@arm.com, anshuman.khandual@arm.com, catalin.marinas@arm.com, cl@gentwo.org, vbabka@suse.cz, mhocko@suse.com, apopple@nvidia.com, dave.hansen@linux.intel.com, will@kernel.org, baohua@kernel.org, jack@suse.cz, srivatsa@csail.mit.edu, haowenchao22@gmail.com, hughd@google.com, aneesh.kumar@kernel.org, yang@os.amperecomputing.com, peterx@redhat.com, ioworker0@gmail.com, wangkefeng.wang@huawei.com, ziy@nvidia.com, jglisse@google.com, surenb@google.com, vishal.moola@gmail.com, zokeefe@google.com, zhengqi.arch@bytedance.com, jhubbard@nvidia.com, 21cnbao@gmail.com, willy@infradead.org, kirill.shutemov@linux.intel.com, david@redhat.com, aarcange@redhat.com, raquini@redhat.com, sunnanyong@huawei.com, usamaarif642@gmail.com, audra@redhat.com, akpm@linux-foundation.org References: <20250108233128.14484-1-npache@redhat.com> <20250108233128.14484-9-npache@redhat.com> Content-Language: en-US From: Dev Jain In-Reply-To: <20250108233128.14484-9-npache@redhat.com> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit On 09/01/25 5:01 am, Nico Pache wrote: > khugepaged scans PMD ranges for potential collapse to a hugepage. To add > mTHP support we use this scan to instead record chunks of fully utilized > sections of the PMD. > > create a bitmap to represent a PMD in order MTHP_MIN_ORDER chunks. > by default we will set this to order 3. The reasoning is that for 4K 512 > PMD size this results in a 64 bit bitmap which has some optimizations. > For other arches like ARM64 64K, we can set a larger order if needed. > > khugepaged_scan_bitmap uses a stack struct to recursively scan a bitmap > that represents chunks of fully utilized regions. We can then determine > what mTHP size fits best and in the following patch, we set this bitmap > while scanning the PMD. > > max_ptes_none is used as a scale to determine how "full" an order must > be before being considered for collapse. > > Signed-off-by: Nico Pache > --- > include/linux/khugepaged.h | 4 +- > mm/khugepaged.c | 129 +++++++++++++++++++++++++++++++++++-- > 2 files changed, 126 insertions(+), 7 deletions(-) > > diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h > index 1f46046080f5..31cff8aeec4a 100644 > --- a/include/linux/khugepaged.h > +++ b/include/linux/khugepaged.h > @@ -1,7 +1,9 @@ > /* SPDX-License-Identifier: GPL-2.0 */ > #ifndef _LINUX_KHUGEPAGED_H > #define _LINUX_KHUGEPAGED_H > - Nit: I don't think this line needs to be deleted. > +#define MIN_MTHP_ORDER 3 > +#define MIN_MTHP_NR (1< +#define MTHP_BITMAP_SIZE (1<<(HPAGE_PMD_ORDER - MIN_MTHP_ORDER)) > extern unsigned int khugepaged_max_ptes_none __read_mostly; > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > extern struct attribute_group khugepaged_attr_group; > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 9eb161b04ee4..de1dc6ea3c71 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -94,6 +94,11 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); > > static struct kmem_cache *mm_slot_cache __ro_after_init; > > +struct scan_bit_state { > + u8 order; > + u8 offset; > +}; > + > struct collapse_control { > bool is_khugepaged; > > @@ -102,6 +107,15 @@ struct collapse_control { > > /* nodemask for allocation fallback */ > nodemask_t alloc_nmask; > + > + /* bitmap used to collapse mTHP sizes. 1bit = order MIN_MTHP_ORDER mTHP */ > + unsigned long *mthp_bitmap; > + unsigned long *mthp_bitmap_temp; > + struct scan_bit_state *mthp_bitmap_stack; > +}; > + > +struct collapse_control khugepaged_collapse_control = { > + .is_khugepaged = true, > }; > > /** > @@ -389,6 +403,25 @@ int __init khugepaged_init(void) > if (!mm_slot_cache) > return -ENOMEM; > > + /* > + * allocate the bitmaps dynamically since MTHP_BITMAP_SIZE is not known at > + * compile time for some architectures. > + */ > + khugepaged_collapse_control.mthp_bitmap = kmalloc_array( > + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > + if (!khugepaged_collapse_control.mthp_bitmap) > + return -ENOMEM; > + > + khugepaged_collapse_control.mthp_bitmap_temp = kmalloc_array( > + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > + if (!khugepaged_collapse_control.mthp_bitmap_temp) > + return -ENOMEM; > + > + khugepaged_collapse_control.mthp_bitmap_stack = kmalloc_array( > + MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL); > + if (!khugepaged_collapse_control.mthp_bitmap_stack) > + return -ENOMEM; > + > khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; > khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; > khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; > @@ -400,6 +433,9 @@ int __init khugepaged_init(void) > void __init khugepaged_destroy(void) > { > kmem_cache_destroy(mm_slot_cache); > + kfree(khugepaged_collapse_control.mthp_bitmap); > + kfree(khugepaged_collapse_control.mthp_bitmap_temp); > + kfree(khugepaged_collapse_control.mthp_bitmap_stack); > } > > static inline int khugepaged_test_exit(struct mm_struct *mm) > @@ -850,10 +886,6 @@ static void khugepaged_alloc_sleep(void) > remove_wait_queue(&khugepaged_wait, &wait); > } > > -struct collapse_control khugepaged_collapse_control = { > - .is_khugepaged = true, > -}; > - > static bool khugepaged_scan_abort(int nid, struct collapse_control *cc) > { > int i; > @@ -1102,7 +1134,8 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, > > static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > int referenced, int unmapped, > - struct collapse_control *cc) > + struct collapse_control *cc, bool *mmap_locked, > + int order, int offset) > { > LIST_HEAD(compound_pagelist); > pmd_t *pmd, _pmd; > @@ -1115,6 +1148,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > struct mmu_notifier_range range; > VM_BUG_ON(address & ~HPAGE_PMD_MASK); > > + /* if collapsing mTHPs we may have already released the read_lock, and > + * need to reaquire it to keep the proper locking order. > + */ > + if (!*mmap_locked) > + mmap_read_lock(mm); There is no need to take the read lock again, because we drop it just after this. > /* > * Before allocating the hugepage, release the mmap_lock read lock. > * The allocation can take potentially a long time if it involves > @@ -1122,6 +1160,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > * that. We will recheck the vma after taking it again in write mode. > */ > mmap_read_unlock(mm); > + *mmap_locked = false; > > result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER); > if (result != SCAN_SUCCEED) > @@ -1256,12 +1295,71 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > out_up_write: > mmap_write_unlock(mm); > out_nolock: > + *mmap_locked = false; > if (folio) > folio_put(folio); > trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); > return result; > } > > +// Recursive function to consume the bitmap > +static int khugepaged_scan_bitmap(struct mm_struct *mm, unsigned long address, > + int referenced, int unmapped, struct collapse_control *cc, > + bool *mmap_locked, unsigned long enabled_orders) > +{ > + u8 order, offset; > + int num_chunks; > + int bits_set, max_percent, threshold_bits; > + int next_order, mid_offset; > + int top = -1; > + int collapsed = 0; > + int ret; > + struct scan_bit_state state; > + > + cc->mthp_bitmap_stack[++top] = (struct scan_bit_state) > + { HPAGE_PMD_ORDER - MIN_MTHP_ORDER, 0 }; > + > + while (top >= 0) { > + state = cc->mthp_bitmap_stack[top--]; > + order = state.order; > + offset = state.offset; > + num_chunks = 1 << order; > + // Skip mTHP orders that are not enabled > + if (!(enabled_orders >> (order + MIN_MTHP_ORDER)) & 1) > + goto next; > + > + // copy the relavant section to a new bitmap > + bitmap_shift_right(cc->mthp_bitmap_temp, cc->mthp_bitmap, offset, > + MTHP_BITMAP_SIZE); > + > + bits_set = bitmap_weight(cc->mthp_bitmap_temp, num_chunks); > + > + // Check if the region is "almost full" based on the threshold > + max_percent = ((HPAGE_PMD_NR - khugepaged_max_ptes_none - 1) * 100) > + / (HPAGE_PMD_NR - 1); > + threshold_bits = (max_percent * num_chunks) / 100; > + > + if (bits_set >= threshold_bits) { > + ret = collapse_huge_page(mm, address, referenced, unmapped, cc, > + mmap_locked, order + MIN_MTHP_ORDER, offset * MIN_MTHP_NR); > + if (ret == SCAN_SUCCEED) > + collapsed += (1 << (order + MIN_MTHP_ORDER)); > + continue; > + } > + > +next: > + if (order > 0) { > + next_order = order - 1; > + mid_offset = offset + (num_chunks / 2); > + cc->mthp_bitmap_stack[++top] = (struct scan_bit_state) > + { next_order, mid_offset }; > + cc->mthp_bitmap_stack[++top] = (struct scan_bit_state) > + { next_order, offset }; > + } > + } > + return collapsed; > +} > + > static int khugepaged_scan_pmd(struct mm_struct *mm, > struct vm_area_struct *vma, > unsigned long address, bool *mmap_locked, > @@ -1430,7 +1528,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, > pte_unmap_unlock(pte, ptl); > if (result == SCAN_SUCCEED) { > result = collapse_huge_page(mm, address, referenced, > - unmapped, cc); > + unmapped, cc, mmap_locked, HPAGE_PMD_ORDER, 0); > /* collapse_huge_page will return with the mmap_lock released */ > *mmap_locked = false; > } > @@ -2767,6 +2865,21 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, > return -ENOMEM; > cc->is_khugepaged = false; > > + cc->mthp_bitmap = kmalloc_array( > + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > + if (!cc->mthp_bitmap) > + return -ENOMEM; > + > + cc->mthp_bitmap_temp = kmalloc_array( > + BITS_TO_LONGS(MTHP_BITMAP_SIZE), sizeof(unsigned long), GFP_KERNEL); > + if (!cc->mthp_bitmap_temp) > + return -ENOMEM; > + > + cc->mthp_bitmap_stack = kmalloc_array( > + MTHP_BITMAP_SIZE, sizeof(struct scan_bit_state), GFP_KERNEL); > + if (!cc->mthp_bitmap_stack) > + return -ENOMEM; > + > mmgrab(mm); > lru_add_drain_all(); > > @@ -2831,8 +2944,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, > out_nolock: > mmap_assert_locked(mm); > mmdrop(mm); > + kfree(cc->mthp_bitmap); > + kfree(cc->mthp_bitmap_temp); > + kfree(cc->mthp_bitmap_stack); > kfree(cc); > > + > return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 > : madvise_collapse_errno(last_fail); > }