* [PATCH v4 1/6] mm: free zapped tail pages when splitting isolated thp
2024-08-19 2:30 [PATCH v4 0/6] mm: split underused THPs Usama Arif
@ 2024-08-19 2:30 ` Usama Arif
2024-08-19 2:30 ` [PATCH v4 2/6] mm: remap unused subpages to shared zeropage " Usama Arif
` (4 subsequent siblings)
5 siblings, 0 replies; 21+ messages in thread
From: Usama Arif @ 2024-08-19 2:30 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Shuang Zhai, Usama Arif
From: Yu Zhao <yuzhao@google.com>
If a tail page has only two references left, one inherited from the
isolation of its head and the other from lru_add_page_tail() which we
are about to drop, it means this tail page was concurrently zapped.
Then we can safely free it and save page reclaim or migration the
trouble of trying it.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Shuang Zhai <zhais@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
mm/huge_memory.c | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 04ee8abd6475..147655821f09 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3059,7 +3059,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
unsigned int new_nr = 1 << new_order;
int order = folio_order(folio);
unsigned int nr = 1 << order;
+ struct folio_batch free_folios;
+ folio_batch_init(&free_folios);
/* complete memcg works before add pages to LRU */
split_page_memcg(head, order, new_order);
@@ -3143,6 +3145,27 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (subpage == page)
continue;
folio_unlock(new_folio);
+ /*
+ * If a folio has only two references left, one inherited
+ * from the isolation of its head and the other from
+ * lru_add_page_tail() which we are about to drop, it means this
+ * folio was concurrently zapped. Then we can safely free it
+ * and save page reclaim or migration the trouble of trying it.
+ */
+ if (list && folio_ref_freeze(new_folio, 2)) {
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(new_folio), new_folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_large(new_folio), new_folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_mapped(new_folio), new_folio);
+
+ folio_clear_active(new_folio);
+ folio_clear_unevictable(new_folio);
+ list_del(&new_folio->lru);
+ if (!folio_batch_add(&free_folios, new_folio)) {
+ mem_cgroup_uncharge_folios(&free_folios);
+ free_unref_folios(&free_folios);
+ }
+ continue;
+ }
/*
* Subpages may be freed if there wasn't any mapping
@@ -3153,6 +3176,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
*/
free_page_and_swap_cache(subpage);
}
+
+ if (free_folios.nr) {
+ mem_cgroup_uncharge_folios(&free_folios);
+ free_unref_folios(&free_folios);
+ }
}
/* Racy check whether the huge page can be split */
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v4 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp
2024-08-19 2:30 [PATCH v4 0/6] mm: split underused THPs Usama Arif
2024-08-19 2:30 ` [PATCH v4 1/6] mm: free zapped tail pages when splitting isolated thp Usama Arif
@ 2024-08-19 2:30 ` Usama Arif
2024-08-19 2:30 ` [PATCH v4 3/6] mm: selftest to verify zero-filled pages are mapped to zeropage Usama Arif
` (3 subsequent siblings)
5 siblings, 0 replies; 21+ messages in thread
From: Usama Arif @ 2024-08-19 2:30 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Shuang Zhai, Usama Arif
From: Yu Zhao <yuzhao@google.com>
Here being unused means containing only zeros and inaccessible to
userspace. When splitting an isolated thp under reclaim or migration,
the unused subpages can be mapped to the shared zeropage, hence saving
memory. This is particularly helpful when the internal
fragmentation of a thp is high, i.e. it has many untouched subpages.
This is also a prerequisite for THP low utilization shrinker which will
be introduced in later patches, where underutilized THPs are split, and
the zero-filled pages are freed saving memory.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Shuang Zhai <zhais@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
include/linux/rmap.h | 7 ++++-
mm/huge_memory.c | 8 ++---
mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++------
mm/migrate_device.c | 4 +--
4 files changed, 75 insertions(+), 16 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0978c64f49d8..07854d1f9ad6 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+ RMP_LOCKED = 1 << 0,
+ RMP_USE_SHARED_ZEROPAGE = 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
/*
* rmap_walk_control: To control rmap traversing for specific needs
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 147655821f09..2d77b5d2291e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2911,7 +2911,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
{
int i = 0;
@@ -2919,7 +2919,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
if (!folio_test_anon(folio))
return;
for (;;) {
- remove_migration_ptes(folio, folio, true);
+ remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
i += folio_nr_pages(folio);
if (i >= nr)
break;
@@ -3129,7 +3129,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (nr_dropped)
shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr);
+ remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
/*
* set page to its compound_head when split to non order-0 pages, so
@@ -3425,7 +3425,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(folio, folio_nr_pages(folio));
+ remap_page(folio, folio_nr_pages(folio), 0);
ret = -EAGAIN;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 66a5f73ebfdf..2d2e65d69427 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -178,13 +178,57 @@ void putback_movable_pages(struct list_head *l)
}
}
+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
+ struct folio *folio,
+ unsigned long idx)
+{
+ struct page *page = folio_page(folio, idx);
+ bool contains_data;
+ pte_t newpte;
+ void *addr;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+ if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED) ||
+ mm_forbids_zeropage(pvmw->vma->vm_mm))
+ return false;
+
+ /*
+ * The pmd entry mapping the old thp was flushed and the pte mapping
+ * this subpage has been non present. If the subpage is only zero-filled
+ * then map it to the shared zeropage.
+ */
+ addr = kmap_local_page(page);
+ contains_data = memchr_inv(addr, 0, PAGE_SIZE);
+ kunmap_local(addr);
+
+ if (contains_data)
+ return false;
+
+ newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
+ pvmw->vma->vm_page_prot));
+ set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+
+ dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
+ return true;
+}
+
+struct rmap_walk_arg {
+ struct folio *folio;
+ bool map_unused_to_zeropage;
+};
+
/*
* Restore a potential migration pte to a working pte entry
*/
static bool remove_migration_pte(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *old)
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ struct rmap_walk_arg *rmap_walk_arg = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
@@ -208,6 +252,9 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ if (rmap_walk_arg->map_unused_to_zeropage &&
+ try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+ continue;
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
@@ -286,14 +333,21 @@ static bool remove_migration_pte(struct folio *folio,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
{
+ struct rmap_walk_arg rmap_walk_arg = {
+ .folio = src,
+ .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = src,
+ .arg = &rmap_walk_arg,
};
- if (locked)
+ VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+
+ if (flags & RMP_LOCKED)
rmap_walk_locked(dst, &rwc);
else
rmap_walk(dst, &rwc);
@@ -903,7 +957,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, 0);
rc = mapping->a_ops->writepage(&folio->page, &wbc);
@@ -1067,7 +1121,7 @@ static void migrate_folio_undo_src(struct folio *src,
struct list_head *ret)
{
if (page_was_mapped)
- remove_migration_ptes(src, src, false);
+ remove_migration_ptes(src, src, 0);
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
@@ -1305,7 +1359,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
lru_add_drain();
if (old_page_state & PAGE_WAS_MAPPED)
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, 0);
out_unlock_both:
folio_unlock(dst);
@@ -1443,7 +1497,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (page_was_mapped)
remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6d66dc1c6ffa..8f875636b35b 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
continue;
folio = page_folio(page);
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
folio_unlock(folio);
@@ -837,7 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
src = page_folio(page);
dst = page_folio(newpage);
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, 0);
folio_unlock(src);
if (is_zone_device_page(page))
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v4 3/6] mm: selftest to verify zero-filled pages are mapped to zeropage
2024-08-19 2:30 [PATCH v4 0/6] mm: split underused THPs Usama Arif
2024-08-19 2:30 ` [PATCH v4 1/6] mm: free zapped tail pages when splitting isolated thp Usama Arif
2024-08-19 2:30 ` [PATCH v4 2/6] mm: remap unused subpages to shared zeropage " Usama Arif
@ 2024-08-19 2:30 ` Usama Arif
2024-08-21 19:09 ` Usama Arif
2024-08-19 2:30 ` [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios Usama Arif
` (2 subsequent siblings)
5 siblings, 1 reply; 21+ messages in thread
From: Usama Arif @ 2024-08-19 2:30 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Alexander Zhu, Usama Arif
From: Alexander Zhu <alexlzhu@fb.com>
When a THP is split, any subpage that is zero-filled will be mapped
to the shared zeropage, hence saving memory. Add selftest to verify
this by allocating zero-filled THP and comparing RssAnon before and
after split.
Signed-off-by: Alexander Zhu <alexlzhu@fb.com>
Acked-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
.../selftests/mm/split_huge_page_test.c | 71 +++++++++++++++++++
tools/testing/selftests/mm/vm_util.c | 22 ++++++
tools/testing/selftests/mm/vm_util.h | 1 +
3 files changed, 94 insertions(+)
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
index e5e8dafc9d94..eb6d1b9fc362 100644
--- a/tools/testing/selftests/mm/split_huge_page_test.c
+++ b/tools/testing/selftests/mm/split_huge_page_test.c
@@ -84,6 +84,76 @@ static void write_debugfs(const char *fmt, ...)
write_file(SPLIT_DEBUGFS, input, ret + 1);
}
+static char *allocate_zero_filled_hugepage(size_t len)
+{
+ char *result;
+ size_t i;
+
+ result = memalign(pmd_pagesize, len);
+ if (!result) {
+ printf("Fail to allocate memory\n");
+ exit(EXIT_FAILURE);
+ }
+
+ madvise(result, len, MADV_HUGEPAGE);
+
+ for (i = 0; i < len; i++)
+ result[i] = (char)0;
+
+ return result;
+}
+
+static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
+{
+ unsigned long rss_anon_before, rss_anon_after;
+ size_t i;
+
+ if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
+ printf("No THP is allocated\n");
+ exit(EXIT_FAILURE);
+ }
+
+ rss_anon_before = rss_anon();
+ if (!rss_anon_before) {
+ printf("No RssAnon is allocated before split\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /* split all THPs */
+ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+ (uint64_t)one_page + len, 0);
+
+ for (i = 0; i < len; i++)
+ if (one_page[i] != (char)0) {
+ printf("%ld byte corrupted\n", i);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
+ printf("Still AnonHugePages not split\n");
+ exit(EXIT_FAILURE);
+ }
+
+ rss_anon_after = rss_anon();
+ if (rss_anon_after >= rss_anon_before) {
+ printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
+ rss_anon_before, rss_anon_after);
+ exit(EXIT_FAILURE);
+ }
+}
+
+void split_pmd_zero_pages(void)
+{
+ char *one_page;
+ int nr_hpages = 4;
+ size_t len = nr_hpages * pmd_pagesize;
+
+ one_page = allocate_zero_filled_hugepage(len);
+ verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
+ printf("Split zero filled huge pages successful\n");
+ free(one_page);
+}
+
void split_pmd_thp(void)
{
char *one_page;
@@ -431,6 +501,7 @@ int main(int argc, char **argv)
fd_size = 2 * pmd_pagesize;
+ split_pmd_zero_pages();
split_pmd_thp();
split_pte_mapped_thp();
split_file_backed_thp();
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
index 5a62530da3b5..d8d0cf04bb57 100644
--- a/tools/testing/selftests/mm/vm_util.c
+++ b/tools/testing/selftests/mm/vm_util.c
@@ -12,6 +12,7 @@
#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
#define SMAP_FILE_PATH "/proc/self/smaps"
+#define STATUS_FILE_PATH "/proc/self/status"
#define MAX_LINE_LENGTH 500
unsigned int __page_size;
@@ -171,6 +172,27 @@ uint64_t read_pmd_pagesize(void)
return strtoul(buf, NULL, 10);
}
+unsigned long rss_anon(void)
+{
+ unsigned long rss_anon = 0;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+
+ fp = fopen(STATUS_FILE_PATH, "r");
+ if (!fp)
+ ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
+
+ if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
+ goto err_out;
+
+ if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
+ ksft_exit_fail_msg("Reading status error\n");
+
+err_out:
+ fclose(fp);
+ return rss_anon;
+}
+
bool __check_huge(void *addr, char *pattern, int nr_hpages,
uint64_t hpage_size)
{
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 9007c420d52c..71b75429f4a5 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
void clear_softdirty(void);
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
uint64_t read_pmd_pagesize(void);
+uint64_t rss_anon(void);
bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v4 3/6] mm: selftest to verify zero-filled pages are mapped to zeropage
2024-08-19 2:30 ` [PATCH v4 3/6] mm: selftest to verify zero-filled pages are mapped to zeropage Usama Arif
@ 2024-08-21 19:09 ` Usama Arif
0 siblings, 0 replies; 21+ messages in thread
From: Usama Arif @ 2024-08-21 19:09 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Alexander Zhu
On 18/08/2024 22:30, Usama Arif wrote:
> From: Alexander Zhu <alexlzhu@fb.com>
>
> When a THP is split, any subpage that is zero-filled will be mapped
> to the shared zeropage, hence saving memory. Add selftest to verify
> this by allocating zero-filled THP and comparing RssAnon before and
> after split.
>
> Signed-off-by: Alexander Zhu <alexlzhu@fb.com>
> Acked-by: Rik van Riel <riel@surriel.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
> .../selftests/mm/split_huge_page_test.c | 71 +++++++++++++++++++
> tools/testing/selftests/mm/vm_util.c | 22 ++++++
> tools/testing/selftests/mm/vm_util.h | 1 +
> 3 files changed, 94 insertions(+)
>
> diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
> index e5e8dafc9d94..eb6d1b9fc362 100644
> --- a/tools/testing/selftests/mm/split_huge_page_test.c
> +++ b/tools/testing/selftests/mm/split_huge_page_test.c
> @@ -84,6 +84,76 @@ static void write_debugfs(const char *fmt, ...)
> write_file(SPLIT_DEBUGFS, input, ret + 1);
> }
>
> +static char *allocate_zero_filled_hugepage(size_t len)
> +{
> + char *result;
> + size_t i;
> +
> + result = memalign(pmd_pagesize, len);
> + if (!result) {
> + printf("Fail to allocate memory\n");
> + exit(EXIT_FAILURE);
> + }
> +
> + madvise(result, len, MADV_HUGEPAGE);
> +
> + for (i = 0; i < len; i++)
> + result[i] = (char)0;
> +
> + return result;
> +}
> +
> +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hpages, size_t len)
> +{
> + unsigned long rss_anon_before, rss_anon_after;
> + size_t i;
> +
> + if (!check_huge_anon(one_page, 4, pmd_pagesize)) {
> + printf("No THP is allocated\n");
> + exit(EXIT_FAILURE);
> + }
> +
> + rss_anon_before = rss_anon();
> + if (!rss_anon_before) {
> + printf("No RssAnon is allocated before split\n");
> + exit(EXIT_FAILURE);
> + }
> +
> + /* split all THPs */
> + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
> + (uint64_t)one_page + len, 0);
> +
> + for (i = 0; i < len; i++)
> + if (one_page[i] != (char)0) {
> + printf("%ld byte corrupted\n", i);
> + exit(EXIT_FAILURE);
> + }
> +
> + if (!check_huge_anon(one_page, 0, pmd_pagesize)) {
> + printf("Still AnonHugePages not split\n");
> + exit(EXIT_FAILURE);
> + }
> +
> + rss_anon_after = rss_anon();
> + if (rss_anon_after >= rss_anon_before) {
> + printf("Incorrect RssAnon value. Before: %ld After: %ld\n",
> + rss_anon_before, rss_anon_after);
> + exit(EXIT_FAILURE);
> + }
> +}
> +
> +void split_pmd_zero_pages(void)
> +{
> + char *one_page;
> + int nr_hpages = 4;
> + size_t len = nr_hpages * pmd_pagesize;
> +
> + one_page = allocate_zero_filled_hugepage(len);
> + verify_rss_anon_split_huge_page_all_zeroes(one_page, nr_hpages, len);
> + printf("Split zero filled huge pages successful\n");
> + free(one_page);
> +}
> +
> void split_pmd_thp(void)
> {
> char *one_page;
> @@ -431,6 +501,7 @@ int main(int argc, char **argv)
>
> fd_size = 2 * pmd_pagesize;
>
> + split_pmd_zero_pages();
> split_pmd_thp();
> split_pte_mapped_thp();
> split_file_backed_thp();
> diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
> index 5a62530da3b5..d8d0cf04bb57 100644
> --- a/tools/testing/selftests/mm/vm_util.c
> +++ b/tools/testing/selftests/mm/vm_util.c
> @@ -12,6 +12,7 @@
>
> #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
> #define SMAP_FILE_PATH "/proc/self/smaps"
> +#define STATUS_FILE_PATH "/proc/self/status"
> #define MAX_LINE_LENGTH 500
>
> unsigned int __page_size;
> @@ -171,6 +172,27 @@ uint64_t read_pmd_pagesize(void)
> return strtoul(buf, NULL, 10);
> }
>
> +unsigned long rss_anon(void)
> +{
> + unsigned long rss_anon = 0;
> + FILE *fp;
> + char buffer[MAX_LINE_LENGTH];
> +
> + fp = fopen(STATUS_FILE_PATH, "r");
> + if (!fp)
> + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH);
> +
> + if (!check_for_pattern(fp, "RssAnon:", buffer, sizeof(buffer)))
> + goto err_out;
> +
> + if (sscanf(buffer, "RssAnon:%10lu kB", &rss_anon) != 1)
> + ksft_exit_fail_msg("Reading status error\n");
> +
> +err_out:
> + fclose(fp);
> + return rss_anon;
> +}
> +
> bool __check_huge(void *addr, char *pattern, int nr_hpages,
> uint64_t hpage_size)
> {
> diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
> index 9007c420d52c..71b75429f4a5 100644
> --- a/tools/testing/selftests/mm/vm_util.h
> +++ b/tools/testing/selftests/mm/vm_util.h
> @@ -39,6 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
> void clear_softdirty(void);
> bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
> uint64_t read_pmd_pagesize(void);
> +uint64_t rss_anon(void);
> bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
> bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
> bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
Need below fixlet as well
From 2476b7a46908c801ab446f41566f4af52b939ac7 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Wed, 21 Aug 2024 20:06:53 +0100
Subject: [PATCH] mm: selftest to verify zero-filled pages are mapped to
zeropage fix
change uint64_t to unsigned long for rss_anon
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
tools/testing/selftests/mm/vm_util.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
index 71b75429f4a5..2eaed8209925 100644
--- a/tools/testing/selftests/mm/vm_util.h
+++ b/tools/testing/selftests/mm/vm_util.h
@@ -39,7 +39,7 @@ unsigned long pagemap_get_pfn(int fd, char *start);
void clear_softdirty(void);
bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
uint64_t read_pmd_pagesize(void);
-uint64_t rss_anon(void);
+unsigned long rss_anon(void);
bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 2:30 [PATCH v4 0/6] mm: split underused THPs Usama Arif
` (2 preceding siblings ...)
2024-08-19 2:30 ` [PATCH v4 3/6] mm: selftest to verify zero-filled pages are mapped to zeropage Usama Arif
@ 2024-08-19 2:30 ` Usama Arif
2024-08-19 8:29 ` Barry Song
2024-08-19 2:30 ` [PATCH v4 5/6] mm: split underused THPs Usama Arif
2024-08-19 2:30 ` [PATCH v4 6/6] mm: add sysfs entry to disable splitting " Usama Arif
5 siblings, 1 reply; 21+ messages in thread
From: Usama Arif @ 2024-08-19 2:30 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Usama Arif
Currently folio->_deferred_list is used to keep track of
partially_mapped folios that are going to be split under memory
pressure. In the next patch, all THPs that are faulted in and collapsed
by khugepaged are also going to be tracked using _deferred_list.
This patch introduces a pageflag to be able to distinguish between
partially mapped folios and others in the deferred_list at split time in
deferred_split_scan. Its needed as __folio_remove_rmap decrements
_mapcount, _large_mapcount and _entire_mapcount, hence it won't be
possible to distinguish between partially mapped folios and others in
deferred_split_scan.
Eventhough it introduces an extra flag to track if the folio is
partially mapped, there is no functional change intended with this
patch and the flag is not useful in this patch itself, it will
become useful in the next patch when _deferred_list has non partially
mapped folios.
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
include/linux/huge_mm.h | 4 ++--
include/linux/page-flags.h | 11 +++++++++++
mm/huge_memory.c | 23 ++++++++++++++++-------
mm/internal.h | 4 +++-
mm/memcontrol.c | 3 ++-
mm/migrate.c | 3 ++-
mm/page_alloc.c | 5 +++--
mm/rmap.c | 5 +++--
mm/vmscan.c | 3 ++-
9 files changed, 44 insertions(+), 17 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4c32058cacfe..969f11f360d2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
{
return split_huge_page_to_list_to_order(page, NULL, 0);
}
-void deferred_split_folio(struct folio *folio);
+void deferred_split_folio(struct folio *folio, bool partially_mapped);
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio);
@@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-static inline void deferred_split_folio(struct folio *folio) {}
+static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a0a29bd092f8..c3bb0e0da581 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -182,6 +182,7 @@ enum pageflags {
/* At least one page in this folio has the hwpoison flag set */
PG_has_hwpoisoned = PG_active,
PG_large_rmappable = PG_workingset, /* anon or file-backed */
+ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
};
#define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
@@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
+FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+/*
+ * PG_partially_mapped is protected by deferred_split split_queue_lock,
+ * so its safe to use non-atomic set/clear.
+ */
+__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
+__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
+FOLIO_TEST_FLAG_FALSE(partially_mapped)
+__FOLIO_SET_FLAG_NOOP(partially_mapped)
+__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
#endif
#define PG_head_mask ((1UL << PG_head))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2d77b5d2291e..70ee49dfeaad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
* page_deferred_list.
*/
list_del_init(&folio->_deferred_list);
+ __folio_clear_partially_mapped(folio);
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
@@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
list_del_init(&folio->_deferred_list);
+ __folio_clear_partially_mapped(folio);
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
@@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
if (folio_test_swapcache(folio))
return;
- if (!list_empty(&folio->_deferred_list))
- return;
-
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (partially_mapped) {
+ if (!folio_test_partially_mapped(folio)) {
+ __folio_set_partially_mapped(folio);
+ if (folio_test_pmd_mappable(folio))
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+ }
+ } else {
+ /* partially mapped folios cannot become non-partially mapped */
+ VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+ }
if (list_empty(&folio->_deferred_list)) {
- if (folio_test_pmd_mappable(folio))
- count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
@@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with folio_put() */
list_del_init(&folio->_deferred_list);
+ __folio_clear_partially_mapped(folio);
ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
diff --git a/mm/internal.h b/mm/internal.h
index 52f7fc4e8ac3..27cbb5365841 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
- if (order > 1)
+ if (order > 1) {
INIT_LIST_HEAD(&folio->_deferred_list);
+ __folio_clear_partially_mapped(folio);
+ }
}
static inline void prep_compound_tail(struct page *head, int tail_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e1ffd2950393..0fd95daecf9a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
!folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
+ !list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio), folio);
/*
* Nobody should be changing or seriously looking at
diff --git a/mm/migrate.c b/mm/migrate.c
index 2d2e65d69427..ef4a732f22b1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
* use _deferred_list.
*/
if (nr_pages > 2 &&
- !list_empty(&folio->_deferred_list)) {
+ !list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio)) {
if (!try_split_folio(folio, split_folios, mode)) {
nr_failed++;
stats->nr_thp_failed += is_thp;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 408ef3d25cf5..a145c550dd2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
break;
case 2:
/* the second tail page: deferred_list overlaps ->mapping */
- if (unlikely(!list_empty(&folio->_deferred_list))) {
- bad_page(page, "on deferred list");
+ if (unlikely(!list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio))) {
+ bad_page(page, "partially mapped folio on deferred list");
goto out;
}
break;
diff --git a/mm/rmap.c b/mm/rmap.c
index a6b9cd0b2b18..4c330635aa4e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
* Check partially_mapped first to ensure it is a large folio.
*/
if (partially_mapped && folio_test_anon(folio) &&
- list_empty(&folio->_deferred_list))
- deferred_split_folio(folio);
+ !folio_test_partially_mapped(folio))
+ deferred_split_folio(folio, true);
+
__folio_mod_stat(folio, -nr, -nr_pmdmapped);
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 25e43bb3b574..25f4e8403f41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
* Split partially mapped folios right away.
* We can free the unmapped pages without IO.
*/
- if (data_race(!list_empty(&folio->_deferred_list)) &&
+ if (data_race(!list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio)) &&
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 2:30 ` [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios Usama Arif
@ 2024-08-19 8:29 ` Barry Song
2024-08-19 8:30 ` Barry Song
2024-08-19 14:17 ` Usama Arif
0 siblings, 2 replies; 21+ messages in thread
From: Barry Song @ 2024-08-19 8:29 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
Hi Usama,
I feel it is much better now! thanks!
On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>
> Currently folio->_deferred_list is used to keep track of
> partially_mapped folios that are going to be split under memory
> pressure. In the next patch, all THPs that are faulted in and collapsed
> by khugepaged are also going to be tracked using _deferred_list.
>
> This patch introduces a pageflag to be able to distinguish between
> partially mapped folios and others in the deferred_list at split time in
> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> possible to distinguish between partially mapped folios and others in
> deferred_split_scan.
>
> Eventhough it introduces an extra flag to track if the folio is
> partially mapped, there is no functional change intended with this
> patch and the flag is not useful in this patch itself, it will
> become useful in the next patch when _deferred_list has non partially
> mapped folios.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
> include/linux/huge_mm.h | 4 ++--
> include/linux/page-flags.h | 11 +++++++++++
> mm/huge_memory.c | 23 ++++++++++++++++-------
> mm/internal.h | 4 +++-
> mm/memcontrol.c | 3 ++-
> mm/migrate.c | 3 ++-
> mm/page_alloc.c | 5 +++--
> mm/rmap.c | 5 +++--
> mm/vmscan.c | 3 ++-
> 9 files changed, 44 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4c32058cacfe..969f11f360d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> {
> return split_huge_page_to_list_to_order(page, NULL, 0);
> }
> -void deferred_split_folio(struct folio *folio);
> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>
> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> unsigned long address, bool freeze, struct folio *folio);
> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> {
> return 0;
> }
> -static inline void deferred_split_folio(struct folio *folio) {}
> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> #define split_huge_pmd(__vma, __pmd, __address) \
> do { } while (0)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index a0a29bd092f8..c3bb0e0da581 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -182,6 +182,7 @@ enum pageflags {
> /* At least one page in this folio has the hwpoison flag set */
> PG_has_hwpoisoned = PG_active,
> PG_large_rmappable = PG_workingset, /* anon or file-backed */
> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> };
>
> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> ClearPageHead(page);
> }
> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> +/*
> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> + * so its safe to use non-atomic set/clear.
> + */
> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> #else
> FOLIO_FLAG_FALSE(large_rmappable)
> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> #endif
>
> #define PG_head_mask ((1UL << PG_head))
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2d77b5d2291e..70ee49dfeaad 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> * page_deferred_list.
> */
> list_del_init(&folio->_deferred_list);
> + __folio_clear_partially_mapped(folio);
> }
> spin_unlock(&ds_queue->split_queue_lock);
> if (mapping) {
> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> if (!list_empty(&folio->_deferred_list)) {
> ds_queue->split_queue_len--;
> list_del_init(&folio->_deferred_list);
> + __folio_clear_partially_mapped(folio);
is it possible to make things clearer by
if (folio_clear_partially_mapped)
__folio_clear_partially_mapped(folio);
While writing without conditions isn't necessarily wrong, adding a condition
will improve the readability of the code and enhance the clarity of my mTHP
counters series. also help decrease smp cache sync if we can avoid
unnecessary writing?
> }
> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> }
>
> -void deferred_split_folio(struct folio *folio)
> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> {
> struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> #ifdef CONFIG_MEMCG
> @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
> if (folio_test_swapcache(folio))
> return;
>
> - if (!list_empty(&folio->_deferred_list))
> - return;
> -
> spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> + if (partially_mapped) {
> + if (!folio_test_partially_mapped(folio)) {
> + __folio_set_partially_mapped(folio);
> + if (folio_test_pmd_mappable(folio))
> + count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> + }
> + } else {
> + /* partially mapped folios cannot become non-partially mapped */
> + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> + }
> if (list_empty(&folio->_deferred_list)) {
> - if (folio_test_pmd_mappable(folio))
> - count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> ds_queue->split_queue_len++;
> #ifdef CONFIG_MEMCG
> @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> } else {
> /* We lost race with folio_put() */
> list_del_init(&folio->_deferred_list);
> + __folio_clear_partially_mapped(folio);
as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
> ds_queue->split_queue_len--;
> }
> if (!--sc->nr_to_scan)
> diff --git a/mm/internal.h b/mm/internal.h
> index 52f7fc4e8ac3..27cbb5365841 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> atomic_set(&folio->_entire_mapcount, -1);
> atomic_set(&folio->_nr_pages_mapped, 0);
> atomic_set(&folio->_pincount, 0);
> - if (order > 1)
> + if (order > 1) {
> INIT_LIST_HEAD(&folio->_deferred_list);
> + __folio_clear_partially_mapped(folio);
if partially_mapped is true for a new folio, does it mean we already have
a bug somewhere?
How is it possible for a new folio to be partially mapped?
> + }
> }
>
> static inline void prep_compound_tail(struct page *head, int tail_idx)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e1ffd2950393..0fd95daecf9a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
> VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
> !folio_test_hugetlb(folio) &&
> - !list_empty(&folio->_deferred_list), folio);
> + !list_empty(&folio->_deferred_list) &&
> + folio_test_partially_mapped(folio), folio);
>
> /*
> * Nobody should be changing or seriously looking at
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 2d2e65d69427..ef4a732f22b1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
> * use _deferred_list.
> */
> if (nr_pages > 2 &&
> - !list_empty(&folio->_deferred_list)) {
> + !list_empty(&folio->_deferred_list) &&
> + folio_test_partially_mapped(folio)) {
> if (!try_split_folio(folio, split_folios, mode)) {
> nr_failed++;
> stats->nr_thp_failed += is_thp;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 408ef3d25cf5..a145c550dd2a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
> break;
> case 2:
> /* the second tail page: deferred_list overlaps ->mapping */
> - if (unlikely(!list_empty(&folio->_deferred_list))) {
> - bad_page(page, "on deferred list");
> + if (unlikely(!list_empty(&folio->_deferred_list) &&
> + folio_test_partially_mapped(folio))) {
> + bad_page(page, "partially mapped folio on deferred list");
> goto out;
> }
> break;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a6b9cd0b2b18..4c330635aa4e 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> * Check partially_mapped first to ensure it is a large folio.
> */
> if (partially_mapped && folio_test_anon(folio) &&
> - list_empty(&folio->_deferred_list))
> - deferred_split_folio(folio);
> + !folio_test_partially_mapped(folio))
> + deferred_split_folio(folio, true);
> +
> __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>
> /*
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 25e43bb3b574..25f4e8403f41 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> * Split partially mapped folios right away.
> * We can free the unmapped pages without IO.
> */
> - if (data_race(!list_empty(&folio->_deferred_list)) &&
> + if (data_race(!list_empty(&folio->_deferred_list) &&
> + folio_test_partially_mapped(folio)) &&
> split_folio_to_list(folio, folio_list))
> goto activate_locked;
> }
> --
> 2.43.5
>
Thanks
Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 8:29 ` Barry Song
@ 2024-08-19 8:30 ` Barry Song
2024-08-19 14:17 ` Usama Arif
1 sibling, 0 replies; 21+ messages in thread
From: Barry Song @ 2024-08-19 8:30 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On Mon, Aug 19, 2024 at 8:29 PM Barry Song <baohua@kernel.org> wrote:
>
> Hi Usama,
>
> I feel it is much better now! thanks!
>
> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> > Currently folio->_deferred_list is used to keep track of
> > partially_mapped folios that are going to be split under memory
> > pressure. In the next patch, all THPs that are faulted in and collapsed
> > by khugepaged are also going to be tracked using _deferred_list.
> >
> > This patch introduces a pageflag to be able to distinguish between
> > partially mapped folios and others in the deferred_list at split time in
> > deferred_split_scan. Its needed as __folio_remove_rmap decrements
> > _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> > possible to distinguish between partially mapped folios and others in
> > deferred_split_scan.
> >
> > Eventhough it introduces an extra flag to track if the folio is
> > partially mapped, there is no functional change intended with this
> > patch and the flag is not useful in this patch itself, it will
> > become useful in the next patch when _deferred_list has non partially
> > mapped folios.
> >
> > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > ---
> > include/linux/huge_mm.h | 4 ++--
> > include/linux/page-flags.h | 11 +++++++++++
> > mm/huge_memory.c | 23 ++++++++++++++++-------
> > mm/internal.h | 4 +++-
> > mm/memcontrol.c | 3 ++-
> > mm/migrate.c | 3 ++-
> > mm/page_alloc.c | 5 +++--
> > mm/rmap.c | 5 +++--
> > mm/vmscan.c | 3 ++-
> > 9 files changed, 44 insertions(+), 17 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 4c32058cacfe..969f11f360d2 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> > {
> > return split_huge_page_to_list_to_order(page, NULL, 0);
> > }
> > -void deferred_split_folio(struct folio *folio);
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >
> > void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > unsigned long address, bool freeze, struct folio *folio);
> > @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> > {
> > return 0;
> > }
> > -static inline void deferred_split_folio(struct folio *folio) {}
> > +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> > #define split_huge_pmd(__vma, __pmd, __address) \
> > do { } while (0)
> >
> > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > index a0a29bd092f8..c3bb0e0da581 100644
> > --- a/include/linux/page-flags.h
> > +++ b/include/linux/page-flags.h
> > @@ -182,6 +182,7 @@ enum pageflags {
> > /* At least one page in this folio has the hwpoison flag set */
> > PG_has_hwpoisoned = PG_active,
> > PG_large_rmappable = PG_workingset, /* anon or file-backed */
> > + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> > };
> >
> > #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> > @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> > ClearPageHead(page);
> > }
> > FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> > +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > +/*
> > + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> > + * so its safe to use non-atomic set/clear.
> > + */
> > +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > #else
> > FOLIO_FLAG_FALSE(large_rmappable)
> > +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> > +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> > +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> > #endif
> >
> > #define PG_head_mask ((1UL << PG_head))
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 2d77b5d2291e..70ee49dfeaad 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > * page_deferred_list.
> > */
> > list_del_init(&folio->_deferred_list);
> > + __folio_clear_partially_mapped(folio);
> > }
> > spin_unlock(&ds_queue->split_queue_lock);
> > if (mapping) {
> > @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> > if (!list_empty(&folio->_deferred_list)) {
> > ds_queue->split_queue_len--;
> > list_del_init(&folio->_deferred_list);
> > + __folio_clear_partially_mapped(folio);
>
> is it possible to make things clearer by
>
> if (folio_clear_partially_mapped)
> __folio_clear_partially_mapped(folio);
sorry for typo,
if (folio_test_partially_mapped)
__folio_clear_partially_mapped(folio);
>
> While writing without conditions isn't necessarily wrong, adding a condition
> will improve the readability of the code and enhance the clarity of my mTHP
> counters series. also help decrease smp cache sync if we can avoid
> unnecessary writing?
>
> > }
> > spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> > }
> >
> > -void deferred_split_folio(struct folio *folio)
> > +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> > +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> > {
> > struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> > #ifdef CONFIG_MEMCG
> > @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
> > if (folio_test_swapcache(folio))
> > return;
> >
> > - if (!list_empty(&folio->_deferred_list))
> > - return;
> > -
> > spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> > + if (partially_mapped) {
> > + if (!folio_test_partially_mapped(folio)) {
> > + __folio_set_partially_mapped(folio);
> > + if (folio_test_pmd_mappable(folio))
> > + count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > + }
> > + } else {
> > + /* partially mapped folios cannot become non-partially mapped */
> > + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> > + }
> > if (list_empty(&folio->_deferred_list)) {
> > - if (folio_test_pmd_mappable(folio))
> > - count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> > - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> > list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> > ds_queue->split_queue_len++;
> > #ifdef CONFIG_MEMCG
> > @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> > } else {
> > /* We lost race with folio_put() */
> > list_del_init(&folio->_deferred_list);
> > + __folio_clear_partially_mapped(folio);
>
> as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
>
> > ds_queue->split_queue_len--;
> > }
> > if (!--sc->nr_to_scan)
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 52f7fc4e8ac3..27cbb5365841 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> > atomic_set(&folio->_entire_mapcount, -1);
> > atomic_set(&folio->_nr_pages_mapped, 0);
> > atomic_set(&folio->_pincount, 0);
> > - if (order > 1)
> > + if (order > 1) {
> > INIT_LIST_HEAD(&folio->_deferred_list);
> > + __folio_clear_partially_mapped(folio);
>
> if partially_mapped is true for a new folio, does it mean we already have
> a bug somewhere?
>
> How is it possible for a new folio to be partially mapped?
>
> > + }
> > }
> >
> > static inline void prep_compound_tail(struct page *head, int tail_idx)
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index e1ffd2950393..0fd95daecf9a 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
> > VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> > VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
> > !folio_test_hugetlb(folio) &&
> > - !list_empty(&folio->_deferred_list), folio);
> > + !list_empty(&folio->_deferred_list) &&
> > + folio_test_partially_mapped(folio), folio);
> >
> > /*
> > * Nobody should be changing or seriously looking at
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 2d2e65d69427..ef4a732f22b1 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
> > * use _deferred_list.
> > */
> > if (nr_pages > 2 &&
> > - !list_empty(&folio->_deferred_list)) {
> > + !list_empty(&folio->_deferred_list) &&
> > + folio_test_partially_mapped(folio)) {
> > if (!try_split_folio(folio, split_folios, mode)) {
> > nr_failed++;
> > stats->nr_thp_failed += is_thp;
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 408ef3d25cf5..a145c550dd2a 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
> > break;
> > case 2:
> > /* the second tail page: deferred_list overlaps ->mapping */
> > - if (unlikely(!list_empty(&folio->_deferred_list))) {
> > - bad_page(page, "on deferred list");
> > + if (unlikely(!list_empty(&folio->_deferred_list) &&
> > + folio_test_partially_mapped(folio))) {
> > + bad_page(page, "partially mapped folio on deferred list");
> > goto out;
> > }
> > break;
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index a6b9cd0b2b18..4c330635aa4e 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> > * Check partially_mapped first to ensure it is a large folio.
> > */
> > if (partially_mapped && folio_test_anon(folio) &&
> > - list_empty(&folio->_deferred_list))
> > - deferred_split_folio(folio);
> > + !folio_test_partially_mapped(folio))
> > + deferred_split_folio(folio, true);
> > +
> > __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >
> > /*
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 25e43bb3b574..25f4e8403f41 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> > * Split partially mapped folios right away.
> > * We can free the unmapped pages without IO.
> > */
> > - if (data_race(!list_empty(&folio->_deferred_list)) &&
> > + if (data_race(!list_empty(&folio->_deferred_list) &&
> > + folio_test_partially_mapped(folio)) &&
> > split_folio_to_list(folio, folio_list))
> > goto activate_locked;
> > }
> > --
> > 2.43.5
> >
>
> Thanks
> Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 8:29 ` Barry Song
2024-08-19 8:30 ` Barry Song
@ 2024-08-19 14:17 ` Usama Arif
2024-08-19 19:00 ` Barry Song
1 sibling, 1 reply; 21+ messages in thread
From: Usama Arif @ 2024-08-19 14:17 UTC (permalink / raw)
To: Barry Song
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On 19/08/2024 09:29, Barry Song wrote:
> Hi Usama,
>
> I feel it is much better now! thanks!
>
> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>> Currently folio->_deferred_list is used to keep track of
>> partially_mapped folios that are going to be split under memory
>> pressure. In the next patch, all THPs that are faulted in and collapsed
>> by khugepaged are also going to be tracked using _deferred_list.
>>
>> This patch introduces a pageflag to be able to distinguish between
>> partially mapped folios and others in the deferred_list at split time in
>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>> possible to distinguish between partially mapped folios and others in
>> deferred_split_scan.
>>
>> Eventhough it introduces an extra flag to track if the folio is
>> partially mapped, there is no functional change intended with this
>> patch and the flag is not useful in this patch itself, it will
>> become useful in the next patch when _deferred_list has non partially
>> mapped folios.
>>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>> include/linux/huge_mm.h | 4 ++--
>> include/linux/page-flags.h | 11 +++++++++++
>> mm/huge_memory.c | 23 ++++++++++++++++-------
>> mm/internal.h | 4 +++-
>> mm/memcontrol.c | 3 ++-
>> mm/migrate.c | 3 ++-
>> mm/page_alloc.c | 5 +++--
>> mm/rmap.c | 5 +++--
>> mm/vmscan.c | 3 ++-
>> 9 files changed, 44 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 4c32058cacfe..969f11f360d2 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>> {
>> return split_huge_page_to_list_to_order(page, NULL, 0);
>> }
>> -void deferred_split_folio(struct folio *folio);
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>
>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>> unsigned long address, bool freeze, struct folio *folio);
>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>> {
>> return 0;
>> }
>> -static inline void deferred_split_folio(struct folio *folio) {}
>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>> #define split_huge_pmd(__vma, __pmd, __address) \
>> do { } while (0)
>>
>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>> index a0a29bd092f8..c3bb0e0da581 100644
>> --- a/include/linux/page-flags.h
>> +++ b/include/linux/page-flags.h
>> @@ -182,6 +182,7 @@ enum pageflags {
>> /* At least one page in this folio has the hwpoison flag set */
>> PG_has_hwpoisoned = PG_active,
>> PG_large_rmappable = PG_workingset, /* anon or file-backed */
>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>> };
>>
>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>> ClearPageHead(page);
>> }
>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>> +/*
>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
>> + * so its safe to use non-atomic set/clear.
>> + */
>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>> #else
>> FOLIO_FLAG_FALSE(large_rmappable)
>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>> #endif
>>
>> #define PG_head_mask ((1UL << PG_head))
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 2d77b5d2291e..70ee49dfeaad 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> * page_deferred_list.
>> */
>> list_del_init(&folio->_deferred_list);
>> + __folio_clear_partially_mapped(folio);
>> }
>> spin_unlock(&ds_queue->split_queue_lock);
>> if (mapping) {
>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>> if (!list_empty(&folio->_deferred_list)) {
>> ds_queue->split_queue_len--;
>> list_del_init(&folio->_deferred_list);
>> + __folio_clear_partially_mapped(folio);
>
> is it possible to make things clearer by
>
> if (folio_clear_partially_mapped)
> __folio_clear_partially_mapped(folio);
>
> While writing without conditions isn't necessarily wrong, adding a condition
> will improve the readability of the code and enhance the clarity of my mTHP
> counters series. also help decrease smp cache sync if we can avoid
> unnecessary writing?
>
Do you mean if(folio_test_partially_mapped(folio))?
I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
>> }
>> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>> }
>>
>> -void deferred_split_folio(struct folio *folio)
>> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
>> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
>> {
>> struct deferred_split *ds_queue = get_deferred_split_queue(folio);
>> #ifdef CONFIG_MEMCG
>> @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
>> if (folio_test_swapcache(folio))
>> return;
>>
>> - if (!list_empty(&folio->_deferred_list))
>> - return;
>> -
>> spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>> + if (partially_mapped) {
>> + if (!folio_test_partially_mapped(folio)) {
>> + __folio_set_partially_mapped(folio);
>> + if (folio_test_pmd_mappable(folio))
>> + count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> + }
>> + } else {
>> + /* partially mapped folios cannot become non-partially mapped */
>> + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
>> + }
>> if (list_empty(&folio->_deferred_list)) {
>> - if (folio_test_pmd_mappable(folio))
>> - count_vm_event(THP_DEFERRED_SPLIT_PAGE);
>> - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
>> list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
>> ds_queue->split_queue_len++;
>> #ifdef CONFIG_MEMCG
>> @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>> } else {
>> /* We lost race with folio_put() */
>> list_del_init(&folio->_deferred_list);
>> + __folio_clear_partially_mapped(folio);
>
> as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
>
>> ds_queue->split_queue_len--;
>> }
>> if (!--sc->nr_to_scan)
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 52f7fc4e8ac3..27cbb5365841 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
>> atomic_set(&folio->_entire_mapcount, -1);
>> atomic_set(&folio->_nr_pages_mapped, 0);
>> atomic_set(&folio->_pincount, 0);
>> - if (order > 1)
>> + if (order > 1) {
>> INIT_LIST_HEAD(&folio->_deferred_list);
>> + __folio_clear_partially_mapped(folio);
>
> if partially_mapped is true for a new folio, does it mean we already have
> a bug somewhere?
>
> How is it possible for a new folio to be partially mapped?
>
Its not, I did it because I wanted to make it explicit that the folio is being initialized, similar to how before this INIT_LIST_HEAD(&folio->_deferred_list) is done here.
There is no functional issue in removing it here, because I believe the flag is initialized to false from start.
>> + }
>> }
>>
>> static inline void prep_compound_tail(struct page *head, int tail_idx)
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index e1ffd2950393..0fd95daecf9a 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
>> VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>> VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
>> !folio_test_hugetlb(folio) &&
>> - !list_empty(&folio->_deferred_list), folio);
>> + !list_empty(&folio->_deferred_list) &&
>> + folio_test_partially_mapped(folio), folio);
>>
>> /*
>> * Nobody should be changing or seriously looking at
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 2d2e65d69427..ef4a732f22b1 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
>> * use _deferred_list.
>> */
>> if (nr_pages > 2 &&
>> - !list_empty(&folio->_deferred_list)) {
>> + !list_empty(&folio->_deferred_list) &&
>> + folio_test_partially_mapped(folio)) {
>> if (!try_split_folio(folio, split_folios, mode)) {
>> nr_failed++;
>> stats->nr_thp_failed += is_thp;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 408ef3d25cf5..a145c550dd2a 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
>> break;
>> case 2:
>> /* the second tail page: deferred_list overlaps ->mapping */
>> - if (unlikely(!list_empty(&folio->_deferred_list))) {
>> - bad_page(page, "on deferred list");
>> + if (unlikely(!list_empty(&folio->_deferred_list) &&
>> + folio_test_partially_mapped(folio))) {
>> + bad_page(page, "partially mapped folio on deferred list");
>> goto out;
>> }
>> break;
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index a6b9cd0b2b18..4c330635aa4e 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>> * Check partially_mapped first to ensure it is a large folio.
>> */
>> if (partially_mapped && folio_test_anon(folio) &&
>> - list_empty(&folio->_deferred_list))
>> - deferred_split_folio(folio);
>> + !folio_test_partially_mapped(folio))
>> + deferred_split_folio(folio, true);
>> +
>> __folio_mod_stat(folio, -nr, -nr_pmdmapped);
>>
>> /*
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 25e43bb3b574..25f4e8403f41 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>> * Split partially mapped folios right away.
>> * We can free the unmapped pages without IO.
>> */
>> - if (data_race(!list_empty(&folio->_deferred_list)) &&
>> + if (data_race(!list_empty(&folio->_deferred_list) &&
>> + folio_test_partially_mapped(folio)) &&
>> split_folio_to_list(folio, folio_list))
>> goto activate_locked;
>> }
>> --
>> 2.43.5
>>
>
> Thanks
> Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 14:17 ` Usama Arif
@ 2024-08-19 19:00 ` Barry Song
2024-08-19 20:16 ` Usama Arif
0 siblings, 1 reply; 21+ messages in thread
From: Barry Song @ 2024-08-19 19:00 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 19/08/2024 09:29, Barry Song wrote:
> > Hi Usama,
> >
> > I feel it is much better now! thanks!
> >
> > On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >> Currently folio->_deferred_list is used to keep track of
> >> partially_mapped folios that are going to be split under memory
> >> pressure. In the next patch, all THPs that are faulted in and collapsed
> >> by khugepaged are also going to be tracked using _deferred_list.
> >>
> >> This patch introduces a pageflag to be able to distinguish between
> >> partially mapped folios and others in the deferred_list at split time in
> >> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >> possible to distinguish between partially mapped folios and others in
> >> deferred_split_scan.
> >>
> >> Eventhough it introduces an extra flag to track if the folio is
> >> partially mapped, there is no functional change intended with this
> >> patch and the flag is not useful in this patch itself, it will
> >> become useful in the next patch when _deferred_list has non partially
> >> mapped folios.
> >>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >> include/linux/huge_mm.h | 4 ++--
> >> include/linux/page-flags.h | 11 +++++++++++
> >> mm/huge_memory.c | 23 ++++++++++++++++-------
> >> mm/internal.h | 4 +++-
> >> mm/memcontrol.c | 3 ++-
> >> mm/migrate.c | 3 ++-
> >> mm/page_alloc.c | 5 +++--
> >> mm/rmap.c | 5 +++--
> >> mm/vmscan.c | 3 ++-
> >> 9 files changed, 44 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >> index 4c32058cacfe..969f11f360d2 100644
> >> --- a/include/linux/huge_mm.h
> >> +++ b/include/linux/huge_mm.h
> >> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >> {
> >> return split_huge_page_to_list_to_order(page, NULL, 0);
> >> }
> >> -void deferred_split_folio(struct folio *folio);
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>
> >> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >> unsigned long address, bool freeze, struct folio *folio);
> >> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >> {
> >> return 0;
> >> }
> >> -static inline void deferred_split_folio(struct folio *folio) {}
> >> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >> #define split_huge_pmd(__vma, __pmd, __address) \
> >> do { } while (0)
> >>
> >> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >> index a0a29bd092f8..c3bb0e0da581 100644
> >> --- a/include/linux/page-flags.h
> >> +++ b/include/linux/page-flags.h
> >> @@ -182,6 +182,7 @@ enum pageflags {
> >> /* At least one page in this folio has the hwpoison flag set */
> >> PG_has_hwpoisoned = PG_active,
> >> PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >> };
> >>
> >> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> >> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >> ClearPageHead(page);
> >> }
> >> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >> +/*
> >> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> >> + * so its safe to use non-atomic set/clear.
> >> + */
> >> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >> #else
> >> FOLIO_FLAG_FALSE(large_rmappable)
> >> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> >> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> >> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >> #endif
> >>
> >> #define PG_head_mask ((1UL << PG_head))
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 2d77b5d2291e..70ee49dfeaad 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >> * page_deferred_list.
> >> */
> >> list_del_init(&folio->_deferred_list);
> >> + __folio_clear_partially_mapped(folio);
> >> }
> >> spin_unlock(&ds_queue->split_queue_lock);
> >> if (mapping) {
> >> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >> if (!list_empty(&folio->_deferred_list)) {
> >> ds_queue->split_queue_len--;
> >> list_del_init(&folio->_deferred_list);
> >> + __folio_clear_partially_mapped(folio);
> >
> > is it possible to make things clearer by
> >
> > if (folio_clear_partially_mapped)
> > __folio_clear_partially_mapped(folio);
> >
> > While writing without conditions isn't necessarily wrong, adding a condition
> > will improve the readability of the code and enhance the clarity of my mTHP
> > counters series. also help decrease smp cache sync if we can avoid
> > unnecessary writing?
> >
>
> Do you mean if(folio_test_partially_mapped(folio))?
>
> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
In the pmd-order case, the majority of folios are not partially mapped.
Unconditional writes will trigger cache synchronization across all
CPUs (related to the MESI protocol), making them more costly. By
using conditional writes, such as "if(test) write," we can avoid
most unnecessary writes, which is much more efficient. Additionally,
we only need to manage nr_split_deferred when the condition
is met. We are carefully evaluating all scenarios to determine
if modifications to the partially_mapped flag are necessary.
>
>
> >> }
> >> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >> }
> >>
> >> -void deferred_split_folio(struct folio *folio)
> >> +/* partially_mapped=false won't clear PG_partially_mapped folio flag */
> >> +void deferred_split_folio(struct folio *folio, bool partially_mapped)
> >> {
> >> struct deferred_split *ds_queue = get_deferred_split_queue(folio);
> >> #ifdef CONFIG_MEMCG
> >> @@ -3486,14 +3489,19 @@ void deferred_split_folio(struct folio *folio)
> >> if (folio_test_swapcache(folio))
> >> return;
> >>
> >> - if (!list_empty(&folio->_deferred_list))
> >> - return;
> >> -
> >> spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> >> + if (partially_mapped) {
> >> + if (!folio_test_partially_mapped(folio)) {
> >> + __folio_set_partially_mapped(folio);
> >> + if (folio_test_pmd_mappable(folio))
> >> + count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> + count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >> + }
> >> + } else {
> >> + /* partially mapped folios cannot become non-partially mapped */
> >> + VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
> >> + }
> >> if (list_empty(&folio->_deferred_list)) {
> >> - if (folio_test_pmd_mappable(folio))
> >> - count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> >> - count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
> >> list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
> >> ds_queue->split_queue_len++;
> >> #ifdef CONFIG_MEMCG
> >> @@ -3542,6 +3550,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >> } else {
> >> /* We lost race with folio_put() */
> >> list_del_init(&folio->_deferred_list);
> >> + __folio_clear_partially_mapped(folio);
> >
> > as above? Do we also need if(test) for split_huge_page_to_list_to_order()?
> >
> >> ds_queue->split_queue_len--;
> >> }
> >> if (!--sc->nr_to_scan)
> >> diff --git a/mm/internal.h b/mm/internal.h
> >> index 52f7fc4e8ac3..27cbb5365841 100644
> >> --- a/mm/internal.h
> >> +++ b/mm/internal.h
> >> @@ -662,8 +662,10 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> >> atomic_set(&folio->_entire_mapcount, -1);
> >> atomic_set(&folio->_nr_pages_mapped, 0);
> >> atomic_set(&folio->_pincount, 0);
> >> - if (order > 1)
> >> + if (order > 1) {
> >> INIT_LIST_HEAD(&folio->_deferred_list);
> >> + __folio_clear_partially_mapped(folio);
> >
> > if partially_mapped is true for a new folio, does it mean we already have
> > a bug somewhere?
> >
> > How is it possible for a new folio to be partially mapped?
> >
>
> Its not, I did it because I wanted to make it explicit that the folio is being initialized, similar to how before this INIT_LIST_HEAD(&folio->_deferred_list) is done here.
>
> There is no functional issue in removing it here, because I believe the flag is initialized to false from start.
> >> + }
> >> }
> >>
> >> static inline void prep_compound_tail(struct page *head, int tail_idx)
> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >> index e1ffd2950393..0fd95daecf9a 100644
> >> --- a/mm/memcontrol.c
> >> +++ b/mm/memcontrol.c
> >> @@ -4669,7 +4669,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
> >> VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> >> VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
> >> !folio_test_hugetlb(folio) &&
> >> - !list_empty(&folio->_deferred_list), folio);
> >> + !list_empty(&folio->_deferred_list) &&
> >> + folio_test_partially_mapped(folio), folio);
> >>
> >> /*
> >> * Nobody should be changing or seriously looking at
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 2d2e65d69427..ef4a732f22b1 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -1735,7 +1735,8 @@ static int migrate_pages_batch(struct list_head *from,
> >> * use _deferred_list.
> >> */
> >> if (nr_pages > 2 &&
> >> - !list_empty(&folio->_deferred_list)) {
> >> + !list_empty(&folio->_deferred_list) &&
> >> + folio_test_partially_mapped(folio)) {
> >> if (!try_split_folio(folio, split_folios, mode)) {
> >> nr_failed++;
> >> stats->nr_thp_failed += is_thp;
> >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >> index 408ef3d25cf5..a145c550dd2a 100644
> >> --- a/mm/page_alloc.c
> >> +++ b/mm/page_alloc.c
> >> @@ -957,8 +957,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
> >> break;
> >> case 2:
> >> /* the second tail page: deferred_list overlaps ->mapping */
> >> - if (unlikely(!list_empty(&folio->_deferred_list))) {
> >> - bad_page(page, "on deferred list");
> >> + if (unlikely(!list_empty(&folio->_deferred_list) &&
> >> + folio_test_partially_mapped(folio))) {
> >> + bad_page(page, "partially mapped folio on deferred list");
> >> goto out;
> >> }
> >> break;
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index a6b9cd0b2b18..4c330635aa4e 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -1578,8 +1578,9 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
> >> * Check partially_mapped first to ensure it is a large folio.
> >> */
> >> if (partially_mapped && folio_test_anon(folio) &&
> >> - list_empty(&folio->_deferred_list))
> >> - deferred_split_folio(folio);
> >> + !folio_test_partially_mapped(folio))
> >> + deferred_split_folio(folio, true);
> >> +
> >> __folio_mod_stat(folio, -nr, -nr_pmdmapped);
> >>
> >> /*
> >> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> index 25e43bb3b574..25f4e8403f41 100644
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >> @@ -1233,7 +1233,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> >> * Split partially mapped folios right away.
> >> * We can free the unmapped pages without IO.
> >> */
> >> - if (data_race(!list_empty(&folio->_deferred_list)) &&
> >> + if (data_race(!list_empty(&folio->_deferred_list) &&
> >> + folio_test_partially_mapped(folio)) &&
> >> split_folio_to_list(folio, folio_list))
> >> goto activate_locked;
> >> }
> >> --
> >> 2.43.5
> >>
> >
> > Thanks
> > Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 19:00 ` Barry Song
@ 2024-08-19 20:16 ` Usama Arif
2024-08-19 21:34 ` Barry Song
0 siblings, 1 reply; 21+ messages in thread
From: Usama Arif @ 2024-08-19 20:16 UTC (permalink / raw)
To: Barry Song
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On 19/08/2024 20:00, Barry Song wrote:
> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>
>>
>>
>> On 19/08/2024 09:29, Barry Song wrote:
>>> Hi Usama,
>>>
>>> I feel it is much better now! thanks!
>>>
>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>
>>>> Currently folio->_deferred_list is used to keep track of
>>>> partially_mapped folios that are going to be split under memory
>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
>>>> by khugepaged are also going to be tracked using _deferred_list.
>>>>
>>>> This patch introduces a pageflag to be able to distinguish between
>>>> partially mapped folios and others in the deferred_list at split time in
>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>>>> possible to distinguish between partially mapped folios and others in
>>>> deferred_split_scan.
>>>>
>>>> Eventhough it introduces an extra flag to track if the folio is
>>>> partially mapped, there is no functional change intended with this
>>>> patch and the flag is not useful in this patch itself, it will
>>>> become useful in the next patch when _deferred_list has non partially
>>>> mapped folios.
>>>>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>> include/linux/huge_mm.h | 4 ++--
>>>> include/linux/page-flags.h | 11 +++++++++++
>>>> mm/huge_memory.c | 23 ++++++++++++++++-------
>>>> mm/internal.h | 4 +++-
>>>> mm/memcontrol.c | 3 ++-
>>>> mm/migrate.c | 3 ++-
>>>> mm/page_alloc.c | 5 +++--
>>>> mm/rmap.c | 5 +++--
>>>> mm/vmscan.c | 3 ++-
>>>> 9 files changed, 44 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 4c32058cacfe..969f11f360d2 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>>> {
>>>> return split_huge_page_to_list_to_order(page, NULL, 0);
>>>> }
>>>> -void deferred_split_folio(struct folio *folio);
>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>>>
>>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>> unsigned long address, bool freeze, struct folio *folio);
>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>>> {
>>>> return 0;
>>>> }
>>>> -static inline void deferred_split_folio(struct folio *folio) {}
>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>>> #define split_huge_pmd(__vma, __pmd, __address) \
>>>> do { } while (0)
>>>>
>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>> index a0a29bd092f8..c3bb0e0da581 100644
>>>> --- a/include/linux/page-flags.h
>>>> +++ b/include/linux/page-flags.h
>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>> /* At least one page in this folio has the hwpoison flag set */
>>>> PG_has_hwpoisoned = PG_active,
>>>> PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>>>> };
>>>>
>>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>>>> ClearPageHead(page);
>>>> }
>>>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>> +/*
>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
>>>> + * so its safe to use non-atomic set/clear.
>>>> + */
>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>> #else
>>>> FOLIO_FLAG_FALSE(large_rmappable)
>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>>>> #endif
>>>>
>>>> #define PG_head_mask ((1UL << PG_head))
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 2d77b5d2291e..70ee49dfeaad 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>> * page_deferred_list.
>>>> */
>>>> list_del_init(&folio->_deferred_list);
>>>> + __folio_clear_partially_mapped(folio);
>>>> }
>>>> spin_unlock(&ds_queue->split_queue_lock);
>>>> if (mapping) {
>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>> if (!list_empty(&folio->_deferred_list)) {
>>>> ds_queue->split_queue_len--;
>>>> list_del_init(&folio->_deferred_list);
>>>> + __folio_clear_partially_mapped(folio);
>>>
>>> is it possible to make things clearer by
>>>
>>> if (folio_clear_partially_mapped)
>>> __folio_clear_partially_mapped(folio);
>>>
>>> While writing without conditions isn't necessarily wrong, adding a condition
>>> will improve the readability of the code and enhance the clarity of my mTHP
>>> counters series. also help decrease smp cache sync if we can avoid
>>> unnecessary writing?
>>>
>>
>> Do you mean if(folio_test_partially_mapped(folio))?
>>
>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
>
> In the pmd-order case, the majority of folios are not partially mapped.
> Unconditional writes will trigger cache synchronization across all
> CPUs (related to the MESI protocol), making them more costly. By
> using conditional writes, such as "if(test) write," we can avoid
> most unnecessary writes, which is much more efficient. Additionally,
> we only need to manage nr_split_deferred when the condition
> is met. We are carefully evaluating all scenarios to determine
> if modifications to the partially_mapped flag are necessary.
>
Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
Author: Usama Arif <usamaarif642@gmail.com>
Date: Mon Aug 19 21:07:16 2024 +0100
mm: Introduce a pageflag for partially mapped folios fix
Test partially_mapped flag before clearing it. This should
avoid unnecessary writes and will be needed in the nr_split_deferred
series.
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5d67d3b3c1b2..ccde60aaaa0f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
list_del_init(&folio->_deferred_list);
- __folio_clear_partially_mapped(folio);
+ if (folio_test_partially_mapped(folio))
+ __folio_clear_partially_mapped(folio);
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
@@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with folio_put() */
list_del_init(&folio->_deferred_list);
- __folio_clear_partially_mapped(folio);
+ if (folio_test_partially_mapped(folio))
+ __folio_clear_partially_mapped(folio);
ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 20:16 ` Usama Arif
@ 2024-08-19 21:34 ` Barry Song
2024-08-19 21:55 ` Barry Song
0 siblings, 1 reply; 21+ messages in thread
From: Barry Song @ 2024-08-19 21:34 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 19/08/2024 20:00, Barry Song wrote:
> > On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>
> >>
> >>
> >> On 19/08/2024 09:29, Barry Song wrote:
> >>> Hi Usama,
> >>>
> >>> I feel it is much better now! thanks!
> >>>
> >>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>
> >>>> Currently folio->_deferred_list is used to keep track of
> >>>> partially_mapped folios that are going to be split under memory
> >>>> pressure. In the next patch, all THPs that are faulted in and collapsed
> >>>> by khugepaged are also going to be tracked using _deferred_list.
> >>>>
> >>>> This patch introduces a pageflag to be able to distinguish between
> >>>> partially mapped folios and others in the deferred_list at split time in
> >>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >>>> possible to distinguish between partially mapped folios and others in
> >>>> deferred_split_scan.
> >>>>
> >>>> Eventhough it introduces an extra flag to track if the folio is
> >>>> partially mapped, there is no functional change intended with this
> >>>> patch and the flag is not useful in this patch itself, it will
> >>>> become useful in the next patch when _deferred_list has non partially
> >>>> mapped folios.
> >>>>
> >>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>> ---
> >>>> include/linux/huge_mm.h | 4 ++--
> >>>> include/linux/page-flags.h | 11 +++++++++++
> >>>> mm/huge_memory.c | 23 ++++++++++++++++-------
> >>>> mm/internal.h | 4 +++-
> >>>> mm/memcontrol.c | 3 ++-
> >>>> mm/migrate.c | 3 ++-
> >>>> mm/page_alloc.c | 5 +++--
> >>>> mm/rmap.c | 5 +++--
> >>>> mm/vmscan.c | 3 ++-
> >>>> 9 files changed, 44 insertions(+), 17 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>> index 4c32058cacfe..969f11f360d2 100644
> >>>> --- a/include/linux/huge_mm.h
> >>>> +++ b/include/linux/huge_mm.h
> >>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>>> {
> >>>> return split_huge_page_to_list_to_order(page, NULL, 0);
> >>>> }
> >>>> -void deferred_split_folio(struct folio *folio);
> >>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>>>
> >>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>>> unsigned long address, bool freeze, struct folio *folio);
> >>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>>> {
> >>>> return 0;
> >>>> }
> >>>> -static inline void deferred_split_folio(struct folio *folio) {}
> >>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>>> #define split_huge_pmd(__vma, __pmd, __address) \
> >>>> do { } while (0)
> >>>>
> >>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >>>> index a0a29bd092f8..c3bb0e0da581 100644
> >>>> --- a/include/linux/page-flags.h
> >>>> +++ b/include/linux/page-flags.h
> >>>> @@ -182,6 +182,7 @@ enum pageflags {
> >>>> /* At least one page in this folio has the hwpoison flag set */
> >>>> PG_has_hwpoisoned = PG_active,
> >>>> PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >>>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >>>> };
> >>>>
> >>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> >>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >>>> ClearPageHead(page);
> >>>> }
> >>>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>> +/*
> >>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> >>>> + * so its safe to use non-atomic set/clear.
> >>>> + */
> >>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>> #else
> >>>> FOLIO_FLAG_FALSE(large_rmappable)
> >>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> >>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> >>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >>>> #endif
> >>>>
> >>>> #define PG_head_mask ((1UL << PG_head))
> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>> index 2d77b5d2291e..70ee49dfeaad 100644
> >>>> --- a/mm/huge_memory.c
> >>>> +++ b/mm/huge_memory.c
> >>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>> * page_deferred_list.
> >>>> */
> >>>> list_del_init(&folio->_deferred_list);
> >>>> + __folio_clear_partially_mapped(folio);
> >>>> }
> >>>> spin_unlock(&ds_queue->split_queue_lock);
> >>>> if (mapping) {
> >>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>>> if (!list_empty(&folio->_deferred_list)) {
> >>>> ds_queue->split_queue_len--;
> >>>> list_del_init(&folio->_deferred_list);
> >>>> + __folio_clear_partially_mapped(folio);
> >>>
> >>> is it possible to make things clearer by
> >>>
> >>> if (folio_clear_partially_mapped)
> >>> __folio_clear_partially_mapped(folio);
> >>>
> >>> While writing without conditions isn't necessarily wrong, adding a condition
> >>> will improve the readability of the code and enhance the clarity of my mTHP
> >>> counters series. also help decrease smp cache sync if we can avoid
> >>> unnecessary writing?
> >>>
> >>
> >> Do you mean if(folio_test_partially_mapped(folio))?
> >>
> >> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> >
> > In the pmd-order case, the majority of folios are not partially mapped.
> > Unconditional writes will trigger cache synchronization across all
> > CPUs (related to the MESI protocol), making them more costly. By
> > using conditional writes, such as "if(test) write," we can avoid
> > most unnecessary writes, which is much more efficient. Additionally,
> > we only need to manage nr_split_deferred when the condition
> > is met. We are carefully evaluating all scenarios to determine
> > if modifications to the partially_mapped flag are necessary.
> >
>
>
> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
>
> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
> Author: Usama Arif <usamaarif642@gmail.com>
> Date: Mon Aug 19 21:07:16 2024 +0100
>
> mm: Introduce a pageflag for partially mapped folios fix
>
> Test partially_mapped flag before clearing it. This should
> avoid unnecessary writes and will be needed in the nr_split_deferred
> series.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 5d67d3b3c1b2..ccde60aaaa0f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
> if (!list_empty(&folio->_deferred_list)) {
> ds_queue->split_queue_len--;
> list_del_init(&folio->_deferred_list);
> - __folio_clear_partially_mapped(folio);
> + if (folio_test_partially_mapped(folio))
> + __folio_clear_partially_mapped(folio);
> }
> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> }
> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> } else {
> /* We lost race with folio_put() */
> list_del_init(&folio->_deferred_list);
> - __folio_clear_partially_mapped(folio);
> + if (folio_test_partially_mapped(folio))
> + __folio_clear_partially_mapped(folio);
> ds_queue->split_queue_len--;
> }
> if (!--sc->nr_to_scan)
>
Do we also need if (folio_test_partially_mapped(folio)) in
split_huge_page_to_list_to_order()?
I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
non-partially-mapped folios. To be future-proof, we might want to handle
both cases equally.
By the way, we might not need to clear the flag for a new folio. This differs
from the init_list, which is necessary. If a new folio has the partially_mapped
flag, it indicates that we failed to clear it when freeing the folio to
the buddy system, which is a bug we need to fix in the free path.
Thanks
Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 21:34 ` Barry Song
@ 2024-08-19 21:55 ` Barry Song
2024-08-20 19:35 ` Usama Arif
0 siblings, 1 reply; 21+ messages in thread
From: Barry Song @ 2024-08-19 21:55 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@kernel.org> wrote:
>
> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >
> >
> >
> > On 19/08/2024 20:00, Barry Song wrote:
> > > On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>
> > >>
> > >>
> > >> On 19/08/2024 09:29, Barry Song wrote:
> > >>> Hi Usama,
> > >>>
> > >>> I feel it is much better now! thanks!
> > >>>
> > >>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> > >>>>
> > >>>> Currently folio->_deferred_list is used to keep track of
> > >>>> partially_mapped folios that are going to be split under memory
> > >>>> pressure. In the next patch, all THPs that are faulted in and collapsed
> > >>>> by khugepaged are also going to be tracked using _deferred_list.
> > >>>>
> > >>>> This patch introduces a pageflag to be able to distinguish between
> > >>>> partially mapped folios and others in the deferred_list at split time in
> > >>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> > >>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> > >>>> possible to distinguish between partially mapped folios and others in
> > >>>> deferred_split_scan.
> > >>>>
> > >>>> Eventhough it introduces an extra flag to track if the folio is
> > >>>> partially mapped, there is no functional change intended with this
> > >>>> patch and the flag is not useful in this patch itself, it will
> > >>>> become useful in the next patch when _deferred_list has non partially
> > >>>> mapped folios.
> > >>>>
> > >>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > >>>> ---
> > >>>> include/linux/huge_mm.h | 4 ++--
> > >>>> include/linux/page-flags.h | 11 +++++++++++
> > >>>> mm/huge_memory.c | 23 ++++++++++++++++-------
> > >>>> mm/internal.h | 4 +++-
> > >>>> mm/memcontrol.c | 3 ++-
> > >>>> mm/migrate.c | 3 ++-
> > >>>> mm/page_alloc.c | 5 +++--
> > >>>> mm/rmap.c | 5 +++--
> > >>>> mm/vmscan.c | 3 ++-
> > >>>> 9 files changed, 44 insertions(+), 17 deletions(-)
> > >>>>
> > >>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > >>>> index 4c32058cacfe..969f11f360d2 100644
> > >>>> --- a/include/linux/huge_mm.h
> > >>>> +++ b/include/linux/huge_mm.h
> > >>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> > >>>> {
> > >>>> return split_huge_page_to_list_to_order(page, NULL, 0);
> > >>>> }
> > >>>> -void deferred_split_folio(struct folio *folio);
> > >>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> > >>>>
> > >>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > >>>> unsigned long address, bool freeze, struct folio *folio);
> > >>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> > >>>> {
> > >>>> return 0;
> > >>>> }
> > >>>> -static inline void deferred_split_folio(struct folio *folio) {}
> > >>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> > >>>> #define split_huge_pmd(__vma, __pmd, __address) \
> > >>>> do { } while (0)
> > >>>>
> > >>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> > >>>> index a0a29bd092f8..c3bb0e0da581 100644
> > >>>> --- a/include/linux/page-flags.h
> > >>>> +++ b/include/linux/page-flags.h
> > >>>> @@ -182,6 +182,7 @@ enum pageflags {
> > >>>> /* At least one page in this folio has the hwpoison flag set */
> > >>>> PG_has_hwpoisoned = PG_active,
> > >>>> PG_large_rmappable = PG_workingset, /* anon or file-backed */
> > >>>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> > >>>> };
> > >>>>
> > >>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> > >>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> > >>>> ClearPageHead(page);
> > >>>> }
> > >>>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> > >>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>>> +/*
> > >>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> > >>>> + * so its safe to use non-atomic set/clear.
> > >>>> + */
> > >>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> > >>>> #else
> > >>>> FOLIO_FLAG_FALSE(large_rmappable)
> > >>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> > >>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> > >>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> > >>>> #endif
> > >>>>
> > >>>> #define PG_head_mask ((1UL << PG_head))
> > >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > >>>> index 2d77b5d2291e..70ee49dfeaad 100644
> > >>>> --- a/mm/huge_memory.c
> > >>>> +++ b/mm/huge_memory.c
> > >>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> > >>>> * page_deferred_list.
> > >>>> */
> > >>>> list_del_init(&folio->_deferred_list);
> > >>>> + __folio_clear_partially_mapped(folio);
> > >>>> }
> > >>>> spin_unlock(&ds_queue->split_queue_lock);
> > >>>> if (mapping) {
> > >>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> > >>>> if (!list_empty(&folio->_deferred_list)) {
> > >>>> ds_queue->split_queue_len--;
> > >>>> list_del_init(&folio->_deferred_list);
> > >>>> + __folio_clear_partially_mapped(folio);
> > >>>
> > >>> is it possible to make things clearer by
> > >>>
> > >>> if (folio_clear_partially_mapped)
> > >>> __folio_clear_partially_mapped(folio);
> > >>>
> > >>> While writing without conditions isn't necessarily wrong, adding a condition
> > >>> will improve the readability of the code and enhance the clarity of my mTHP
> > >>> counters series. also help decrease smp cache sync if we can avoid
> > >>> unnecessary writing?
> > >>>
> > >>
> > >> Do you mean if(folio_test_partially_mapped(folio))?
> > >>
> > >> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> > >
> > > In the pmd-order case, the majority of folios are not partially mapped.
> > > Unconditional writes will trigger cache synchronization across all
> > > CPUs (related to the MESI protocol), making them more costly. By
> > > using conditional writes, such as "if(test) write," we can avoid
> > > most unnecessary writes, which is much more efficient. Additionally,
> > > we only need to manage nr_split_deferred when the condition
> > > is met. We are carefully evaluating all scenarios to determine
> > > if modifications to the partially_mapped flag are necessary.
> > >
> >
> >
> > Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
> >
> > commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
> > Author: Usama Arif <usamaarif642@gmail.com>
> > Date: Mon Aug 19 21:07:16 2024 +0100
> >
> > mm: Introduce a pageflag for partially mapped folios fix
> >
> > Test partially_mapped flag before clearing it. This should
> > avoid unnecessary writes and will be needed in the nr_split_deferred
> > series.
> >
> > Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 5d67d3b3c1b2..ccde60aaaa0f 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
> > if (!list_empty(&folio->_deferred_list)) {
> > ds_queue->split_queue_len--;
> > list_del_init(&folio->_deferred_list);
> > - __folio_clear_partially_mapped(folio);
> > + if (folio_test_partially_mapped(folio))
> > + __folio_clear_partially_mapped(folio);
> > }
> > spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> > }
> > @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> > } else {
> > /* We lost race with folio_put() */
> > list_del_init(&folio->_deferred_list);
> > - __folio_clear_partially_mapped(folio);
> > + if (folio_test_partially_mapped(folio))
> > + __folio_clear_partially_mapped(folio);
> > ds_queue->split_queue_len--;
> > }
> > if (!--sc->nr_to_scan)
> >
>
> Do we also need if (folio_test_partially_mapped(folio)) in
> split_huge_page_to_list_to_order()?
>
> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
> non-partially-mapped folios. To be future-proof, we might want to handle
> both cases equally.
we recall we also have a real case which can split entirely_mapped
folio:
mm: huge_memory: enable debugfs to split huge pages to any order
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4
>
> By the way, we might not need to clear the flag for a new folio. This differs
> from the init_list, which is necessary. If a new folio has the partially_mapped
> flag, it indicates that we failed to clear it when freeing the folio to
> the buddy system, which is a bug we need to fix in the free path.
>
> Thanks
> Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-19 21:55 ` Barry Song
@ 2024-08-20 19:35 ` Usama Arif
2024-08-20 21:30 ` Barry Song
0 siblings, 1 reply; 21+ messages in thread
From: Usama Arif @ 2024-08-20 19:35 UTC (permalink / raw)
To: Barry Song, akpm
Cc: linux-mm, hannes, riel, shakeel.butt, roman.gushchin, yuzhao,
david, ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn,
corbet, linux-kernel, linux-doc, kernel-team
On 19/08/2024 22:55, Barry Song wrote:
> On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@kernel.org> wrote:
>>
>> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>
>>>
>>>
>>> On 19/08/2024 20:00, Barry Song wrote:
>>>> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 19/08/2024 09:29, Barry Song wrote:
>>>>>> Hi Usama,
>>>>>>
>>>>>> I feel it is much better now! thanks!
>>>>>>
>>>>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
>>>>>>>
>>>>>>> Currently folio->_deferred_list is used to keep track of
>>>>>>> partially_mapped folios that are going to be split under memory
>>>>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
>>>>>>> by khugepaged are also going to be tracked using _deferred_list.
>>>>>>>
>>>>>>> This patch introduces a pageflag to be able to distinguish between
>>>>>>> partially mapped folios and others in the deferred_list at split time in
>>>>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
>>>>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
>>>>>>> possible to distinguish between partially mapped folios and others in
>>>>>>> deferred_split_scan.
>>>>>>>
>>>>>>> Eventhough it introduces an extra flag to track if the folio is
>>>>>>> partially mapped, there is no functional change intended with this
>>>>>>> patch and the flag is not useful in this patch itself, it will
>>>>>>> become useful in the next patch when _deferred_list has non partially
>>>>>>> mapped folios.
>>>>>>>
>>>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>>>>> ---
>>>>>>> include/linux/huge_mm.h | 4 ++--
>>>>>>> include/linux/page-flags.h | 11 +++++++++++
>>>>>>> mm/huge_memory.c | 23 ++++++++++++++++-------
>>>>>>> mm/internal.h | 4 +++-
>>>>>>> mm/memcontrol.c | 3 ++-
>>>>>>> mm/migrate.c | 3 ++-
>>>>>>> mm/page_alloc.c | 5 +++--
>>>>>>> mm/rmap.c | 5 +++--
>>>>>>> mm/vmscan.c | 3 ++-
>>>>>>> 9 files changed, 44 insertions(+), 17 deletions(-)
>>>>>>>
>>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>>>> index 4c32058cacfe..969f11f360d2 100644
>>>>>>> --- a/include/linux/huge_mm.h
>>>>>>> +++ b/include/linux/huge_mm.h
>>>>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
>>>>>>> {
>>>>>>> return split_huge_page_to_list_to_order(page, NULL, 0);
>>>>>>> }
>>>>>>> -void deferred_split_folio(struct folio *folio);
>>>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
>>>>>>>
>>>>>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>>>>>>> unsigned long address, bool freeze, struct folio *folio);
>>>>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
>>>>>>> {
>>>>>>> return 0;
>>>>>>> }
>>>>>>> -static inline void deferred_split_folio(struct folio *folio) {}
>>>>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
>>>>>>> #define split_huge_pmd(__vma, __pmd, __address) \
>>>>>>> do { } while (0)
>>>>>>>
>>>>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
>>>>>>> index a0a29bd092f8..c3bb0e0da581 100644
>>>>>>> --- a/include/linux/page-flags.h
>>>>>>> +++ b/include/linux/page-flags.h
>>>>>>> @@ -182,6 +182,7 @@ enum pageflags {
>>>>>>> /* At least one page in this folio has the hwpoison flag set */
>>>>>>> PG_has_hwpoisoned = PG_active,
>>>>>>> PG_large_rmappable = PG_workingset, /* anon or file-backed */
>>>>>>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
>>>>>>> };
>>>>>>>
>>>>>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
>>>>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
>>>>>>> ClearPageHead(page);
>>>>>>> }
>>>>>>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
>>>>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>>>> +/*
>>>>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
>>>>>>> + * so its safe to use non-atomic set/clear.
>>>>>>> + */
>>>>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
>>>>>>> #else
>>>>>>> FOLIO_FLAG_FALSE(large_rmappable)
>>>>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
>>>>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
>>>>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
>>>>>>> #endif
>>>>>>>
>>>>>>> #define PG_head_mask ((1UL << PG_head))
>>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>>>> index 2d77b5d2291e..70ee49dfeaad 100644
>>>>>>> --- a/mm/huge_memory.c
>>>>>>> +++ b/mm/huge_memory.c
>>>>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>>>> * page_deferred_list.
>>>>>>> */
>>>>>>> list_del_init(&folio->_deferred_list);
>>>>>>> + __folio_clear_partially_mapped(folio);
>>>>>>> }
>>>>>>> spin_unlock(&ds_queue->split_queue_lock);
>>>>>>> if (mapping) {
>>>>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>>>>>> if (!list_empty(&folio->_deferred_list)) {
>>>>>>> ds_queue->split_queue_len--;
>>>>>>> list_del_init(&folio->_deferred_list);
>>>>>>> + __folio_clear_partially_mapped(folio);
>>>>>>
>>>>>> is it possible to make things clearer by
>>>>>>
>>>>>> if (folio_clear_partially_mapped)
>>>>>> __folio_clear_partially_mapped(folio);
>>>>>>
>>>>>> While writing without conditions isn't necessarily wrong, adding a condition
>>>>>> will improve the readability of the code and enhance the clarity of my mTHP
>>>>>> counters series. also help decrease smp cache sync if we can avoid
>>>>>> unnecessary writing?
>>>>>>
>>>>>
>>>>> Do you mean if(folio_test_partially_mapped(folio))?
>>>>>
>>>>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
>>>>
>>>> In the pmd-order case, the majority of folios are not partially mapped.
>>>> Unconditional writes will trigger cache synchronization across all
>>>> CPUs (related to the MESI protocol), making them more costly. By
>>>> using conditional writes, such as "if(test) write," we can avoid
>>>> most unnecessary writes, which is much more efficient. Additionally,
>>>> we only need to manage nr_split_deferred when the condition
>>>> is met. We are carefully evaluating all scenarios to determine
>>>> if modifications to the partially_mapped flag are necessary.
>>>>
>>>
>>>
>>> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
>>>
>>> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
>>> Author: Usama Arif <usamaarif642@gmail.com>
>>> Date: Mon Aug 19 21:07:16 2024 +0100
>>>
>>> mm: Introduce a pageflag for partially mapped folios fix
>>>
>>> Test partially_mapped flag before clearing it. This should
>>> avoid unnecessary writes and will be needed in the nr_split_deferred
>>> series.
>>>
>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 5d67d3b3c1b2..ccde60aaaa0f 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
>>> if (!list_empty(&folio->_deferred_list)) {
>>> ds_queue->split_queue_len--;
>>> list_del_init(&folio->_deferred_list);
>>> - __folio_clear_partially_mapped(folio);
>>> + if (folio_test_partially_mapped(folio))
>>> + __folio_clear_partially_mapped(folio);
>>> }
>>> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>>> }
>>> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
>>> } else {
>>> /* We lost race with folio_put() */
>>> list_del_init(&folio->_deferred_list);
>>> - __folio_clear_partially_mapped(folio);
>>> + if (folio_test_partially_mapped(folio))
>>> + __folio_clear_partially_mapped(folio);
>>> ds_queue->split_queue_len--;
>>> }
>>> if (!--sc->nr_to_scan)
>>>
>>
>> Do we also need if (folio_test_partially_mapped(folio)) in
>> split_huge_page_to_list_to_order()?
>>
>> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
>> non-partially-mapped folios. To be future-proof, we might want to handle
>> both cases equally.
>
> we recall we also have a real case which can split entirely_mapped
> folio:
>
> mm: huge_memory: enable debugfs to split huge pages to any order
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4
>
>>
>> By the way, we might not need to clear the flag for a new folio. This differs
>> from the init_list, which is necessary. If a new folio has the partially_mapped
>> flag, it indicates that we failed to clear it when freeing the folio to
>> the buddy system, which is a bug we need to fix in the free path.
>>
>> Thanks
>> Barry
I believe the below fixlet should address all concerns:
From 95492a51b1929ea274b4e5b78fc74e7736645d58 Mon Sep 17 00:00:00 2001
From: Usama Arif <usamaarif642@gmail.com>
Date: Mon, 19 Aug 2024 21:07:16 +0100
Subject: [PATCH] mm: Introduce a pageflag for partially mapped folios fix
Test partially_mapped flag before clearing it. This should
avoid unnecessary writes and will be needed in the nr_split_deferred
series.
Also no need to clear partially_mapped prepping compound head, as it
should start with already being cleared.
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
include/linux/page-flags.h | 2 +-
mm/huge_memory.c | 9 ++++++---
mm/internal.h | 4 +---
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c3bb0e0da581..f1602695daf2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1182,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/
#define PAGE_FLAGS_SECOND \
(0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
- 1UL << PG_large_rmappable)
+ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5d67d3b3c1b2..402b9d933de0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3422,7 +3422,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
* page_deferred_list.
*/
list_del_init(&folio->_deferred_list);
- __folio_clear_partially_mapped(folio);
+ if (folio_test_partially_mapped(folio))
+ __folio_clear_partially_mapped(folio);
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
@@ -3479,7 +3480,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
list_del_init(&folio->_deferred_list);
- __folio_clear_partially_mapped(folio);
+ if (folio_test_partially_mapped(folio))
+ __folio_clear_partially_mapped(folio);
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
@@ -3610,7 +3612,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
} else {
/* We lost race with folio_put() */
list_del_init(&folio->_deferred_list);
- __folio_clear_partially_mapped(folio);
+ if (folio_test_partially_mapped(folio))
+ __folio_clear_partially_mapped(folio);
ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
diff --git a/mm/internal.h b/mm/internal.h
index 27cbb5365841..52f7fc4e8ac3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -662,10 +662,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
- if (order > 1) {
+ if (order > 1)
INIT_LIST_HEAD(&folio->_deferred_list);
- __folio_clear_partially_mapped(folio);
- }
}
static inline void prep_compound_tail(struct page *head, int tail_idx)
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-20 19:35 ` Usama Arif
@ 2024-08-20 21:30 ` Barry Song
2024-08-21 19:04 ` Usama Arif
0 siblings, 1 reply; 21+ messages in thread
From: Barry Song @ 2024-08-20 21:30 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On Wed, Aug 21, 2024 at 7:35 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 19/08/2024 22:55, Barry Song wrote:
> > On Tue, Aug 20, 2024 at 9:34 AM Barry Song <baohua@kernel.org> wrote:
> >>
> >> On Tue, Aug 20, 2024 at 8:16 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>
> >>>
> >>>
> >>> On 19/08/2024 20:00, Barry Song wrote:
> >>>> On Tue, Aug 20, 2024 at 2:17 AM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 19/08/2024 09:29, Barry Song wrote:
> >>>>>> Hi Usama,
> >>>>>>
> >>>>>> I feel it is much better now! thanks!
> >>>>>>
> >>>>>> On Mon, Aug 19, 2024 at 2:31 PM Usama Arif <usamaarif642@gmail.com> wrote:
> >>>>>>>
> >>>>>>> Currently folio->_deferred_list is used to keep track of
> >>>>>>> partially_mapped folios that are going to be split under memory
> >>>>>>> pressure. In the next patch, all THPs that are faulted in and collapsed
> >>>>>>> by khugepaged are also going to be tracked using _deferred_list.
> >>>>>>>
> >>>>>>> This patch introduces a pageflag to be able to distinguish between
> >>>>>>> partially mapped folios and others in the deferred_list at split time in
> >>>>>>> deferred_split_scan. Its needed as __folio_remove_rmap decrements
> >>>>>>> _mapcount, _large_mapcount and _entire_mapcount, hence it won't be
> >>>>>>> possible to distinguish between partially mapped folios and others in
> >>>>>>> deferred_split_scan.
> >>>>>>>
> >>>>>>> Eventhough it introduces an extra flag to track if the folio is
> >>>>>>> partially mapped, there is no functional change intended with this
> >>>>>>> patch and the flag is not useful in this patch itself, it will
> >>>>>>> become useful in the next patch when _deferred_list has non partially
> >>>>>>> mapped folios.
> >>>>>>>
> >>>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>>>>> ---
> >>>>>>> include/linux/huge_mm.h | 4 ++--
> >>>>>>> include/linux/page-flags.h | 11 +++++++++++
> >>>>>>> mm/huge_memory.c | 23 ++++++++++++++++-------
> >>>>>>> mm/internal.h | 4 +++-
> >>>>>>> mm/memcontrol.c | 3 ++-
> >>>>>>> mm/migrate.c | 3 ++-
> >>>>>>> mm/page_alloc.c | 5 +++--
> >>>>>>> mm/rmap.c | 5 +++--
> >>>>>>> mm/vmscan.c | 3 ++-
> >>>>>>> 9 files changed, 44 insertions(+), 17 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> >>>>>>> index 4c32058cacfe..969f11f360d2 100644
> >>>>>>> --- a/include/linux/huge_mm.h
> >>>>>>> +++ b/include/linux/huge_mm.h
> >>>>>>> @@ -321,7 +321,7 @@ static inline int split_huge_page(struct page *page)
> >>>>>>> {
> >>>>>>> return split_huge_page_to_list_to_order(page, NULL, 0);
> >>>>>>> }
> >>>>>>> -void deferred_split_folio(struct folio *folio);
> >>>>>>> +void deferred_split_folio(struct folio *folio, bool partially_mapped);
> >>>>>>>
> >>>>>>> void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> >>>>>>> unsigned long address, bool freeze, struct folio *folio);
> >>>>>>> @@ -495,7 +495,7 @@ static inline int split_huge_page(struct page *page)
> >>>>>>> {
> >>>>>>> return 0;
> >>>>>>> }
> >>>>>>> -static inline void deferred_split_folio(struct folio *folio) {}
> >>>>>>> +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
> >>>>>>> #define split_huge_pmd(__vma, __pmd, __address) \
> >>>>>>> do { } while (0)
> >>>>>>>
> >>>>>>> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> >>>>>>> index a0a29bd092f8..c3bb0e0da581 100644
> >>>>>>> --- a/include/linux/page-flags.h
> >>>>>>> +++ b/include/linux/page-flags.h
> >>>>>>> @@ -182,6 +182,7 @@ enum pageflags {
> >>>>>>> /* At least one page in this folio has the hwpoison flag set */
> >>>>>>> PG_has_hwpoisoned = PG_active,
> >>>>>>> PG_large_rmappable = PG_workingset, /* anon or file-backed */
> >>>>>>> + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
> >>>>>>> };
> >>>>>>>
> >>>>>>> #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1)
> >>>>>>> @@ -861,8 +862,18 @@ static inline void ClearPageCompound(struct page *page)
> >>>>>>> ClearPageHead(page);
> >>>>>>> }
> >>>>>>> FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
> >>>>>>> +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>>>> +/*
> >>>>>>> + * PG_partially_mapped is protected by deferred_split split_queue_lock,
> >>>>>>> + * so its safe to use non-atomic set/clear.
> >>>>>>> + */
> >>>>>>> +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>>>> +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
> >>>>>>> #else
> >>>>>>> FOLIO_FLAG_FALSE(large_rmappable)
> >>>>>>> +FOLIO_TEST_FLAG_FALSE(partially_mapped)
> >>>>>>> +__FOLIO_SET_FLAG_NOOP(partially_mapped)
> >>>>>>> +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped)
> >>>>>>> #endif
> >>>>>>>
> >>>>>>> #define PG_head_mask ((1UL << PG_head))
> >>>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>>>>>> index 2d77b5d2291e..70ee49dfeaad 100644
> >>>>>>> --- a/mm/huge_memory.c
> >>>>>>> +++ b/mm/huge_memory.c
> >>>>>>> @@ -3398,6 +3398,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>>>>>> * page_deferred_list.
> >>>>>>> */
> >>>>>>> list_del_init(&folio->_deferred_list);
> >>>>>>> + __folio_clear_partially_mapped(folio);
> >>>>>>> }
> >>>>>>> spin_unlock(&ds_queue->split_queue_lock);
> >>>>>>> if (mapping) {
> >>>>>>> @@ -3454,11 +3455,13 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>>>>>> if (!list_empty(&folio->_deferred_list)) {
> >>>>>>> ds_queue->split_queue_len--;
> >>>>>>> list_del_init(&folio->_deferred_list);
> >>>>>>> + __folio_clear_partially_mapped(folio);
> >>>>>>
> >>>>>> is it possible to make things clearer by
> >>>>>>
> >>>>>> if (folio_clear_partially_mapped)
> >>>>>> __folio_clear_partially_mapped(folio);
> >>>>>>
> >>>>>> While writing without conditions isn't necessarily wrong, adding a condition
> >>>>>> will improve the readability of the code and enhance the clarity of my mTHP
> >>>>>> counters series. also help decrease smp cache sync if we can avoid
> >>>>>> unnecessary writing?
> >>>>>>
> >>>>>
> >>>>> Do you mean if(folio_test_partially_mapped(folio))?
> >>>>>
> >>>>> I don't like this idea. I think it makes the readability worse? If I was looking at if (test) -> clear for the first time, I would become confused why its being tested if its going to be clear at the end anyways?
> >>>>
> >>>> In the pmd-order case, the majority of folios are not partially mapped.
> >>>> Unconditional writes will trigger cache synchronization across all
> >>>> CPUs (related to the MESI protocol), making them more costly. By
> >>>> using conditional writes, such as "if(test) write," we can avoid
> >>>> most unnecessary writes, which is much more efficient. Additionally,
> >>>> we only need to manage nr_split_deferred when the condition
> >>>> is met. We are carefully evaluating all scenarios to determine
> >>>> if modifications to the partially_mapped flag are necessary.
> >>>>
> >>>
> >>>
> >>> Hmm okay, as you said its needed for nr_split_deferred anyways. Something like below is ok to fold in?
> >>>
> >>> commit 4ae9e2067346effd902b342296987b97dee29018 (HEAD)
> >>> Author: Usama Arif <usamaarif642@gmail.com>
> >>> Date: Mon Aug 19 21:07:16 2024 +0100
> >>>
> >>> mm: Introduce a pageflag for partially mapped folios fix
> >>>
> >>> Test partially_mapped flag before clearing it. This should
> >>> avoid unnecessary writes and will be needed in the nr_split_deferred
> >>> series.
> >>>
> >>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >>>
> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >>> index 5d67d3b3c1b2..ccde60aaaa0f 100644
> >>> --- a/mm/huge_memory.c
> >>> +++ b/mm/huge_memory.c
> >>> @@ -3479,7 +3479,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
> >>> if (!list_empty(&folio->_deferred_list)) {
> >>> ds_queue->split_queue_len--;
> >>> list_del_init(&folio->_deferred_list);
> >>> - __folio_clear_partially_mapped(folio);
> >>> + if (folio_test_partially_mapped(folio))
> >>> + __folio_clear_partially_mapped(folio);
> >>> }
> >>> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> >>> }
> >>> @@ -3610,7 +3611,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> >>> } else {
> >>> /* We lost race with folio_put() */
> >>> list_del_init(&folio->_deferred_list);
> >>> - __folio_clear_partially_mapped(folio);
> >>> + if (folio_test_partially_mapped(folio))
> >>> + __folio_clear_partially_mapped(folio);
> >>> ds_queue->split_queue_len--;
> >>> }
> >>> if (!--sc->nr_to_scan)
> >>>
> >>
> >> Do we also need if (folio_test_partially_mapped(folio)) in
> >> split_huge_page_to_list_to_order()?
> >>
> >> I recall that in Yu Zhao's TAO, there’s a chance of splitting (shattering)
> >> non-partially-mapped folios. To be future-proof, we might want to handle
> >> both cases equally.
> >
> > we recall we also have a real case which can split entirely_mapped
> > folio:
> >
> > mm: huge_memory: enable debugfs to split huge pages to any order
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fc4d182316bd5309b4066fd9ef21529ea397a7d4
> >
> >>
> >> By the way, we might not need to clear the flag for a new folio. This differs
> >> from the init_list, which is necessary. If a new folio has the partially_mapped
> >> flag, it indicates that we failed to clear it when freeing the folio to
> >> the buddy system, which is a bug we need to fix in the free path.
> >>
> >> Thanks
> >> Barry
>
> I believe the below fixlet should address all concerns:
Hi Usama,
thanks! I can't judge if we need this partially_mapped flag. but if we
need, the code
looks correct to me. I'd like to leave this to David and other experts to ack.
an alternative approach might be two lists? one for entirely_mapped,
the other one
for split_deferred. also seems ugly ?
On the other hand, when we want to extend your patchset to mTHP other than PMD-
order, will the only deferred_list create huge lock contention while
adding or removing
folios from it?
>
>
> From 95492a51b1929ea274b4e5b78fc74e7736645d58 Mon Sep 17 00:00:00 2001
> From: Usama Arif <usamaarif642@gmail.com>
> Date: Mon, 19 Aug 2024 21:07:16 +0100
> Subject: [PATCH] mm: Introduce a pageflag for partially mapped folios fix
>
> Test partially_mapped flag before clearing it. This should
> avoid unnecessary writes and will be needed in the nr_split_deferred
> series.
> Also no need to clear partially_mapped prepping compound head, as it
> should start with already being cleared.
>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
> include/linux/page-flags.h | 2 +-
> mm/huge_memory.c | 9 ++++++---
> mm/internal.h | 4 +---
> 3 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index c3bb0e0da581..f1602695daf2 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -1182,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
> */
> #define PAGE_FLAGS_SECOND \
> (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
> - 1UL << PG_large_rmappable)
> + 1UL << PG_large_rmappable | 1UL << PG_partially_mapped)
>
> #define PAGE_FLAGS_PRIVATE \
> (1UL << PG_private | 1UL << PG_private_2)
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 5d67d3b3c1b2..402b9d933de0 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3422,7 +3422,8 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> * page_deferred_list.
> */
> list_del_init(&folio->_deferred_list);
> - __folio_clear_partially_mapped(folio);
> + if (folio_test_partially_mapped(folio))
> + __folio_clear_partially_mapped(folio);
> }
> spin_unlock(&ds_queue->split_queue_lock);
> if (mapping) {
> @@ -3479,7 +3480,8 @@ void __folio_undo_large_rmappable(struct folio *folio)
> if (!list_empty(&folio->_deferred_list)) {
> ds_queue->split_queue_len--;
> list_del_init(&folio->_deferred_list);
> - __folio_clear_partially_mapped(folio);
> + if (folio_test_partially_mapped(folio))
> + __folio_clear_partially_mapped(folio);
> }
> spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> }
> @@ -3610,7 +3612,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
> } else {
> /* We lost race with folio_put() */
> list_del_init(&folio->_deferred_list);
> - __folio_clear_partially_mapped(folio);
> + if (folio_test_partially_mapped(folio))
> + __folio_clear_partially_mapped(folio);
> ds_queue->split_queue_len--;
> }
> if (!--sc->nr_to_scan)
> diff --git a/mm/internal.h b/mm/internal.h
> index 27cbb5365841..52f7fc4e8ac3 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -662,10 +662,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
> atomic_set(&folio->_entire_mapcount, -1);
> atomic_set(&folio->_nr_pages_mapped, 0);
> atomic_set(&folio->_pincount, 0);
> - if (order > 1) {
> + if (order > 1)
> INIT_LIST_HEAD(&folio->_deferred_list);
> - __folio_clear_partially_mapped(folio);
> - }
> }
>
> static inline void prep_compound_tail(struct page *head, int tail_idx)
> --
> 2.43.5
>
>
Thanks
Barry
^ permalink raw reply [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-20 21:30 ` Barry Song
@ 2024-08-21 19:04 ` Usama Arif
2024-08-21 21:25 ` Barry Song
2024-09-01 12:48 ` Bang Li
0 siblings, 2 replies; 21+ messages in thread
From: Usama Arif @ 2024-08-21 19:04 UTC (permalink / raw)
To: Barry Song
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On 20/08/2024 17:30, Barry Song wrote:
> Hi Usama,
> thanks! I can't judge if we need this partially_mapped flag. but if we
> need, the code
> looks correct to me. I'd like to leave this to David and other experts to ack.
>
Thanks for the reviews!
> an alternative approach might be two lists? one for entirely_mapped,
> the other one
> for split_deferred. also seems ugly ?
>
That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
> On the other hand, when we want to extend your patchset to mTHP other than PMD-
> order, will the only deferred_list create huge lock contention while
> adding or removing
> folios from it?
>
Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-21 19:04 ` Usama Arif
@ 2024-08-21 21:25 ` Barry Song
2024-09-01 12:48 ` Bang Li
1 sibling, 0 replies; 21+ messages in thread
From: Barry Song @ 2024-08-21 21:25 UTC (permalink / raw)
To: Usama Arif
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team
On Thu, Aug 22, 2024 at 3:04 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
>
>
> On 20/08/2024 17:30, Barry Song wrote:
>
> > Hi Usama,
> > thanks! I can't judge if we need this partially_mapped flag. but if we
> > need, the code
> > looks correct to me. I'd like to leave this to David and other experts to ack.
> >
>
> Thanks for the reviews!
>
> > an alternative approach might be two lists? one for entirely_mapped,
> > the other one
> > for split_deferred. also seems ugly ?
> >
>
> That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
>
Thanks for the explanation; I missed the earlier discussion.
> > On the other hand, when we want to extend your patchset to mTHP other than PMD-
> > order, will the only deferred_list create huge lock contention while
> > adding or removing
> > folios from it?
> >
>
> Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
>
Right.
> Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
>
> I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
>
Agreed. I suspect this will be also useful for larger mTHPs like 1M and 512KB.
While lock contention might be a concern, it shouldn't be a barrier to
proceeding
with your patchset, which only supports PMD-order.
> Thanks
>
Thanks
Barry
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-08-21 19:04 ` Usama Arif
2024-08-21 21:25 ` Barry Song
@ 2024-09-01 12:48 ` Bang Li
2024-09-02 10:09 ` Usama Arif
1 sibling, 1 reply; 21+ messages in thread
From: Bang Li @ 2024-09-01 12:48 UTC (permalink / raw)
To: Usama Arif, Barry Song
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team, libang.li,
Lance Yang
hi, Usama
On 2024/8/22 3:04, Usama Arif wrote:
>
> On 20/08/2024 17:30, Barry Song wrote:
>
>> Hi Usama,
>> thanks! I can't judge if we need this partially_mapped flag. but if we
>> need, the code
>> looks correct to me. I'd like to leave this to David and other experts to ack.
>>
> Thanks for the reviews!
>
>> an alternative approach might be two lists? one for entirely_mapped,
>> the other one
>> for split_deferred. also seems ugly ?
>>
> That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
>
>> On the other hand, when we want to extend your patchset to mTHP other than PMD-
>> order, will the only deferred_list create huge lock contention while
>> adding or removing
>> folios from it?
>>
> Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
>
> Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
>
> I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
>
> Thanks
Below is the core code snippet to support "split underused mTHP". Can we extend the
khugepaged_max_ptes_none value to mthp and keep its semantics unchanged? With a small
modification, Only folios with page numbers greater than khugepaged_max_ptes_none - 1
can be added to the deferred_split list and can be split. What do you think?
diff --git a/mm/memory.c b/mm/memory.c
index b95fce7d190f..ef503958d6a0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4789,6 +4789,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
}
folio_ref_add(folio, nr_pages - 1);
+ if (nr_pages > 1 && nr_pages > khugepaged_max_ptes_none - 1)
+ deferred_split_folio(folio, false);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
shmem THP has the same memory expansion problem when the shmem_enabled configuration is
set to always. In my opinion, it is necessary to support "split underused shmem THP",
but I am not sure if there is any gap in the implementation?
Bang
Thanks
^ permalink raw reply related [flat|nested] 21+ messages in thread* Re: [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios
2024-09-01 12:48 ` Bang Li
@ 2024-09-02 10:09 ` Usama Arif
0 siblings, 0 replies; 21+ messages in thread
From: Usama Arif @ 2024-09-02 10:09 UTC (permalink / raw)
To: Bang Li, Barry Song
Cc: akpm, linux-mm, hannes, riel, shakeel.butt, roman.gushchin,
yuzhao, david, ryan.roberts, rppt, willy, cerasuolodomenico,
ryncsn, corbet, linux-kernel, linux-doc, kernel-team, libang.li,
Lance Yang
On 01/09/2024 08:48, Bang Li wrote:
> hi, Usama
>
> On 2024/8/22 3:04, Usama Arif wrote:
>
>>
>> On 20/08/2024 17:30, Barry Song wrote:
>>
>>> Hi Usama,
>>> thanks! I can't judge if we need this partially_mapped flag. but if we
>>> need, the code
>>> looks correct to me. I'd like to leave this to David and other experts to ack.
>>>
>> Thanks for the reviews!
>>
>>> an alternative approach might be two lists? one for entirely_mapped,
>>> the other one
>>> for split_deferred. also seems ugly ?
>>>
>> That was my very first prototype! I shifted to using a bool which I sent in v1, and then a bit in _flags_1 as David suggested. I believe a bit in _flags_1 is the best way forward, as it leaves the most space in folio for future work.
>>
>>> On the other hand, when we want to extend your patchset to mTHP other than PMD-
>>> order, will the only deferred_list create huge lock contention while
>>> adding or removing
>>> folios from it?
>>>
>> Yes, I would imagine so. the deferred_split_queue is per memcg/node, so that helps.
>>
>> Also, this work is tied to khugepaged. So would need some thought when doing it for mTHP.
>>
>> I would imagine doing underused shrinker for mTHP would be less beneficial compared to doing it for 2M THP. But probably needs experimentation.
>>
>> Thanks
>
> Below is the core code snippet to support "split underused mTHP". Can we extend the
> khugepaged_max_ptes_none value to mthp and keep its semantics unchanged? With a small
> modification, Only folios with page numbers greater than khugepaged_max_ptes_none - 1
> can be added to the deferred_split list and can be split. What do you think?
>
hmm, so I believe its not as simple as that.
First mTHP support would need to be added to khugepaged. The entire khugepaged code needs
to be audited for it and significantly tested. If you just look at all the instances of
HPAGE_PMD_NR in khugepaged.c, you will see it will be a significant change and needs to
be a series of its own.
Also, different values of max_ptes_none can have different significance for mTHP sizes.
max_ptes_none of 200 does not mean anything for 512K and below, it means 78% of a 1M mTHP
and 39% of a 2M THP. We might want different max_ptes_none for each mTHP if we were to do
this.
The benefit of splitting underused mTHPs of lower order might not be worth the work
needed to split. Someone needs to experiment with different workloads and see some
improvement for these lower orders.
So probably a lot more than the below diff is needed for mTHP support!
> diff --git a/mm/memory.c b/mm/memory.c
> index b95fce7d190f..ef503958d6a0 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4789,6 +4789,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
> }
>
> folio_ref_add(folio, nr_pages - 1);
> + if (nr_pages > 1 && nr_pages > khugepaged_max_ptes_none - 1)
> + deferred_split_folio(folio, false);
> add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
> count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
> folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
>
> shmem THP has the same memory expansion problem when the shmem_enabled configuration is
> set to always. In my opinion, it is necessary to support "split underused shmem THP",
> but I am not sure if there is any gap in the implementation?
>
> Bang
> Thanks
>
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v4 5/6] mm: split underused THPs
2024-08-19 2:30 [PATCH v4 0/6] mm: split underused THPs Usama Arif
` (3 preceding siblings ...)
2024-08-19 2:30 ` [PATCH v4 4/6] mm: Introduce a pageflag for partially mapped folios Usama Arif
@ 2024-08-19 2:30 ` Usama Arif
2024-08-19 2:30 ` [PATCH v4 6/6] mm: add sysfs entry to disable splitting " Usama Arif
5 siblings, 0 replies; 21+ messages in thread
From: Usama Arif @ 2024-08-19 2:30 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Usama Arif
This is an attempt to mitigate the issue of running out of memory when
THP is always enabled. During runtime whenever a THP is being faulted in
(__do_huge_pmd_anonymous_page) or collapsed by khugepaged
(collapse_huge_page), the THP is added to _deferred_list. Whenever
memory reclaim happens in linux, the kernel runs the deferred_split
shrinker which goes through the _deferred_list.
If the folio was partially mapped, the shrinker attempts to split it.
If the folio is not partially mapped, the shrinker checks if the THP
was underused, i.e. how many of the base 4K pages of the entire THP
were zero-filled. If this number goes above a certain threshold (decided by
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none), the
shrinker will attempt to split that THP. Then at remap time, the pages
that were zero-filled are mapped to the shared zeropage, hence saving
memory.
Suggested-by: Rik van Riel <riel@surriel.com>
Co-authored-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
Documentation/admin-guide/mm/transhuge.rst | 6 +++
include/linux/khugepaged.h | 1 +
include/linux/vm_event_item.h | 1 +
mm/huge_memory.c | 60 +++++++++++++++++++++-
mm/khugepaged.c | 3 +-
mm/vmstat.c | 1 +
6 files changed, 69 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 058485daf186..40741b892aff 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -447,6 +447,12 @@ thp_deferred_split_page
splitting it would free up some memory. Pages on split queue are
going to be split under memory pressure.
+thp_underused_split_page
+ is incremented when a huge page on the split queue was split
+ because it was underused. A THP is underused if the number of
+ zero pages in the THP is above a certain threshold
+ (/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none).
+
thp_split_pmd
is incremented every time a PMD split into table of PTEs.
This can happen, for instance, when application calls mprotect() or
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index f68865e19b0b..30baae91b225 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -4,6 +4,7 @@
#include <linux/sched/coredump.h> /* MMF_VM_HUGEPAGE */
+extern unsigned int khugepaged_max_ptes_none __read_mostly;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern struct attribute_group khugepaged_attr_group;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index aae5c7c5cfb4..aed952d04132 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -105,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_SPLIT_PAGE,
THP_SPLIT_PAGE_FAILED,
THP_DEFERRED_SPLIT_PAGE,
+ THP_UNDERUSED_SPLIT_PAGE,
THP_SPLIT_PMD,
THP_SCAN_EXCEED_NONE_PTE,
THP_SCAN_EXCEED_SWAP_PTE,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 70ee49dfeaad..f5363cf900f9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1087,6 +1087,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
+ deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -3526,6 +3527,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
return READ_ONCE(ds_queue->split_queue_len);
}
+static bool thp_underused(struct folio *folio)
+{
+ int num_zero_pages = 0, num_filled_pages = 0;
+ void *kaddr;
+ int i;
+
+ if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
+ return false;
+
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
+ if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
+ num_zero_pages++;
+ if (num_zero_pages > khugepaged_max_ptes_none) {
+ kunmap_local(kaddr);
+ return true;
+ }
+ } else {
+ /*
+ * Another path for early exit once the number
+ * of non-zero filled pages exceeds threshold.
+ */
+ num_filled_pages++;
+ if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ kunmap_local(kaddr);
+ return false;
+ }
+ }
+ kunmap_local(kaddr);
+ }
+ return false;
+}
+
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
@@ -3559,13 +3593,35 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ bool did_split = false;
+ bool underused = false;
+
+ if (!folio_test_partially_mapped(folio)) {
+ underused = thp_underused(folio);
+ if (!underused)
+ goto next;
+ }
if (!folio_trylock(folio))
goto next;
- /* split_huge_page() removes page from list on success */
- if (!split_folio(folio))
+ if (!split_folio(folio)) {
+ did_split = true;
+ if (underused)
+ count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
split++;
+ }
folio_unlock(folio);
next:
+ /*
+ * split_folio() removes folio from list on success.
+ * Only add back to the queue if folio is partially mapped.
+ * If thp_underused returns false, or if split_folio fails
+ * in the case it was underused, then consider it used and
+ * don't add it back to split_queue.
+ */
+ if (!did_split && !folio_test_partially_mapped(folio)) {
+ list_del_init(&folio->_deferred_list);
+ ds_queue->split_queue_len--;
+ }
folio_put(folio);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c42062478c1..2e138b22d939 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*
* Note that these are only respected if collapse was initiated by khugepaged.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly;
+unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
@@ -1235,6 +1235,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
+ deferred_split_folio(folio, false);
spin_unlock(pmd_ptl);
folio = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3a402ea91f0..6060bb7bbb44 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1384,6 +1384,7 @@ const char * const vmstat_text[] = {
"thp_split_page",
"thp_split_page_failed",
"thp_deferred_split_page",
+ "thp_underused_split_page",
"thp_split_pmd",
"thp_scan_exceed_none_pte",
"thp_scan_exceed_swap_pte",
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread* [PATCH v4 6/6] mm: add sysfs entry to disable splitting underused THPs
2024-08-19 2:30 [PATCH v4 0/6] mm: split underused THPs Usama Arif
` (4 preceding siblings ...)
2024-08-19 2:30 ` [PATCH v4 5/6] mm: split underused THPs Usama Arif
@ 2024-08-19 2:30 ` Usama Arif
5 siblings, 0 replies; 21+ messages in thread
From: Usama Arif @ 2024-08-19 2:30 UTC (permalink / raw)
To: akpm, linux-mm
Cc: hannes, riel, shakeel.butt, roman.gushchin, yuzhao, david, baohua,
ryan.roberts, rppt, willy, cerasuolodomenico, ryncsn, corbet,
linux-kernel, linux-doc, kernel-team, Usama Arif
If disabled, THPs faulted in or collapsed will not be added to
_deferred_list, and therefore won't be considered for splitting under
memory pressure if underused.
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
Documentation/admin-guide/mm/transhuge.rst | 10 +++++++++
mm/huge_memory.c | 26 ++++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 40741b892aff..02ae7bc9efbd 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -202,6 +202,16 @@ PMD-mappable transparent hugepage::
cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
+All THPs at fault and collapse time will be added to _deferred_list,
+and will therefore be split under memory presure if they are considered
+"underused". A THP is underused if the number of zero-filled pages in
+the THP is above max_ptes_none (see below). It is possible to disable
+this behaviour by writing 0 to shrink_underused, and enable it by writing
+1 to it::
+
+ echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused
+ echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused
+
khugepaged will be automatically started when PMD-sized THP is enabled
(either of the per-size anon control or the top-level control are set
to "always" or "madvise"), and it'll be automatically shutdown when
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f5363cf900f9..5d67d3b3c1b2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -74,6 +74,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc);
+static bool split_underused_thp = true;
static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
@@ -439,6 +440,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
+static ssize_t split_underused_thp_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", split_underused_thp);
+}
+
+static ssize_t split_underused_thp_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err = kstrtobool(buf, &split_underused_thp);
+
+ if (err < 0)
+ return err;
+
+ return count;
+}
+
+static struct kobj_attribute split_underused_thp_attr = __ATTR(
+ shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
+
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
@@ -447,6 +469,7 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
+ &split_underused_thp_attr.attr,
NULL,
};
@@ -3477,6 +3500,9 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
if (folio_order(folio) <= 1)
return;
+ if (!partially_mapped && !split_underused_thp)
+ return;
+
/*
* The try_to_unmap() in page reclaim path might reach here too,
* this may cause a race condition to corrupt deferred split queue.
--
2.43.5
^ permalink raw reply related [flat|nested] 21+ messages in thread