linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Chih-En Lin <shiyn.lin@gmail.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Qi Zheng <zhengqi.arch@bytedance.com>,
	David Hildenbrand <david@redhat.com>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Christophe Leroy <christophe.leroy@csgroup.eu>,
	John Hubbard <jhubbard@nvidia.com>, Nadav Amit <namit@vmware.com>,
	Barry Song <baohua@kernel.org>,
	Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Masami Hiramatsu <mhiramat@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>,
	Jiri Olsa <jolsa@kernel.org>, Namhyung Kim <namhyung@kernel.org>,
	Ian Rogers <irogers@google.com>,
	Adrian Hunter <adrian.hunter@intel.com>,
	Yu Zhao <yuzhao@google.com>, Steven Barrett <steven@liquorix.net>,
	Juergen Gross <jgross@suse.com>, Peter Xu <peterx@redhat.com>,
	Kefeng Wang <wangkefeng.wang@huawei.com>,
	Tong Tiangen <tongtiangen@huawei.com>,
	Christoph Hellwig <hch@infradead.org>,
	"Liam R. Howlett" <Liam.Howlett@Oracle.com>,
	Yang Shi <shy828301@gmail.com>, Vlastimil Babka <vbabka@suse.cz>,
	Alex Sierra <alex.sierra@amd.com>,
	Vincent Whitchurch <vincent.whitchurch@axis.com>,
	Anshuman Khandual <anshuman.khandual@arm.com>,
	Li kunyu <kunyu@nfschina.com>, Liu Shixin <liushixin2@huawei.com>,
	Hugh Dickins <hughd@google.com>, Minchan Kim <minchan@kernel.org>,
	Joey Gouly <joey.gouly@arm.com>,
	Chih-En Lin <shiyn.lin@gmail.com>, Michal Hocko <mhocko@suse.com>,
	Suren Baghdasaryan <surenb@google.com>,
	"Zach O'Keefe" <zokeefe@google.com>,
	Gautam Menghani <gautammenghani201@gmail.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Mark Brown <broonie@kernel.org>,
	"Eric W. Biederman" <ebiederm@xmission.com>,
	Andrei Vagin <avagin@gmail.com>,
	Shakeel Butt <shakeelb@google.com>,
	Daniel Bristot de Oliveira <bristot@kernel.org>,
	"Jason A. Donenfeld" <Jason@zx2c4.com>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Alexey Gladkov <legion@kernel.org>,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-trace-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org,
	Dinglan Peng <peng301@purdue.edu>,
	Pedro Fonseca <pfonseca@purdue.edu>,
	Jim Huang <jserv@ccns.ncku.edu.tw>,
	Huichun Feng <foxhoundsk.tw@gmail.com>
Subject: [PATCH v5 01/17] mm: Split out the present cases from zap_pte_range()
Date: Fri, 14 Apr 2023 22:23:25 +0800	[thread overview]
Message-ID: <20230414142341.354556-2-shiyn.lin@gmail.com> (raw)
In-Reply-To: <20230414142341.354556-1-shiyn.lin@gmail.com>

As the complexity of zap_pte_range() has increased, The readability
and maintainability are becoming more difficult. To simplfy and
improve the expandability of zap PTE part, split the present and
non-present cases from zap_pte_range() and replace the individual
flag variable by the single flag with bitwise operations.

Signed-off-by: Chih-En Lin <shiyn.lin@gmail.com>
---
 mm/memory.c | 217 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 129 insertions(+), 88 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 01a23ad48a04..0476cf22ea33 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1351,29 +1351,147 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
 	pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
 }
 
+#define ZAP_PTE_INIT 0x0000
+#define ZAP_PTE_FORCE_FLUSH 0x0001
+
+struct zap_pte_details {
+	pte_t **pte;
+	unsigned long *addr;
+	unsigned int flags;
+	int rss[NR_MM_COUNTERS];
+};
+
+/* Return 0 to continue, 1 to break. */
+static inline int
+zap_present_pte(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		struct zap_details *details,
+		struct zap_pte_details *pte_details)
+{
+	struct mm_struct *mm = tlb->mm;
+	struct page *page;
+	unsigned int delay_rmap;
+	unsigned long addr = *pte_details->addr;
+	pte_t *pte = *pte_details->pte;
+	pte_t ptent = *pte;
+
+	page = vm_normal_page(vma, addr, ptent);
+	if (unlikely(!should_zap_page(details, page)))
+		return 0;
+
+	ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+	tlb_remove_tlb_entry(tlb, pte, addr);
+	zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+	if (unlikely(!page))
+		return 0;
+
+	delay_rmap = 0;
+	if (!PageAnon(page)) {
+		if (pte_dirty(ptent)) {
+			set_page_dirty(page);
+			if (tlb_delay_rmap(tlb)) {
+				delay_rmap = 1;
+				pte_details->flags |= ZAP_PTE_FORCE_FLUSH;
+			}
+		}
+		if (pte_young(ptent) && likely(vma_has_recency(vma)))
+			mark_page_accessed(page);
+
+	}
+	pte_details->rss[mm_counter(page)]--;
+	if (!delay_rmap) {
+		page_remove_rmap(page, vma, false);
+		if (unlikely(page_mapcount(page) < 0))
+			print_bad_pte(vma, addr, ptent, page);
+	}
+	if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+		*pte_details->addr += PAGE_SIZE;
+		pte_details->flags |= ZAP_PTE_FORCE_FLUSH;
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline void
+zap_nopresent_pte(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		  struct zap_details *details,
+		  struct zap_pte_details *pte_details)
+{
+	struct mm_struct *mm = tlb->mm;
+	struct page *page;
+	unsigned long addr = *pte_details->addr;
+	pte_t *pte = *pte_details->pte;
+	pte_t ptent = *pte;
+	swp_entry_t entry = pte_to_swp_entry(ptent);
+
+	if (is_device_private_entry(entry) ||
+	    is_device_exclusive_entry(entry)) {
+		page = pfn_swap_entry_to_page(entry);
+		if (unlikely(!should_zap_page(details, page)))
+			return;
+		/*
+		 * Both device private/exclusive mappings should only
+		 * work with anonymous page so far, so we don't need to
+		 * consider uffd-wp bit when zap. For more information,
+		 * see zap_install_uffd_wp_if_needed().
+		 */
+		WARN_ON_ONCE(!vma_is_anonymous(vma));
+		pte_details->rss[mm_counter(page)]--;
+		if (is_device_private_entry(entry))
+			page_remove_rmap(page, vma, false);
+		put_page(page);
+	} else if (!non_swap_entry(entry)) {
+		/* Genuine swap entry, hence a private anon page */
+		if (!should_zap_cows(details))
+			return;
+		pte_details->rss[MM_SWAPENTS]--;
+		if (unlikely(!free_swap_and_cache(entry)))
+			print_bad_pte(vma, addr, ptent, NULL);
+	} else if (is_migration_entry(entry)) {
+		page = pfn_swap_entry_to_page(entry);
+		if (!should_zap_page(details, page))
+			return;
+		pte_details->rss[mm_counter(page)]--;
+	} else if (pte_marker_entry_uffd_wp(entry)) {
+		/* Only drop the uffd-wp marker if explicitly requested */
+		if (!zap_drop_file_uffd_wp(details))
+			return;
+	} else if (is_hwpoison_entry(entry) ||
+		   is_swapin_error_entry(entry)) {
+		if (!should_zap_cows(details))
+			return;
+	} else {
+		/* We should have covered all the swap entry types */
+		WARN_ON_ONCE(1);
+	}
+	pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+	zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+}
+
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
 	struct mm_struct *mm = tlb->mm;
-	int force_flush = 0;
-	int rss[NR_MM_COUNTERS];
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
-	swp_entry_t entry;
+	struct zap_pte_details pte_details = {
+		.addr = &addr,
+		.flags = ZAP_PTE_INIT,
+		.pte = &pte,
+	};
 
 	tlb_change_page_size(tlb, PAGE_SIZE);
 again:
-	init_rss_vec(rss);
+	init_rss_vec(pte_details.rss);
 	start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	pte = start_pte;
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
-		struct page *page;
 
 		if (pte_none(ptent))
 			continue;
@@ -1382,95 +1500,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			break;
 
 		if (pte_present(ptent)) {
-			unsigned int delay_rmap;
-
-			page = vm_normal_page(vma, addr, ptent);
-			if (unlikely(!should_zap_page(details, page)))
-				continue;
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
-			tlb_remove_tlb_entry(tlb, pte, addr);
-			zap_install_uffd_wp_if_needed(vma, addr, pte, details,
-						      ptent);
-			if (unlikely(!page))
-				continue;
-
-			delay_rmap = 0;
-			if (!PageAnon(page)) {
-				if (pte_dirty(ptent)) {
-					set_page_dirty(page);
-					if (tlb_delay_rmap(tlb)) {
-						delay_rmap = 1;
-						force_flush = 1;
-					}
-				}
-				if (pte_young(ptent) && likely(vma_has_recency(vma)))
-					mark_page_accessed(page);
-			}
-			rss[mm_counter(page)]--;
-			if (!delay_rmap) {
-				page_remove_rmap(page, vma, false);
-				if (unlikely(page_mapcount(page) < 0))
-					print_bad_pte(vma, addr, ptent, page);
-			}
-			if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
-				force_flush = 1;
-				addr += PAGE_SIZE;
+			if (zap_present_pte(tlb, vma, details, &pte_details))
 				break;
-			}
 			continue;
 		}
-
-		entry = pte_to_swp_entry(ptent);
-		if (is_device_private_entry(entry) ||
-		    is_device_exclusive_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
-			if (unlikely(!should_zap_page(details, page)))
-				continue;
-			/*
-			 * Both device private/exclusive mappings should only
-			 * work with anonymous page so far, so we don't need to
-			 * consider uffd-wp bit when zap. For more information,
-			 * see zap_install_uffd_wp_if_needed().
-			 */
-			WARN_ON_ONCE(!vma_is_anonymous(vma));
-			rss[mm_counter(page)]--;
-			if (is_device_private_entry(entry))
-				page_remove_rmap(page, vma, false);
-			put_page(page);
-		} else if (!non_swap_entry(entry)) {
-			/* Genuine swap entry, hence a private anon page */
-			if (!should_zap_cows(details))
-				continue;
-			rss[MM_SWAPENTS]--;
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
-		} else if (is_migration_entry(entry)) {
-			page = pfn_swap_entry_to_page(entry);
-			if (!should_zap_page(details, page))
-				continue;
-			rss[mm_counter(page)]--;
-		} else if (pte_marker_entry_uffd_wp(entry)) {
-			/* Only drop the uffd-wp marker if explicitly requested */
-			if (!zap_drop_file_uffd_wp(details))
-				continue;
-		} else if (is_hwpoison_entry(entry) ||
-			   is_swapin_error_entry(entry)) {
-			if (!should_zap_cows(details))
-				continue;
-		} else {
-			/* We should have covered all the swap entry types */
-			WARN_ON_ONCE(1);
-		}
-		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-		zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+		zap_nopresent_pte(tlb, vma, details, &pte_details);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	add_mm_rss_vec(mm, rss);
+	add_mm_rss_vec(mm, pte_details.rss);
 	arch_leave_lazy_mmu_mode();
 
 	/* Do the actual TLB flush before dropping ptl */
-	if (force_flush) {
+	if (pte_details.flags & ZAP_PTE_FORCE_FLUSH) {
 		tlb_flush_mmu_tlbonly(tlb);
 		tlb_flush_rmaps(tlb, vma);
 	}
@@ -1482,8 +1523,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	 * entries before releasing the ptl), free the batched
 	 * memory too. Restart if we didn't do everything.
 	 */
-	if (force_flush) {
-		force_flush = 0;
+	if (pte_details.flags & ZAP_PTE_FORCE_FLUSH) {
+		pte_details.flags &= ~ZAP_PTE_FORCE_FLUSH;
 		tlb_flush_mmu(tlb);
 	}
 
-- 
2.34.1


  reply	other threads:[~2023-04-14 14:24 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-14 14:23 [PATCH v5 00/17] Introduce Copy-On-Write to Page Table Chih-En Lin
2023-04-14 14:23 ` Chih-En Lin [this message]
2023-04-14 14:23 ` [PATCH v5 02/17] mm: Allow user to control COW PTE via prctl Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 03/17] mm: Add Copy-On-Write PTE to fork() Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 04/17] mm: Add break COW PTE fault and helper functions Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 05/17] mm: Handle COW-ed PTE during zapping Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 06/17] mm/rmap: Break COW PTE in rmap walking Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 07/17] mm/khugepaged: Break COW PTE before scanning pte Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 08/17] mm/ksm: Break COW PTE before modify shared PTE Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 09/17] mm/madvise: Handle COW-ed PTE with madvise() Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 10/17] mm/gup: Trigger break COW PTE before calling follow_pfn_pte() Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 11/17] mm/mprotect: Break COW PTE before changing protection Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 12/17] mm/userfaultfd: Support COW PTE Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 13/17] mm/migrate_device: " Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 14/17] fs/proc: Support COW PTE with clear_refs_write Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 15/17] events/uprobes: Break COW PTE before replacing page Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 16/17] mm: fork: Enable COW PTE to fork system call Chih-En Lin
2023-04-14 14:23 ` [PATCH v5 17/17] mm: Check the unexpected modification of COW-ed PTE Chih-En Lin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230414142341.354556-2-shiyn.lin@gmail.com \
    --to=shiyn.lin@gmail.com \
    --cc=Jason@zx2c4.com \
    --cc=Liam.Howlett@Oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alex.sierra@amd.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=anshuman.khandual@arm.com \
    --cc=avagin@gmail.com \
    --cc=baohua@kernel.org \
    --cc=bp@alien8.de \
    --cc=bristot@kernel.org \
    --cc=broonie@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=christophe.leroy@csgroup.eu \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=ebiederm@xmission.com \
    --cc=foxhoundsk.tw@gmail.com \
    --cc=gautammenghani201@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hch@infradead.org \
    --cc=hpa@zytor.com \
    --cc=hughd@google.com \
    --cc=irogers@google.com \
    --cc=jgross@suse.com \
    --cc=jhubbard@nvidia.com \
    --cc=joey.gouly@arm.com \
    --cc=jolsa@kernel.org \
    --cc=jserv@ccns.ncku.edu.tw \
    --cc=kunyu@nfschina.com \
    --cc=legion@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=liushixin2@huawei.com \
    --cc=mark.rutland@arm.com \
    --cc=mhiramat@kernel.org \
    --cc=mhocko@suse.com \
    --cc=minchan@kernel.org \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=namit@vmware.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=peng301@purdue.edu \
    --cc=peterx@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pfonseca@purdue.edu \
    --cc=rostedt@goodmis.org \
    --cc=shakeelb@google.com \
    --cc=shy828301@gmail.com \
    --cc=steven@liquorix.net \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=tongtiangen@huawei.com \
    --cc=vbabka@suse.cz \
    --cc=vincent.whitchurch@axis.com \
    --cc=wangkefeng.wang@huawei.com \
    --cc=willy@infradead.org \
    --cc=x86@kernel.org \
    --cc=yuzhao@google.com \
    --cc=zhengqi.arch@bytedance.com \
    --cc=zokeefe@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).