Linux-mm Archive on lore.kernel.org

Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 2/2] fork: group allocation of per-cpu counters for mm struct
From: Mateusz Guzik @ 2023-08-22 18:41 UTC (permalink / raw)
  To: linux-kernel; +Cc: dennis, tj, cl, akpm, shakeelb, linux-mm, Mateusz Guzik
In-Reply-To: <20230822184152.2194558-1-mjguzik@gmail.com>

A trivial execve scalability test which tries to be very friendly
(statically linked binaries, all separate) is predominantly bottlenecked
by back-to-back per-cpu counter allocations which serialize on global
locks.

Ease the pain by allocating and freeing them in one go.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -static -O2 -o static-doexec doexec.c
$ ./static-doexec $(nproc)

Even at a very modest scale of 26 cores (ops/s):
before:	133543.63
after:	186061.81 (+39%)

While with the patch these allocations remain a significant problem,
the primary bottleneck shifts to:

    __pv_queued_spin_lock_slowpath+1
    _raw_spin_lock_irqsave+57
    folio_lruvec_lock_irqsave+91
    release_pages+590
    tlb_batch_pages_flush+61
    tlb_finish_mmu+101
    exit_mmap+327
    __mmput+61
    begin_new_exec+1245
    load_elf_binary+712
    bprm_execve+644
    do_execveat_common.isra.0+429
    __x64_sys_execve+50
    do_syscall_64+46
    entry_SYSCALL_64_after_hwframe+110

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 kernel/fork.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index d2e12b6d2b18..4f0ada33457e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
  */
 void __mmdrop(struct mm_struct *mm)
 {
-	int i;
-
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
 
@@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
+	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 
-	for (i = 0; i < NR_MM_COUNTERS; i++)
-		percpu_counter_destroy(&mm->rss_stat[i]);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
-	int i;
-
 	mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
 	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
 	atomic_set(&mm->mm_users, 1);
@@ -1301,17 +1296,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm))
 		goto fail_cid;
 
-	for (i = 0; i < NR_MM_COUNTERS; i++)
-		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
-			goto fail_pcpu;
+	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS))
+		goto fail_pcpu;
 
 	mm->user_ns = get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
 	return mm;
 
 fail_pcpu:
-	while (i > 0)
-		percpu_counter_destroy(&mm->rss_stat[--i]);
 	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);
-- 
2.39.2



^ permalink raw reply related

* [PATCH v2 1/2] pcpcntr: add group allocation/free
From: Mateusz Guzik @ 2023-08-22 18:41 UTC (permalink / raw)
  To: linux-kernel; +Cc: dennis, tj, cl, akpm, shakeelb, linux-mm, Mateusz Guzik
In-Reply-To: <20230822184152.2194558-1-mjguzik@gmail.com>

Allocations and frees are globally serialized on the pcpu lock (and the
CPU hotplug lock if enabled, which is the case on Debian).

At least one frequent consumer allocates 4 back-to-back counters (and
frees them in the same manner), exacerbating the problem.

While this does not fully remedy scalability issues, it is a step
towards that goal and provides immediate relief.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 include/linux/percpu_counter.h | 20 ++++++++---
 lib/percpu_counter.c           | 61 +++++++++++++++++++++++-----------
 2 files changed, 57 insertions(+), 24 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 75b73c83bc9d..518a4088b964 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -30,17 +30,27 @@ struct percpu_counter {
 
 extern int percpu_counter_batch;
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
-			  struct lock_class_key *key);
+int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
+			  u32 nr_counters, struct lock_class_key *key);
 
-#define percpu_counter_init(fbc, value, gfp)				\
+#define percpu_counter_init_many(fbc, value, gfp, nr_counters)		\
 	({								\
 		static struct lock_class_key __key;			\
 									\
-		__percpu_counter_init(fbc, value, gfp, &__key);		\
+		__percpu_counter_init_many(fbc, value, gfp, nr_counters,\
+					   &__key);			\
 	})
 
-void percpu_counter_destroy(struct percpu_counter *fbc);
+
+#define percpu_counter_init(fbc, value, gfp)				\
+	percpu_counter_init_many(fbc, value, gfp, 1)
+
+void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
+static inline void percpu_counter_destroy(struct percpu_counter *fbc)
+{
+	percpu_counter_destroy_many(fbc, 1);
+}
+
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
 			      s32 batch);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 5004463c4f9f..9338b27f1cdd 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -151,48 +151,71 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 }
 EXPORT_SYMBOL(__percpu_counter_sum);
 
-int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
-			  struct lock_class_key *key)
+int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
+			  u32 nr_counters, struct lock_class_key *key)
 {
 	unsigned long flags __maybe_unused;
-
-	raw_spin_lock_init(&fbc->lock);
-	lockdep_set_class(&fbc->lock, key);
-	fbc->count = amount;
-	fbc->counters = alloc_percpu_gfp(s32, gfp);
-	if (!fbc->counters)
+	size_t counter_size;
+	s32 __percpu *counters;
+	u32 i;
+
+	counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
+	counters = __alloc_percpu_gfp(nr_counters * counter_size,
+				      __alignof__(*counters), gfp);
+	if (!counters) {
+		fbc[0].counters = NULL;
 		return -ENOMEM;
+	}
 
-	debug_percpu_counter_activate(fbc);
+	for (i = 0; i < nr_counters; i++) {
+		raw_spin_lock_init(&fbc[i].lock);
+		lockdep_set_class(&fbc[i].lock, key);
+#ifdef CONFIG_HOTPLUG_CPU
+		INIT_LIST_HEAD(&fbc[i].list);
+#endif
+		fbc[i].count = amount;
+		fbc[i].counters = (void *)counters + (i * counter_size);
+
+		debug_percpu_counter_activate(&fbc[i]);
+	}
 
 #ifdef CONFIG_HOTPLUG_CPU
-	INIT_LIST_HEAD(&fbc->list);
 	spin_lock_irqsave(&percpu_counters_lock, flags);
-	list_add(&fbc->list, &percpu_counters);
+	for (i = 0; i < nr_counters; i++)
+		list_add(&fbc[i].list, &percpu_counters);
 	spin_unlock_irqrestore(&percpu_counters_lock, flags);
 #endif
 	return 0;
 }
-EXPORT_SYMBOL(__percpu_counter_init);
+EXPORT_SYMBOL(__percpu_counter_init_many);
 
-void percpu_counter_destroy(struct percpu_counter *fbc)
+void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
 {
 	unsigned long flags __maybe_unused;
+	u32 i;
+
+	if (WARN_ON_ONCE(!fbc))
+		return;
 
-	if (!fbc->counters)
+	if (!fbc[0].counters)
 		return;
 
-	debug_percpu_counter_deactivate(fbc);
+	for (i = 0; i < nr_counters; i++)
+		debug_percpu_counter_deactivate(&fbc[i]);
 
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock_irqsave(&percpu_counters_lock, flags);
-	list_del(&fbc->list);
+	for (i = 0; i < nr_counters; i++)
+		list_del(&fbc[i].list);
 	spin_unlock_irqrestore(&percpu_counters_lock, flags);
 #endif
-	free_percpu(fbc->counters);
-	fbc->counters = NULL;
+
+	free_percpu(fbc[0].counters);
+
+	for (i = 0; i < nr_counters; i++)
+		fbc[i].counters = NULL;
 }
-EXPORT_SYMBOL(percpu_counter_destroy);
+EXPORT_SYMBOL(percpu_counter_destroy_many);
 
 int percpu_counter_batch __read_mostly = 32;
 EXPORT_SYMBOL(percpu_counter_batch);
-- 
2.39.2



^ permalink raw reply related

* [PATCH v2 0/2] execve scalability issues, part 1
From: Mateusz Guzik @ 2023-08-22 18:41 UTC (permalink / raw)
  To: linux-kernel; +Cc: dennis, tj, cl, akpm, shakeelb, linux-mm, Mateusz Guzik

To start I figured I'm going to bench about as friendly case as it gets
-- statically linked *separate* binaries all doing execve in a loop.

I borrowed the bench from here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -static -O2 -o static-doexec doexec.c
$ ./static-doexec $(nproc)

It prints a result every second (warning: first line is garbage).

My test box is temporarily only 26 cores and even at this scale I run
into massive lock contention stemming from back-to-back calls to
percpu_counter_init (and _destroy later).

While not a panacea, one simple thing to do here is to batch these ops.
Since the term "batching" is already used in the file, I decided to
refer to it as "grouping" instead.

Even if this code could be patched to dodge these counters,  I would
argue a high-traffic alloc/free consumer is only a matter of time so it
makes sense to facilitate it.

With the fix I get an ok win, to quote from the commit:
> Even at a very modest scale of 26 cores (ops/s):
> before: 133543.63
> after:  186061.81 (+39%)

> While with the patch these allocations remain a significant problem,
> the primary bottleneck shifts to:
> 
>     __pv_queued_spin_lock_slowpath+1
>     _raw_spin_lock_irqsave+57
>     folio_lruvec_lock_irqsave+91
>     release_pages+590
>     tlb_batch_pages_flush+61
>     tlb_finish_mmu+101
>     exit_mmap+327
>     __mmput+61
>     begin_new_exec+1245
>     load_elf_binary+712
>     bprm_execve+644
>     do_execveat_common.isra.0+429
>     __x64_sys_execve+50
>     do_syscall_64+46
>     entry_SYSCALL_64_after_hwframe+110

I intend to do more work on the area to mostly sort it out, but I would
not mind if someone else took the hammer to folio. :)

With this out of the way I'll be looking at some form of caching to
eliminate these allocs as a problem.

Thoughts?

v2:
- force bigger alignment on alloc
- rename "counters" to "nr_counters" and pass prior to lock key
- drop {}'s for single-statement loops

Mateusz Guzik (2):
  pcpcntr: add group allocation/free
  fork: group allocation of per-cpu counters for mm struct

 include/linux/percpu_counter.h | 20 ++++++++---
 kernel/fork.c                  | 14 ++------
 lib/percpu_counter.c           | 61 +++++++++++++++++++++++-----------
 3 files changed, 60 insertions(+), 35 deletions(-)

-- 
2.39.2

^ permalink raw reply

* Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd
From: Hugh Dickins @ 2023-08-22 18:34 UTC (permalink / raw)
  To: Peter Xu
  Cc: Hugh Dickins, Jann Horn, Andrew Morton, Mike Kravetz,
	Mike Rapoport, Kirill A. Shutemov, Matthew Wilcox,
	David Hildenbrand, Suren Baghdasaryan, Qi Zheng, Yang Shi,
	Mel Gorman, Peter Zijlstra, Will Deacon, Yu Zhao, Alistair Popple,
	Ralph Campbell, Ira Weiny, Steven Price, SeongJae Park,
	Lorenzo Stoakes, Huang Ying, Naoya Horiguchi, Christophe Leroy,
	Zack Rusin, Jason Gunthorpe, Axel Rasmussen, Anshuman Khandual,
	Pasha Tatashin, Miaohe Lin, Minchan Kim, Christoph Hellwig,
	Song Liu, Thomas Hellstrom, Russell King, David S. Miller,
	Michael Ellerman, Aneesh Kumar K.V, Heiko Carstens,
	Christian Borntraeger, Claudio Imbrenda, Alexander Gordeev,
	Gerald Schaefer, Vasily Gorbik, Vishal Moola, Vlastimil Babka,
	Zi Yan, Zach O'Keefe, Linux ARM, sparclinux, linuxppc-dev,
	linux-s390, kernel list, Linux-MM
In-Reply-To: <ZOTGvfO31pleXrPF@x1n>

On Tue, 22 Aug 2023, Peter Xu wrote:
> On Mon, Aug 21, 2023 at 07:51:38PM -0700, Hugh Dickins wrote:
> > On Mon, 21 Aug 2023, Jann Horn wrote:
...
> > > 
> > > I guess an alternative would be to use a spin_trylock() instead of the
> > > current pmd_lock(), and if that fails, temporarily drop the page table
> > > lock and then restart from step 2 with both locks held - and at that
> > > point the page table scan should be fast since we expect it to usually
> > > be empty.
> > 
> > That's certainly a good idea, if collapse on userfaultfd_armed private
> > is anything of a common case (I doubt, but I don't know).  It may be a
> > better idea anyway (saving a drop and retake of ptlock).
> > 
> > I gave it a try, expecting to end up with something that would lead
> > me to say "I tried it, but it didn't work out well"; but actually it
> > looks okay to me.  I wouldn't say I prefer it, but it seems reasonable,
> > and no more complicated (as Peter rightly observes) than the original.
> > 
> > It's up to you and Peter, and whoever has strong feelings about it,
> > to choose between them: I don't mind (but I shall be sad if someone
> > demands that I indent that comment deeper - I'm not a fan of long
> > multi-line comments near column 80).
> 
> No strong opinion here, either.  Just one trivial comment/question below on
> the new patch (if that will be preferred)..

I'm going to settle for the original v1 for now (I'll explain why in reply
to Jann next) - which already has the blessing of your Acked-by, thanks.

(Yes, the locking is a bit confusing: but mainly for the unrelated reason,
that with the split locking configs, we never quite know whether this lock
is the same as that lock or not, and so have to be rather careful.)

> > [PATCH mm-unstable v2] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd
...
> > @@ -1572,9 +1572,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
> >  				haddr, haddr + HPAGE_PMD_SIZE);
> >  	mmu_notifier_invalidate_range_start(&range);
> >  	notified = true;
> > -	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
> > -	if (!start_pte)		/* mmap_lock + page lock should prevent this */
> > -		goto abort;
> > +	spin_lock(ptl);
> 
> .. here will the ptl always be valid?
> 
> That comes from the previous round of pte_offset_map_lock(), and I assume
> after this whole "thp collapse without write lock" work landed, it has the
> same lifecycle with the *pte pointer, so can be invalid right after the rcu
> read lock released; mmap read lock isn't strong enough to protect the ptl,
> not anymore.
> 
> Maybe it's all fine because the thp collapse path is the solo path(s) that
> will release the pte pgtable page without write mmap lock (so as to release
> the ptl too when doing so), and we at least still hold the page lock, so
> the worst case is the other concurrent "thp collapse" will still serialize
> with this one on the huge page lock. But that doesn't look as solid as
> fetching again the ptl from another pte_offset_map_nolock().  So still just
> raise this question up.  It's possible I just missed something.

It is safe, as you say because of us holding the hpage lock, which stops
any racing callers of collapse_pte_mapped_thp() or retract_page_tables():
and these are the functions which (currently) make the *pmd transition
which pte_offset_map_lock() etc. are being careful to guard against.

[In future we can imagine empty page table removal making that transition
too: and that wouldn't even have any hpage to lock.  Will it rely on
mmap_lock for write? or pmd_lock? probably both, but no need to design
for it now.]

But I agree that it does *look* more questionable in this patch: there was
a reassuring pte_offset_map_lock() there before, and now I rely more on
the assumptions and just use the "previous" ptl (and that's why I chose
to make the !start_pte case a VM_BUG_ON a few lines later).

I expect, with more time spent, I could cast it back into more reassuring
form: but it's all a bit of a con trick - if you look further down (even
before v2 or v1 fixes) to "step 4", there we have "if (ptl != pml)" which
is also relying on the fact that ptl cannot have changed.  And no doubt
that too could be recast into more reassuring-looking form, but it
wouldn't actually be worthwhile.

Thanks for considering these, Peter: I'll recommend v1 to Andrew.

Hugh


^ permalink raw reply

* Re: [PATCH RFC 0/2] mm/page_alloc: free_pcppages_bulk safeguard
From: Matthew Wilcox @ 2023-08-22 18:28 UTC (permalink / raw)
  To: Chris Li
  Cc: Mel Gorman, Andrew Morton, Kemeng Shi, baolin.wang, Michal Hocko,
	david, linux-mm, Namhyung Kim, Greg Thelen, linux-kernel,
	John Sperbeck, Huang Ying, Alexei Starovoitov
In-Reply-To: <CAF8kJuOsWo5RfDcfnWZfnqYXjf6bkkxdXG1JCwjaEZ1nn29AaA@mail.gmail.com>

On Tue, Aug 22, 2023 at 10:48:42AM -0700, Chris Li wrote:
> Hi Mel,
> 
> Adding Alexei to the discussion.
> 
> On Mon, Aug 21, 2023 at 3:32 AM Mel Gorman <mgorman@techsingularity.net> wrote:
> >
> > On Thu, Aug 17, 2023 at 11:05:22PM -0700, Chris Li wrote:
> > > In this patch series I want to safeguard
> > > the free_pcppage_bulk against change in the
> > > pcp->count outside of this function. e.g.
> > > by BPF program inject on the function tracepoint.
> > >
> > > I break up the patches into two seperate patches
> > > for the safeguard and clean up.
> > >
> > > Hopefully that is easier to review.
> > >
> > > Signed-off-by: Chris Li <chrisl@kernel.org>
> >
> > This sounds like a maintenance nightmare if internal state can be arbitrary
> > modified by a BPF program and still expected to work properly in all cases.
> > Every review would have to take into account "what if a BPF script modifies
> > state behind our back?"
> 
> Thanks for the feedback.
> 
> I agree that it is hard to support if we allow BPF to change any internal
> stage as a rule.  That is why it is a RFC. Would you consider it case
> by case basis?
> The kernel panic is bad, the first patch is actually very small. I can
> also change it
> to generate warnings if we detect the inconsistent state.

We wouldn't allow C code that hooks spinlocks (eg lockdep) to allocate
memory.  I don't understand why BPF code should allocate memory either.


^ permalink raw reply

* [PATCH v4] proc/ksm: add ksm stats to /proc/pid/smaps
From: Stefan Roesch @ 2023-08-22 18:05 UTC (permalink / raw)
  To: kernel-team
  Cc: shr, akpm, david, linux-fsdevel, hannes, riel, linux-kernel,
	linux-mm

With madvise and prctl KSM can be enabled for different VMA's. Once it
is enabled we can query how effective KSM is overall. However we cannot
easily query if an individual VMA benefits from KSM.

This commit adds a KSM section to the /prod/<pid>/smaps file. It reports
how many of the pages are KSM pages. The returned value for KSM is
independent of the use of the shared zeropage.

Here is a typical output:

7f420a000000-7f421a000000 rw-p 00000000 00:00 0
Size:             262144 kB
KernelPageSize:        4 kB
MMUPageSize:           4 kB
Rss:               51212 kB
Pss:                8276 kB
Shared_Clean:        172 kB
Shared_Dirty:      42996 kB
Private_Clean:       196 kB
Private_Dirty:      7848 kB
Referenced:        15388 kB
Anonymous:         51212 kB
KSM:               41376 kB
LazyFree:              0 kB
AnonHugePages:         0 kB
ShmemPmdMapped:        0 kB
FilePmdMapped:         0 kB
Shared_Hugetlb:        0 kB
Private_Hugetlb:       0 kB
Swap:             202016 kB
SwapPss:            3882 kB
Locked:                0 kB
THPeligible:    0
ProtectionKey:         0
ksm_state:          0
ksm_skip_base:      0
ksm_skip_count:     0
VmFlags: rd wr mr mw me nr mg anon

This information also helps with the following workflow:
- First enable KSM for all the VMA's of a process with prctl.
- Then analyze with the above smaps report which VMA's benefit the most
- Change the application (if possible) to add the corresponding madvise
calls for the VMA's that benefit the most

Signed-off-by: Stefan Roesch <shr@devkernel.io>
---
 Documentation/filesystems/proc.rst |  4 ++++
 fs/proc/task_mmu.c                 | 16 +++++++++++-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 7897a7dafcbc..d5bdfd59f5b0 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -461,6 +461,7 @@ Memory Area, or VMA) there is a series of lines such as the following::
     Private_Dirty:         0 kB
     Referenced:          892 kB
     Anonymous:             0 kB
+    KSM:                   0 kB
     LazyFree:              0 kB
     AnonHugePages:         0 kB
     ShmemPmdMapped:        0 kB
@@ -501,6 +502,9 @@ accessed.
 a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
 
+"KSM" shows the amount of anonymous memory that has been de-duplicated. The
+value is independent of the use of shared zeropage.
+
 "LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
 The memory isn't freed immediately with madvise(). It's freed in memory
 pressure if the memory is clean. Please note that the printed value might
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 51315133cdc2..4532caa8011c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
 #include <linux/hugetlb.h>
 #include <linux/huge_mm.h>
 #include <linux/mount.h>
+#include <linux/ksm.h>
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
@@ -396,6 +397,7 @@ struct mem_size_stats {
 	unsigned long swap;
 	unsigned long shared_hugetlb;
 	unsigned long private_hugetlb;
+	unsigned long ksm;
 	u64 pss;
 	u64 pss_anon;
 	u64 pss_file;
@@ -435,9 +437,9 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
 	}
 }
 
-static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		bool compound, bool young, bool dirty, bool locked,
-		bool migration)
+static void smaps_account(struct mem_size_stats *mss, pte_t *pte,
+		struct page *page, bool compound, bool young, bool dirty,
+		bool locked, bool migration)
 {
 	int i, nr = compound ? compound_nr(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
@@ -452,6 +454,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 			mss->lazyfree += size;
 	}
 
+	if (PageKsm(page) && (!pte || !is_ksm_zero_pte(*pte)))
+		mss->ksm += size;
+
 	mss->resident += size;
 	/* Accumulate the size in pages that have been accessed. */
 	if (young || page_is_young(page) || PageReferenced(page))
@@ -557,7 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	if (!page)
 		return;
 
-	smaps_account(mss, page, false, young, dirty, locked, migration);
+	smaps_account(mss, pte, page, false, young, dirty, locked, migration);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -591,7 +596,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	else
 		mss->file_thp += HPAGE_PMD_SIZE;
 
-	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+	smaps_account(mss, NULL, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
 		      locked, migration);
 }
 #else
@@ -822,6 +827,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
 	SEQ_PUT_DEC(" kB\nPrivate_Dirty:  ", mss->private_dirty);
 	SEQ_PUT_DEC(" kB\nReferenced:     ", mss->referenced);
 	SEQ_PUT_DEC(" kB\nAnonymous:      ", mss->anonymous);
+	SEQ_PUT_DEC(" kB\nKSM:            ", mss->ksm);
 	SEQ_PUT_DEC(" kB\nLazyFree:       ", mss->lazyfree);
 	SEQ_PUT_DEC(" kB\nAnonHugePages:  ", mss->anonymous_thp);
 	SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);

base-commit: f4a280e5bb4a764a75d3215b61bc0f02b4c26417
-- 
2.39.3



^ permalink raw reply related

* Re: [PATCH v3] proc/ksm: add ksm stats to /proc/pid/smaps
From: Stefan Roesch @ 2023-08-22 18:03 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: kernel-team, akpm, linux-fsdevel, hannes, riel, linux-kernel,
	linux-mm
In-Reply-To: <886b6a56-8acb-e975-b5f3-d8098a2285ab@redhat.com>


David Hildenbrand <david@redhat.com> writes:

> On 17.08.23 18:23, Stefan Roesch wrote:
>> With madvise and prctl KSM can be enabled for different VMA's. Once it
>> is enabled we can query how effective KSM is overall. However we cannot
>> easily query if an individual VMA benefits from KSM.
>> This commit adds a KSM section to the /prod/<pid>/smaps file. It reports
>> how many of the pages are KSM pages. The returned value for KSM is
>> independent of the use of the shared zeropage.
>> Here is a typical output:
>> 7f420a000000-7f421a000000 rw-p 00000000 00:00 0
>> Size:             262144 kB
>> KernelPageSize:        4 kB
>> MMUPageSize:           4 kB
>> Rss:               51212 kB
>> Pss:                8276 kB
>> Shared_Clean:        172 kB
>> Shared_Dirty:      42996 kB
>> Private_Clean:       196 kB
>> Private_Dirty:      7848 kB
>> Referenced:        15388 kB
>> Anonymous:         51212 kB
>> KSM:               41376 kB
>> LazyFree:              0 kB
>> AnonHugePages:         0 kB
>> ShmemPmdMapped:        0 kB
>> FilePmdMapped:         0 kB
>> Shared_Hugetlb:        0 kB
>> Private_Hugetlb:       0 kB
>> Swap:             202016 kB
>> SwapPss:            3882 kB
>> Locked:                0 kB
>> THPeligible:    0
>> ProtectionKey:         0
>> ksm_state:          0
>> ksm_skip_base:      0
>> ksm_skip_count:     0
>> VmFlags: rd wr mr mw me nr mg anon
>> This information also helps with the following workflow:
>> - First enable KSM for all the VMA's of a process with prctl.
>> - Then analyze with the above smaps report which VMA's benefit the most
>> - Change the application (if possible) to add the corresponding madvise
>> calls for the VMA's that benefit the most
>> Signed-off-by: Stefan Roesch <shr@devkernel.io>
>> ---
>>   Documentation/filesystems/proc.rst | 4 ++++
>>   fs/proc/task_mmu.c                 | 5 +++++
>>   2 files changed, 9 insertions(+)
>> diff --git a/Documentation/filesystems/proc.rst
>> b/Documentation/filesystems/proc.rst
>> index 7897a7dafcbc..d5bdfd59f5b0 100644
>> --- a/Documentation/filesystems/proc.rst
>> +++ b/Documentation/filesystems/proc.rst
>> @@ -461,6 +461,7 @@ Memory Area, or VMA) there is a series of lines such as the following::
>>       Private_Dirty:         0 kB
>>       Referenced:          892 kB
>>       Anonymous:             0 kB
>> +    KSM:                   0 kB
>>       LazyFree:              0 kB
>>       AnonHugePages:         0 kB
>>       ShmemPmdMapped:        0 kB
>> @@ -501,6 +502,9 @@ accessed.
>>   a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
>>   and a page is modified, the file page is replaced by a private anonymous copy.
>>   +"KSM" shows the amount of anonymous memory that has been de-duplicated. The
>> +value is independent of the use of shared zeropage.
>> +
>>   "LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
>>   The memory isn't freed immediately with madvise(). It's freed in memory
>>   pressure if the memory is clean. Please note that the printed value might
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 51315133cdc2..f591c750ffda 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -396,6 +396,7 @@ struct mem_size_stats {
>>   	unsigned long swap;
>>   	unsigned long shared_hugetlb;
>>   	unsigned long private_hugetlb;
>> +	unsigned long ksm;
>>   	u64 pss;
>>   	u64 pss_anon;
>>   	u64 pss_file;
>> @@ -452,6 +453,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
>>   			mss->lazyfree += size;
>>   	}
>>   +	if (PageKsm(page))
>> +		mss->ksm += size;
>> +
>
> Did you accidentally not include handling of the KSM-shared zeropage?
>
> Or did I misinterpret "independent of the use of the shared zeropage."

I think I misunderstood your review comment. I'll address it in the next version.


^ permalink raw reply

* Re: [PATCH v5 11/37] mm: Define VM_SHADOW_STACK for arm64 when we support GCS
From: Mark Brown @ 2023-08-22 17:55 UTC (permalink / raw)
  To: Edgecombe, Rick P
  Cc: david@redhat.com, corbet@lwn.net, ardb@kernel.org, maz@kernel.org,
	shuah@kernel.org, Szabolcs.Nagy@arm.com, keescook@chromium.org,
	james.morse@arm.com, debug@rivosinc.com,
	akpm@linux-foundation.org, palmer@dabbelt.com,
	catalin.marinas@arm.com, hjl.tools@gmail.com,
	paul.walmsley@sifive.com, linux-mm@kvack.org,
	aou@eecs.berkeley.edu, oleg@redhat.com, arnd@arndb.de,
	ebiederm@xmission.com, will@kernel.org, suzuki.poulose@arm.com,
	linux-kernel@vger.kernel.org, kvmarm@lists.linux.dev,
	linux-doc@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-kselftest@vger.kernel.org,
	linux-arch@vger.kernel.org, oliver.upton@linux.dev,
	linux-riscv@lists.infradead.org
In-Reply-To: <e10e729392c5fa421baf08b4ea7aaac6ffada0f5.camel@intel.com>

[-- Attachment #1: Type: text/plain, Size: 788 bytes --]

On Tue, Aug 22, 2023 at 04:47:26PM +0000, Edgecombe, Rick P wrote:
> On Tue, 2023-08-22 at 16:41 +0100, Mark Brown wrote:

> > I can certainly update it to do that, I was just trying to fit in
> > with
> > how the code was written on the basis that there was probably a good
> > reason for it that had been discussed somewhere.  I can send an
> > incremental patch for this on top of the x86 patches assuming they go
> > in
> > during the merge window.

> There was something like that on the x86 series way back, but it was
> dropped[0]. IIRC risc-v was going to try to do something other than
> VM_SHADOW_STACK, so they may conflict some day. But in the meantime,
> adding a CONFIG_HAVE_ARCH_SHADOW_STACK here in the arm series makes
> sense to me.

OK, I'll do that.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH v4 03/36] arm64/gcs: Document the ABI for Guarded Control Stacks
From: Mark Brown @ 2023-08-22 17:53 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Will Deacon, Jonathan Corbet, Andrew Morton, Marc Zyngier,
	Oliver Upton, James Morse, Suzuki K Poulose, Arnd Bergmann,
	Oleg Nesterov, Eric Biederman, Kees Cook, Shuah Khan,
	Rick P. Edgecombe, Deepak Gupta, Ard Biesheuvel, Szabolcs Nagy,
	H.J. Lu, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	linux-arm-kernel, linux-doc, kvmarm, linux-fsdevel, linux-arch,
	linux-mm, linux-kselftest, linux-kernel, linux-riscv
In-Reply-To: <ZOTnL1SDJWZjHPUW@arm.com>

[-- Attachment #1: Type: text/plain, Size: 2635 bytes --]

On Tue, Aug 22, 2023 at 05:49:51PM +0100, Catalin Marinas wrote:
> On Fri, Aug 18, 2023 at 08:38:02PM +0100, Mark Brown wrote:

> > > stack and pass the pointer/size to clone3()? It saves us from having to
> > > guess what the right size we'd need. struct clone_args is extensible.

> > I can't recall or locate the specific reasoning there right now, perhaps
> > Rick or someone else can?  I'd guess there would be compat concerns for
> > things that don't go via libc which would complicate the story with
> > identifying and marking things as GCS/SS safe, it's going to be more
> > robust to just supply a GCS if the process is using it.  That said
> > having a default doesn't preclude us using the extensibility to allow
> > userspace directly to control the GCS size, I would certainly be in
> > favour of adding support for that.

> It would be good if someone provided a summary of the x86 decision (I'll
> get to those thread but most likely in September). I think we concluded
> that we can't deploy GCS entirely transparently, so we need a libc
> change (apart from the ELF annotations). Since libc is opting in to GCS,

Right, we need changes for setjmp()/longjmp() for example.

> we could also update the pthread_create() etc. to allocate the shadow
> together with the standard stack.

> Anyway, that's my preference but maybe there were good reasons not to do
> this.

Yeah, it'd be good to understand.  I've been through quite a lot of old
versions of the x86 series (I've not found them all, there's 30 versions
or something of the old series plus the current one is on v9) and the
code always appears to have been this way with changelogs that explain
the what but not the why.  For example roughly the current behaviour was
already in place in v10 of the original series:

   https://lore.kernel.org/lkml/20200429220732.31602-26-yu-cheng.yu@intel.com/

I do worry about the story for users calling the underlying clone3() API
(or legacy clone() for that matter) directly, and we would also need to
handle the initial GCS enable via prctl() - that's not insurmountable,
we could add a size argument there that only gets interpreted during the
initial enable for example.

My sense is that they deployment story is going to be smoother with
defaults being provided since it avoids dealing with the issue of what
to do if userspace creates a thread without a GCS in a GCS enabled
process but like I say I'd be totally happy to extend clone3().  I will
put some patches together for that (probably once the x86 stuff lands).
Given the size of this series it might be better split out for
manageability if nothing else.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH] shmem: add support for user extended attributes
From: Oleksandr Tymoshenko @ 2023-08-22 17:50 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Christian Brauner, Franklin “Snaipe” Mathieu, corbet,
	akpm, linux-doc, linux-kernel, linux-mm
In-Reply-To: <924ed61c-5681-aa8b-d943-7f73694d159@google.com>

[-- Attachment #1: Type: text/plain, Size: 2528 bytes --]

Thanks for working on this.

On Mon, Aug 21, 2023 at 10:52 AM Hugh Dickins <hughd@google.com> wrote:

> On Tue, 15 Aug 2023, Christian Brauner wrote:
> > On Tue, Aug 15, 2023 at 09:46:22AM +0200, Franklin “Snaipe” Mathieu
> wrote:
> > >
> > > So, it's likely that there's some more work to do in that area; I'd
> > > certainly expect the OOM killer to take the overall memory footprint
> > > of mount namespaces into account when selecting which processes to
> > > kill. It's also possible my experiment was flawed and not
> > > representative of a real-life scenario, as I clearly have interacted
> > > with misbehaving containers before, which got killed when they wrote
> > > too much to tmpfs. But then again, my experiment also didn't take
> > > memory cgroups into account.
> >
> > So mount namespaces are orthogonal to that and they would be the wrong
> > layer to handle this.
> >
> > Note that an unprivileged user (regular or via containers) on the system
> > can just exhaust all memory in various ways. Ultimately the container or
> > user would likely be taken down by in-kernel OOM or systemd-oomd or
> > similar tools under memory pressure.
> >
> > Of course, all that means is that untrusted workloads need to have
> > cgroup memory limits. That also limits tmpfs instances and prevents
> > unprivileged user from using all memory.
> >
> > If you don't set a memory limit then yes, the container might be able to
> > exhaust all memory but that's a bug in the container runtime. Also, at
> > some point the OOM killer or related userspace tools will select the
> > container init process for termination at which point all the namespaces
> > and mounts go away. That's probably what you experience as misbehaving
> > containers. The real bug there is probably that they're allowed to run
> > without memory limits in the first place.
>
> Thanks, this was a good reminder that I very much needed to look back at
> the memory cgroup limiting of xattrs on tmpfs - I'd had the patch in the
> original series, then was alarmed to find shmem_alloc_inode() using
> GFP_KERNEL, so there seemed no point in accounting the xattrs if the
> inodes were not being accounted: so dropped it temporarily.  I had
> forgotten that SLAB_ACCOUNT on the kmem_cache ensures that accounting.
>
> "tmpfs,xattr: GFP_KERNEL_ACCOUNT for simple xattrs" just sent to fix it:
>
> https://lore.kernel.org/linux-fsdevel/f6953e5a-4183-8314-38f2-40be60998615@google.com/
>
> Thanks,
> Hugh

[-- Attachment #2: Type: text/html, Size: 3244 bytes --]

^ permalink raw reply

* Re: [PATCH RFC 0/2] mm/page_alloc: free_pcppages_bulk safeguard
From: Chris Li @ 2023-08-22 17:48 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Andrew Morton, Kemeng Shi, baolin.wang, Michal Hocko, david,
	willy, linux-mm, Namhyung Kim, Greg Thelen, linux-kernel,
	John Sperbeck, Huang Ying, Alexei Starovoitov
In-Reply-To: <20230821103225.qntnsotdzuthxn2y@techsingularity.net>

Hi Mel,

Adding Alexei to the discussion.

On Mon, Aug 21, 2023 at 3:32 AM Mel Gorman <mgorman@techsingularity.net> wrote:
>
> On Thu, Aug 17, 2023 at 11:05:22PM -0700, Chris Li wrote:
> > In this patch series I want to safeguard
> > the free_pcppage_bulk against change in the
> > pcp->count outside of this function. e.g.
> > by BPF program inject on the function tracepoint.
> >
> > I break up the patches into two seperate patches
> > for the safeguard and clean up.
> >
> > Hopefully that is easier to review.
> >
> > Signed-off-by: Chris Li <chrisl@kernel.org>
>
> This sounds like a maintenance nightmare if internal state can be arbitrary
> modified by a BPF program and still expected to work properly in all cases.
> Every review would have to take into account "what if a BPF script modifies
> state behind our back?"

Thanks for the feedback.

I agree that it is hard to support if we allow BPF to change any internal
stage as a rule.  That is why it is a RFC. Would you consider it case
by case basis?
The kernel panic is bad, the first patch is actually very small. I can
also change it
to generate warnings if we detect the inconsistent state.

How about the second (clean up) patch or Keming's clean up version? I can modify
it to take out the pcp->count if the verdict is just not supporting
BPF changing internal
state at all. I do wish to get rid of the pindex_min and pindex_max.

Thanks

Chris

^ permalink raw reply

* Re: [syzbot] [mm?] kernel BUG in free_unref_page_prepare
From: Mike Kravetz @ 2023-08-22 17:25 UTC (permalink / raw)
  To: syzbot
  Cc: akpm, linux-kernel, linux-mm, llvm, muchun.song, nathan,
	ndesaulniers, syzkaller-bugs, trix, willy
In-Reply-To: <0000000000001f4d4806037fc7b8@google.com>

On 08/22/23 02:39, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:    7271b2a53042 Add linux-next specific files for 20230818
> git tree:       linux-next
> console+strace: https://syzkaller.appspot.com/x/log.txt?x=119dbe4ba80000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=1936af09cdef7dd6
> dashboard link: https://syzkaller.appspot.com/bug?extid=7423f066b632d9ff23d5
> compiler:       gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils for Debian) 2.40
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=112386f3a80000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15044717a80000
> 
> Downloadable assets:
> disk image: https://storage.googleapis.com/syzbot-assets/d81109bc02c1/disk-7271b2a5.raw.xz
> vmlinux: https://storage.googleapis.com/syzbot-assets/4b3bf8e2a4f7/vmlinux-7271b2a5.xz
> kernel image: https://storage.googleapis.com/syzbot-assets/6404cd473c1e/bzImage-7271b2a5.xz
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+7423f066b632d9ff23d5@syzkaller.appspotmail.com

Looks to be caused by Matthew's series "Remove _folio_dtor and _folio_order"
It has already been noted that this series caused issues.

-- 
Mike Kravetz


^ permalink raw reply

* [PATCH] hugetlb: Add documentation for vma_kernel_pagesize()
From: Matthew Wilcox (Oracle) @ 2023-08-22 17:24 UTC (permalink / raw)
  To: Andrew Morton, linux-mm; +Cc: Matthew Wilcox (Oracle)

This is an exported symbol, so it should have kernel-doc.  Update
it to mention folios, and point out that they might be larger than
the supported page size for this VMA.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/hugetlb.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 283cd5290515..dfadb3bfd284 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -968,9 +968,14 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(linear_hugepage_index);
 
-/*
- * Return the size of the pages allocated when backing a VMA. In the majority
- * cases this will be same size as used by the page table entries.
+/**
+ * vma_kernel_pagesize - Page size granularity for this VMA.
+ * @vma: The user mapping.
+ *
+ * Folios in this VMA will be aligned to, and at least the size of the
+ * number of bytes returned by this function.
+ *
+ * Return: The default size of the folios allocated when backing a VMA.
  */
 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
 {
-- 
2.40.1



^ permalink raw reply related

* Re: [PATCH 5.4 0/1] mm: allow a controlled amount of unfairness in the page lock
From: Saeed Mirzamohammadi @ 2023-08-22 17:20 UTC (permalink / raw)
  To: Greg KH
  Cc: # v4 . 16+, Andrew Morton, Ingo Molnar, Peter Zijlstra,
	Juri Lelli, Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Ben Segall, Mel Gorman, Luis Chamberlain, Kees Cook,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	linux-fsdevel@vger.kernel.org
In-Reply-To: <2023082248-parting-backed-2ab0@gregkh>



> On Aug 22, 2023, at 12:08 AM, Greg KH <gregkh@linuxfoundation.org> wrote:
> 
> On Mon, Aug 21, 2023 at 03:25:45PM -0700, Saeed Mirzamohammadi wrote:
>> We observed a 35% of regression running phoronix pts/ramspeed and also 16%
>> with unixbench. Regression is caused by the following commit:
>> dd0f194cfeb5 | mm: rewrite wait_on_page_bit_common() logic
> 
> That is not a valid git id in Linus's or in the linux-stable repo that I
> can see.  Are you sure that it is correct?

Sorry for the incorrect sha. Here are the correct ones:

  kernel_dot_org/linux-stable.git    linux-5.4.y            - c32ab1c1959a
  kernel_dot_org/torvalds_linux.git  master                 - 2a9127fcf229
  kernel_dot_org/linux-stable.git    master                 - 2a9127fcf229
---
  subject          : mm: rewrite wait_on_page_bit_common() logic
  author           : torvalds@linux-foundation.org
  author date      : 2020-07-23 17:16:49

Thanks,
Saeed

> 
> thanks,
> 
> greg k-h



^ permalink raw reply

* Re: [PATCH v2 07/13] mm: Remove HUGETLB_PAGE_DTOR
From: Mike Kravetz @ 2023-08-22 17:19 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Andrew Morton, Jens Axboe, io-uring, linux-mm
In-Reply-To: <ZOQsUaUA2JnY22Nw@casper.infradead.org>

On 08/22/23 04:32, Matthew Wilcox wrote:
> On Mon, Aug 21, 2023 at 08:13:00PM -0700, Mike Kravetz wrote:
> > On 08/16/23 16:11, Matthew Wilcox (Oracle) wrote:
> > > We can use a bit in page[1].flags to indicate that this folio belongs
> > > to hugetlb instead of using a value in page[1].dtors.  That lets
> > > folio_test_hugetlb() become an inline function like it should be.
> > > We can also get rid of NULL_COMPOUND_DTOR.
> > 
> > Not 100% sure yet, but I suspect this patch/series causes the following
> > BUG in today's linux-next.  I can do a deeper dive tomorrow.
> > 
> > # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> > # echo 0 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> > 
> > [  352.631099] page:ffffea0007a30000 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x1e8c00
> > [  352.633674] head:ffffea0007a30000 order:8 entire_mapcount:0 nr_pages_mapped:0 pincount:0
> 
> order 8?  Well, that's exciting.  This is clearly x86, so it should be
> order 9.  Did we mistakenly clear bit 0 of tail->flags?
> 
> Oh.  Oh yes, we do.
> 
> __update_and_free_hugetlb_folio:
> 
>         for (i = 0; i < pages_per_huge_page(h); i++) {
>                 subpage = folio_page(folio, i);
>                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
>                                 1 << PG_referenced | 1 << PG_dirty |
>                                 1 << PG_active | 1 << PG_private |
>                                 1 << PG_writeback);
>         }
> 
> locked		PF_NO_TAIL
> error		PF_NO_TAIL
> referenced	PF_HEAD
> dirty		PF_HEAD
> active		PF_HEAD
> private		PF_ANY
> writeback	PF_NO_TAIL
> 
> So it seems to me there's no way to set any of these bits other than
> PG_private.  And, well, you control PG_private in hugetlbfs.  Do you
> ever set it on tail pages?

Nope

> 
> I think you can remove this entire loop and be happy?

I believe you are correct.  Although, this is clearing flags in the head
page as well as tail pages.  So, I think we still need to clear referenced,
dirty and active as well as private on the head page.

Will take a look today.
-- 
Mike Kravetz


^ permalink raw reply

* Re: [PATCH v6] mm/filemap: remove hugetlb special casing in filemap.c
From: Matthew Wilcox @ 2023-08-22 17:15 UTC (permalink / raw)
  To: Sidhartha Kumar; +Cc: linux-kernel, linux-mm, akpm, songmuchun, mike.kravetz
In-Reply-To: <20230817181836.103744-1-sidhartha.kumar@oracle.com>

On Thu, Aug 17, 2023 at 11:18:36AM -0700, Sidhartha Kumar wrote:
> @@ -890,8 +867,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
>  					unsigned long address)
>  {
>  	pgoff_t pgoff;
> -	if (unlikely(is_vm_hugetlb_page(vma)))
> -		return linear_hugepage_index(vma, address);
>  	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
>  	pgoff += vma->vm_pgoff;
>  	return pgoff;

This is the last use of linear_hugepage_index(), so please remove the
function and its declaration too.


^ permalink raw reply

* Re: [PATCH mm-unstable v1 1/4] mm/swap: stop using page->private on tail pages for THP_SWAP
From: David Hildenbrand @ 2023-08-22 17:14 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: linux-kernel, linux-mm, linux-arm-kernel, Andrew Morton,
	Matthew Wilcox, Peter Xu, Catalin Marinas, Will Deacon,
	Hugh Dickins, Seth Jennings, Dan Streetman, Vitaly Wool
In-Reply-To: <CAJD7tkZNjwq6sWE5hCh6rYpSdD0quGFevXWq9eO_t3tHQBTNmA@mail.gmail.com>

>>
>> +static inline swp_entry_t page_swap_entry(struct page *page)
>> +{
>> +       struct folio *folio = page_folio(page);
>> +       swp_entry_t entry = folio_swap_entry(folio);
>> +
>> +       entry.val += page - &folio->page;
> 
> Would it be better to use folio_page_idx() here?

Sounds reasonable!

-- 
Cheers,

David / dhildenb



^ permalink raw reply

* Re: [PATCH v4 21/36] arm64/mm: Implement map_shadow_stack()
From: Mark Brown @ 2023-08-22 17:05 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Will Deacon, Jonathan Corbet, Andrew Morton, Marc Zyngier,
	Oliver Upton, James Morse, Suzuki K Poulose, Arnd Bergmann,
	Oleg Nesterov, Eric Biederman, Kees Cook, Shuah Khan,
	Rick P. Edgecombe, Deepak Gupta, Ard Biesheuvel, Szabolcs Nagy,
	H.J. Lu, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	linux-arm-kernel, linux-doc, kvmarm, linux-fsdevel, linux-arch,
	linux-mm, linux-kselftest, linux-kernel, linux-riscv
In-Reply-To: <ZOTlBpAbSX6TSZzW@arm.com>

[-- Attachment #1: Type: text/plain, Size: 461 bytes --]

On Tue, Aug 22, 2023 at 05:40:38PM +0100, Catalin Marinas wrote:
> On Fri, Aug 18, 2023 at 06:08:52PM +0100, Mark Brown wrote:

> > mprotect() uses arch_validate_flags() which we're already having cover
> > this so it's already covered.

> I searched the patches and there's no change to the arm64
> arch_validate_flags(). Maybe I missed it.

It's in v5, the update to arch_validate_flags() was one of your comments
from another patch in the series.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH 1/2] pcpcntr: add group allocation/free
From: Dennis Zhou @ 2023-08-22 17:02 UTC (permalink / raw)
  To: Mateusz Guzik; +Cc: linux-kernel, tj, cl, akpm, shakeelb, linux-mm
In-Reply-To: <20230821202829.2163744-2-mjguzik@gmail.com>

Hi Mateusz,

On Mon, Aug 21, 2023 at 10:28:28PM +0200, Mateusz Guzik wrote:
> Allocations and frees are globally serialized on the pcpu lock (and the
> CPU hotplug lock if enabled, which is the case on Debian).
> 
> At least one frequent consumer allocates 4 back-to-back counters (and
> frees them in the same manner), exacerbating the problem.
> 
> While this does not fully remedy scalability issues, it is a step
> towards that goal and provides immediate relief.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>

As I mentioned yesterday, I like this approach because instead of making
percpu smarter, we're batching against the higher level inits which I'm
assuming will have additional synchronization requirements.

Below is mostly style changes to conform and a few nits wrt naming.

> ---
>  include/linux/percpu_counter.h | 19 ++++++++---
>  lib/percpu_counter.c           | 61 ++++++++++++++++++++++++----------
>  2 files changed, 57 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
> index 75b73c83bc9d..ff5850b07124 100644
> --- a/include/linux/percpu_counter.h
> +++ b/include/linux/percpu_counter.h
> @@ -30,17 +30,26 @@ struct percpu_counter {
>  
>  extern int percpu_counter_batch;
>  
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> -			  struct lock_class_key *key);
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> +			  struct lock_class_key *key, u32 count);

1. Can we move count to before the lock_class_key?
2. count is an overloaded term between percpu_counters and
percpu_refcount. Maybe `nr` or `nr_counters` is better?

>  
> -#define percpu_counter_init(fbc, value, gfp)				\
> +#define percpu_counter_init_many(fbc, value, gfp, count)		\
>  	({								\
>  		static struct lock_class_key __key;			\
>  									\
> -		__percpu_counter_init(fbc, value, gfp, &__key);		\
> +		__percpu_counter_init_many(fbc, value, gfp, &__key, count);\
>  	})
>  
> -void percpu_counter_destroy(struct percpu_counter *fbc);
> +
> +#define percpu_counter_init(fbc, value, gfp)				\
> +	percpu_counter_init_many(fbc, value, gfp, 1)
> +
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count);
> +static inline void percpu_counter_destroy(struct percpu_counter *fbc)
> +{
> +	percpu_counter_destroy_many(fbc, 1);
> +}
> +
>  void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
>  void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
>  			      s32 batch);
> diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
> index 5004463c4f9f..2a33cf23df55 100644
> --- a/lib/percpu_counter.c
> +++ b/lib/percpu_counter.c
> @@ -151,48 +151,73 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
>  }
>  EXPORT_SYMBOL(__percpu_counter_sum);
>  
> -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> -			  struct lock_class_key *key)
> +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
> +			  struct lock_class_key *key, u32 count)
>  {
>  	unsigned long flags __maybe_unused;
> +	s32 __percpu *counters;
> +	u32 i;
>  
> -	raw_spin_lock_init(&fbc->lock);
> -	lockdep_set_class(&fbc->lock, key);
> -	fbc->count = amount;
> -	fbc->counters = alloc_percpu_gfp(s32, gfp);
> -	if (!fbc->counters)
> +	counters = __alloc_percpu_gfp(sizeof(*counters) * count,
> +				      sizeof(*counters), gfp);


So while a bit moot, I think it'd be nice to clarify what we should do
here wrt alignment and batch allocation. There has been a case in the
past where alignment > size.

This is from my batch api implementation:
+       /*
+        * When allocating a batch we need to align the size so that each
+        * element in the batch will have the appropriate alignment.
+        */
+       size = ALIGN(size, align);

So I think some variation of:
    element_size = ALIGN(sizeof(*counters, __alignof__(*counters)));
    counters = __alloc_percpu_gfp(nr * element_size, __alignof__(*counters), gfp);

While this isn't necessary here for s32's, I think it would be nice to
just set the precedent so we preserve alignment asks for future users if
they use this as a template.

> +	if (!counters) {
> +		fbc[0].counters = NULL;
>  		return -ENOMEM;
> +	}
>  
> -	debug_percpu_counter_activate(fbc);
> +	for (i = 0; i < count; i++) {
> +		raw_spin_lock_init(&fbc[i].lock);
> +		lockdep_set_class(&fbc[i].lock, key);
> +#ifdef CONFIG_HOTPLUG_CPU
> +		INIT_LIST_HEAD(&fbc[i].list);
> +#endif
> +		fbc[i].count = amount;
> +		fbc[i].counters = &counters[i];

This would have to become some (void *) math tho with element_size.

> +
> +		debug_percpu_counter_activate(&fbc[i]);
> +	}
>  
>  #ifdef CONFIG_HOTPLUG_CPU
> -	INIT_LIST_HEAD(&fbc->list);
>  	spin_lock_irqsave(&percpu_counters_lock, flags);
> -	list_add(&fbc->list, &percpu_counters);
> +	for (i = 0; i < count; i++) {
> +		list_add(&fbc[i].list, &percpu_counters);
> +	}

nit: we don't add {} for single line loops.

>  	spin_unlock_irqrestore(&percpu_counters_lock, flags);
>  #endif
>  	return 0;
>  }
> -EXPORT_SYMBOL(__percpu_counter_init);
> +EXPORT_SYMBOL(__percpu_counter_init_many);
>  
> -void percpu_counter_destroy(struct percpu_counter *fbc)
> +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 count)
>  {
>  	unsigned long flags __maybe_unused;
> +	u32 i;
>  
> -	if (!fbc->counters)
> +	if (WARN_ON_ONCE(!fbc))
>  		return;
>  
> -	debug_percpu_counter_deactivate(fbc);
> +	if (!fbc[0].counters)
> +		return;
> +
> +	for (i = 0; i < count; i++) {
> +		debug_percpu_counter_deactivate(&fbc[i]);
> +	}
>  

nit: no {}.

>  #ifdef CONFIG_HOTPLUG_CPU
>  	spin_lock_irqsave(&percpu_counters_lock, flags);
> -	list_del(&fbc->list);
> +	for (i = 0; i < count; i++) {
> +		list_del(&fbc[i].list);
> +	}

nit: no {}.

>  	spin_unlock_irqrestore(&percpu_counters_lock, flags);
>  #endif
> -	free_percpu(fbc->counters);
> -	fbc->counters = NULL;
> +
> +	free_percpu(fbc[0].counters);
> +
> +	for (i = 0; i < count; i++) {
> +		fbc[i].counters = NULL;
> +	}

nit: no {}.

>  }
> -EXPORT_SYMBOL(percpu_counter_destroy);
> +EXPORT_SYMBOL(percpu_counter_destroy_many);
>  
>  int percpu_counter_batch __read_mostly = 32;
>  EXPORT_SYMBOL(percpu_counter_batch);
> -- 
> 2.39.2
> 

Thanks,
Dennis


^ permalink raw reply

* Re: [PATCH v4 18/36] arm64/gcs: Context switch GCS state for EL0
From: Mark Brown @ 2023-08-22 17:01 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Will Deacon, Jonathan Corbet, Andrew Morton, Marc Zyngier,
	Oliver Upton, James Morse, Suzuki K Poulose, Arnd Bergmann,
	Oleg Nesterov, Eric Biederman, Kees Cook, Shuah Khan,
	Rick P. Edgecombe, Deepak Gupta, Ard Biesheuvel, Szabolcs Nagy,
	H.J. Lu, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	linux-arm-kernel, linux-doc, kvmarm, linux-fsdevel, linux-arch,
	linux-mm, linux-kselftest, linux-kernel, linux-riscv
In-Reply-To: <ZOTjnmwwZ+iMsi6Y@arm.com>

[-- Attachment #1: Type: text/plain, Size: 1195 bytes --]

On Tue, Aug 22, 2023 at 05:34:38PM +0100, Catalin Marinas wrote:
> On Wed, Aug 16, 2023 at 07:15:53PM +0100, Mark Brown wrote:

> > Right, it's for the GCS memory rather than the registers.  I'm fairly
> > sure it's excessive but but was erring on the side of caution until I
> > have convinced myself that the interactions between GCS barriers and
> > regular barriers were doing the right thing, until we have physical
> > implementations to contend with I'd guess the practical impact will be
> > minimal.

> Well, I'd say either we are clear about why it's (not) needed or we ask
> the architects to clarify the spec. I haven't checked your latest
> series but in principle I don't like adding barriers just because we are
> not sure they are needed (and I don't think having hardware eventually
> changes this).

I should probably also mention that another part of my thinking was that
when we implement GCS for EL1 we'll need to ensure that everything is
synced during the pivot of the EL1 GCS (each EL needs an independent
GCS).  We won't be able to rely on having an ERET there so it's going to
have more stringent requirements, I was partly punting to think things
through fully there.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH mm-unstable v1 1/4] mm/swap: stop using page->private on tail pages for THP_SWAP
From: Yosry Ahmed @ 2023-08-22 17:00 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-kernel, linux-mm, linux-arm-kernel, Andrew Morton,
	Matthew Wilcox, Peter Xu, Catalin Marinas, Will Deacon,
	Hugh Dickins, Seth Jennings, Dan Streetman, Vitaly Wool
In-Reply-To: <20230821160849.531668-2-david@redhat.com>

On Mon, Aug 21, 2023 at 9:09 AM David Hildenbrand <david@redhat.com> wrote:
>
> Let's stop using page->private on tail pages, making it possible to
> just unconditionally reuse that field in the tail pages of large folios.
>
> The remaining usage of the private field for THP_SWAP is in the THP
> splitting code (mm/huge_memory.c), that we'll handle separately later.
>
> Update the THP_SWAP documentation and sanity checks in mm_types.h and
> __split_huge_page_tail().
>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/arm64/mm/mteswap.c  |  5 +++--
>  include/linux/mm_types.h | 12 +-----------
>  include/linux/swap.h     |  9 +++++++++
>  mm/huge_memory.c         | 15 ++++++---------
>  mm/memory.c              |  2 +-
>  mm/rmap.c                |  2 +-
>  mm/swap_state.c          |  5 +++--
>  mm/swapfile.c            |  4 ++--
>  8 files changed, 26 insertions(+), 28 deletions(-)
>
> diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
> index cd508ba80ab1..a31833e3ddc5 100644
> --- a/arch/arm64/mm/mteswap.c
> +++ b/arch/arm64/mm/mteswap.c
> @@ -33,8 +33,9 @@ int mte_save_tags(struct page *page)
>
>         mte_save_page_tags(page_address(page), tag_storage);
>
> -       /* page_private contains the swap entry.val set in do_swap_page */
> -       ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL);
> +       /* lookup the swap entry.val from the page */
> +       ret = xa_store(&mte_pages, page_swap_entry(page).val, tag_storage,
> +                      GFP_KERNEL);
>         if (WARN(xa_is_err(ret), "Failed to store MTE tags")) {
>                 mte_free_tag_storage(tag_storage);
>                 return xa_err(ret);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index b9b6c88875b9..61361f1750c3 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -333,11 +333,8 @@ struct folio {
>                         atomic_t _pincount;
>  #ifdef CONFIG_64BIT
>                         unsigned int _folio_nr_pages;
> -                       /* 4 byte gap here */
> -       /* private: the union with struct page is transitional */
> -                       /* Fix THP_SWAP to not use tail->private */
> -                       unsigned long _private_1;
>  #endif
> +       /* private: the union with struct page is transitional */
>                 };
>                 struct page __page_1;
>         };
> @@ -358,9 +355,6 @@ struct folio {
>         /* public: */
>                         struct list_head _deferred_list;
>         /* private: the union with struct page is transitional */
> -                       unsigned long _avail_2a;
> -                       /* Fix THP_SWAP to not use tail->private */
> -                       unsigned long _private_2a;
>                 };
>                 struct page __page_2;
>         };
> @@ -385,9 +379,6 @@ FOLIO_MATCH(memcg_data, memcg_data);
>                         offsetof(struct page, pg) + sizeof(struct page))
>  FOLIO_MATCH(flags, _flags_1);
>  FOLIO_MATCH(compound_head, _head_1);
> -#ifdef CONFIG_64BIT
> -FOLIO_MATCH(private, _private_1);
> -#endif
>  #undef FOLIO_MATCH
>  #define FOLIO_MATCH(pg, fl)                                            \
>         static_assert(offsetof(struct folio, fl) ==                     \
> @@ -396,7 +387,6 @@ FOLIO_MATCH(flags, _flags_2);
>  FOLIO_MATCH(compound_head, _head_2);
>  FOLIO_MATCH(flags, _flags_2a);
>  FOLIO_MATCH(compound_head, _head_2a);
> -FOLIO_MATCH(private, _private_2a);
>  #undef FOLIO_MATCH
>
>  /**
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index bb5adc604144..84fe0e94f5cd 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -339,6 +339,15 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio)
>         return entry;
>  }
>
> +static inline swp_entry_t page_swap_entry(struct page *page)
> +{
> +       struct folio *folio = page_folio(page);
> +       swp_entry_t entry = folio_swap_entry(folio);
> +
> +       entry.val += page - &folio->page;

Would it be better to use folio_page_idx() here?

> +       return entry;
> +}
> +
>  static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry)
>  {
>         folio->private = (void *)entry.val;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index cc2f65f8cc62..c04702ae71d2 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2446,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
>         page_tail->index = head->index + tail;
>
>         /*
> -        * page->private should not be set in tail pages with the exception
> -        * of swap cache pages that store the swp_entry_t in tail pages.
> -        * Fix up and warn once if private is unexpectedly set.
> -        *
> -        * What of 32-bit systems, on which folio->_pincount overlays
> -        * head[1].private?  No problem: THP_SWAP is not enabled on 32-bit, and
> -        * pincount must be 0 for folio_ref_freeze() to have succeeded.
> +        * page->private should not be set in tail pages. Fix up and warn once
> +        * if private is unexpectedly set.
>          */
> -       if (!folio_test_swapcache(page_folio(head))) {
> -               VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
> +       if (unlikely(page_tail->private)) {
> +               VM_WARN_ON_ONCE_PAGE(true, page_tail);
>                 page_tail->private = 0;
>         }
> +       if (PageSwapCache(head))
> +               set_page_private(page_tail, (unsigned long)head->private + tail);
>
>         /* Page flags must be visible before we make the page non-compound. */
>         smp_wmb();
> diff --git a/mm/memory.c b/mm/memory.c
> index d003076b218d..ff13242c1589 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3882,7 +3882,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>                  * changed.
>                  */
>                 if (unlikely(!folio_test_swapcache(folio) ||
> -                            page_private(page) != entry.val))
> +                            page_swap_entry(page).val != entry.val))
>                         goto out_page;
>
>                 /*
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 1f04debdc87a..ec7f8e6c9e48 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1647,7 +1647,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
>                          */
>                         dec_mm_counter(mm, mm_counter(&folio->page));
>                 } else if (folio_test_anon(folio)) {
> -                       swp_entry_t entry = { .val = page_private(subpage) };
> +                       swp_entry_t entry = page_swap_entry(subpage);
>                         pte_t swp_pte;
>                         /*
>                          * Store the swap location in the pte.
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 01f15139b7d9..2f2417810052 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -100,6 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
>
>         folio_ref_add(folio, nr);
>         folio_set_swapcache(folio);
> +       folio_set_swap_entry(folio, entry);
>
>         do {
>                 xas_lock_irq(&xas);
> @@ -113,7 +114,6 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
>                                 if (shadowp)
>                                         *shadowp = old;
>                         }
> -                       set_page_private(folio_page(folio, i), entry.val + i);
>                         xas_store(&xas, folio);
>                         xas_next(&xas);
>                 }
> @@ -154,9 +154,10 @@ void __delete_from_swap_cache(struct folio *folio,
>         for (i = 0; i < nr; i++) {
>                 void *entry = xas_store(&xas, shadow);
>                 VM_BUG_ON_PAGE(entry != folio, entry);
> -               set_page_private(folio_page(folio, i), 0);
>                 xas_next(&xas);
>         }
> +       entry.val = 0;
> +       folio_set_swap_entry(folio, entry);
>         folio_clear_swapcache(folio);
>         address_space->nrpages -= nr;
>         __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index d46933adf789..bd9d904671b9 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -3369,7 +3369,7 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
>
>  struct swap_info_struct *page_swap_info(struct page *page)
>  {
> -       swp_entry_t entry = { .val = page_private(page) };
> +       swp_entry_t entry = page_swap_entry(page);
>         return swp_swap_info(entry);
>  }
>
> @@ -3384,7 +3384,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping);
>
>  pgoff_t __page_file_index(struct page *page)
>  {
> -       swp_entry_t swap = { .val = page_private(page) };
> +       swp_entry_t swap = page_swap_entry(page);
>         return swp_offset(swap);
>  }
>  EXPORT_SYMBOL_GPL(__page_file_index);
> --
> 2.41.0
>
>


^ permalink raw reply

* Re: [PATCH v2 2/2] selftests: cachestat: catch failing fsync test on tmpfs
From: Nhat Pham @ 2023-08-22 16:58 UTC (permalink / raw)
  To: Andre Przywara
  Cc: Shuah Khan, Johannes Weiner, linux-kselftest, linux-mm,
	linux-kernel
In-Reply-To: <CAKEwX=NEZjeFFQsC3gWLgkh=PMHDh44Uzo3aZFH07y1xL=VvWg@mail.gmail.com>

On Tue, Aug 22, 2023 at 8:55 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Mon, Aug 21, 2023 at 9:05 AM Andre Przywara <andre.przywara@arm.com> wrote:
> >
> > The cachestat kselftest runs a test on a normal file, which is created
> > temporarily in the current directory. Among the tests it runs there is a
> > call to fsync(), which is expected to clean all dirty pages used by the
> > file.
> > However the tmpfs filesystem implements fsync() as noop_fsync(), so the
> > call will not even attempt to clean anything when this test file happens
> > to live on a tmpfs instance. This happens in an initramfs, or when the
> > current directory is in /dev/shm or sometimes /tmp.
> >
> > To avoid this test failing wrongly, use statfs() to check which filesystem
> > the test file lives on. If that is "tmpfs", we skip the fsync() test.
> >
> > Since the fsync test is only one part of the "normal file" test, we now
> > execute this twice, skipping the fsync part on the first call.
> > This way only the second test, including the fsync part, would be skipped.
> >
> > Signed-off-by: Andre Przywara <andre.przywara@arm.com>
> > ---
> >  .../selftests/cachestat/test_cachestat.c      | 62 ++++++++++++++-----
> >  1 file changed, 47 insertions(+), 15 deletions(-)
> >
> > diff --git a/tools/testing/selftests/cachestat/test_cachestat.c b/tools/testing/selftests/cachestat/test_cachestat.c
> > index 8f8f46c24846d..4804c7dc7b312 100644
> > --- a/tools/testing/selftests/cachestat/test_cachestat.c
> > +++ b/tools/testing/selftests/cachestat/test_cachestat.c
> > @@ -4,10 +4,12 @@
> >  #include <stdio.h>
> >  #include <stdbool.h>
> >  #include <linux/kernel.h>
> > +#include <linux/magic.h>
> >  #include <linux/mman.h>
> >  #include <sys/mman.h>
> >  #include <sys/shm.h>
> >  #include <sys/syscall.h>
> > +#include <sys/vfs.h>
> >  #include <unistd.h>
> >  #include <string.h>
> >  #include <fcntl.h>
> > @@ -15,7 +17,7 @@
> >
> >  #include "../kselftest.h"
> >
> > -#define NR_TESTS       8
> > +#define NR_TESTS       9
> >
> >  static const char * const dev_files[] = {
> >         "/dev/zero", "/dev/null", "/dev/urandom",
> > @@ -91,6 +93,20 @@ bool write_exactly(int fd, size_t filesize)
> >         return ret;
> >  }
> >
> > +/*
> > + * fsync() is implemented via noop_fsync() on tmpfs. This makes the fsync()
> > + * test fail below, so we need to check for test file living on a tmpfs.
> > + */
> > +static bool is_on_tmpfs(int fd)
> > +{
> > +       struct statfs statfs_buf;
> > +
> > +       if (fstatfs(fd, &statfs_buf))
> > +               return false;
> > +
> > +       return statfs_buf.f_type == TMPFS_MAGIC;
> > +}
> > +
> >  /*
> >   * Open/create the file at filename, (optionally) write random data to it
> >   * (exactly num_pages), then test the cachestat syscall on this file.
> > @@ -98,13 +114,13 @@ bool write_exactly(int fd, size_t filesize)
> >   * If test_fsync == true, fsync the file, then check the number of dirty
> >   * pages.
> >   */
> > -bool test_cachestat(const char *filename, bool write_random, bool create,
> > -               bool test_fsync, unsigned long num_pages, int open_flags,
> > -               mode_t open_mode)
> > +static int test_cachestat(const char *filename, bool write_random, bool create,
> > +                         bool test_fsync, unsigned long num_pages,
> > +                         int open_flags, mode_t open_mode)
> >  {
> Hmm would the semantic change a bit here?
>
> Previously, this function returned true if passed.
> But it seems like KSFT_PASS is defined as 0:
> https://github.com/torvalds/linux/blob/9e38be7/tools/testing/selftests/kselftest.h#L74
>
> So maybe we have to change from:
>
> if (test_<test-name>())
>
>
> to:
>
> if (!test_<test-name>)())
>
> or, explicitly as:
>
> if (test_<test-name>() == KSFT_PASS)
Ah never mind, ignore this. I didn't see your change
down in the main function.

Everything LGTM!
Acked-by: Nhat Pham <nphamcs@gmail.com>
>
>
> >         size_t PS = sysconf(_SC_PAGESIZE);
> >         int filesize = num_pages * PS;
> > -       bool ret = true;
> > +       int ret = KSFT_PASS;
> >         long syscall_ret;
> >         struct cachestat cs;
> >         struct cachestat_range cs_range = { 0, filesize };
> > @@ -113,7 +129,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
> >
> >         if (fd == -1) {
> >                 ksft_print_msg("Unable to create/open file.\n");
> > -               ret = false;
> > +               ret = KSFT_FAIL;
> >                 goto out;
> >         } else {
> >                 ksft_print_msg("Create/open %s\n", filename);
> > @@ -122,7 +138,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
> >         if (write_random) {
> >                 if (!write_exactly(fd, filesize)) {
> >                         ksft_print_msg("Unable to access urandom.\n");
> > -                       ret = false;
> > +                       ret = KSFT_FAIL;
> >                         goto out1;
> >                 }
> >         }
> > @@ -133,7 +149,7 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
> >
> >         if (syscall_ret) {
> >                 ksft_print_msg("Cachestat returned non-zero.\n");
> > -               ret = false;
> > +               ret = KSFT_FAIL;
> >                 goto out1;
> >
> >         } else {
> > @@ -143,15 +159,17 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
> >                         if (cs.nr_cache + cs.nr_evicted != num_pages) {
> >                                 ksft_print_msg(
> >                                         "Total number of cached and evicted pages is off.\n");
> > -                               ret = false;
> > +                               ret = KSFT_FAIL;
> >                         }
> >                 }
> >         }
> >
> >         if (test_fsync) {
> > -               if (fsync(fd)) {
> > +               if (is_on_tmpfs(fd)) {
> > +                       ret = KSFT_SKIP;
> > +               } else if (fsync(fd)) {
> >                         ksft_print_msg("fsync fails.\n");
> > -                       ret = false;
> > +                       ret = KSFT_FAIL;
> >                 } else {
> >                         syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0);
> >
> > @@ -162,13 +180,13 @@ bool test_cachestat(const char *filename, bool write_random, bool create,
> >                                 print_cachestat(&cs);
> >
> >                                 if (cs.nr_dirty) {
> > -                                       ret = false;
> > +                                       ret = KSFT_FAIL;
> >                                         ksft_print_msg(
> >                                                 "Number of dirty should be zero after fsync.\n");
> >                                 }
> >                         } else {
> >                                 ksft_print_msg("Cachestat (after fsync) returned non-zero.\n");
> > -                               ret = false;
> > +                               ret = KSFT_FAIL;
> >                                 goto out1;
> >                         }
> >                 }
> > @@ -259,7 +277,7 @@ int main(void)
> >                 const char *dev_filename = dev_files[i];
> >
> >                 if (test_cachestat(dev_filename, false, false, false,
> > -                       4, O_RDONLY, 0400))
> > +                       4, O_RDONLY, 0400) == KSFT_PASS)
> >                         ksft_test_result_pass("cachestat works with %s\n", dev_filename);
> >                 else {
> >                         ksft_test_result_fail("cachestat fails with %s\n", dev_filename);
> > @@ -268,13 +286,27 @@ int main(void)
> >         }
> >
> >         if (test_cachestat("tmpfilecachestat", true, true,
> > -               true, 4, O_CREAT | O_RDWR, 0400 | 0600))
> > +               false, 4, O_CREAT | O_RDWR, 0600) == KSFT_PASS)
> >                 ksft_test_result_pass("cachestat works with a normal file\n");
> >         else {
> >                 ksft_test_result_fail("cachestat fails with normal file\n");
> >                 ret = 1;
> >         }
> >
> > +       switch (test_cachestat("tmpfilecachestat", true, true,
> > +               true, 4, O_CREAT | O_RDWR, 0600)) {
> > +       case KSFT_FAIL:
> > +               ksft_test_result_fail("cachestat fsync fails with normal file\n");
> > +               ret = KSFT_FAIL;
> > +               break;
> > +       case KSFT_PASS:
> > +               ksft_test_result_pass("cachestat fsync works with a normal file\n");
> > +               break;
> > +       case KSFT_SKIP:
> > +               ksft_test_result_skip("tmpfilecachestat is on tmpfs\n");
> > +               break;
> > +       }
> > +
> >         if (test_cachestat_shmem())
> >                 ksft_test_result_pass("cachestat works with a shmem file\n");
> >         else {
> > --
> > 2.25.1
> >


^ permalink raw reply

* Re: [PATCH v4 03/36] arm64/gcs: Document the ABI for Guarded Control Stacks
From: Catalin Marinas @ 2023-08-22 16:49 UTC (permalink / raw)
  To: Mark Brown
  Cc: Will Deacon, Jonathan Corbet, Andrew Morton, Marc Zyngier,
	Oliver Upton, James Morse, Suzuki K Poulose, Arnd Bergmann,
	Oleg Nesterov, Eric Biederman, Kees Cook, Shuah Khan,
	Rick P. Edgecombe, Deepak Gupta, Ard Biesheuvel, Szabolcs Nagy,
	H.J. Lu, Paul Walmsley, Palmer Dabbelt, Albert Ou,
	linux-arm-kernel, linux-doc, kvmarm, linux-fsdevel, linux-arch,
	linux-mm, linux-kselftest, linux-kernel, linux-riscv
In-Reply-To: <aaea542c-929c-4c9b-8caa-ca67e0eb9c1e@sirena.org.uk>

On Fri, Aug 18, 2023 at 08:38:02PM +0100, Mark Brown wrote:
> On Fri, Aug 18, 2023 at 06:29:54PM +0100, Catalin Marinas wrote:
> > A related question - it may have been discussed intensively on the x86
> > thread (I may read it sometime) - why not have the libc map the shadow
> 
> Your assumption that this is a single thread feels optimistic there.

Yeah and I unfortunately ignored all of them.

> > stack and pass the pointer/size to clone3()? It saves us from having to
> > guess what the right size we'd need. struct clone_args is extensible.
> 
> I can't recall or locate the specific reasoning there right now, perhaps
> Rick or someone else can?  I'd guess there would be compat concerns for
> things that don't go via libc which would complicate the story with
> identifying and marking things as GCS/SS safe, it's going to be more
> robust to just supply a GCS if the process is using it.  That said
> having a default doesn't preclude us using the extensibility to allow
> userspace directly to control the GCS size, I would certainly be in
> favour of adding support for that.

It would be good if someone provided a summary of the x86 decision (I'll
get to those thread but most likely in September). I think we concluded
that we can't deploy GCS entirely transparently, so we need a libc
change (apart from the ELF annotations). Since libc is opting in to GCS,
we could also update the pthread_create() etc. to allocate the shadow
together with the standard stack.

Anyway, that's my preference but maybe there were good reasons not to do
this.

-- 
Catalin


^ permalink raw reply

* Re: [PATCH 2/3] mm: memcg: add a helper for non-unified stats flushing
From: Yosry Ahmed @ 2023-08-22 16:48 UTC (permalink / raw)
  To: Michal Koutný
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Roman Gushchin,
	Shakeel Butt, Muchun Song, Ivan Babrou, Tejun Heo, linux-mm,
	cgroups, linux-kernel
In-Reply-To: <5y3e32ek6owren3q5e3gzonzxzdhs53ihywj3mtbhz56hnizfy@fctafygsnfaq>

On Tue, Aug 22, 2023 at 9:35 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Tue, Aug 22, 2023 at 09:00:06AM -0700, Yosry Ahmed <yosryahmed@google.com> wrote:
> > We can probably also kick FLUSH_TIME forward as well.
>
> True, they guard same work.
>
> > Perhaps I can move both into do_stats_flush() then. If I understand
> > correctly this is what you mean?
>
> Yes.
>
> > What do you think?
>
> The latter is certainly better looking code.
>
> I wasn't sure at first about moving stats_flush_threshold reset before
> actual flush but on second thought it should not be a significant change
> - readers: may skip flushing, the values that they read should still be
>   below the error threshold,

Unified readers will skip anyway as there's an ongoing flush,
non-unified readers don't check the threshold anyway (with this patch
series). So for readers it should not be a change.

> - writers: may be slowed down a bit (because of conditional atomic write
>   optimization in memcg_rstat_updates), presumably not on average
>   though.

Yeah writers will start doing atomic writes once a flush starts
instead of once it ends. I don't think it will matter in practice
though. The optimization is only effective if we manage to surpass the
threshold before the periodic flusher (or any unified flusher) kicks
in and resets it. If we start doing atomic writes earlier, then we
will also stop earlier; the number of atomic writes should stay the
same.

I think the only difference will be the scenario where we start atomic
writes early but the periodic flush happens before we reach the
threshold, in which case we aren't doing a lot of updates anyway.

I hope this makes _any_ sense :)

>
> So the latter should work too.
>

I will include that in v2. I will wait for a bit of further review
comments on this version first.

Thanks for taking a look!

> Michal

^ permalink raw reply

* Re: [PATCH v5 11/37] mm: Define VM_SHADOW_STACK for arm64 when we support GCS
From: Edgecombe, Rick P @ 2023-08-22 16:47 UTC (permalink / raw)
  To: broonie@kernel.org, david@redhat.com
  Cc: corbet@lwn.net, ardb@kernel.org, maz@kernel.org, shuah@kernel.org,
	Szabolcs.Nagy@arm.com, keescook@chromium.org, james.morse@arm.com,
	debug@rivosinc.com, akpm@linux-foundation.org, palmer@dabbelt.com,
	catalin.marinas@arm.com, hjl.tools@gmail.com,
	paul.walmsley@sifive.com, linux-mm@kvack.org,
	aou@eecs.berkeley.edu, oleg@redhat.com, arnd@arndb.de,
	ebiederm@xmission.com, will@kernel.org, suzuki.poulose@arm.com,
	linux-kernel@vger.kernel.org, kvmarm@lists.linux.dev,
	linux-doc@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	linux-fsdevel@vger.kernel.org, linux-kselftest@vger.kernel.org,
	linux-arch@vger.kernel.org, oliver.upton@linux.dev,
	linux-riscv@lists.infradead.org
In-Reply-To: <54b2c1e5-a99d-42c0-b686-1b5cbb849581@sirena.org.uk>

On Tue, 2023-08-22 at 16:41 +0100, Mark Brown wrote:
> On Tue, Aug 22, 2023 at 05:21:09PM +0200, David Hildenbrand wrote:
> > On 22.08.23 15:56, Mark Brown wrote:
> 
> > > @@ -372,7 +372,17 @@ extern unsigned int kobjsize(const void
> > > *objp);
> > >     * having a PAGE_SIZE guard gap.
> > >     */
> > >    # define VM_SHADOW_STACK	VM_HIGH_ARCH_5
> > > -#else
> > > +#endif
> > > +
> > > +#if defined(CONFIG_ARM64_GCS)
> > > +/*
> > > + * arm64's Guarded Control Stack implements similar
> > > functionality and
> > > + * has similar constraints to shadow stacks.
> > > + */
> > > +# define VM_SHADOW_STACK	VM_HIGH_ARCH_5
> > > +#endif
> 
> > Shouldn't that all just merged with the previous define(s)?
> 
> > Also, I wonder if we now want to have CONFIG_HAVE_ARCH_SHADOW_STACK
> > or
> > similar.
> 
> I can certainly update it to do that, I was just trying to fit in
> with
> how the code was written on the basis that there was probably a good
> reason for it that had been discussed somewhere.  I can send an
> incremental patch for this on top of the x86 patches assuming they go
> in
> during the merge window.

There was something like that on the x86 series way back, but it was
dropped[0]. IIRC risc-v was going to try to do something other than
VM_SHADOW_STACK, so they may conflict some day. But in the meantime,
adding a CONFIG_HAVE_ARCH_SHADOW_STACK here in the arm series makes
sense to me.

[0]
https://lore.kernel.org/lkml/d09e952d8ae696f687f0787dfeb7be7699c02913.camel@intel.com/

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox