linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/3] mm: make vm_area_struct anon_name field RCU-safe
@ 2024-01-22  7:13 Suren Baghdasaryan
  2024-01-22  7:13 ` [PATCH 2/3] mm: add mm_struct sequence number to detect write locks Suren Baghdasaryan
  2024-01-22  7:13 ` [PATCH 3/3] mm/maps: read proc/pid/maps under RCU Suren Baghdasaryan
  0 siblings, 2 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2024-01-22  7:13 UTC (permalink / raw)
  To: akpm
  Cc: viro, brauner, jack, dchinner, casey, ben.wolsieffer, paulmck,
	david, avagin, usama.anjum, peterx, hughd, ryan.roberts,
	wangkefeng.wang, Liam.Howlett, yuzhao, axelrasmussen, lstoakes,
	talumbau, willy, vbabka, mgorman, jhubbard, vishal.moola,
	mathieu.desnoyers, dhowells, jgg, sidhartha.kumar,
	andriy.shevchenko, yangxingui, keescook, linux-kernel,
	linux-fsdevel, linux-mm, kernel-team, surenb

For lockless /proc/pid/maps reading we have to ensure all the fields
used when generating the output are RCU-safe. The only pointer fields
in vm_area_struct which are used to generate that file's output are
vm_file and anon_name. vm_file is RCU-safe but anon_name is not. Make
anon_name RCU-safe as well.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm_inline.h | 10 +++++++++-
 include/linux/mm_types.h  |  3 ++-
 mm/madvise.c              | 30 ++++++++++++++++++++++++++----
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f4fe593c1400..bbdb0ca857f1 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -389,7 +389,7 @@ static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
 	struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
 
 	if (anon_name)
-		new_vma->anon_name = anon_vma_name_reuse(anon_name);
+		rcu_assign_pointer(new_vma->anon_name, anon_vma_name_reuse(anon_name));
 }
 
 static inline void free_anon_vma_name(struct vm_area_struct *vma)
@@ -411,6 +411,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
 		!strcmp(anon_name1->name, anon_name2->name);
 }
 
+struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma);
+
 #else /* CONFIG_ANON_VMA_NAME */
 static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
 static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
@@ -424,6 +426,12 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
 	return true;
 }
 
+static inline
+struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
 #endif  /* CONFIG_ANON_VMA_NAME */
 
 static inline void init_tlb_flush_pending(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..bbe1223cd992 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -545,6 +545,7 @@ struct vm_userfaultfd_ctx {};
 
 struct anon_vma_name {
 	struct kref kref;
+	struct rcu_head rcu;
 	/* The name needs to be at the end because it is dynamically sized. */
 	char name[];
 };
@@ -699,7 +700,7 @@ struct vm_area_struct {
 	 * terminated string containing the name given to the vma, or NULL if
 	 * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
 	 */
-	struct anon_vma_name *anon_name;
+	struct anon_vma_name __rcu *anon_name;
 #endif
 #ifdef CONFIG_SWAP
 	atomic_long_t swap_readahead_info;
diff --git a/mm/madvise.c b/mm/madvise.c
index 912155a94ed5..0f222d464254 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -88,14 +88,15 @@ void anon_vma_name_free(struct kref *kref)
 {
 	struct anon_vma_name *anon_name =
 			container_of(kref, struct anon_vma_name, kref);
-	kfree(anon_name);
+	kfree_rcu(anon_name, rcu);
 }
 
 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
 {
 	mmap_assert_locked(vma->vm_mm);
 
-	return vma->anon_name;
+	return rcu_dereference_protected(vma->anon_name,
+		rwsem_is_locked(&vma->vm_mm->mmap_lock));
 }
 
 /* mmap_lock should be write-locked */
@@ -105,7 +106,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
 	struct anon_vma_name *orig_name = anon_vma_name(vma);
 
 	if (!anon_name) {
-		vma->anon_name = NULL;
+		rcu_assign_pointer(vma->anon_name, NULL);
 		anon_vma_name_put(orig_name);
 		return 0;
 	}
@@ -113,11 +114,32 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
 	if (anon_vma_name_eq(orig_name, anon_name))
 		return 0;
 
-	vma->anon_name = anon_vma_name_reuse(anon_name);
+	rcu_assign_pointer(vma->anon_name, anon_vma_name_reuse(anon_name));
 	anon_vma_name_put(orig_name);
 
 	return 0;
 }
+
+/*
+ * Returned anon_vma_name is stable due to elevated refcount but not guaranteed
+ * to be assigned to the original VMA after the call.
+ */
+struct anon_vma_name *anon_vma_name_get_rcu(struct vm_area_struct *vma)
+{
+	struct anon_vma_name __rcu *anon_name;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	anon_name = rcu_dereference(vma->anon_name);
+	if (!anon_name)
+		return NULL;
+
+	if (unlikely(!kref_get_unless_zero(&anon_name->kref)))
+		return NULL;
+
+	return anon_name;
+}
+
 #else /* CONFIG_ANON_VMA_NAME */
 static int replace_anon_vma_name(struct vm_area_struct *vma,
 				 struct anon_vma_name *anon_name)
-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/3] mm: add mm_struct sequence number to detect write locks
  2024-01-22  7:13 [PATCH 1/3] mm: make vm_area_struct anon_name field RCU-safe Suren Baghdasaryan
@ 2024-01-22  7:13 ` Suren Baghdasaryan
  2024-01-22  7:13 ` [PATCH 3/3] mm/maps: read proc/pid/maps under RCU Suren Baghdasaryan
  1 sibling, 0 replies; 7+ messages in thread
From: Suren Baghdasaryan @ 2024-01-22  7:13 UTC (permalink / raw)
  To: akpm
  Cc: viro, brauner, jack, dchinner, casey, ben.wolsieffer, paulmck,
	david, avagin, usama.anjum, peterx, hughd, ryan.roberts,
	wangkefeng.wang, Liam.Howlett, yuzhao, axelrasmussen, lstoakes,
	talumbau, willy, vbabka, mgorman, jhubbard, vishal.moola,
	mathieu.desnoyers, dhowells, jgg, sidhartha.kumar,
	andriy.shevchenko, yangxingui, keescook, linux-kernel,
	linux-fsdevel, linux-mm, kernel-team, surenb

Provide a way for lockless mm_struct users to detect whether mm might have
been changed since some specific point in time. The API provided allows
the user to record a counter when it starts using the mm and later use
that counter to check if anyone write-locked mmap_lock since the counter
was recorded. Recording the counter value should be done while holding
mmap_lock at least for reading to prevent the counter from concurrent
changes. Every time mmap_lock is write-locked mm_struct updates its
mm_wr_seq counter so that checks against counters recorded before that
would fail, indicating a possibility of mm being modified.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm_types.h  |  2 ++
 include/linux/mmap_lock.h | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bbe1223cd992..e749f7f09314 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -846,6 +846,8 @@ struct mm_struct {
 		 */
 		int mm_lock_seq;
 #endif
+		/* Counter incremented each time mm gets write-locked */
+		unsigned long mm_wr_seq;
 
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 8d38dcb6d044..0197079cb6fe 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -106,6 +106,8 @@ static inline void mmap_write_lock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_start_locking(mm, true);
 	down_write(&mm->mmap_lock);
+	/* Pairs with ACQUIRE semantics in mmap_write_seq_read */
+	smp_store_release(&mm->mm_wr_seq, mm->mm_wr_seq + 1);
 	__mmap_lock_trace_acquire_returned(mm, true, true);
 }
 
@@ -113,6 +115,8 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
 {
 	__mmap_lock_trace_start_locking(mm, true);
 	down_write_nested(&mm->mmap_lock, subclass);
+	/* Pairs with ACQUIRE semantics in mmap_write_seq_read */
+	smp_store_release(&mm->mm_wr_seq, mm->mm_wr_seq + 1);
 	__mmap_lock_trace_acquire_returned(mm, true, true);
 }
 
@@ -122,6 +126,10 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm)
 
 	__mmap_lock_trace_start_locking(mm, true);
 	ret = down_write_killable(&mm->mmap_lock);
+	if (!ret) {
+		/* Pairs with ACQUIRE semantics in mmap_write_seq_read */
+		smp_store_release(&mm->mm_wr_seq, mm->mm_wr_seq + 1);
+	}
 	__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
 	return ret;
 }
@@ -140,6 +148,20 @@ static inline void mmap_write_downgrade(struct mm_struct *mm)
 	downgrade_write(&mm->mmap_lock);
 }
 
+static inline unsigned long mmap_write_seq_read(struct mm_struct *mm)
+{
+	/* Pairs with RELEASE semantics in mmap_write_lock */
+	return smp_load_acquire(&mm->mm_wr_seq);
+}
+
+static inline void mmap_write_seq_record(struct mm_struct *mm,
+					 unsigned long *mm_wr_seq)
+{
+	mmap_assert_locked(mm);
+	/* Nobody can concurrently modify since we hold the mmap_lock */
+	*mm_wr_seq = mm->mm_wr_seq;
+}
+
 static inline void mmap_read_lock(struct mm_struct *mm)
 {
 	__mmap_lock_trace_start_locking(mm, false);
-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/3] mm/maps: read proc/pid/maps under RCU
  2024-01-22  7:13 [PATCH 1/3] mm: make vm_area_struct anon_name field RCU-safe Suren Baghdasaryan
  2024-01-22  7:13 ` [PATCH 2/3] mm: add mm_struct sequence number to detect write locks Suren Baghdasaryan
@ 2024-01-22  7:13 ` Suren Baghdasaryan
  2024-01-23  5:36   ` SeongJae Park
  1 sibling, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2024-01-22  7:13 UTC (permalink / raw)
  To: akpm
  Cc: viro, brauner, jack, dchinner, casey, ben.wolsieffer, paulmck,
	david, avagin, usama.anjum, peterx, hughd, ryan.roberts,
	wangkefeng.wang, Liam.Howlett, yuzhao, axelrasmussen, lstoakes,
	talumbau, willy, vbabka, mgorman, jhubbard, vishal.moola,
	mathieu.desnoyers, dhowells, jgg, sidhartha.kumar,
	andriy.shevchenko, yangxingui, keescook, linux-kernel,
	linux-fsdevel, linux-mm, kernel-team, surenb

With maple_tree supporting vma tree traversal under RCU and per-vma locks
making vma access RCU-safe, /proc/pid/maps can be read under RCU and
without the need to read-lock mmap_lock. However vma content can change
from under us, therefore we make a copy of the vma and we pin pointer
fields used when generating the output (currently only vm_file and
anon_name). Afterwards we check for concurrent address space
modifications, wait for them to end and retry. That last check is needed
to avoid possibility of missing a vma during concurrent maple_tree
node replacement, which might report a NULL when a vma is replaced
with another one. While we take the mmap_lock for reading during such
contention, we do that momentarily only to record new mm_wr_seq counter.
This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such as
monitoring/data collection services) from blocking address space updates.

Note that this change has a userspace visible disadvantage: it allows for
sub-page data tearing as opposed to the previous mechanism where data
tearing could happen only between pages of generated output data.
Since current userspace considers data tearing between pages to be
acceptable, we assume is will be able to handle sub-page data tearing
as well.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 fs/proc/internal.h |   2 +
 fs/proc/task_mmu.c | 114 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a71ac5379584..e0247225bb68 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -290,6 +290,8 @@ struct proc_maps_private {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct vma_iterator iter;
+	unsigned long mm_wr_seq;
+	struct vm_area_struct vma_copy;
 #ifdef CONFIG_NUMA
 	struct mempolicy *task_mempolicy;
 #endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795f..3886d04afc01 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -126,11 +126,96 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
 
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
-						loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static const struct seq_operations proc_pid_maps_op;
+/*
+ * Take VMA snapshot and pin vm_file and anon_name as they are used by
+ * show_map_vma.
+ */
+static int get_vma_snapshow(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
+	struct vm_area_struct *copy = &priv->vma_copy;
+	int ret = -EAGAIN;
+
+	memcpy(copy, vma, sizeof(*vma));
+	if (copy->vm_file && !get_file_rcu(&copy->vm_file))
+		goto out;
+
+	if (copy->anon_name && !anon_vma_name_get_rcu(copy))
+		goto put_file;
+
+	if (priv->mm_wr_seq == mmap_write_seq_read(priv->mm))
+		return 0;
+
+	/* Address space got modified, vma might be stale. Wait and retry. */
+	rcu_read_unlock();
+	ret = mmap_read_lock_killable(priv->mm);
+	mmap_write_seq_record(priv->mm, &priv->mm_wr_seq);
+	mmap_read_unlock(priv->mm);
+	rcu_read_lock();
+
+	if (!ret)
+		ret = -EAGAIN; /* no other errors, ok to retry */
+
+	if (copy->anon_name)
+		anon_vma_name_put(copy->anon_name);
+put_file:
+	if (copy->vm_file)
+		fput(copy->vm_file);
+out:
+	return ret;
+}
+
+static void put_vma_snapshot(struct proc_maps_private *priv)
+{
+	struct vm_area_struct *vma = &priv->vma_copy;
+
+	if (vma->anon_name)
+		anon_vma_name_put(vma->anon_name);
+	if (vma->vm_file)
+		fput(vma->vm_file);
+}
+
+static inline bool needs_mmap_lock(struct seq_file *m)
+{
+	/*
+	 * smaps and numa_maps perform page table walk, therefore require
+	 * mmap_lock but maps can be read under RCU.
+	 */
+	return m->op != &proc_pid_maps_op;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+/* Without per-vma locks VMA access is not RCU-safe */
+static inline bool needs_mmap_lock(struct seq_file *m) { return true; }
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = vma_next(&priv->iter);
 
+#ifdef CONFIG_PER_VMA_LOCK
+	if (vma && !needs_mmap_lock(m)) {
+		int ret;
+
+		put_vma_snapshot(priv);
+		while ((ret = get_vma_snapshow(priv, vma)) == -EAGAIN) {
+			/* lookup the vma at the last position again */
+			vma_iter_init(&priv->iter, priv->mm, *ppos);
+			vma = vma_next(&priv->iter);
+		}
+
+		if (ret) {
+			put_vma_snapshot(priv);
+			return NULL;
+		}
+		vma = &priv->vma_copy;
+	}
+#endif
 	if (vma) {
 		*ppos = vma->vm_start;
 	} else {
@@ -169,12 +254,20 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
 		return ERR_PTR(-EINTR);
 	}
 
+	/* Drop mmap_lock if possible */
+	if (!needs_mmap_lock(m)) {
+		mmap_write_seq_record(priv->mm, &priv->mm_wr_seq);
+		mmap_read_unlock(priv->mm);
+		rcu_read_lock();
+		memset(&priv->vma_copy, 0, sizeof(priv->vma_copy));
+	}
+
 	vma_iter_init(&priv->iter, mm, last_addr);
 	hold_task_mempolicy(priv);
 	if (last_addr == -2UL)
 		return get_gate_vma(mm);
 
-	return proc_get_vma(priv, ppos);
+	return proc_get_vma(m, ppos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
@@ -183,7 +276,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
 		*ppos = -1UL;
 		return NULL;
 	}
-	return proc_get_vma(m->private, ppos);
+	return proc_get_vma(m, ppos);
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -195,7 +288,10 @@ static void m_stop(struct seq_file *m, void *v)
 		return;
 
 	release_task_mempolicy(priv);
-	mmap_read_unlock(mm);
+	if (needs_mmap_lock(m))
+		mmap_read_unlock(mm);
+	else
+		rcu_read_unlock();
 	mmput(mm);
 	put_task_struct(priv->task);
 	priv->task = NULL;
@@ -283,8 +379,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	start = vma->vm_start;
 	end = vma->vm_end;
 	show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
-	if (mm)
-		anon_name = anon_vma_name(vma);
+	if (mm) {
+		anon_name = needs_mmap_lock(m) ? anon_vma_name(vma) :
+				anon_vma_name_get_rcu(vma);
+	}
 
 	/*
 	 * Print the dentry name for named mappings, and a
@@ -338,6 +436,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		seq_puts(m, name);
 	}
 	seq_putc(m, '\n');
+	if (anon_name && !needs_mmap_lock(m))
+		anon_vma_name_put(anon_name);
 }
 
 static int show_map(struct seq_file *m, void *v)
-- 
2.43.0.429.g432eaa2c6b-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 3/3] mm/maps: read proc/pid/maps under RCU
  2024-01-22  7:13 ` [PATCH 3/3] mm/maps: read proc/pid/maps under RCU Suren Baghdasaryan
@ 2024-01-23  5:36   ` SeongJae Park
  2024-01-23  6:07     ` Suren Baghdasaryan
  0 siblings, 1 reply; 7+ messages in thread
From: SeongJae Park @ 2024-01-23  5:36 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: akpm, viro, brauner, jack, dchinner, casey, ben.wolsieffer,
	paulmck, david, avagin, usama.anjum, peterx, hughd, ryan.roberts,
	wangkefeng.wang, Liam.Howlett, yuzhao, axelrasmussen, lstoakes,
	talumbau, willy, vbabka, mgorman, jhubbard, vishal.moola,
	mathieu.desnoyers, dhowells, jgg, sidhartha.kumar,
	andriy.shevchenko, yangxingui, keescook, linux-kernel,
	linux-fsdevel, linux-mm, kernel-team

Hi Suren,

On Sun, 21 Jan 2024 23:13:24 -0800 Suren Baghdasaryan <surenb@google.com> wrote:

> With maple_tree supporting vma tree traversal under RCU and per-vma locks
> making vma access RCU-safe, /proc/pid/maps can be read under RCU and
> without the need to read-lock mmap_lock. However vma content can change
> from under us, therefore we make a copy of the vma and we pin pointer
> fields used when generating the output (currently only vm_file and
> anon_name). Afterwards we check for concurrent address space
> modifications, wait for them to end and retry. That last check is needed
> to avoid possibility of missing a vma during concurrent maple_tree
> node replacement, which might report a NULL when a vma is replaced
> with another one. While we take the mmap_lock for reading during such
> contention, we do that momentarily only to record new mm_wr_seq counter.
> This change is designed to reduce mmap_lock contention and prevent a
> process reading /proc/pid/maps files (often a low priority task, such as
> monitoring/data collection services) from blocking address space updates.
> 
> Note that this change has a userspace visible disadvantage: it allows for
> sub-page data tearing as opposed to the previous mechanism where data
> tearing could happen only between pages of generated output data.
> Since current userspace considers data tearing between pages to be
> acceptable, we assume is will be able to handle sub-page data tearing
> as well.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/proc/internal.h |   2 +
>  fs/proc/task_mmu.c | 114 ++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 109 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index a71ac5379584..e0247225bb68 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -290,6 +290,8 @@ struct proc_maps_private {
>  	struct task_struct *task;
>  	struct mm_struct *mm;
>  	struct vma_iterator iter;
> +	unsigned long mm_wr_seq;
> +	struct vm_area_struct vma_copy;
>  #ifdef CONFIG_NUMA
>  	struct mempolicy *task_mempolicy;
>  #endif
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 3f78ebbb795f..3886d04afc01 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -126,11 +126,96 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
>  }
>  #endif
>  
> -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> -						loff_t *ppos)
> +#ifdef CONFIG_PER_VMA_LOCK
> +
> +static const struct seq_operations proc_pid_maps_op;
> +/*
> + * Take VMA snapshot and pin vm_file and anon_name as they are used by
> + * show_map_vma.
> + */
> +static int get_vma_snapshow(struct proc_maps_private *priv, struct vm_area_struct *vma)
>  {
> +	struct vm_area_struct *copy = &priv->vma_copy;
> +	int ret = -EAGAIN;
> +
> +	memcpy(copy, vma, sizeof(*vma));
> +	if (copy->vm_file && !get_file_rcu(&copy->vm_file))
> +		goto out;
> +
> +	if (copy->anon_name && !anon_vma_name_get_rcu(copy))
> +		goto put_file;

From today updated mm-unstable which containing this patch, I'm getting below
build error when CONFIG_ANON_VMA_NAME is not set.  Seems this patch needs to
handle the case?

    .../linux/fs/proc/task_mmu.c: In function ‘get_vma_snapshow’:
    .../linux/fs/proc/task_mmu.c:145:19: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
      145 |         if (copy->anon_name && !anon_vma_name_get_rcu(copy))
          |                   ^~~~~~~~~
          |                   anon_vma
    .../linux/fs/proc/task_mmu.c:161:19: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
      161 |         if (copy->anon_name)
          |                   ^~~~~~~~~
          |                   anon_vma
    .../linux/fs/proc/task_mmu.c:162:41: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
      162 |                 anon_vma_name_put(copy->anon_name);
          |                                         ^~~~~~~~~
          |                                         anon_vma
    .../linux/fs/proc/task_mmu.c: In function ‘put_vma_snapshot’:
    .../linux/fs/proc/task_mmu.c:174:18: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
      174 |         if (vma->anon_name)
          |                  ^~~~~~~~~
          |                  anon_vma
    .../linux/fs/proc/task_mmu.c:175:40: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
      175 |                 anon_vma_name_put(vma->anon_name);
          |                                        ^~~~~~~~~
          |                                        anon_vma

[...]


Thanks,
SJ

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 3/3] mm/maps: read proc/pid/maps under RCU
  2024-01-23  5:36   ` SeongJae Park
@ 2024-01-23  6:07     ` Suren Baghdasaryan
  2024-01-23 23:12       ` Suren Baghdasaryan
  0 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2024-01-23  6:07 UTC (permalink / raw)
  To: SeongJae Park
  Cc: akpm, viro, brauner, jack, dchinner, casey, ben.wolsieffer,
	paulmck, david, avagin, usama.anjum, peterx, hughd, ryan.roberts,
	wangkefeng.wang, Liam.Howlett, yuzhao, axelrasmussen, lstoakes,
	talumbau, willy, vbabka, mgorman, jhubbard, vishal.moola,
	mathieu.desnoyers, dhowells, jgg, sidhartha.kumar,
	andriy.shevchenko, yangxingui, keescook, linux-kernel,
	linux-fsdevel, linux-mm, kernel-team

On Mon, Jan 22, 2024 at 9:36 PM SeongJae Park <sj@kernel.org> wrote:
>
> Hi Suren,
>
> On Sun, 21 Jan 2024 23:13:24 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
>
> > With maple_tree supporting vma tree traversal under RCU and per-vma locks
> > making vma access RCU-safe, /proc/pid/maps can be read under RCU and
> > without the need to read-lock mmap_lock. However vma content can change
> > from under us, therefore we make a copy of the vma and we pin pointer
> > fields used when generating the output (currently only vm_file and
> > anon_name). Afterwards we check for concurrent address space
> > modifications, wait for them to end and retry. That last check is needed
> > to avoid possibility of missing a vma during concurrent maple_tree
> > node replacement, which might report a NULL when a vma is replaced
> > with another one. While we take the mmap_lock for reading during such
> > contention, we do that momentarily only to record new mm_wr_seq counter.
> > This change is designed to reduce mmap_lock contention and prevent a
> > process reading /proc/pid/maps files (often a low priority task, such as
> > monitoring/data collection services) from blocking address space updates.
> >
> > Note that this change has a userspace visible disadvantage: it allows for
> > sub-page data tearing as opposed to the previous mechanism where data
> > tearing could happen only between pages of generated output data.
> > Since current userspace considers data tearing between pages to be
> > acceptable, we assume is will be able to handle sub-page data tearing
> > as well.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > ---
> >  fs/proc/internal.h |   2 +
> >  fs/proc/task_mmu.c | 114 ++++++++++++++++++++++++++++++++++++++++++---
> >  2 files changed, 109 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index a71ac5379584..e0247225bb68 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -290,6 +290,8 @@ struct proc_maps_private {
> >       struct task_struct *task;
> >       struct mm_struct *mm;
> >       struct vma_iterator iter;
> > +     unsigned long mm_wr_seq;
> > +     struct vm_area_struct vma_copy;
> >  #ifdef CONFIG_NUMA
> >       struct mempolicy *task_mempolicy;
> >  #endif
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 3f78ebbb795f..3886d04afc01 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -126,11 +126,96 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
> >  }
> >  #endif
> >
> > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> > -                                             loff_t *ppos)
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +
> > +static const struct seq_operations proc_pid_maps_op;
> > +/*
> > + * Take VMA snapshot and pin vm_file and anon_name as they are used by
> > + * show_map_vma.
> > + */
> > +static int get_vma_snapshow(struct proc_maps_private *priv, struct vm_area_struct *vma)
> >  {
> > +     struct vm_area_struct *copy = &priv->vma_copy;
> > +     int ret = -EAGAIN;
> > +
> > +     memcpy(copy, vma, sizeof(*vma));
> > +     if (copy->vm_file && !get_file_rcu(&copy->vm_file))
> > +             goto out;
> > +
> > +     if (copy->anon_name && !anon_vma_name_get_rcu(copy))
> > +             goto put_file;
>
> From today updated mm-unstable which containing this patch, I'm getting below
> build error when CONFIG_ANON_VMA_NAME is not set.  Seems this patch needs to
> handle the case?

Hi SeongJae,
Thanks for reporting! I'll post an updated version fixing this config.
Suren.


>
>     .../linux/fs/proc/task_mmu.c: In function ‘get_vma_snapshow’:
>     .../linux/fs/proc/task_mmu.c:145:19: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
>       145 |         if (copy->anon_name && !anon_vma_name_get_rcu(copy))
>           |                   ^~~~~~~~~
>           |                   anon_vma
>     .../linux/fs/proc/task_mmu.c:161:19: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
>       161 |         if (copy->anon_name)
>           |                   ^~~~~~~~~
>           |                   anon_vma
>     .../linux/fs/proc/task_mmu.c:162:41: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
>       162 |                 anon_vma_name_put(copy->anon_name);
>           |                                         ^~~~~~~~~
>           |                                         anon_vma
>     .../linux/fs/proc/task_mmu.c: In function ‘put_vma_snapshot’:
>     .../linux/fs/proc/task_mmu.c:174:18: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
>       174 |         if (vma->anon_name)
>           |                  ^~~~~~~~~
>           |                  anon_vma
>     .../linux/fs/proc/task_mmu.c:175:40: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
>       175 |                 anon_vma_name_put(vma->anon_name);
>           |                                        ^~~~~~~~~
>           |                                        anon_vma
>
> [...]
>
>
> Thanks,
> SJ

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 3/3] mm/maps: read proc/pid/maps under RCU
  2024-01-23  6:07     ` Suren Baghdasaryan
@ 2024-01-23 23:12       ` Suren Baghdasaryan
  2024-01-23 23:48         ` SeongJae Park
  0 siblings, 1 reply; 7+ messages in thread
From: Suren Baghdasaryan @ 2024-01-23 23:12 UTC (permalink / raw)
  To: SeongJae Park
  Cc: akpm, viro, brauner, jack, dchinner, casey, ben.wolsieffer,
	paulmck, david, avagin, usama.anjum, peterx, hughd, ryan.roberts,
	wangkefeng.wang, Liam.Howlett, yuzhao, axelrasmussen, lstoakes,
	talumbau, willy, vbabka, mgorman, jhubbard, vishal.moola,
	mathieu.desnoyers, dhowells, jgg, sidhartha.kumar,
	andriy.shevchenko, yangxingui, keescook, linux-kernel,
	linux-fsdevel, linux-mm, kernel-team

On Mon, Jan 22, 2024 at 10:07 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Jan 22, 2024 at 9:36 PM SeongJae Park <sj@kernel.org> wrote:
> >
> > Hi Suren,
> >
> > On Sun, 21 Jan 2024 23:13:24 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > > With maple_tree supporting vma tree traversal under RCU and per-vma locks
> > > making vma access RCU-safe, /proc/pid/maps can be read under RCU and
> > > without the need to read-lock mmap_lock. However vma content can change
> > > from under us, therefore we make a copy of the vma and we pin pointer
> > > fields used when generating the output (currently only vm_file and
> > > anon_name). Afterwards we check for concurrent address space
> > > modifications, wait for them to end and retry. That last check is needed
> > > to avoid possibility of missing a vma during concurrent maple_tree
> > > node replacement, which might report a NULL when a vma is replaced
> > > with another one. While we take the mmap_lock for reading during such
> > > contention, we do that momentarily only to record new mm_wr_seq counter.
> > > This change is designed to reduce mmap_lock contention and prevent a
> > > process reading /proc/pid/maps files (often a low priority task, such as
> > > monitoring/data collection services) from blocking address space updates.
> > >
> > > Note that this change has a userspace visible disadvantage: it allows for
> > > sub-page data tearing as opposed to the previous mechanism where data
> > > tearing could happen only between pages of generated output data.
> > > Since current userspace considers data tearing between pages to be
> > > acceptable, we assume is will be able to handle sub-page data tearing
> > > as well.
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > ---
> > >  fs/proc/internal.h |   2 +
> > >  fs/proc/task_mmu.c | 114 ++++++++++++++++++++++++++++++++++++++++++---
> > >  2 files changed, 109 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > > index a71ac5379584..e0247225bb68 100644
> > > --- a/fs/proc/internal.h
> > > +++ b/fs/proc/internal.h
> > > @@ -290,6 +290,8 @@ struct proc_maps_private {
> > >       struct task_struct *task;
> > >       struct mm_struct *mm;
> > >       struct vma_iterator iter;
> > > +     unsigned long mm_wr_seq;
> > > +     struct vm_area_struct vma_copy;
> > >  #ifdef CONFIG_NUMA
> > >       struct mempolicy *task_mempolicy;
> > >  #endif
> > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > > index 3f78ebbb795f..3886d04afc01 100644
> > > --- a/fs/proc/task_mmu.c
> > > +++ b/fs/proc/task_mmu.c
> > > @@ -126,11 +126,96 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
> > >  }
> > >  #endif
> > >
> > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> > > -                                             loff_t *ppos)
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +
> > > +static const struct seq_operations proc_pid_maps_op;
> > > +/*
> > > + * Take VMA snapshot and pin vm_file and anon_name as they are used by
> > > + * show_map_vma.
> > > + */
> > > +static int get_vma_snapshow(struct proc_maps_private *priv, struct vm_area_struct *vma)
> > >  {
> > > +     struct vm_area_struct *copy = &priv->vma_copy;
> > > +     int ret = -EAGAIN;
> > > +
> > > +     memcpy(copy, vma, sizeof(*vma));
> > > +     if (copy->vm_file && !get_file_rcu(&copy->vm_file))
> > > +             goto out;
> > > +
> > > +     if (copy->anon_name && !anon_vma_name_get_rcu(copy))
> > > +             goto put_file;
> >
> > From today updated mm-unstable which containing this patch, I'm getting below
> > build error when CONFIG_ANON_VMA_NAME is not set.  Seems this patch needs to
> > handle the case?
>
> Hi SeongJae,
> Thanks for reporting! I'll post an updated version fixing this config.

Fix is posted at
https://lore.kernel.org/all/20240123231014.3801041-3-surenb@google.com/
as part of v2 of this patchset.
Thanks,
Suren.

> Suren.
>
>
> >
> >     .../linux/fs/proc/task_mmu.c: In function ‘get_vma_snapshow’:
> >     .../linux/fs/proc/task_mmu.c:145:19: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
> >       145 |         if (copy->anon_name && !anon_vma_name_get_rcu(copy))
> >           |                   ^~~~~~~~~
> >           |                   anon_vma
> >     .../linux/fs/proc/task_mmu.c:161:19: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
> >       161 |         if (copy->anon_name)
> >           |                   ^~~~~~~~~
> >           |                   anon_vma
> >     .../linux/fs/proc/task_mmu.c:162:41: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
> >       162 |                 anon_vma_name_put(copy->anon_name);
> >           |                                         ^~~~~~~~~
> >           |                                         anon_vma
> >     .../linux/fs/proc/task_mmu.c: In function ‘put_vma_snapshot’:
> >     .../linux/fs/proc/task_mmu.c:174:18: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
> >       174 |         if (vma->anon_name)
> >           |                  ^~~~~~~~~
> >           |                  anon_vma
> >     .../linux/fs/proc/task_mmu.c:175:40: error: ‘struct vm_area_struct’ has no member named ‘anon_name’; did you mean ‘anon_vma’?
> >       175 |                 anon_vma_name_put(vma->anon_name);
> >           |                                        ^~~~~~~~~
> >           |                                        anon_vma
> >
> > [...]
> >
> >
> > Thanks,
> > SJ

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 3/3] mm/maps: read proc/pid/maps under RCU
  2024-01-23 23:12       ` Suren Baghdasaryan
@ 2024-01-23 23:48         ` SeongJae Park
  0 siblings, 0 replies; 7+ messages in thread
From: SeongJae Park @ 2024-01-23 23:48 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: SeongJae Park, akpm, viro, brauner, jack, dchinner, casey,
	ben.wolsieffer, paulmck, david, avagin, usama.anjum, peterx,
	hughd, ryan.roberts, wangkefeng.wang, Liam.Howlett, yuzhao,
	axelrasmussen, lstoakes, talumbau, willy, vbabka, mgorman,
	jhubbard, vishal.moola, mathieu.desnoyers, dhowells, jgg,
	sidhartha.kumar, andriy.shevchenko, yangxingui, keescook,
	linux-kernel, linux-fsdevel, linux-mm, kernel-team

Hi Suren,

On Tue, 23 Jan 2024 15:12:44 -0800 Suren Baghdasaryan <surenb@google.com> wrote:

> On Mon, Jan 22, 2024 at 10:07 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Mon, Jan 22, 2024 at 9:36 PM SeongJae Park <sj@kernel.org> wrote:
> > >
> > > Hi Suren,
> > >
> > > On Sun, 21 Jan 2024 23:13:24 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> > >
[...]
> > > From today updated mm-unstable which containing this patch, I'm getting below
> > > build error when CONFIG_ANON_VMA_NAME is not set.  Seems this patch needs to
> > > handle the case?
> >
> > Hi SeongJae,
> > Thanks for reporting! I'll post an updated version fixing this config.
> 
> Fix is posted at
> https://lore.kernel.org/all/20240123231014.3801041-3-surenb@google.com/
> as part of v2 of this patchset.

I confirmed the new version fixes the build error.  Thank you for the kind and
nice fix!


Thanks,
SJ

[...]

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-01-23 23:48 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-22  7:13 [PATCH 1/3] mm: make vm_area_struct anon_name field RCU-safe Suren Baghdasaryan
2024-01-22  7:13 ` [PATCH 2/3] mm: add mm_struct sequence number to detect write locks Suren Baghdasaryan
2024-01-22  7:13 ` [PATCH 3/3] mm/maps: read proc/pid/maps under RCU Suren Baghdasaryan
2024-01-23  5:36   ` SeongJae Park
2024-01-23  6:07     ` Suren Baghdasaryan
2024-01-23 23:12       ` Suren Baghdasaryan
2024-01-23 23:48         ` SeongJae Park

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).