Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v3 3/5] pid: use pid_has_task() in __change_pid()
From: Christian Brauner @ 2019-10-17 10:18 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191017101832.5985-1-christian.brauner@ubuntu.com>

Replace hlist_empty() with the new pid_has_task() helper which is more
idiomatic, easier to grep for, and unifies how callers perform this
check.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
---
/* pidfd selftests */
passed

/* v1 */
patch not present

/* v2 */
Link: https://lore.kernel.org/r/20191016153606.2326-3-christian.brauner@ubuntu.com
patch introduced

/* v2 */
- Oleg Nesterov <oleg@redhat.com>:
  - s/task_alive/pid_has_task/g
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..124d40b239b1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -299,7 +299,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
 	*pid_ptr = new;
 
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
-		if (!hlist_empty(&pid->tasks[tmp]))
+		if (pid_has_task(pid, tmp))
 			return;
 
 	free_pid(pid);
-- 
2.23.0

^ permalink raw reply related

* [PATCH v3 2/5] test: verify fdinfo for pidfd of reaped process
From: Christian Brauner @ 2019-10-17 10:18 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191017101832.5985-1-christian.brauner@ubuntu.com>

Test that the fdinfo field of a pidfd referring to a dead process
correctly shows Pid: -1 and NSpid: -1.

Cc: Christian Kellner <christian@kellner.me>
Cc: linux-kselftest@vger.kernel.org
Reviewed-by: Christian Kellner <christian@kellner.me>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* pidfd selftests */
passed

/* v1 */
Link: https://lore.kernel.org/r/20191015141332.4055-2-christian.brauner@ubuntu.com

/* v2 */
Link: https://lore.kernel.org/r/20191016153606.2326-2-christian.brauner@ubuntu.com
unchanged

/* v3 */
unchanged
---
 .../selftests/pidfd/pidfd_fdinfo_test.c       | 59 ++++++++++++++-----
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
index 3721be994abd..22558524f71c 100644
--- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
@@ -113,11 +113,15 @@ static struct child clone_newns(int (*fn)(void *), void *args,
 	return ret;
 }
 
+static inline void child_close(struct child *child)
+{
+	close(child->fd);
+}
+
 static inline int child_join(struct child *child, struct error *err)
 {
 	int r;
 
-	(void)close(child->fd);
 	r = wait_for_pid(child->pid);
 	if (r < 0)
 		error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)",
@@ -128,6 +132,12 @@ static inline int child_join(struct child *child, struct error *err)
 	return r;
 }
 
+static inline int child_join_close(struct child *child, struct error *err)
+{
+	child_close(child);
+	return child_join(child, err);
+}
+
 static inline void trim_newline(char *str)
 {
 	char *pos = strrchr(str, '\n');
@@ -136,8 +146,8 @@ static inline void trim_newline(char *str)
 		*pos = '\0';
 }
 
-static int verify_fdinfo_nspid(int pidfd, struct error *err,
-			       const char *expect, ...)
+static int verify_fdinfo(int pidfd, struct error *err, const char *prefix,
+			 size_t prefix_len, const char *expect, ...)
 {
 	char buffer[512] = {0, };
 	char path[512] = {0, };
@@ -160,17 +170,20 @@ static int verify_fdinfo_nspid(int pidfd, struct error *err,
 				 pidfd);
 
 	while (getline(&line, &n, f) != -1) {
-		if (strncmp(line, "NSpid:", 6))
+		char *val;
+
+		if (strncmp(line, prefix, prefix_len))
 			continue;
 
 		found = 1;
 
-		r = strcmp(line + 6, buffer);
+		val = line + prefix_len;
+		r = strcmp(val, buffer);
 		if (r != 0) {
 			trim_newline(line);
 			trim_newline(buffer);
-			error_set(err, PIDFD_FAIL, "NSpid: '%s' != '%s'",
-				  line + 6, buffer);
+			error_set(err, PIDFD_FAIL, "%s '%s' != '%s'",
+				  prefix, val, buffer);
 		}
 		break;
 	}
@@ -179,8 +192,8 @@ static int verify_fdinfo_nspid(int pidfd, struct error *err,
 	fclose(f);
 
 	if (found == 0)
-		return error_set(err, PIDFD_FAIL, "NSpid not found for fd %d",
-				 pidfd);
+		return error_set(err, PIDFD_FAIL, "%s not found for fd %d",
+				 prefix, pidfd);
 
 	return PIDFD_PASS;
 }
@@ -213,7 +226,7 @@ static int child_fdinfo_nspid_test(void *args)
 	}
 
 	pidfd = *(int *)args;
-	r = verify_fdinfo_nspid(pidfd, &err, "\t0\n");
+	r = verify_fdinfo(pidfd, &err, "NSpid:", 6, "\t0\n");
 
 	if (r != PIDFD_PASS)
 		ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg);
@@ -242,24 +255,42 @@ static void test_pidfd_fdinfo_nspid(void)
 	/* The children will have pid 1 in the new pid namespace,
 	 * so the line must be 'NSPid:\t<pid>\t1'.
 	 */
-	verify_fdinfo_nspid(a.fd, &err, "\t%d\t%d\n", a.pid, 1);
-	verify_fdinfo_nspid(b.fd, &err, "\t%d\t%d\n", b.pid, 1);
+	verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t%d\t%d\n", a.pid, 1);
+	verify_fdinfo(b.fd, &err, "NSpid:", 6, "\t%d\t%d\n", b.pid, 1);
 
 	/* wait for the process, check the exit status and set
 	 * 'err' accordingly, if it is not already set.
 	 */
+	child_join_close(&a, &err);
+	child_join_close(&b, &err);
+
+	error_report(&err, test_name);
+}
+
+static void test_pidfd_dead_fdinfo(void)
+{
+	struct child a;
+	struct error err = {0, };
+	const char *test_name = "pidfd check fdinfo for dead process";
+
+	/* Create a new child in a new pid and mount namespace */
+	a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+	error_check(&err, test_name);
 	child_join(&a, &err);
-	child_join(&b, &err);
 
+	verify_fdinfo(a.fd, &err, "Pid:", 4, "\t-1\n");
+	verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t-1\n");
+	child_close(&a);
 	error_report(&err, test_name);
 }
 
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(1);
+	ksft_set_plan(2);
 
 	test_pidfd_fdinfo_nspid();
+	test_pidfd_dead_fdinfo();
 
 	return ksft_exit_pass();
 }
-- 
2.23.0

^ permalink raw reply related

* [PATCH v3 1/5] pidfd: check pid has attached task in fdinfo
From: Christian Brauner @ 2019-10-17 10:18 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191016153606.2326-1-christian.brauner@ubuntu.com>

Currently, when a task is dead we still print the pid it used to use in
the fdinfo files of its pidfds. This doesn't make much sense since the
pid may have already been reused. So verify that the task is still alive
by introducing the pid_has_task() helper which will be used by other
callers in follow-up patches.
If the task is not alive anymore, we will print -1. This allows us to
differentiate between a task not being present in a given pid namespace
- in which case we already print 0 - and a task having been reaped.

Note that this uses PIDTYPE_PID for the check. Technically, we could've
checked PIDTYPE_TGID since pidfds currently only refer to thread-group
leaders but if they won't anymore in the future then this check becomes
problematic without it being immediately obvious to non-experts imho. If
a thread is created via clone(CLONE_THREAD) than struct pid has a single
non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a
PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even
though the thread-group leader might still be very much alive. So
checking PIDTYPE_PID is fine and is easier to maintain should we ever
allow pidfds to refer to threads.

Cc: Jann Horn <jannh@google.com>
Cc: Christian Kellner <christian@kellner.me>
Cc: linux-api@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
---
/* pidfd selftests */
passed

/* v1 */
Link: https://lore.kernel.org/r/20191015141332.4055-1-christian.brauner@ubuntu.com

/* v2 */
Link: https://lore.kernel.org/r/20191016153606.2326-1-christian.brauner@ubuntu.com
- Oleg Nesterov <oleg@redhat.com>:
  - simplify check whether task is still alive to hlist_empty()
  - optionally introduce generic helper to replace open coded
    hlist_emtpy() checks whether or not a task is alive
- Christian Brauner <christian.brauner@ubuntu.com>:
  - introduce task_alive() helper and use in pidfd_show_fdinfo()

/* v3 */
- Oleg Nesterov <oleg@redhat.com>:
  - s/task_alive/pid_has_task/g
- Christian Brauner <christian.brauner@ubuntu.com>:
  - reword commit message to better reflect naming switch from
    task_alive() to pid_has_task()
---
 include/linux/pid.h |  4 ++++
 kernel/fork.c       | 17 +++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 9645b1194c98..034e3cd60dc0 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -85,6 +85,10 @@ static inline struct pid *get_pid(struct pid *pid)
 
 extern void put_pid(struct pid *pid);
 extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
+static inline bool pid_has_task(struct pid *pid, enum pid_type type)
+{
+	return !hlist_empty(&pid->tasks[type]);
+}
 extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
 
 extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
diff --git a/kernel/fork.c b/kernel/fork.c
index 782986962d47..ffa314838b43 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1732,15 +1732,20 @@ static int pidfd_release(struct inode *inode, struct file *file)
  */
 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
-	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
 	struct pid *pid = f->private_data;
-	pid_t nr = pid_nr_ns(pid, ns);
+	struct pid_namespace *ns;
+	pid_t nr = -1;
 
-	seq_put_decimal_ull(m, "Pid:\t", nr);
+	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+		ns = proc_pid_ns(file_inode(m->file));
+		nr = pid_nr_ns(pid, ns);
+	}
+
+	seq_put_decimal_ll(m, "Pid:\t", nr);
 
 #ifdef CONFIG_PID_NS
-	seq_put_decimal_ull(m, "\nNSpid:\t", nr);
-	if (nr) {
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
 		int i;
 
 		/* If nr is non-zero it means that 'pid' is valid and that
@@ -1749,7 +1754,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 		 * Start at one below the already printed level.
 		 */
 		for (i = ns->level + 1; i <= pid->level; i++)
-			seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
 	}
 #endif
 	seq_putc(m, '\n');
-- 
2.23.0

^ permalink raw reply related

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Vincenzo Frascino @ 2019-10-17  9:47 UTC (permalink / raw)
  To: Thomas Gleixner, Andrei Vagin
  Cc: Dmitry Safonov, linux-kernel, Dmitry Safonov, Andrei Vagin,
	Adrian Reber, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86
In-Reply-To: <alpine.DEB.2.21.1910171039500.1824@nanos.tec.linutronix.de>

On 10/17/19 10:20 AM, Thomas Gleixner wrote:

[...]

> The architectures which implement VDSO are:
> 
>     arm, arm64, mips, nds32, powerpc, riscv, s390, sparc, x86, um
> 
> arm64, mips, x86 use the generic VDSO. Patches for arm are floating
> around. UM is special as it just traps into the syscalls. No idea about the
> rest. Vincenzo might know.
> 

There a couple of cases: hexagon and csky that have vDSOs for signal trampolines
if I recall correctly, but they do not fall into the category we are exploring
at the moment.

> The bad news is that we have no information (except on arm which has a
> config switch for VDSO) whether an architecture provides VDSO support or
> not.
> 
> So unless you add something like
> 
>    config HAS_VDSO
>    	  bool
> 
> which is selected by all architectures which provide VDSO support, the only
> sane solution is to depend on GENERIC_VDSO_TIME_NS.
> 
> TBH, I would not even bother. The architectures which matter and are going
> to use time namespaces already support VDSO and they need to move to the
> generic implementation anyway as we discussed and agreed on in Vancouver.
> 
> Providing time name spaces for the non VDSO archs is a purely academic
> exercise.

I totally agree with this.

-- 
Regards,
Vincenzo

^ permalink raw reply

* Re: [PATCHv7 00/33] kernel: Introduce Time Namespace
From: Thomas Gleixner @ 2019-10-17  9:24 UTC (permalink / raw)
  To: Dmitry Safonov
  Cc: linux-kernel, Dmitry Safonov, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	Vincenzo Frascino, containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <20191011012341.846266-1-dima@arista.com>

On Fri, 11 Oct 2019, Dmitry Safonov wrote:
> We wrote two small benchmarks. The first one gettime_perf.c calls
> clock_gettime() in a loop for 3 seconds. It shows us performance with
> a hot CPU cache (more clock_gettime() cycles - the better):
> 
>         | before    | CONFIG_TIME_NS=n | host      | inside timens
> --------------------------------------------------------------
>         | 153242367 | 153567617        | 150933203 | 139310914
>         | 153324800 | 153115132        | 150919828 | 139299761
>         | 153125401 | 153686868        | 150930471 | 139273917
>         | 153399355 | 153694866        | 151083410 | 139286081
>         | 153489417 | 153739716        | 150997262 | 139146403
>         | 153494270 | 153724332        | 150035651 | 138835612
> -----------------------------------------------------------
> avg     | 153345935 | 153588088        | 150816637 | 139192114
> diff %  | 100       | 100.1            | 98.3      | 90.7


That host 98.3% number is weird and does not match the tests I did with the
fallback code I provided you. On my limited testing that fallback hidden in
the slowpath did not show any difference to the TIME_NS=n case when not
inside a time namespace.

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Vincenzo Frascino @ 2019-10-17  9:23 UTC (permalink / raw)
  To: Andrei Vagin, Thomas Gleixner
  Cc: Dmitry Safonov, linux-kernel, Dmitry Safonov, Andrei Vagin,
	Adrian Reber, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86
In-Reply-To: <20191016233342.GA3075@gmail.com>

Hi Andrei,

On 10/17/19 12:33 AM, Andrei Vagin wrote:
>>> Having CONFIG_TIME_NS "default y" makes so that the option is selected even on
>>> the architectures that have no support for time namespaces.
>>> The direct consequence is that the fallbacks defined in this patch are never
>>> selected and this ends up in kernel compilation errors due to missing symbols.
>>>
>>> The error below shows what happens on arm64 (similar behavior on other
>>> architectures):
>>>
>>> aarch64-linux-gnu-ld: kernel/time/namespace.o: in function `timens_on_fork':
>>> kernel/time/namespace.c:321: undefined reference to `vdso_join_timens'
>>>
>>> My proposal is to keep TIME_NS "default n" (just remove "default y"), let the
>>> architectures that enable time namespaces select it and make CONFIG_TIME_NS
>>> select GENERIC_VDSO_TIME_NS if arch has HAVE_GENERIC_VDSO.
>> Nah.
>>
>> config TIME_NS
>> 	bool "TIME namespace"
>> 	depends on GENERIC_VDSO_TIME_NS
> I was thinking to fix this by the same way with a small difference.
> 
> If GENERIC_GETTIMEOFDAY isn't set, it should be safe to allow enabling
> TIME_NS. In this case, clock_gettime works via system call and we don't
> have arch-specific code in this case. Does this sound reasonable?
> 
>         depends on (!GENERIC_GETTIMEOFDAY || GENERIC_VDSO_TIME_NS)

This option would not work because not all the architectures have been converted
to the unified vdso approach. For example if you try to build ppc64 that does
not set GENERIC_GETTIMEOFDAY but has vdso support you would get:

powerpc64-linux-ld: warning: orphan section `.gnu.hash' from `linker stubs'
being placed in section `.gnu.hash'.
kernel/time/namespace.o: In function `.timens_set_vvar_page.isra.2.part.3':
namespace.c:(.text+0x178): undefined reference to `.arch_get_vdso_data'
kernel/time/namespace.o: In function `.timens_install':
namespace.c:(.text+0x798): undefined reference to `.vdso_join_timens'
kernel/time/namespace.o: In function `.timens_on_fork':
namespace.c:(.text+0x90c): undefined reference to `.vdso_join_timens'
/home/vinfra01/Projects/linux/Makefile:1074: recipe for target 'vmlinux' failed
make[1]: *** [vmlinux] Error 1
make[1]: Leaving directory '/home/vinfra01/Projects/linux-out'
Makefile:179: recipe for target 'sub-make' failed
make: *** [sub-make] Error 2

-- 
Regards,
Vincenzo

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Thomas Gleixner @ 2019-10-17  9:20 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Vincenzo Frascino, Dmitry Safonov, linux-kernel, Dmitry Safonov,
	Andrei Vagin, Adrian Reber, Andy Lutomirski, Arnd Bergmann,
	Christian Brauner, Cyrill Gorcunov, Eric W. Biederman,
	H. Peter Anvin, Ingo Molnar, Jann Horn, Jeff Dike, Oleg Nesterov,
	Pavel Emelyanov, Shuah Khan, containers, criu, linux-api, x86
In-Reply-To: <20191016233342.GA3075@gmail.com>

Andrei!

On Wed, 16 Oct 2019, Andrei Vagin wrote:
> On Wed, Oct 16, 2019 at 12:39:11PM +0200, Thomas Gleixner wrote:
> > Nah.
> > 
> > config TIME_NS
> > 	bool "TIME namespace"
> > 	depends on GENERIC_VDSO_TIME_NS
> 
> I was thinking to fix this by the same way with a small difference.
> 
> If GENERIC_GETTIMEOFDAY isn't set, it should be safe to allow enabling
> TIME_NS. In this case, clock_gettime works via system call and we don't
> have arch-specific code in this case. Does this sound reasonable?
> 
>         depends on (!GENERIC_GETTIMEOFDAY || GENERIC_VDSO_TIME_NS)

No, that's wrong. If GENERIC_GETTIMEOFDAY is not set, then the architecture
still might have its own VDSO implementation and we agreed in Vancouver a
year ago that we are not going to support per architecture time namespace
VDSO implementations.

So if at all then you want:

   depends on HAVE_GENERIC_VDSO && (!GENERIC_GETTIMEOFDAY || GENERIC_VDSO_TIME_NS)

But that's crap, really.

The reason why HAVE_GENERIC_VDSO and GENERIC_GETTIMEOFDAY exist as separate
config items is not a functional issue. It's there to ease the migration to
the generic VDSO implementation. Having generic VDSO in production without
implementing GENERIC_GETTIMEOFDAY does not make any sense at all.

The architectures which implement VDSO are:

    arm, arm64, mips, nds32, powerpc, riscv, s390, sparc, x86, um

arm64, mips, x86 use the generic VDSO. Patches for arm are floating
around. UM is special as it just traps into the syscalls. No idea about the
rest. Vincenzo might know.

The bad news is that we have no information (except on arm which has a
config switch for VDSO) whether an architecture provides VDSO support or
not.

So unless you add something like

   config HAS_VDSO
   	  bool

which is selected by all architectures which provide VDSO support, the only
sane solution is to depend on GENERIC_VDSO_TIME_NS.

TBH, I would not even bother. The architectures which matter and are going
to use time namespaces already support VDSO and they need to move to the
generic implementation anyway as we discussed and agreed on in Vancouver.

Providing time name spaces for the non VDSO archs is a purely academic
exercise.

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCH v2 1/5] pidfd: verify task is alive when printing fdinfo
From: Oleg Nesterov @ 2019-10-17  8:54 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, aarcange, akpm, christian, cyphar, elena.reshetova,
	guro, jannh, ldv, linux-api, linux-kselftest, mhocko, mingo,
	peterz, shuah, tglx, viro
In-Reply-To: <20191016163107.5zwt6fmjyd5mkqqw@wittgenstein>

On 10/16, Christian Brauner wrote:
>
> On Wed, Oct 16, 2019 at 06:24:09PM +0200, Oleg Nesterov wrote:
> >
> > And why task_ if it accepts pid+pid_type? May be pid_has_task() or
> > something like this...
>
> Given what I said above that might be a decent name.
>
> >
> > OK, since I can't suggest a better name I won't really argue. Feel free
> > to add my reviewed-by to this series.
>
> No, naming is important. Thanks for being picky about that too and I'll
> happily resend. :)

Thanks ;) May be pid_in_use() ? Up to you, anything which starts with pid_
looks better to me than task_alive().

Oleg.

^ permalink raw reply

* Re: [PATCH v2] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Christian Brauner @ 2019-10-17  6:09 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: cyphar, mingo, peterz, alexander.shishkin, jolsa, namhyung,
	christian, keescook, linux, viro, torvalds, linux-api,
	linux-kernel
In-Reply-To: <871rvctkof.fsf@mpe.ellerman.id.au>

On Thu, Oct 17, 2019 at 09:00:48AM +1100, Michael Ellerman wrote:
> Christian Brauner <christian.brauner@ubuntu.com> writes:
> > On Wed, Oct 16, 2019 at 11:27:32PM +1100, Michael Ellerman wrote:
> >> On a machine with a 64K PAGE_SIZE, the nested for loops in
> >> test_check_nonzero_user() can lead to soft lockups, eg:
> >> 
> >>   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
> >>   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
> >>   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
> >>   ...
> >>   NIP __might_sleep+0x20/0xc0
> >>   LR  __might_fault+0x40/0x60
> >>   Call Trace:
> >>     check_zeroed_user+0x12c/0x200
> >>     test_user_copy_init+0x67c/0x1210 [test_user_copy]
> >>     do_one_initcall+0x60/0x340
> >>     do_init_module+0x7c/0x2f0
> >>     load_module+0x2d94/0x30e0
> >>     __do_sys_finit_module+0xc8/0x150
> >>     system_call+0x5c/0x68
> >> 
> >> Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
> >> tweak it to only scan a 1024 byte region, but make it cross the
> >> page boundary.
> >> 
> >> Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
> >> Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
> >> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> >
> > With Aleksa's Reviewed-by I've picked this up:
> > https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=copy_struct_from_user
> 
> Thanks. Are you planning to send that to Linus for v5.4 or v5.5 ?

This looks like a pretty straight bugfix to me since it's clearly
causing an issue for you on power so v5.4-rc4 is what I'd aim for. I
just want it to be in linux-next until tomorrow.

Christian

^ permalink raw reply

* Re: [PATCH v7 0/3] add thermal/power management features for FPGA DFL drivers
From: Moritz Fischer @ 2019-10-17  2:35 UTC (permalink / raw)
  To: Wu Hao
  Cc: mdf, linux-fpga, linux-kernel, linux-api, linux-hwmon, linux,
	jdelvare, gregkh
In-Reply-To: <1571031723-12101-1-git-send-email-hao.wu@intel.com>

On Mon, Oct 14, 2019 at 01:42:00PM +0800, Wu Hao wrote:
> Hi Moritz and all,
> 
> This patchset adds thermal and power management features for FPGA DFL
> drivers. Both patches are using hwmon as userspace interfaces.
> 
> This patchset is created on top of 5.4-rc3, please help with review to see
> if any comments, thank you very much!
> 
> Main changes from v6:
>  - update kernel version and date in sysfs doc.
> 
> Main changes from v5:
>  - rebase and clean up (remove empty uinit function) per changes in recent
>    merged dfl patches.
>  - update date in sysfs doc.
> 
> Main changes from v4:
>  - rebase due to Documentation format change (dfl.txt -> rst).
>  - clamp threshold inputs for sysfs interfaces. (patch#3)
>  - update sysfs doc to add more description for ltr sysfs interfaces.
>    (patch#3)
> 
> Main changes from v3:
>  - use HWMON_CHANNEL_INFO.
> 
> Main changes from v2:
>  - switch to standard hwmon APIs for thermal hwmon:
>      temp1_alarm        --> temp1_max
>      temp1_alarm_status --> temp1_max_alarm
>      temp1_crit_status  --> temp1_crit_alarm
>      temp1_alarm_policy --> temp1_max_policy
>  - switch to standard hwmon APIs for power hwmon:
>      power1_cap         --> power1_max
>      power1_cap_status  --> power1_max_alarm
>      power1_crit_status --> power1_crit_alarm
> 
> Wu Hao (2):
>   fpga: dfl: fme: add thermal management support
>   fpga: dfl: fme: add power management support
> 
> Xu Yilun (1):
>   Documentation: fpga: dfl: add descriptions for thermal/power
>     management interfaces
> 
>  Documentation/ABI/testing/sysfs-platform-dfl-fme | 132 ++++++++
>  Documentation/fpga/dfl.rst                       |  10 +
>  drivers/fpga/Kconfig                             |   2 +-
>  drivers/fpga/dfl-fme-main.c                      | 385 +++++++++++++++++++++++
>  4 files changed, 528 insertions(+), 1 deletion(-)
> 
> -- 
> 1.8.3.1
> 

Series applied.

Thanks,
Moritz

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Andrei Vagin @ 2019-10-16 23:33 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Vincenzo Frascino, Dmitry Safonov, linux-kernel, Dmitry Safonov,
	Andrei Vagin, Adrian Reber, Andy Lutomirski, Arnd Bergmann,
	Christian Brauner, Cyrill Gorcunov, Eric W. Biederman,
	H. Peter Anvin, Ingo Molnar, Jann Horn, Jeff Dike, Oleg Nesterov,
	Pavel Emelyanov, Shuah Khan, containers, criu, linux-api, x86
In-Reply-To: <alpine.DEB.2.21.1910161230070.2046@nanos.tec.linutronix.de>

On Wed, Oct 16, 2019 at 12:39:11PM +0200, Thomas Gleixner wrote:
> On Wed, 16 Oct 2019, Vincenzo Frascino wrote:
> 
> < Trim 250+ lines ( 3+ pages) of pointlessly wasted electrons >
> 
> > > --- a/init/Kconfig
> > > +++ b/init/Kconfig
> > > @@ -1096,6 +1096,13 @@ config UTS_NS
> > >  	  In this namespace tasks see different info provided with the
> > >  	  uname() system call
> > >  
> > > +config TIME_NS
> > > +	bool "TIME namespace"
> > > +	default y
> > 
> > Having CONFIG_TIME_NS "default y" makes so that the option is selected even on
> > the architectures that have no support for time namespaces.
> > The direct consequence is that the fallbacks defined in this patch are never
> > selected and this ends up in kernel compilation errors due to missing symbols.
> > 
> > The error below shows what happens on arm64 (similar behavior on other
> > architectures):
> > 
> > aarch64-linux-gnu-ld: kernel/time/namespace.o: in function `timens_on_fork':
> > kernel/time/namespace.c:321: undefined reference to `vdso_join_timens'
> > 
> > My proposal is to keep TIME_NS "default n" (just remove "default y"), let the
> > architectures that enable time namespaces select it and make CONFIG_TIME_NS
> > select GENERIC_VDSO_TIME_NS if arch has HAVE_GENERIC_VDSO.
> 
> Nah.
> 
> config TIME_NS
> 	bool "TIME namespace"
> 	depends on GENERIC_VDSO_TIME_NS

I was thinking to fix this by the same way with a small difference.

If GENERIC_GETTIMEOFDAY isn't set, it should be safe to allow enabling
TIME_NS. In this case, clock_gettime works via system call and we don't
have arch-specific code in this case. Does this sound reasonable?

        depends on (!GENERIC_GETTIMEOFDAY || GENERIC_VDSO_TIME_NS)

Thanks,
Andrei

^ permalink raw reply

* Re: [PATCH v2] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Michael Ellerman @ 2019-10-16 22:00 UTC (permalink / raw)
  To: Christian Brauner
  Cc: cyphar, mingo, peterz, alexander.shishkin, jolsa, namhyung,
	christian, keescook, linux, viro, torvalds, linux-api,
	linux-kernel
In-Reply-To: <20191016130319.vcc2mqac3ta5jjat@wittgenstein>

Christian Brauner <christian.brauner@ubuntu.com> writes:
> On Wed, Oct 16, 2019 at 11:27:32PM +1100, Michael Ellerman wrote:
>> On a machine with a 64K PAGE_SIZE, the nested for loops in
>> test_check_nonzero_user() can lead to soft lockups, eg:
>> 
>>   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
>>   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
>>   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
>>   ...
>>   NIP __might_sleep+0x20/0xc0
>>   LR  __might_fault+0x40/0x60
>>   Call Trace:
>>     check_zeroed_user+0x12c/0x200
>>     test_user_copy_init+0x67c/0x1210 [test_user_copy]
>>     do_one_initcall+0x60/0x340
>>     do_init_module+0x7c/0x2f0
>>     load_module+0x2d94/0x30e0
>>     __do_sys_finit_module+0xc8/0x150
>>     system_call+0x5c/0x68
>> 
>> Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
>> tweak it to only scan a 1024 byte region, but make it cross the
>> page boundary.
>> 
>> Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
>> Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
>> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>
> With Aleksa's Reviewed-by I've picked this up:
> https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=copy_struct_from_user

Thanks. Are you planning to send that to Linus for v5.4 or v5.5 ?

cheers

^ permalink raw reply

* Re: [RFC PATCH 02/21] Add a prelocked wake-up
From: Tim Chen @ 2019-10-16 17:02 UTC (permalink / raw)
  To: Linus Torvalds, David Howells, Kan Liang
  Cc: Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	Nicolas Dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, LSM List, linux-fsdevel, Linux API,
	Linux Kernel Mailing List
In-Reply-To: <CAHk-=whiz1sHXu8SVZKEC2dup=r5JMrftPtEt6ff9Ea8dyH8yQ@mail.gmail.com>

On 10/15/19 3:14 PM, Linus Torvalds wrote:
> On Tue, Oct 15, 2019 at 2:48 PM David Howells <dhowells@redhat.com> wrote:
>>
>> Add a wakeup call for a case whereby the caller already has the waitqueue
>> spinlock held.
> 
> That naming is crazy.
> 
> We already have helper functions like this, and they are just called
> "wake_up_locked()".
> 
> So the "prelocked" naming is just odd. Make it be
> wake_up_interruptible_sync_poll_locked().
> 
> The helper function should likely be
> 
>   void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
> unsigned int mode, void *key)
>   {
>         __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
>   }
>   EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
> 
> to match the other naming patterns there.
> 
> [ Unrelated ]
> 
> Looking at that mess of functions, I also wonder if we should try to
> just remove the bookmark code again. It was cute, and it was useful,
> but I think the problem with the page lock list may have been fixed by
> commit 9a1ea439b16b ("mm: put_and_wait_on_page_locked() while page is
> migrated") which avoids the retry condition with
> migrate_page_move_mapping().
> 
> Tim/Kan? Do you have the problematic load still?
> 

Unfortunately, we do not have ready access to that problematic load
which was run by a customer on 8 socket system.  They were not
willing to give the workload to us, and have not responded to my
request to rerun their load with commit 9a1ea439b16b.

The commit greatly reduced migration failures with concurrent page faulting.
And successful migrations could have prevented the big
pile up of waiters faulting and waiting on the page, which was the
problem the bookmark code was trying to solve.

So I also tend to think that the problem should have been resolved.
But unfortunately I don't have a ready workload to confirm.

Tim

^ permalink raw reply

* Re: [PATCHv7 19/33] lib/vdso: Prepare for time namespace support
From: Vincenzo Frascino @ 2019-10-16 16:36 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Dmitry Safonov, linux-kernel, Dmitry Safonov, Adrian Reber,
	Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <alpine.DEB.2.21.1910161704120.2046@nanos.tec.linutronix.de>

Hi Thomas,

On 10/16/19 4:07 PM, Thomas Gleixner wrote:
[...]

> Can you pretty please finally start to trim your replies?

I really apologize. I can't explain to myself how did I miss the information. It
will not happen again.

-- 
Regards,
Vincenzo

^ permalink raw reply

* Re: [PATCH v2 1/5] pidfd: verify task is alive when printing fdinfo
From: Christian Brauner @ 2019-10-16 16:31 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: linux-kernel, aarcange, akpm, christian, cyphar, elena.reshetova,
	guro, jannh, ldv, linux-api, linux-kselftest, mhocko, mingo,
	peterz, shuah, tglx, viro
In-Reply-To: <20191016162408.GB31585@redhat.com>

On Wed, Oct 16, 2019 at 06:24:09PM +0200, Oleg Nesterov wrote:
> On 10/16, Christian Brauner wrote:
> >
> > +static inline bool task_alive(struct pid *pid, enum pid_type type)
> > +{
> > +	return !hlist_empty(&pid->tasks[type]);
> > +}
> 
> So you decided to add a helper ;) OK, but note that its name is very
> confusing and misleading. Even more than pid_alive() we already have.

That's why I chose that name. This is the second time I get bitten by
taking inspiration from prior naming examples. :)

> 
> What does "alive" actually mean? Say, task_alive(pid, PIDTYPE_SID) == F
> after fork(). Then it becomes T if this task does setsid().

Yes, that annoyed me to. If you think about pidfd_open() you have a
similar problem. The question we are asking in pidfd_open() is not
task_alive() but rather was-this-pid-used-as.

> 
> And why task_ if it accepts pid+pid_type? May be pid_has_task() or
> something like this...

Given what I said above that might be a decent name.

> 
> OK, since I can't suggest a better name I won't really argue. Feel free
> to add my reviewed-by to this series.

No, naming is important. Thanks for being picky about that too and I'll
happily resend. :)

Thanks!
Christian

^ permalink raw reply

* Re: [PATCH v2 1/5] pidfd: verify task is alive when printing fdinfo
From: Oleg Nesterov @ 2019-10-16 16:24 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-kernel, aarcange, akpm, christian, cyphar, elena.reshetova,
	guro, jannh, ldv, linux-api, linux-kselftest, mhocko, mingo,
	peterz, shuah, tglx, viro
In-Reply-To: <20191016153606.2326-1-christian.brauner@ubuntu.com>

On 10/16, Christian Brauner wrote:
>
> +static inline bool task_alive(struct pid *pid, enum pid_type type)
> +{
> +	return !hlist_empty(&pid->tasks[type]);
> +}

So you decided to add a helper ;) OK, but note that its name is very
confusing and misleading. Even more than pid_alive() we already have.

What does "alive" actually mean? Say, task_alive(pid, PIDTYPE_SID) == F
after fork(). Then it becomes T if this task does setsid().

And why task_ if it accepts pid+pid_type? May be pid_has_task() or
something like this...

OK, since I can't suggest a better name I won't really argue. Feel free
to add my reviewed-by to this series.

Oleg.

^ permalink raw reply

* [PATCH v2 5/5] pid: use task_alive() in pidfd_open()
From: Christian Brauner @ 2019-10-16 15:36 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191016153606.2326-1-christian.brauner@ubuntu.com>

Use the new task_alive() helper in pidfd_open(). This simplifies the
code and avoids taking rcu_read_{lock,unlock}() and leads to overall
nicer code.

Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
patch not present

/* v2 */
patch introduced
---
 kernel/pid.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 70ad4a9f728c..1f425b6c4c47 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -497,7 +497,7 @@ static int pidfd_create(struct pid *pid)
  */
 SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 {
-	int fd, ret;
+	int fd;
 	struct pid *p;
 
 	if (flags)
@@ -510,13 +510,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	if (!p)
 		return -ESRCH;
 
-	ret = 0;
-	rcu_read_lock();
-	if (!pid_task(p, PIDTYPE_TGID))
-		ret = -EINVAL;
-	rcu_read_unlock();
+	if (task_alive(p, PIDTYPE_TGID))
+		fd = pidfd_create(p);
+	else
+		fd = -EINVAL;
 
-	fd = ret ?: pidfd_create(p);
 	put_pid(p);
 	return fd;
 }
-- 
2.23.0

^ permalink raw reply related

* [PATCH v2 4/5] exit: use task_alive() in do_wait()
From: Christian Brauner @ 2019-10-16 15:36 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191016153606.2326-1-christian.brauner@ubuntu.com>

Replace hlist_empty() with the new task_alive() helper which is more
idiomatic, easier to grep for, and unifies how callers perform this check.

Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
patch not present

/* v2 */
patch introduced
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d67002..2bb0cbe94bda 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1457,7 +1457,7 @@ static long do_wait(struct wait_opts *wo)
 	 */
 	wo->notask_error = -ECHILD;
 	if ((wo->wo_type < PIDTYPE_MAX) &&
-	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+	   (!wo->wo_pid || !task_alive(wo->wo_pid, wo->wo_type)))
 		goto notask;
 
 	set_current_state(TASK_INTERRUPTIBLE);
-- 
2.23.0

^ permalink raw reply related

* [PATCH v2 3/5] pid: use task_alive() in __change_pid()
From: Christian Brauner @ 2019-10-16 15:36 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191016153606.2326-1-christian.brauner@ubuntu.com>

Replace hlist_empty() with the new task_alive() helper which is more
idiomatic, easier to grep for, and unifies how callers perform this
check.

Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
patch not present

/* v2 */
patch introduced
---
 kernel/pid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..70ad4a9f728c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -299,7 +299,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
 	*pid_ptr = new;
 
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
-		if (!hlist_empty(&pid->tasks[tmp]))
+		if (task_alive(pid, tmp))
 			return;
 
 	free_pid(pid);
-- 
2.23.0

^ permalink raw reply related

* [PATCH v2 2/5] test: verify fdinfo for pidfd of reaped process
From: Christian Brauner @ 2019-10-16 15:36 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191016153606.2326-1-christian.brauner@ubuntu.com>

Test that the fdinfo field of a pidfd referring to a dead process
correctly shows Pid: -1 and NSpid: -1.

Cc: Christian Kellner <christian@kellner.me>
Reviewed-by: Christian Kellner <christian@kellner.me>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
Link: https://lore.kernel.org/r/20191015141332.4055-2-christian.brauner@ubuntu.com

/* v2 */
unchanged
---
 .../selftests/pidfd/pidfd_fdinfo_test.c       | 59 ++++++++++++++-----
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
index 3721be994abd..22558524f71c 100644
--- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
@@ -113,11 +113,15 @@ static struct child clone_newns(int (*fn)(void *), void *args,
 	return ret;
 }
 
+static inline void child_close(struct child *child)
+{
+	close(child->fd);
+}
+
 static inline int child_join(struct child *child, struct error *err)
 {
 	int r;
 
-	(void)close(child->fd);
 	r = wait_for_pid(child->pid);
 	if (r < 0)
 		error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)",
@@ -128,6 +132,12 @@ static inline int child_join(struct child *child, struct error *err)
 	return r;
 }
 
+static inline int child_join_close(struct child *child, struct error *err)
+{
+	child_close(child);
+	return child_join(child, err);
+}
+
 static inline void trim_newline(char *str)
 {
 	char *pos = strrchr(str, '\n');
@@ -136,8 +146,8 @@ static inline void trim_newline(char *str)
 		*pos = '\0';
 }
 
-static int verify_fdinfo_nspid(int pidfd, struct error *err,
-			       const char *expect, ...)
+static int verify_fdinfo(int pidfd, struct error *err, const char *prefix,
+			 size_t prefix_len, const char *expect, ...)
 {
 	char buffer[512] = {0, };
 	char path[512] = {0, };
@@ -160,17 +170,20 @@ static int verify_fdinfo_nspid(int pidfd, struct error *err,
 				 pidfd);
 
 	while (getline(&line, &n, f) != -1) {
-		if (strncmp(line, "NSpid:", 6))
+		char *val;
+
+		if (strncmp(line, prefix, prefix_len))
 			continue;
 
 		found = 1;
 
-		r = strcmp(line + 6, buffer);
+		val = line + prefix_len;
+		r = strcmp(val, buffer);
 		if (r != 0) {
 			trim_newline(line);
 			trim_newline(buffer);
-			error_set(err, PIDFD_FAIL, "NSpid: '%s' != '%s'",
-				  line + 6, buffer);
+			error_set(err, PIDFD_FAIL, "%s '%s' != '%s'",
+				  prefix, val, buffer);
 		}
 		break;
 	}
@@ -179,8 +192,8 @@ static int verify_fdinfo_nspid(int pidfd, struct error *err,
 	fclose(f);
 
 	if (found == 0)
-		return error_set(err, PIDFD_FAIL, "NSpid not found for fd %d",
-				 pidfd);
+		return error_set(err, PIDFD_FAIL, "%s not found for fd %d",
+				 prefix, pidfd);
 
 	return PIDFD_PASS;
 }
@@ -213,7 +226,7 @@ static int child_fdinfo_nspid_test(void *args)
 	}
 
 	pidfd = *(int *)args;
-	r = verify_fdinfo_nspid(pidfd, &err, "\t0\n");
+	r = verify_fdinfo(pidfd, &err, "NSpid:", 6, "\t0\n");
 
 	if (r != PIDFD_PASS)
 		ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg);
@@ -242,24 +255,42 @@ static void test_pidfd_fdinfo_nspid(void)
 	/* The children will have pid 1 in the new pid namespace,
 	 * so the line must be 'NSPid:\t<pid>\t1'.
 	 */
-	verify_fdinfo_nspid(a.fd, &err, "\t%d\t%d\n", a.pid, 1);
-	verify_fdinfo_nspid(b.fd, &err, "\t%d\t%d\n", b.pid, 1);
+	verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t%d\t%d\n", a.pid, 1);
+	verify_fdinfo(b.fd, &err, "NSpid:", 6, "\t%d\t%d\n", b.pid, 1);
 
 	/* wait for the process, check the exit status and set
 	 * 'err' accordingly, if it is not already set.
 	 */
+	child_join_close(&a, &err);
+	child_join_close(&b, &err);
+
+	error_report(&err, test_name);
+}
+
+static void test_pidfd_dead_fdinfo(void)
+{
+	struct child a;
+	struct error err = {0, };
+	const char *test_name = "pidfd check fdinfo for dead process";
+
+	/* Create a new child in a new pid and mount namespace */
+	a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+	error_check(&err, test_name);
 	child_join(&a, &err);
-	child_join(&b, &err);
 
+	verify_fdinfo(a.fd, &err, "Pid:", 4, "\t-1\n");
+	verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t-1\n");
+	child_close(&a);
 	error_report(&err, test_name);
 }
 
 int main(int argc, char **argv)
 {
 	ksft_print_header();
-	ksft_set_plan(1);
+	ksft_set_plan(2);
 
 	test_pidfd_fdinfo_nspid();
+	test_pidfd_dead_fdinfo();
 
 	return ksft_exit_pass();
 }
-- 
2.23.0

^ permalink raw reply related

* [PATCH v2 1/5] pidfd: verify task is alive when printing fdinfo
From: Christian Brauner @ 2019-10-16 15:36 UTC (permalink / raw)
  To: oleg, linux-kernel
  Cc: aarcange, akpm, christian, cyphar, elena.reshetova, guro, jannh,
	ldv, linux-api, linux-kselftest, mhocko, mingo, peterz, shuah,
	tglx, viro, Christian Brauner
In-Reply-To: <20191015141332.4055-1-christian.brauner@ubuntu.com>

Currently, when a task is dead we still print the pid it used to use in
the fdinfo files of its pidfds. This doesn't make much sense since the
pid may have already been reused. So verify that the task is still
alive by introducing the task_alive() helper which will be used by other
callers in follow-up patches.
If the task is not alive anymore, we will print -1. This allows us to
differentiate between a task not being present in a given pid namespace
- in which case we already print 0 - and a task having been reaped.

Note that this uses PIDTYPE_PID for the check. Technically, we could've
checked PIDTYPE_TGID since pidfds currently only refer to thread-group
leaders but if they won't anymore in the future then this check becomes
problematic without it being immediately obvious to non-experts imho. If
a thread is created via clone(CLONE_THREAD) than struct pid has a single
non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a
PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even
though the thread-group leader might still be very much alive. So
checking PIDTYPE_PID is fine and is easier to maintain should we ever
allow pidfds to refer to threads.

Cc: Jann Horn <jannh@google.com>
Cc: Christian Kellner <christian@kellner.me>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: linux-api@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
Link: https://lore.kernel.org/r/20191015141332.4055-1-christian.brauner@ubuntu.com

/* v2 */
- Oleg Nesterov <oleg@redhat.com>:
  - simplify check whether task is still alive to hlist_empty()
  - optionally introduce generic helper to replace open coded
    hlist_emtpy() checks whether or not a task is alive
- Christian Brauner <christian.brauner@ubuntu.com>:
  - introduce task_alive() helper and use in pidfd_show_fdinfo()
---
 include/linux/pid.h |  4 ++++
 kernel/fork.c       | 17 +++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 9645b1194c98..5f1c8ce10b71 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -85,6 +85,10 @@ static inline struct pid *get_pid(struct pid *pid)
 
 extern void put_pid(struct pid *pid);
 extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
+static inline bool task_alive(struct pid *pid, enum pid_type type)
+{
+	return !hlist_empty(&pid->tasks[type]);
+}
 extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
 
 extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
diff --git a/kernel/fork.c b/kernel/fork.c
index 782986962d47..ef9a9d661079 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1732,15 +1732,20 @@ static int pidfd_release(struct inode *inode, struct file *file)
  */
 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
-	struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
 	struct pid *pid = f->private_data;
-	pid_t nr = pid_nr_ns(pid, ns);
+	struct pid_namespace *ns;
+	pid_t nr = -1;
 
-	seq_put_decimal_ull(m, "Pid:\t", nr);
+	if (likely(task_alive(pid, PIDTYPE_PID))) {
+		ns = proc_pid_ns(file_inode(m->file));
+		nr = pid_nr_ns(pid, ns);
+	}
+
+	seq_put_decimal_ll(m, "Pid:\t", nr);
 
 #ifdef CONFIG_PID_NS
-	seq_put_decimal_ull(m, "\nNSpid:\t", nr);
-	if (nr) {
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
 		int i;
 
 		/* If nr is non-zero it means that 'pid' is valid and that
@@ -1749,7 +1754,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 		 * Start at one below the already printed level.
 		 */
 		for (i = ns->level + 1; i <= pid->level; i++)
-			seq_put_decimal_ull(m, "\t", pid->numbers[i].nr);
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
 	}
 #endif
 	seq_putc(m, '\n');
-- 
2.23.0

^ permalink raw reply related

* Re: [RFC PATCH 02/21] Add a prelocked wake-up
From: Linus Torvalds @ 2019-10-16 15:31 UTC (permalink / raw)
  To: David Howells
  Cc: Tim Chen, Kan Liang, Casey Schaufler, Stephen Smalley,
	Greg Kroah-Hartman, Nicolas Dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, LSM List, linux-fsdevel,
	Linux API, Linux Kernel Mailing List
In-Reply-To: <6900.1571235985@warthog.procyon.org.uk>

On Wed, Oct 16, 2019 at 7:26 AM David Howells <dhowells@redhat.com> wrote:
>
> Btw, is there any point in __wake_up_sync_key() taking a nr_exclusive
> argument since it clears WF_SYNC if nr_exclusive != 1 and doesn't make sense
> to be >1 anyway.

Ack, looks sane to me.

We have _very_ few users of nr_exclusive. I wonder if it's even worth
having at all, but it's definitely not worth it here.

I'd love for nr_exclusive to go away and be replaced by WF_ALL
instead. Right now it looks like there is one SGI driver that uses it,
and the sbitmap code. That was all I could find.

Oh well. You removing one case is at last a small amount of progress.

         Linus

^ permalink raw reply

* Re: [PATCHv7 19/33] lib/vdso: Prepare for time namespace support
From: Thomas Gleixner @ 2019-10-16 15:07 UTC (permalink / raw)
  To: Vincenzo Frascino
  Cc: Dmitry Safonov, linux-kernel, Dmitry Safonov, Adrian Reber,
	Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <a726e64f-bf73-4eca-6acf-75926898d88a@arm.com>

On Wed, 16 Oct 2019, Vincenzo Frascino wrote:

< SNIP >

> > --- a/lib/vdso/Kconfig
> > +++ b/lib/vdso/Kconfig
> > @@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO
> >  	help
> >  	  This config option enables the compat VDSO layer.
> >  
> > +config VDSO_TIMENS
> 
> To uniform the naming with the rest of the file and with CONFIG_TIME_NS, can we
> please change the name of this config option in GENERIC_VDSO_TIME_NS? And then
> follow the logic explained by Thomas in patch 1 of this series.

Can you pretty please finally start to trim your replies?

If you didn't read the link I sent you earlier, then here is the relevant
paragraph:

  It's an annoyance to have to scroll down through several pages of quoted
  text to find a single line of reply or to figure out that after that
  reply the rest of the e-mail is just useless ballast.

I know that corporate mail style does not care about that, but I pretty
much care.

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCHv7 19/33] lib/vdso: Prepare for time namespace support
From: Vincenzo Frascino @ 2019-10-16 14:37 UTC (permalink / raw)
  To: Dmitry Safonov, linux-kernel
  Cc: Dmitry Safonov, Thomas Gleixner, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <20191011012341.846266-20-dima@arista.com>

Hi Dmitry,

On 10/11/19 2:23 AM, Dmitry Safonov wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> To support time namespaces in the vdso with a minimal impact on regular non
> time namespace affected tasks, the namespace handling needs to be hidden in
> a slow path.
> 
> The most obvious place is vdso_seq_begin(). If a task belongs to a time
> namespace then the VVAR page which contains the system wide vdso data is
> replaced with a namespace specific page which has the same layout as the
> VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path
> and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time
> namespace handling path.
> 
> The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
> update of the vdso data is in progress, is not really affecting regular
> tasks which are not part of a time namespace as the task is spin waiting
> for the update to finish and vdso_data->seq to become even again.
> 
> If a time namespace task hits that code path, it invokes the corresponding
> time getter function which retrieves the real VVAR page, reads host time
> and then adds the offset for the requested clock which is stored in the
> special VVAR page.
> 
> If VDSO time namespace support is disabled the whole magic is compiled out.
> 
> Initial testing shows that the disabled case is almost identical to the
> host case which does not take the slow timens path. With the special timens
> page installed the performance hit is constant time and in the range of
> 5-7%.
> 
> For the vdso functions which are not using the sequence count an
> unconditional check for vdso_data->clock_mode is added which switches to
> the real vdso when the clock_mode is VCLOCK_TIMENS.
> 
> Suggested-by: Andy Lutomirski <luto@kernel.org>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Andrei Vagin <avagin@gmail.com>
> Signed-off-by: Dmitry Safonov <dima@arista.com>
> ---
>  include/linux/time.h    |   6 ++
>  include/vdso/datapage.h |  19 +++++-
>  lib/vdso/Kconfig        |   6 ++
>  lib/vdso/gettimeofday.c | 128 +++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 155 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/time.h b/include/linux/time.h
> index 27d83fd2ae61..b1a592638d7d 100644
> --- a/include/linux/time.h
> +++ b/include/linux/time.h
> @@ -96,4 +96,10 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its)
>   */
>  #define time_after32(a, b)	((s32)((u32)(b) - (u32)(a)) < 0)
>  #define time_before32(b, a)	time_after32(a, b)
> +
> +struct timens_offset {
> +	s64	sec;
> +	u64	nsec;
> +};
> +
>  #endif
> diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
> index 2e302c0f41f7..65a38acce27e 100644
> --- a/include/vdso/datapage.h
> +++ b/include/vdso/datapage.h
> @@ -21,6 +21,8 @@
>  #define CS_RAW		1
>  #define CS_BASES	(CS_RAW + 1)
>  
> +#define VCLOCK_TIMENS	UINT_MAX
> +
>  /**
>   * struct vdso_timestamp - basetime per clock_id
>   * @sec:	seconds
> @@ -48,6 +50,7 @@ struct vdso_timestamp {
>   * @mult:		clocksource multiplier
>   * @shift:		clocksource shift
>   * @basetime[clock_id]:	basetime per clock_id
> + * @offset[clock_id]:	time namespace offset per clock_id
>   * @tz_minuteswest:	minutes west of Greenwich
>   * @tz_dsttime:		type of DST correction
>   * @hrtimer_res:	hrtimer resolution
> @@ -55,6 +58,17 @@ struct vdso_timestamp {
>   *
>   * vdso_data will be accessed by 64 bit and compat code at the same time
>   * so we should be careful before modifying this structure.
> + *
> + * @basetime is used to store the base time for the system wide time getter
> + * VVAR page.
> + *
> + * @offset is used by the special time namespace VVAR pages which are
> + * installed instead of the real VVAR page. These namespace pages must set
> + * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the
> + * time namespace slow path. The namespace aware functions retrieve the
> + * real system wide VVAR page, read host time and add the per clock offset.
> + * For clocks which are not affected by time namespace adjustement the
> + * offset must be zero.
>   */
>  struct vdso_data {
>  	u32			seq;
> @@ -65,7 +79,10 @@ struct vdso_data {
>  	u32			mult;
>  	u32			shift;
>  
> -	struct vdso_timestamp	basetime[VDSO_BASES];
> +	union {
> +		struct vdso_timestamp	basetime[VDSO_BASES];
> +		struct timens_offset	offset[VDSO_BASES];
> +	};
>  
>  	s32			tz_minuteswest;
>  	s32			tz_dsttime;
> diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
> index 9fe698ff62ec..85276de70dba 100644
> --- a/lib/vdso/Kconfig
> +++ b/lib/vdso/Kconfig
> @@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO
>  	help
>  	  This config option enables the compat VDSO layer.
>  
> +config VDSO_TIMENS

To uniform the naming with the rest of the file and with CONFIG_TIME_NS, can we
please change the name of this config option in GENERIC_VDSO_TIME_NS? And then
follow the logic explained by Thomas in patch 1 of this series.

Thanks,
Vincenzo

> +	bool
> +	help
> +	  Selected by architectures which support time namespaces in the
> +	  VDSO
> +
>  endif
> diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
> index e630e7ff57f1..25244b677823 100644
> --- a/lib/vdso/gettimeofday.c
> +++ b/lib/vdso/gettimeofday.c
> @@ -38,6 +38,51 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
>  }
>  #endif
>  
> +#ifndef CONFIG_VDSO_TIMENS
> +static __always_inline
> +const struct vdso_data *__arch_get_timens_vdso_data(void)
> +{
> +	return NULL;
> +}
> +#endif
> +
> +static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk,
> +		      struct __kernel_timespec *ts)
> +{
> +	const struct vdso_data *vd = __arch_get_timens_vdso_data();
> +	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
> +	const struct timens_offset *offs = &vdns->offset[clk];
> +	u64 cycles, last, ns;
> +	s64 sec;
> +	u32 seq;
> +
> +	do {
> +		seq = vdso_read_begin(vd);
> +		cycles = __arch_get_hw_counter(vd->clock_mode);
> +		ns = vdso_ts->nsec;
> +		last = vd->cycle_last;
> +		if (unlikely((s64)cycles < 0))
> +			return -1;
> +
> +		ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
> +		ns >>= vd->shift;
> +		sec = vdso_ts->sec;
> +	} while (unlikely(vdso_read_retry(vd, seq)));
> +
> +	/* Add the namespace offset */
> +	sec += offs->sec;
> +	ns += offs->nsec;
> +
> +	/*
> +	 * Do this outside the loop: a race inside the loop could result
> +	 * in __iter_div_u64_rem() being extremely slow.
> +	 */
> +	ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
> +	ts->tv_nsec = ns;
> +
> +	return 0;
> +}
> +
>  static int do_hres(const struct vdso_data *vd, clockid_t clk,
>  		   struct __kernel_timespec *ts)
>  {
> @@ -46,7 +91,28 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk,
>  	u32 seq;
>  
>  	do {
> -		seq = vdso_read_begin(vd);
> +		/*
> +		 * Open coded to handle VCLOCK_TIMENS. Time namespace
> +		 * enabled tasks have a special VVAR page installed which
> +		 * has vd->seq set to 1 and vd->clock_mode set to
> +		 * VCLOCK_TIMENS. For non time namespace affected tasks
> +		 * this does not affect performance because if vd->seq is
> +		 * odd, i.e. a concurrent update is in progress the extra
> +		 * check for vd->clock_mode is just a few extra
> +		 * instructions while spin waiting for vd->seq to become
> +		 * even again.
> +		 */
> +		while (1) {
> +			seq = READ_ONCE(vd->seq);
> +			if (likely(!(seq & 1)))
> +				break;
> +			if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
> +			    vd->clock_mode == VCLOCK_TIMENS)
> +				return do_hres_timens(vd, clk, ts);
> +			cpu_relax();
> +		}
> +		smp_rmb();
> +
>  		cycles = __arch_get_hw_counter(vd->clock_mode);
>  		ns = vdso_ts->nsec;
>  		last = vd->cycle_last;
> @@ -68,6 +134,34 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk,
>  	return 0;
>  }
>  
> +static void do_coarse_timens(const struct vdso_data *vdns, clockid_t clk,
> +			 struct __kernel_timespec *ts)
> +{
> +	const struct vdso_data *vd = __arch_get_timens_vdso_data();
> +	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
> +	const struct timens_offset *offs = &vdns->offset[clk];
> +	u64 nsec;
> +	s64 sec;
> +	s32 seq;
> +
> +	do {
> +		seq = vdso_read_begin(vd);
> +		sec = vdso_ts->sec;
> +		nsec = vdso_ts->nsec;
> +	} while (unlikely(vdso_read_retry(vd, seq)));
> +
> +	/* Add the namespace offset */
> +	sec += offs->sec;
> +	nsec += offs->nsec;
> +
> +	/*
> +	 * Do this outside the loop: a race inside the loop could result
> +	 * in __iter_div_u64_rem() being extremely slow.
> +	 */
> +	ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
> +	ts->tv_nsec = nsec;
> +}
> +
>  static void do_coarse(const struct vdso_data *vd, clockid_t clk,
>  		      struct __kernel_timespec *ts)
>  {
> @@ -75,7 +169,23 @@ static void do_coarse(const struct vdso_data *vd, clockid_t clk,
>  	u32 seq;
>  
>  	do {
> -		seq = vdso_read_begin(vd);
> +		/*
> +		 * Open coded to handle VCLOCK_TIMENS. See comment in
> +		 * do_hres().
> +		 */
> +		while (1) {
> +			seq = READ_ONCE(vd->seq);
> +			if (likely(!(seq & 1)))
> +				break;
> +			if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
> +			    vd->clock_mode == VCLOCK_TIMENS) {
> +				do_coarse_timens(vd, clk, ts);
> +				return;
> +			}
> +			cpu_relax();
> +		}
> +		smp_rmb();
> +
>  		ts->tv_sec = vdso_ts->sec;
>  		ts->tv_nsec = vdso_ts->nsec;
>  	} while (unlikely(vdso_read_retry(vd, seq)));
> @@ -156,6 +266,10 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
>  	}
>  
>  	if (unlikely(tz != NULL)) {
> +		if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
> +		    vd->clock_mode == VCLOCK_TIMENS)
> +			vd = __arch_get_timens_vdso_data();
> +
>  		tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest;
>  		tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime;
>  	}
> @@ -167,7 +281,12 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
>  static __maybe_unused time_t __cvdso_time(time_t *time)
>  {
>  	const struct vdso_data *vd = __arch_get_vdso_data();
> -	time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);
> +	time_t t;
> +
> +	if (IS_ENABLED(CONFIG_VDSO_TIMENS) && vd->clock_mode == VCLOCK_TIMENS)
> +		vd = __arch_get_timens_vdso_data();
> +
> +	t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);
>  
>  	if (time)
>  		*time = t;
> @@ -189,6 +308,9 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res)
>  	if (unlikely((u32) clock >= MAX_CLOCKS))
>  		return -1;
>  
> +	if (IS_ENABLED(CONFIG_VDSO_TIMENS) && vd->clock_mode == VCLOCK_TIMENS)
> +		vd = __arch_get_timens_vdso_data();
> +
>  	hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res);
>  	/*
>  	 * Convert the clockid to a bitmask and use it to check which
> 

-- 
Regards,
Vincenzo

^ permalink raw reply

* Re: [RFC PATCH 02/21] Add a prelocked wake-up
From: David Howells @ 2019-10-16 14:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Tim Chen, Kan Liang, Casey Schaufler, Stephen Smalley,
	Greg Kroah-Hartman, Nicolas Dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, LSM List, linux-fsdevel,
	Linux API, Linux Kernel Mailing List
In-Reply-To: <CAHk-=whiz1sHXu8SVZKEC2dup=r5JMrftPtEt6ff9Ea8dyH8yQ@mail.gmail.com>

Btw, is there any point in __wake_up_sync_key() taking a nr_exclusive
argument since it clears WF_SYNC if nr_exclusive != 1 and doesn't make sense
to be >1 anyway.

David
---
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..bb7676d396cd 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -201,9 +201,9 @@ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
 		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
-void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -214,7 +214,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 #define wake_up_interruptible(x)	__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
 #define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
-#define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
+#define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE)
 
 /*
  * Wakeup macros to be used to report events to the targets.
@@ -228,7 +228,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 #define wake_up_interruptible_poll(x, m)					\
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
 #define wake_up_interruptible_sync_poll(x, m)					\
-	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, poll_to_key(m))
+	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
 
 #define ___wait_cond_timeout(condition)						\
 ({										\
diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d67002..a1ff25ef050e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1435,7 +1435,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
 {
 	__wake_up_sync_key(&parent->signal->wait_chldexit,
-				TASK_INTERRUPTIBLE, 1, p);
+			   TASK_INTERRUPTIBLE, p);
 }
 
 static long do_wait(struct wait_opts *wo)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..b4b52361dab7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
  * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
@@ -183,26 +182,21 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  * accessing the task state.
  */
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, void *key)
+			void *key)
 {
-	int wake_flags = 1; /* XXX WF_SYNC */
-
 	if (unlikely(!wq_head))
 		return;
 
-	if (unlikely(nr_exclusive != 1))
-		wake_flags = 0;
-
-	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+	__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
 {
-	__wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+	__wake_up_sync_key(wq_head, mode, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox