Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCHv3 12/27] x86/vdso: Restrict splitting VVAR VMA
From: Dmitry Safonov @ 2019-04-25 16:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Adrian Reber, Andrei Vagin, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

Although, time namespace can work with VVAR VMA split, it seems worth
to forbid splitting VVAR resulting in stricter ABI and reducing amount
of corner-cases to consider while working further on VDSO.

I don't think there is any use-case for partial mremap() of vvar,
but if there is any - this patch can be easily reverted.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 arch/x86/entry/vdso/vma.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index babc4e7a519c..ff9875a4d53b 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -84,6 +84,18 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
 	return 0;
 }
 
+static int vvar_mremap(const struct vm_special_mapping *sm,
+		struct vm_area_struct *new_vma)
+{
+	unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+	const struct vdso_image *image = current->mm->context.vdso_image;
+
+	if (new_size != -image->sym_vvar_start)
+		return -EINVAL;
+
+	return 0;
+}
+
 static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 		      struct vm_area_struct *vma, struct vm_fault *vmf)
 {
@@ -136,6 +148,7 @@ static const struct vm_special_mapping vdso_mapping = {
 static const struct vm_special_mapping vvar_mapping = {
 	.name = "[vvar]",
 	.fault = vvar_fault,
+	.mremap = vvar_mremap,
 };
 
 /*
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 11/27] x86/vdso/Makefile: Add vobjs32
From: Dmitry Safonov @ 2019-04-25 16:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Adrian Reber, Andrei Vagin, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

Treat ia32/i386 objects in array the same As for 64-bit vdso objects.
This is a preparation ground to avoid code duplication on introduction
timens vdso.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 arch/x86/entry/vdso/Makefile | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 5bfe2243a08f..bc0bdbf49397 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -18,6 +18,8 @@ VDSO32-$(CONFIG_IA32_EMULATION)	:= y
 
 # files to link into the vdso
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
+vobjs32-y += vdso32/vclock_gettime.o
 
 # files to link into kernel
 obj-y				+= vma.o
@@ -31,10 +33,12 @@ vdso_img-$(VDSO32-y)		+= 32
 obj-$(VDSO32-y)			+= vdso32-setup.o
 
 vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
 
 $(obj)/vdso.o: $(obj)/vdso.so
 
 targets += vdso.lds $(vobjs-y)
+targets += vdso32/vdso32.lds $(vobjs32-y)
 
 # Build the vDSO image C files and link them in.
 vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
@@ -125,10 +129,6 @@ $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
 VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
 
-targets += vdso32/vdso32.lds
-targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-targets += vdso32/vclock_gettime.o
-
 KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
 $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
 $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
@@ -153,12 +153,7 @@ endif
 
 $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
 
-$(obj)/vdso32.so.dbg: FORCE \
-		      $(obj)/vdso32/vdso32.lds \
-		      $(obj)/vdso32/vclock_gettime.o \
-		      $(obj)/vdso32/note.o \
-		      $(obj)/vdso32/system_call.o \
-		      $(obj)/vdso32/sigreturn.o
+$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
 	$(call if_changed,vdso)
 
 #
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 10/27] x86/vdso2c: Convert iterator to unsigned
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Adrian Reber, Andrei Vagin, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

i and j are used everywhere with unsigned types.
Cleanup and prettify the code a bit.

Introduce syms_nr for readability and as a preparation for allocating an
array of vDSO entries that will be needed for creating two vdso .so's:
one for host tasks and another for processes inside time namespace.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 arch/x86/entry/vdso/vdso2c.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index fa847a620f40..61c8bb2e5af8 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -13,7 +13,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	unsigned long load_size = -1;  /* Work around bogus warning */
 	unsigned long mapping_size;
 	ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
-	int i;
+	unsigned int i, syms_nr;
 	unsigned long j;
 	ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
 		*alt_sec = NULL;
@@ -86,11 +86,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
 		GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
 
+	syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
 	/* Walk the symbol table */
-	for (i = 0;
-	     i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
-	     i++) {
-		int k;
+	for (i = 0; i < syms_nr; i++) {
+		unsigned int k;
 		ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
 			GET_LE(&symtab_hdr->sh_entsize) * i;
 		const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 09/27] x86/vdso2c: Correct err messages on file opening
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Adrian Reber, Andrei Vagin, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

err() message in main() is misleading: it should print `outfilename`,
which is argv[3], not argv[2].

Correct error messages to be more precise about what failed and for
which file.

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 arch/x86/entry/vdso/vdso2c.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 8e470b018512..26d7177c119e 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -187,7 +187,7 @@ static void map_input(const char *name, void **addr, size_t *len, int prot)
 
 	int fd = open(name, O_RDONLY);
 	if (fd == -1)
-		err(1, "%s", name);
+		err(1, "open(%s)", name);
 
 	tmp_len = lseek(fd, 0, SEEK_END);
 	if (tmp_len == (off_t)-1)
@@ -240,7 +240,7 @@ int main(int argc, char **argv)
 	outfilename = argv[3];
 	outfile = fopen(outfilename, "w");
 	if (!outfile)
-		err(1, "%s", argv[2]);
+		err(1, "fopen(%s)", outfilename);
 
 	go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
 
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 08/27] timens: Shift /proc/uptime
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Adrian Reber, Andrei Vagin, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

Respect boottime inside time namespace for /proc/uptime

Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 fs/proc/uptime.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index a4c2791ab70b..5a1b228964fb 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/time_namespace.h>
 #include <linux/kernel_stat.h>
 
 static int uptime_proc_show(struct seq_file *m, void *v)
@@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 		nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
 	ktime_get_boottime_ts64(&uptime);
+	timens_add_boottime(&uptime);
+
 	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	idle.tv_nsec = rem;
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 07/27] timens/kernel: Take into account timens clock offsets in clock_nanosleep
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Dmitry Safonov, Eric W. Biederman,
	H. Peter Anvin, Ingo Molnar, Jeff Dike, Oleg Nesterov,
	Pavel Emelyanov, Shuah Khan, Thomas Gleixner, Vincenzo Frascino,
	containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@gmail.com>

Wire up clock_nanosleep() to timens offsets.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 kernel/time/alarmtimer.c   |  2 ++
 kernel/time/posix-stubs.c  |  8 ++++----
 kernel/time/posix-timers.c | 12 ++++++++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 23f5c39c8d23..409d518acbb3 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -816,6 +816,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 		ktime_t now = alarm_bases[type].gettime();
 
 		exp = ktime_add_safe(now, exp);
+	} else {
+		exp = timens_ktime_to_host(alarm_bases[type].base_clockid, exp);
 	}
 
 	ret = alarmtimer_do_nsleep(&alarm, exp, type);
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index edaf075d1ee4..f319d51e2e01 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -147,8 +147,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		rmtp = NULL;
 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
-	return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
-				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+	return hrtimer_nanosleep(&t, HRTIMER_MODE_NS | (flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL),
 				 which_clock);
 }
 
@@ -233,8 +233,8 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
 		rmtp = NULL;
 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
 	current->restart_block.nanosleep.compat_rmtp = rmtp;
-	return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
-				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+	return hrtimer_nanosleep(&t, HRTIMER_MODE_NS | (flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL),
 				 which_clock);
 }
 #endif
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 7922731f98cd..39c0330464a3 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1188,6 +1188,14 @@ static int common_nsleep(const clockid_t which_clock, int flags,
 				 which_clock);
 }
 
+static int common_nsleep_timens(const clockid_t which_clock, int flags,
+			 const struct timespec64 *rqtp)
+{
+	return hrtimer_nanosleep(rqtp, HRTIMER_MODE_NS | (flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL),
+				 which_clock);
+}
+
 SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct __kernel_timespec __user *, rqtp,
 		struct __kernel_timespec __user *, rmtp)
@@ -1262,7 +1270,7 @@ static const struct k_clock clock_realtime = {
 static const struct k_clock clock_monotonic = {
 	.clock_getres		= posix_get_hrtimer_res,
 	.clock_get		= posix_ktime_get_ts,
-	.nsleep			= common_nsleep,
+	.nsleep			= common_nsleep_timens,
 	.timer_create		= common_timer_create,
 	.timer_set		= common_timer_set,
 	.timer_get		= common_timer_get,
@@ -1307,7 +1315,7 @@ static const struct k_clock clock_tai = {
 static const struct k_clock clock_boottime = {
 	.clock_getres		= posix_get_hrtimer_res,
 	.clock_get		= posix_get_boottime,
-	.nsleep			= common_nsleep,
+	.nsleep			= common_nsleep_timens,
 	.timer_create		= common_timer_create,
 	.timer_set		= common_timer_set,
 	.timer_get		= common_timer_get,
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 06/27] posix-timers/timens: Take into account clock offsets
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Dmitry Safonov, Eric W. Biederman,
	H. Peter Anvin, Ingo Molnar, Jeff Dike, Oleg Nesterov,
	Pavel Emelyanov, Shuah Khan, Thomas Gleixner, Vincenzo Frascino,
	containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@gmail.com>

Wire timer_settime() syscall into time namespace virtualization.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 kernel/time/alarmtimer.c   | 3 +++
 kernel/time/posix-timers.c | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 31f99361342e..23f5c39c8d23 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
 #include <linux/freezer.h>
 #include <linux/compat.h>
 #include <linux/module.h>
+#include <linux/time_namespace.h>
 
 #include "posix-timers.h"
 
@@ -621,6 +622,8 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
 
 	if (!absolute)
 		expires = ktime_add_safe(expires, base->gettime());
+	else
+		expires = timens_ktime_to_host(base->base_clockid, expires);
 	if (sigev_none)
 		alarm->node.expires = expires;
 	else
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index a723e63d55fd..7922731f98cd 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -803,6 +803,8 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
 
 	if (!absolute)
 		expires = ktime_add_safe(expires, timer->base->get_time());
+	else
+		expires = timens_ktime_to_host(timer->base->clockid, expires);
 	hrtimer_set_expires(timer, expires);
 
 	if (!sigev_none)
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 05/27] timerfd/timens: Take into account ns clock offsets
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Dmitry Safonov, Eric W. Biederman,
	H. Peter Anvin, Ingo Molnar, Jeff Dike, Oleg Nesterov,
	Pavel Emelyanov, Shuah Khan, Thomas Gleixner, Vincenzo Frascino,
	containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@gmail.com>

Make timerfd respect timens offsets.
Provide a helper timens_ktime_to_host() that is useful to wire up
timens to different kernel subsystems.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 fs/timerfd.c                   |  8 +++++--
 include/linux/hrtimer.h        |  1 +
 include/linux/time_namespace.h | 40 ++++++++++++++++++++++++++++++++++
 kernel/time/hrtimer.c          |  3 +++
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a6fc8aa1de7..a55ade09eee1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <linux/rcupdate.h>
+#include <linux/time_namespace.h>
 
 struct timerfd_ctx {
 	union {
@@ -179,6 +180,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
 	htmode = (flags & TFD_TIMER_ABSTIME) ?
 		HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
 
+	htmode |= HRTIMER_MODE_NS;
+
 	texp = timespec64_to_ktime(ktmr->it_value);
 	ctx->expired = 0;
 	ctx->ticks = 0;
@@ -197,9 +200,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
 
 	if (texp != 0) {
 		if (isalarm(ctx)) {
-			if (flags & TFD_TIMER_ABSTIME)
+			if (flags & TFD_TIMER_ABSTIME) {
+				texp = timens_ktime_to_host(clockid, texp);
 				alarm_start(&ctx->t.alarm, texp);
-			else
+			} else
 				alarm_start_relative(&ctx->t.alarm, texp);
 		} else {
 			hrtimer_start(&ctx->t.tmr, texp, htmode);
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2e8957eac4d4..4b9c89c797ee 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -38,6 +38,7 @@ enum hrtimer_mode {
 	HRTIMER_MODE_REL	= 0x01,
 	HRTIMER_MODE_PINNED	= 0x02,
 	HRTIMER_MODE_SOFT	= 0x04,
+	HRTIMER_MODE_NS		= 0x08,
 
 	HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
 	HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 5f0da6858b10..988414f7f791 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -56,6 +56,41 @@ static inline void timens_add_boottime(struct timespec64 *ts)
                 *ts = timespec64_add(*ts, ns_offsets->monotonic_boottime_offset);
 }
 
+static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
+{
+	struct timens_offsets *ns_offsets = current->nsproxy->time_ns->offsets;
+	struct timespec64 *offset;
+	ktime_t koff;
+
+	if (!ns_offsets)
+		return tim;
+
+	switch (clockid) {
+		case CLOCK_MONOTONIC:
+		case CLOCK_MONOTONIC_RAW:
+		case CLOCK_MONOTONIC_COARSE:
+			offset = &ns_offsets->monotonic_time_offset;
+			break;
+		case CLOCK_BOOTTIME:
+		case CLOCK_BOOTTIME_ALARM:
+			offset = &ns_offsets->monotonic_boottime_offset;
+			break;
+		default:
+			return tim;
+	}
+
+	koff = timespec64_to_ktime(*offset);
+	if (tim < koff)
+		tim = 0;
+	else if (KTIME_MAX - tim < -koff)
+		tim = KTIME_MAX;
+	else
+		tim = ktime_sub(tim, koff);
+
+	return tim;
+}
+
+
 #else
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
 {
@@ -82,6 +117,11 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts
 
 static inline void timens_add_monotonic(struct timespec64 *ts) {}
 static inline void timens_add_boottime(struct timespec64 *ts) {}
+
+static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
+{
+	return tim;
+}
 #endif
 
 #endif /* _LINUX_TIMENS_H */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 41dfff23c1f9..484a0efec1df 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -42,6 +42,7 @@
 #include <linux/timer.h>
 #include <linux/freezer.h>
 #include <linux/compat.h>
+#include <linux/time_namespace.h>
 
 #include <linux/uaccess.h>
 
@@ -1069,6 +1070,8 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	if (mode & HRTIMER_MODE_REL)
 		tim = ktime_add_safe(tim, base->get_time());
+	else if (mode & HRTIMER_MODE_NS)
+		tim = timens_ktime_to_host(base->clockid, tim);
 
 	tim = hrtimer_update_lowres(timer, tim, mode);
 
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 04/27] timens: Introduce CLOCK_BOOTTIME offset
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@openvz.org>

Adds boottime virtualisation for time namespace.
Introduce timespec for boottime clock into timens offsets and wire
clock_gettime() syscall.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 include/linux/time_namespace.h | 9 +++++++++
 include/linux/timens_offsets.h | 1 +
 kernel/time/alarmtimer.c       | 8 +++++++-
 kernel/time/posix-stubs.c      | 1 +
 kernel/time/posix-timers.c     | 2 ++
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 8f75d34cf34a..5f0da6858b10 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -48,6 +48,14 @@ static inline void timens_add_monotonic(struct timespec64 *ts)
                 *ts = timespec64_add(*ts, ns_offsets->monotonic_time_offset);
 }
 
+static inline void timens_add_boottime(struct timespec64 *ts)
+{
+        struct timens_offsets *ns_offsets = current->nsproxy->time_ns->offsets;
+
+        if (ns_offsets)
+                *ts = timespec64_add(*ts, ns_offsets->monotonic_boottime_offset);
+}
+
 #else
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
 {
@@ -73,6 +81,7 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts
 }
 
 static inline void timens_add_monotonic(struct timespec64 *ts) {}
+static inline void timens_add_boottime(struct timespec64 *ts) {}
 #endif
 
 #endif /* _LINUX_TIMENS_H */
diff --git a/include/linux/timens_offsets.h b/include/linux/timens_offsets.h
index 248b0c0bb92a..777530c46852 100644
--- a/include/linux/timens_offsets.h
+++ b/include/linux/timens_offsets.h
@@ -4,6 +4,7 @@
 
 struct timens_offsets {
 	struct timespec64  monotonic_time_offset;
+	struct timespec64  monotonic_boottime_offset;
 };
 
 #endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0519a8805aab..31f99361342e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -653,12 +653,18 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
  */
 static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
 {
-	struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+	struct alarm_base *base;
+
+	base = &alarm_bases[clock2alarm(which_clock & ~CLOCK_TIMENS)];
 
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
 	*tp = ktime_to_timespec64(base->gettime());
+
+	if (which_clock == (CLOCK_BOOTTIME_ALARM | CLOCK_TIMENS))
+		timens_add_boottime(tp);
+
 	return 0;
 }
 
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 17c67e0aecd8..edaf075d1ee4 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -82,6 +82,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 		break;
 	case CLOCK_BOOTTIME:
 		ktime_get_boottime_ts64(tp);
+		timens_add_boottime(tp);
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index a047d6b7c768..a723e63d55fd 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -232,6 +232,8 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
 static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
 	ktime_get_boottime_ts64(tp);
+	if (which_clock & CLOCK_TIMENS)
+		timens_add_boottime(tp);
 	return 0;
 }
 
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 03/27] timens: Introduce CLOCK_MONOTONIC offsets
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@openvz.org>

Add monotonic time virtualisation for time namespace.
Introduce timespec for monotionic clock into timens offsets and wire
clock_gettime() syscall.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 include/linux/time_namespace.h |  8 ++++++++
 include/linux/timens_offsets.h |  1 +
 include/uapi/linux/time.h      |  2 ++
 kernel/time/posix-stubs.c      |  2 ++
 kernel/time/posix-timers.c     | 11 +++++++++--
 5 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index b6985aa87479..8f75d34cf34a 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -40,6 +40,13 @@ static inline void put_time_ns(struct time_namespace *ns)
 	kref_put(&ns->kref, free_time_ns);
 }
 
+static inline void timens_add_monotonic(struct timespec64 *ts)
+{
+        struct timens_offsets *ns_offsets = current->nsproxy->time_ns->offsets;
+
+        if (ns_offsets)
+                *ts = timespec64_add(*ts, ns_offsets->monotonic_time_offset);
+}
 
 #else
 static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
@@ -65,6 +72,7 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts
 	return 0;
 }
 
+static inline void timens_add_monotonic(struct timespec64 *ts) {}
 #endif
 
 #endif /* _LINUX_TIMENS_H */
diff --git a/include/linux/timens_offsets.h b/include/linux/timens_offsets.h
index 7d7cb68ea778..248b0c0bb92a 100644
--- a/include/linux/timens_offsets.h
+++ b/include/linux/timens_offsets.h
@@ -3,6 +3,7 @@
 #define _LINUX_TIME_OFFSETS_H
 
 struct timens_offsets {
+	struct timespec64  monotonic_time_offset;
 };
 
 #endif
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 958932effc5e..32abadc68f72 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -65,6 +65,8 @@ struct itimerval {
 #define CLOCKS_MASK			(CLOCK_REALTIME | CLOCK_MONOTONIC)
 #define CLOCKS_MONO			CLOCK_MONOTONIC
 
+#define CLOCK_TIMENS			1024
+
 /*
  * The various flags for setting POSIX.1b interval timers:
  */
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 67df65f887ac..17c67e0aecd8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -14,6 +14,7 @@
 #include <linux/ktime.h>
 #include <linux/timekeeping.h>
 #include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
 #include <linux/compat.h>
 
 #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
@@ -77,6 +78,7 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 		break;
 	case CLOCK_MONOTONIC:
 		ktime_get_ts64(tp);
+		timens_add_monotonic(tp);
 		break;
 	case CLOCK_BOOTTIME:
 		ktime_get_boottime_ts64(tp);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 29176635991f..a047d6b7c768 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,6 +30,7 @@
 #include <linux/hashtable.h>
 #include <linux/compat.h>
 #include <linux/nospec.h>
+#include <linux/time_namespace.h>
 
 #include "timekeeping.h"
 #include "posix-timers.h"
@@ -190,6 +191,8 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
 static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
 {
 	ktime_get_ts64(tp);
+	if (which_clock & CLOCK_TIMENS)
+		timens_add_monotonic(tp);
 	return 0;
 }
 
@@ -199,6 +202,8 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
 static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
 {
 	ktime_get_raw_ts64(tp);
+	if (which_clock & CLOCK_TIMENS)
+		timens_add_monotonic(tp);
 	return 0;
 }
 
@@ -213,6 +218,8 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
 						struct timespec64 *tp)
 {
 	ktime_get_coarse_ts64(tp);
+	if (which_clock & CLOCK_TIMENS)
+		timens_add_monotonic(tp);
 	return 0;
 }
 
@@ -1039,7 +1046,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_get(which_clock, &kernel_tp);
+	error = kc->clock_get(which_clock | CLOCK_TIMENS, &kernel_tp);
 
 	if (!error && put_timespec64(&kernel_tp, tp))
 		error = -EFAULT;
@@ -1121,7 +1128,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
 	if (!kc)
 		return -EINVAL;
 
-	err = kc->clock_get(which_clock, &ts);
+	err = kc->clock_get(which_clock | CLOCK_TIMENS, &ts);
 
 	if (!err && put_old_timespec32(&ts, tp))
 		err = -EFAULT;
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 02/27] timens: Add timens_offsets
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@openvz.org>

Introduce offsets for time namespace. They will contain an adjustment
needed to convert clocks to/from host's.

Allocate one page for each time namespace that will be premapped into
userspace among vvar pages.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 MAINTAINERS                    |  1 +
 include/linux/time_namespace.h |  1 +
 include/linux/timens_offsets.h |  8 ++++++++
 kernel/time_namespace.c        | 14 ++++++++++++--
 4 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/timens_offsets.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b50720361de7..6a57d94963f6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12397,6 +12397,7 @@ S:	Maintained
 F:	fs/timerfd.c
 F:	include/linux/timer*
 F:	include/linux/time_namespace.h
+F:	include/linux/timens_offsets.h
 F:	kernel/time_namespace.c
 F:	kernel/time/*timer*
 
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 9507ed7072fe..b6985aa87479 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -8,6 +8,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/err.h>
+#include <linux/timens_offsets.h>
 
 struct user_namespace;
 extern struct user_namespace init_user_ns;
diff --git a/include/linux/timens_offsets.h b/include/linux/timens_offsets.h
new file mode 100644
index 000000000000..7d7cb68ea778
--- /dev/null
+++ b/include/linux/timens_offsets.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIME_OFFSETS_H
+#define _LINUX_TIME_OFFSETS_H
+
+struct timens_offsets {
+};
+
+#endif
diff --git a/kernel/time_namespace.c b/kernel/time_namespace.c
index 8c600df9771d..4828447721ec 100644
--- a/kernel/time_namespace.c
+++ b/kernel/time_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/sched/task.h>
+#include <linux/mm.h>
 
 static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
 {
@@ -46,6 +47,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 {
 	struct time_namespace *ns;
 	struct ucounts *ucounts;
+	struct page *page;
 	int err;
 
 	err = -ENOSPC;
@@ -58,15 +60,22 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
 	if (!ns)
 		goto fail_dec;
 
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto fail_free;
+	ns->offsets = page_address(page);
+	BUILD_BUG_ON(sizeof(*ns->offsets) > PAGE_SIZE);
+
 	err = ns_alloc_inum(&ns->ns);
 	if (err)
-		goto fail_free;
+		goto fail_page;
 
 	ns->ucounts = ucounts;
 	ns->ns.ops = &timens_operations;
 	ns->user_ns = get_user_ns(user_ns);
 	return ns;
-
+fail_page:
+	free_page((unsigned long)ns->offsets);
 fail_free:
 	kfree(ns);
 fail_dec:
@@ -94,6 +103,7 @@ void free_time_ns(struct kref *kref)
 	struct time_namespace *ns;
 
 	ns = container_of(kref, struct time_namespace, kref);
+	free_page((unsigned long)ns->offsets);
 	dec_time_namespaces(ns->ucounts);
 	put_user_ns(ns->user_ns);
 	ns_free_inum(&ns->ns);
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 01/27] ns: Introduce Time Namespace
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Andrei Vagin, Dmitry Safonov, Adrian Reber, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20190425161416.26600-1-dima@arista.com>

From: Andrei Vagin <avagin@openvz.org>

Time Namespace isolates clock values.

The kernel provides access to several clocks CLOCK_REALTIME,
CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc.

CLOCK_REALTIME
      System-wide clock that measures real (i.e., wall-clock) time.

CLOCK_MONOTONIC
      Clock that cannot be set and represents monotonic time since
      some unspecified starting point.

CLOCK_BOOTTIME
      Identical to CLOCK_MONOTONIC, except it also includes any time
      that the system is suspended.

For many users, the time namespace means the ability to changes date and
time in a container (CLOCK_REALTIME).

But in a context of the checkpoint/restore functionality, monotonic and
bootime clocks become interesting. Both clocks are monotonic with
unspecified staring points. These clocks are widely used to measure time
slices and set timers. After restoring or migrating processes, we have to
guarantee that they never go backward. In an ideal case, the behavior of
these clocks should be the same as for a case when a whole system is
suspended. All this means that we need to be able to set CLOCK_MONOTONIC
and CLOCK_BOOTTIME clocks, what can be done by adding per-namespace
offsets for clocks.

A time namespace is similar to a pid namespace in a way how it is
created: unshare(CLONE_NEWTIME) system call creates a new time namespace,
but doesn't set it to the current process. Then all children of
the process will be born in the new time namespace, or a process can
use the setns() system call to join a namespace.

This scheme allows setting clock offsets for a namespace, before any
processes appear in it.

Link: https://criu.org/Time_namespace
Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 MAINTAINERS                    |   2 +
 fs/proc/namespaces.c           |   4 +
 include/linux/nsproxy.h        |   2 +
 include/linux/proc_ns.h        |   2 +
 include/linux/time_namespace.h |  69 +++++++++++
 include/linux/user_namespace.h |   1 +
 include/uapi/linux/sched.h     |   1 +
 init/Kconfig                   |   7 ++
 kernel/Makefile                |   1 +
 kernel/fork.c                  |   3 +-
 kernel/nsproxy.c               |  41 +++++--
 kernel/time_namespace.c        | 215 +++++++++++++++++++++++++++++++++
 12 files changed, 340 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/time_namespace.h
 create mode 100644 kernel/time_namespace.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 5c38f21aee78..b50720361de7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12396,6 +12396,8 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
 S:	Maintained
 F:	fs/timerfd.c
 F:	include/linux/timer*
+F:	include/linux/time_namespace.h
+F:	kernel/time_namespace.c
 F:	kernel/time/*timer*
 
 POWER MANAGEMENT CORE
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f78b09..8b5c720fe5d7 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = {
 #ifdef CONFIG_CGROUPS
 	&cgroupns_operations,
 #endif
+#ifdef CONFIG_TIME_NS
+	&timens_operations,
+	&timens_for_children_operations,
+#endif
 };
 
 static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 2ae1b1a4d84d..074f395b9ad2 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -35,6 +35,8 @@ struct nsproxy {
 	struct mnt_namespace *mnt_ns;
 	struct pid_namespace *pid_ns_for_children;
 	struct net 	     *net_ns;
+	struct time_namespace *time_ns;
+	struct time_namespace *time_ns_for_children;
 	struct cgroup_namespace *cgroup_ns;
 };
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index d31cb6215905..3e6f332da465 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations;
 extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
 
 /*
  * We always define these enumerators
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
new file mode 100644
index 000000000000..9507ed7072fe
--- /dev/null
+++ b/include/linux/time_namespace.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIMENS_H
+#define _LINUX_TIMENS_H
+
+
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+#include <linux/ns_common.h>
+#include <linux/err.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct time_namespace {
+	struct kref kref;
+	struct user_namespace *user_ns;
+	struct ucounts *ucounts;
+	struct ns_common ns;
+	struct timens_offsets *offsets;
+	bool   initialized;
+} __randomize_layout;
+extern struct time_namespace init_time_ns;
+
+#ifdef CONFIG_TIME_NS
+static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
+{
+	kref_get(&ns->kref);
+	return ns;
+}
+
+extern struct time_namespace *copy_time_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct time_namespace *old_ns);
+extern void free_time_ns(struct kref *kref);
+extern int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
+
+static inline void put_time_ns(struct time_namespace *ns)
+{
+	kref_put(&ns->kref, free_time_ns);
+}
+
+
+#else
+static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
+{
+	return NULL;
+}
+
+static inline void put_time_ns(struct time_namespace *ns)
+{
+}
+
+static inline struct time_namespace *copy_time_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+	if (flags & CLONE_NEWTIME)
+		return ERR_PTR(-EINVAL);
+
+	return old_ns;
+}
+
+static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* _LINUX_TIMENS_H */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..bf84f93dc411 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -45,6 +45,7 @@ enum ucount_type {
 	UCOUNT_NET_NAMESPACES,
 	UCOUNT_MNT_NAMESPACES,
 	UCOUNT_CGROUP_NAMESPACES,
+	UCOUNT_TIME_NAMESPACES,
 #ifdef CONFIG_INOTIFY_USER
 	UCOUNT_INOTIFY_INSTANCES,
 	UCOUNT_INOTIFY_WATCHES,
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 22627f80063e..adffac53c76e 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -10,6 +10,7 @@
 #define CLONE_FS	0x00000200	/* set if fs info shared between processes */
 #define CLONE_FILES	0x00000400	/* set if open files shared between processes */
 #define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+#define CLONE_NEWTIME	0x00001000	/* New time namespace */
 #define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
 #define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
 #define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..10eebeaadfaa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -982,6 +982,13 @@ config UTS_NS
 	  In this namespace tasks see different info provided with the
 	  uname() system call
 
+config TIME_NS
+	bool "TIME namespace"
+	default y
+	help
+	  In this namespace boottime and monotonic clocks can be set.
+	  The time will keep going with the same pace.
+
 config IPC_NS
 	bool "IPC namespace"
 	depends on (SYSVIPC || POSIX_MQUEUE)
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c57e78817da..91a3601b84d9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup/
 obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_TIME_NS) += time_namespace.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..326381a45d71 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2425,7 +2425,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
 	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
 				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
 				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
-				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+				CLONE_NEWTIME))
 		return -EINVAL;
 	/*
 	 * Not implemented, but pretend it works if there is nothing
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..586c9e2017dc 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
 #include <linux/proc_ns.h>
 #include <linux/file.h>
 #include <linux/syscalls.h>
@@ -44,6 +45,10 @@ struct nsproxy init_nsproxy = {
 #ifdef CONFIG_CGROUPS
 	.cgroup_ns		= &init_cgroup_ns,
 #endif
+#ifdef CONFIG_TIME_NS
+	.time_ns		= &init_time_ns,
+	.time_ns_for_children	= &init_time_ns,
+#endif
 };
 
 static inline struct nsproxy *create_nsproxy(void)
@@ -110,8 +115,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		goto out_net;
 	}
 
+	new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
+					tsk->nsproxy->time_ns_for_children);
+	if (IS_ERR(new_nsp->time_ns_for_children)) {
+		err = PTR_ERR(new_nsp->time_ns_for_children);
+		goto out_time;
+	}
+	new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
+
 	return new_nsp;
 
+out_time:
+	put_net(new_nsp->net_ns);
 out_net:
 	put_cgroup_ns(new_nsp->cgroup_ns);
 out_cgroup:
@@ -140,15 +155,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
+	int ret;
 
 	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
 			      CLONE_NEWPID | CLONE_NEWNET |
-			      CLONE_NEWCGROUP)))) {
-		get_nsproxy(old_ns);
-		return 0;
-	}
-
-	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+			      CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+		if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+			get_nsproxy(old_ns);
+			return 0;
+		}
+	} else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/*
@@ -166,6 +182,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	if (IS_ERR(new_ns))
 		return  PTR_ERR(new_ns);
 
+	ret = timens_on_fork(new_ns, tsk);
+	if (ret) {
+		free_nsproxy(new_ns);
+		return ret;
+	}
+
 	tsk->nsproxy = new_ns;
 	return 0;
 }
@@ -180,6 +202,10 @@ void free_nsproxy(struct nsproxy *ns)
 		put_ipc_ns(ns->ipc_ns);
 	if (ns->pid_ns_for_children)
 		put_pid_ns(ns->pid_ns_for_children);
+	if (ns->time_ns)
+		put_time_ns(ns->time_ns);
+	if (ns->time_ns_for_children)
+		put_time_ns(ns->time_ns_for_children);
 	put_cgroup_ns(ns->cgroup_ns);
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
@@ -196,7 +222,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 	int err = 0;
 
 	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+			       CLONE_NEWTIME)))
 		return 0;
 
 	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/time_namespace.c b/kernel/time_namespace.c
new file mode 100644
index 000000000000..8c600df9771d
--- /dev/null
+++ b/kernel/time_namespace.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/time_namespace.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
+
+static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
+{
+	return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
+}
+
+static void dec_time_namespaces(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
+}
+
+static struct time_namespace *create_time_ns(void)
+{
+	struct time_namespace *time_ns;
+
+	time_ns = kmalloc(sizeof(struct time_namespace), GFP_KERNEL);
+	if (time_ns) {
+		kref_init(&time_ns->kref);
+		time_ns->initialized = false;
+	}
+	return time_ns;
+}
+
+/*
+ * Clone a new ns copying @old_ns, setting refcount to 1
+ * @old_ns: namespace to clone
+ * Return the new ns or ERR_PTR.
+ */
+static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
+					  struct time_namespace *old_ns)
+{
+	struct time_namespace *ns;
+	struct ucounts *ucounts;
+	int err;
+
+	err = -ENOSPC;
+	ucounts = inc_time_namespaces(user_ns);
+	if (!ucounts)
+		goto fail;
+
+	err = -ENOMEM;
+	ns = create_time_ns();
+	if (!ns)
+		goto fail_dec;
+
+	err = ns_alloc_inum(&ns->ns);
+	if (err)
+		goto fail_free;
+
+	ns->ucounts = ucounts;
+	ns->ns.ops = &timens_operations;
+	ns->user_ns = get_user_ns(user_ns);
+	return ns;
+
+fail_free:
+	kfree(ns);
+fail_dec:
+	dec_time_namespaces(ucounts);
+fail:
+	return ERR_PTR(err);
+}
+
+/*
+ * Add a reference to old_ns, or clone it if @flags specify CLONE_NEWTIME.
+ * In latter case, changes to the time of this process won't be seen by parent,
+ * and vice versa.
+ */
+struct time_namespace *copy_time_ns(unsigned long flags,
+	struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+	if (!(flags & CLONE_NEWTIME))
+		return get_time_ns(old_ns);
+
+	return clone_time_ns(user_ns, old_ns);
+}
+
+void free_time_ns(struct kref *kref)
+{
+	struct time_namespace *ns;
+
+	ns = container_of(kref, struct time_namespace, kref);
+	dec_time_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+	ns_free_inum(&ns->ns);
+	kfree(ns);
+}
+
+static struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+	return container_of(ns, struct time_namespace, ns);
+}
+
+static struct ns_common *timens_get(struct task_struct *task)
+{
+	struct time_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->time_ns;
+		get_time_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static struct ns_common *timens_for_children_get(struct task_struct *task)
+{
+	struct time_namespace *ns = NULL;
+	struct nsproxy *nsproxy;
+
+	task_lock(task);
+	nsproxy = task->nsproxy;
+	if (nsproxy) {
+		ns = nsproxy->time_ns_for_children;
+		get_time_ns(ns);
+	}
+	task_unlock(task);
+
+	return ns ? &ns->ns : NULL;
+}
+
+static void timens_put(struct ns_common *ns)
+{
+	put_time_ns(to_time_ns(ns));
+}
+
+static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+	struct time_namespace *ns = to_time_ns(new);
+
+	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return -EPERM;
+
+	get_time_ns(ns);
+	get_time_ns(ns);
+	put_time_ns(nsproxy->time_ns);
+	put_time_ns(nsproxy->time_ns_for_children);
+	nsproxy->time_ns = ns;
+	nsproxy->time_ns_for_children = ns;
+	ns->initialized = true;
+	return 0;
+}
+
+int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+	struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
+	struct time_namespace *ns = to_time_ns(nsc);
+
+	if (nsproxy->time_ns == nsproxy->time_ns_for_children)
+		return 0;
+
+	get_time_ns(ns);
+	put_time_ns(nsproxy->time_ns);
+	nsproxy->time_ns = ns;
+	ns->initialized = true;
+
+	return 0;
+}
+
+static struct user_namespace *timens_owner(struct ns_common *ns)
+{
+	return to_time_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations timens_operations = {
+	.name		= "time",
+	.type		= CLONE_NEWTIME,
+	.get		= timens_get,
+	.put		= timens_put,
+	.install	= timens_install,
+	.owner		= timens_owner,
+};
+
+const struct proc_ns_operations timens_for_children_operations = {
+	.name		= "time_for_children",
+	.type		= CLONE_NEWTIME,
+	.get		= timens_for_children_get,
+	.put		= timens_put,
+	.install	= timens_install,
+	.owner		= timens_owner,
+};
+
+struct time_namespace init_time_ns = {
+	.kref = KREF_INIT(3),
+	.user_ns = &init_user_ns,
+	.ns.inum = PROC_UTS_INIT_INO,
+#ifdef CONFIG_UTS_NS
+	.ns.ops = &timens_operations,
+#endif
+};
+
+static int __init time_ns_init(void)
+{
+	return 0;
+}
+subsys_initcall(time_ns_init);
-- 
2.21.0

^ permalink raw reply related

* [PATCHv3 00/27] kernel: Introduce Time Namespace
From: Dmitry Safonov @ 2019-04-25 16:13 UTC (permalink / raw)
  To: linux-kernel
  Cc: Dmitry Safonov, Adrian Reber, Andrei Vagin, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov, Dmitry Safonov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jeff Dike,
	Oleg Nesterov, Pavel Emelyanov, Shuah Khan, Thomas Gleixner,
	Vincenzo Frascino, containers, criu, linux-api, x86, Andrei Vagin

Discussions around time namespace are there for a long time. The first
attempt to implement it was in 2006 by Jeff Dike. From that time, the
topic appears on and off in various discussions.

There are two main use cases for time namespaces:
1. change date and time inside a container;
2. adjust clocks for a container restored from a checkpoint.

“It seems like this might be one of the last major obstacles keeping
migration from being used in production systems, given that not all
containers and connections can be migrated as long as a time dependency
is capable of messing it up.” (by github.com/dav-ell)

The kernel provides access to several clocks: CLOCK_REALTIME,
CLOCK_MONOTONIC, CLOCK_BOOTTIME. Last two clocks are monotonous, but the
start points for them are not defined and are different for each
system. When a container is migrated from one node to another, all
clocks have to be restored into consistent states; in other words, they
have to continue running from the same points where they have been
dumped.

The main idea of this patch set is adding per-namespace offsets for
system clocks. When a process in a non-root time namespace requests
time of a clock, a namespace offset is added to the current value of
this clock and the sum is returned.

All offsets are placed on a separate page, this allows us to map it as
part of VVAR into user processes and use offsets from VDSO calls.

Now offsets are implemented for CLOCK_MONOTONIC and CLOCK_BOOTTIME
clocks.

v3: Major changes:

* Simplify two VDSO images by using static_branch() in vclock_gettime()
  Removes unwanted conflicts with generic VDSO movement patches and
  simplifies things by dropping too invasive linker magic.
  As an alternative to static_branch() we tested an attempt to introduce
  home-made dynamic patching called retcalls:
  https://github.com/0x7f454c46/linux/commit/4cc0180f6d65
  Considering some theoretical problems with toolchains, we decided to go
  with long well-tested nop-patching in static_branch(). Though, it was
  needed to provide backend for relative code.

* address Thomas' comments.
* add sanity checks for offsets:
  - the current clock time in a namespace has to be in [0, KTIME_MAX / 2).
    KTIME_MAX is divided by two here to be sure that the KTIME_MAX limit
    is still unreachable.
Link: https://lkml.org/lkml/2018/9/19/950
Link: https://lkml.org/lkml/2019/2/5/867

v2: There are two major changes:

* Two versions of the VDSO library to avoid a performance penalty for
  host tasks outside time namespace (as suggested by Andy and Thomas).

  As it has been discussed on timens RFC, adding a new conditional branch
  `if (inside_time_ns)` on VDSO for all processes is undesirable.
  It will add a penalty for everybody as branch predictor may mispredict
  the jump. Also there are instruction cache lines wasted on cmp/jmp.

  Those effects of introducing time namespace are very much unwanted
  having in mind how much work have been spent on micro-optimisation
  VDSO code.

  Addressing those problems, there are two versions of VDSO's .so:
  for host tasks (without any penalty) and for processes inside of time
  namespace with clk_to_ns() that subtracts offsets from host's time.


* Allow to set clock offsets for a namespace only before any processes
  appear in it.

  Now a time namespace looks similar to a pid namespace in a way how it is
  created: unshare(CLONE_NEWTIME) system call creates a new time namespace,
  but doesn't set it to the current process. Then all children of
  the process will be born in the new time namespace, or a process can
  use the setns() system call to join a namespace.

  This scheme allows to create a new time namespaces, set clock offsets
  and then populate the namespace with processes.

Our performance measurements show that the price of VDSO's clock_gettime()
in a child time namespace is about 8% with a hot CPU cache and about 90%
with a cold CPU cache. There is no performance regression for host
processes outside time namespace on those tests.

We wrote two small benchmarks. The first one gettime_perf.c calls
clock_gettime() in a loop for 3 seconds. It shows us performance with
a hot CPU cache (more clock_gettime() cycles - the better):

        | before     | CONFIG_TIME_NS=n | host        | inside timens
--------|------------|------------------|-------------|-------------
cycles  | 139887013  | 139453003        | 139899785   | 128792458
diff (%)| 100        | 99.7             | 100         | 92

The second one gettime_perf_cold.c calls rdtsc, clock_gettime(), rdtsc
and shows a difference of second and first rdtsc. We call this binary in
a loop 1000 times, get 1000 values and calculate MODE for them.
It should show us performance with a cold CPU cache
(lesser tsc per cycle - the better):

        | before     | CONFIG_TIME_NS=n | host        | inside timens
--------|------------|------------------|-------------|-------------
tsc     | 6748       | 6718             | 6862        | 12682
diff (%)| 100        | 99.6             | 101.7       | 188

The numbers gathered on Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz.

Cc: Adrian Reber <adrian@lisas.de>
Cc: Andrei Vagin <avagin@openvz.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: containers@lists.linux-foundation.org
Cc: criu@openvz.org
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org

v2: https://lore.kernel.org/lkml/20190206001107.16488-1-dima@arista.com/
RFC: https://lkml.kernel.org/r/20180919205037.9574-1-dima@arista.com/

Andrei Vagin (16):
  ns: Introduce Time Namespace
  timens: Add timens_offsets
  timens: Introduce CLOCK_MONOTONIC offsets
  timens: Introduce CLOCK_BOOTTIME offset
  timerfd/timens: Take into account ns clock offsets
  posix-timers/timens: Take into account clock offsets
  timens/kernel: Take into account timens clock offsets in
    clock_nanosleep
  x86/vdso: Add offsets page in vvar
  vdso: introduce timens_static_branch
  timens/fs/proc: Introduce /proc/pid/timens_offsets
  selftest/timens: Add a test for timerfd
  selftest/timens: Add a test for clock_nanosleep()
  selftest/timens: Add timer offsets test
  x86/vdso: Align VDSO functions by CPU L1 cache line
  selftests: Add a simple perf test for clock_gettime()
  selftest/timens: Check that a right vdso is mapped after fork and exec

Dmitry Safonov (11):
  timens: Shift /proc/uptime
  x86/vdso2c: Correct err messages on file opening
  x86/vdso2c: Convert iterator to unsigned
  x86/vdso/Makefile: Add vobjs32
  x86/vdso: Restrict splitting VVAR VMA
  x86/vdso: Rename vdso_image {.data=>.text}
  x86/vdso: Allocate timens vdso
  x86/vdso: Switch image on setns()/unshare()/clone()
  timens: Add align for timens_offsets
  selftest/timens: Add Time Namespace test for supported clocks
  selftest/timens: Add procfs selftest

 MAINTAINERS                                   |   3 +
 arch/Kconfig                                  |   5 +
 arch/x86/Kconfig                              |   1 +
 arch/x86/entry/vdso/Makefile                  |  16 +-
 arch/x86/entry/vdso/vclock_gettime.c          |  48 +++
 arch/x86/entry/vdso/vdso-layout.lds.S         |  10 +-
 arch/x86/entry/vdso/vdso2c.c                  |   7 +-
 arch/x86/entry/vdso/vdso2c.h                  |  24 +-
 arch/x86/entry/vdso/vma.c                     | 113 ++++++-
 arch/x86/include/asm/jump_label.h             |  14 +
 arch/x86/include/asm/vdso.h                   |  14 +-
 fs/proc/base.c                                | 101 ++++++
 fs/proc/namespaces.c                          |   4 +
 fs/proc/uptime.c                              |   3 +
 fs/timerfd.c                                  |   8 +-
 include/linux/hrtimer.h                       |   1 +
 include/linux/jump_label.h                    |   5 +
 include/linux/nsproxy.h                       |   2 +
 include/linux/proc_ns.h                       |   2 +
 include/linux/time_namespace.h                | 137 ++++++++
 include/linux/timens_offsets.h                |  18 +
 include/linux/user_namespace.h                |   1 +
 include/uapi/linux/sched.h                    |   1 +
 include/uapi/linux/time.h                     |   2 +
 init/Kconfig                                  |   8 +
 kernel/Makefile                               |   1 +
 kernel/fork.c                                 |   3 +-
 kernel/nsproxy.c                              |  41 ++-
 kernel/time/alarmtimer.c                      |  13 +-
 kernel/time/hrtimer.c                         |   3 +
 kernel/time/posix-stubs.c                     |  11 +-
 kernel/time/posix-timers.c                    |  27 +-
 kernel/time_namespace.c                       | 318 ++++++++++++++++++
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/timens/.gitignore     |   8 +
 tools/testing/selftests/timens/Makefile       |  12 +
 .../selftests/timens/clock_nanosleep.c        | 100 ++++++
 tools/testing/selftests/timens/config         |   1 +
 tools/testing/selftests/timens/exec.c         |  91 +++++
 tools/testing/selftests/timens/gettime_perf.c |  74 ++++
 .../selftests/timens/gettime_perf_cold.c      |  63 ++++
 tools/testing/selftests/timens/log.h          |  26 ++
 tools/testing/selftests/timens/procfs.c       | 142 ++++++++
 tools/testing/selftests/timens/timens.c       | 188 +++++++++++
 tools/testing/selftests/timens/timens.h       |  63 ++++
 tools/testing/selftests/timens/timer.c        | 116 +++++++
 tools/testing/selftests/timens/timerfd.c      | 127 +++++++
 47 files changed, 1928 insertions(+), 49 deletions(-)
 create mode 100644 include/linux/time_namespace.h
 create mode 100644 include/linux/timens_offsets.h
 create mode 100644 kernel/time_namespace.c
 create mode 100644 tools/testing/selftests/timens/.gitignore
 create mode 100644 tools/testing/selftests/timens/Makefile
 create mode 100644 tools/testing/selftests/timens/clock_nanosleep.c
 create mode 100644 tools/testing/selftests/timens/config
 create mode 100644 tools/testing/selftests/timens/exec.c
 create mode 100644 tools/testing/selftests/timens/gettime_perf.c
 create mode 100644 tools/testing/selftests/timens/gettime_perf_cold.c
 create mode 100644 tools/testing/selftests/timens/log.h
 create mode 100644 tools/testing/selftests/timens/procfs.c
 create mode 100644 tools/testing/selftests/timens/timens.c
 create mode 100644 tools/testing/selftests/timens/timens.h
 create mode 100644 tools/testing/selftests/timens/timer.c
 create mode 100644 tools/testing/selftests/timens/timerfd.c

-- 
2.21.0

^ permalink raw reply

* Re: [RFC PATCH v6 22/26] x86/cet/shstk: ELF header parsing of Shadow Stack
From: Dave Martin @ 2019-04-25 16:11 UTC (permalink / raw)
  To: Yu-cheng Yu
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190425153547.GG3567@e103592.cambridge.arm.com>

On Thu, Apr 25, 2019 at 04:35:48PM +0100, Dave Martin wrote:
> On Thu, Apr 25, 2019 at 08:14:52AM -0700, Yu-cheng Yu wrote:
> > On Thu, 2019-04-25 at 12:02 +0100, Dave Martin wrote:
> > > On Mon, Nov 19, 2018 at 01:48:05PM -0800, Yu-cheng Yu wrote:
> > > > Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
> > > > to be enabled for the task.
> > > 
> > > What's the status of this series?  I don't see anything in linux-next
> > > yet.
> > > 
> > > For describing ELF features, Arm has recently adopted
> > > NT_GNU_PROPERTY_TYPE_0, with properties closely modelled on
> > > GNU_PROPERTY_X86_FEATURE_1_AND etc. [1]
> > > 
> > > So, arm64 will be need something like this patch for supporting new
> > > features (such as the Branch Target Identification feature of ARMv8.5-A
> > > [2]).
> > > 
> > > If this series isn't likely to merge soon, can we split this patch into
> > > generic and x86-specific parts and handle them separately?
> > > 
> > > It would be good to see the generic ELF note parsing move to common
> > > code -- I'll take a look and comment in more detail.
> > 
> > Yes, I will work on that.
> 
> Thanks.  I may try to hack something in the meantime based on your
> patch.
> 
> One other question: according to the draft spec at
> https://github.com/hjl-tools/linux-abi/wiki/Linux-Extensions-to-gABI, it
> looks like the .note.gnu.property section is supposed to be marked with
> SHF_ALLOC in object files.
> 
> I think that means that the linker will map it with a PT_LOAD entry in
> the program header table in addition to the PT_NOTE that describes the
> location of the note.  I need to check what the toolchain actually
> does.
> 
> If so, can we simply rely on the notes being already mapped, rather than
> needing to do additional I/O on the ELF file to fetch the notes?

[...]

BTW, it looks like this holds true for AArch64 (see below).

Providing this also works on other arches, I think we can just pick
PT_GNU_PROPERTY out of the program headers and rely on the
corresponding note being already mapped by the existing binfmt_elf
code.

Cheers
---Dave


--8<--

$ echo 'void f(void) { }' | \
	aarch64-linux-gnu-gcc -v -nostdlib -Wl,-ef \
		-mbranch-protection=standard -o /tmp/x -x c - && \
	aarch64-linux-gnu-readelf -nl /tmp/x

[...]

gcc version 9.0.1 20190425 (experimental) (GCC) 

[...]

GNU assembler version 2.32.51 (aarch64-linux-gnu) using BFD version (GNU Binutils) 2.32.51.20190425

[...]

Elf file type is EXEC (Executable file)
Entry point 0x400178
There are 5 program headers, starting at offset 64

Program Headers:
  Type           Offset             VirtAddr           PhysAddr
                 FileSiz            MemSiz              Flags  Align
  LOAD           0x0000000000000000 0x0000000000400000 0x0000000000400000
                 0x00000000000001c0 0x00000000000001c0  R E    0x10000
  NOTE           0x0000000000000158 0x0000000000400158 0x0000000000400158
                 0x0000000000000020 0x0000000000000020  R      0x8
  GNU_PROPERTY   0x0000000000000158 0x0000000000400158 0x0000000000400158
                 0x0000000000000020 0x0000000000000020  R      0x8
  GNU_EH_FRAME   0x0000000000000184 0x0000000000400184 0x0000000000400184
                 0x0000000000000014 0x0000000000000014  R      0x4
  GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
                 0x0000000000000000 0x0000000000000000  RW     0x10

 Section to Segment mapping:
  Segment Sections...
   00     .note.gnu.property .text .eh_frame_hdr .eh_frame 
   01     .note.gnu.property 
   02     .note.gnu.property 
   03     .eh_frame_hdr 
   04     

Displaying notes found in: .note.gnu.property
  Owner                 Data size	Description
  GNU                  0x00000010	NT_GNU_PROPERTY_TYPE_0
      Properties: AArch64 feature: BTI, PAC

^ permalink raw reply

* Re: [RFC PATCH v6 22/26] x86/cet/shstk: ELF header parsing of Shadow Stack
From: Dave Martin @ 2019-04-25 15:35 UTC (permalink / raw)
  To: Yu-cheng Yu
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <e7bbb51291434a9c8526d7b617929465d5784121.camel@intel.com>

On Thu, Apr 25, 2019 at 08:14:52AM -0700, Yu-cheng Yu wrote:
> On Thu, 2019-04-25 at 12:02 +0100, Dave Martin wrote:
> > On Mon, Nov 19, 2018 at 01:48:05PM -0800, Yu-cheng Yu wrote:
> > > Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
> > > to be enabled for the task.
> > 
> > What's the status of this series?  I don't see anything in linux-next
> > yet.
> > 
> > For describing ELF features, Arm has recently adopted
> > NT_GNU_PROPERTY_TYPE_0, with properties closely modelled on
> > GNU_PROPERTY_X86_FEATURE_1_AND etc. [1]
> > 
> > So, arm64 will be need something like this patch for supporting new
> > features (such as the Branch Target Identification feature of ARMv8.5-A
> > [2]).
> > 
> > If this series isn't likely to merge soon, can we split this patch into
> > generic and x86-specific parts and handle them separately?
> > 
> > It would be good to see the generic ELF note parsing move to common
> > code -- I'll take a look and comment in more detail.
> 
> Yes, I will work on that.

Thanks.  I may try to hack something in the meantime based on your
patch.

One other question: according to the draft spec at
https://github.com/hjl-tools/linux-abi/wiki/Linux-Extensions-to-gABI, it
looks like the .note.gnu.property section is supposed to be marked with
SHF_ALLOC in object files.

I think that means that the linker will map it with a PT_LOAD entry in
the program header table in addition to the PT_NOTE that describes the
location of the note.  I need to check what the toolchain actually
does.

If so, can we simply rely on the notes being already mapped, rather than
needing to do additional I/O on the ELF file to fetch the notes?

[...]

Cheers
---Dave

^ permalink raw reply

* Re: [RFC PATCH v6 22/26] x86/cet/shstk: ELF header parsing of Shadow Stack
From: Yu-cheng Yu @ 2019-04-25 15:14 UTC (permalink / raw)
  To: Dave Martin
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190425110211.GZ3567@e103592.cambridge.arm.com>

On Thu, 2019-04-25 at 12:02 +0100, Dave Martin wrote:
> On Mon, Nov 19, 2018 at 01:48:05PM -0800, Yu-cheng Yu wrote:
> > Look in .note.gnu.property of an ELF file and check if Shadow Stack needs
> > to be enabled for the task.
> 
> What's the status of this series?  I don't see anything in linux-next
> yet.
> 
> For describing ELF features, Arm has recently adopted
> NT_GNU_PROPERTY_TYPE_0, with properties closely modelled on
> GNU_PROPERTY_X86_FEATURE_1_AND etc. [1]
> 
> So, arm64 will be need something like this patch for supporting new
> features (such as the Branch Target Identification feature of ARMv8.5-A
> [2]).
> 
> If this series isn't likely to merge soon, can we split this patch into
> generic and x86-specific parts and handle them separately?
> 
> It would be good to see the generic ELF note parsing move to common
> code -- I'll take a look and comment in more detail.

Yes, I will work on that.

> 
> [...]
> 
> > diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> > index 69c0f892e310..557ed0ba71c7 100644
> > --- a/arch/x86/include/asm/elf.h
> > +++ b/arch/x86/include/asm/elf.h
> > @@ -381,4 +381,9 @@ struct va_alignment {
> >  
> >  extern struct va_alignment va_align;
> >  extern unsigned long align_vdso_addr(unsigned long);
> > +
> > +#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
> > +extern int arch_setup_features(void *ehdr, void *phdr, struct file *file,
> > +			       bool interp);
> > +#endif
> >  #endif /* _ASM_X86_ELF_H */
> > diff --git a/arch/x86/include/uapi/asm/elf_property.h
> > b/arch/x86/include/uapi/asm/elf_property.h
> > new file mode 100644
> > index 000000000000..af361207718c
> > --- /dev/null
> > +++ b/arch/x86/include/uapi/asm/elf_property.h
> > @@ -0,0 +1,15 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _UAPI_ASM_X86_ELF_PROPERTY_H
> > +#define _UAPI_ASM_X86_ELF_PROPERTY_H
> > +
> > +/*
> > + * pr_type
> > + */
> > +#define GNU_PROPERTY_X86_FEATURE_1_AND (0xc0000002)
> > +
> > +/*
> > + * Bits for GNU_PROPERTY_X86_FEATURE_1_AND
> > + */
> > +#define GNU_PROPERTY_X86_FEATURE_1_SHSTK	(0x00000002)
> > +
> 
> Generally we seem to collect all ELF definitions in <linux/uapi/elf.h>,
> including arch-specific ones.

Agree.

> 
> Is a new header really needed here?
> 
> [...]
> 
> > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> > index 54207327f98f..007ff0fbae84 100644
> > --- a/fs/binfmt_elf.c
> > +++ b/fs/binfmt_elf.c
> > @@ -1081,6 +1081,21 @@ static int load_elf_binary(struct linux_binprm *bprm)
> >  		goto out_free_dentry;
> >  	}
> >  
> > +#ifdef CONFIG_ARCH_HAS_PROGRAM_PROPERTIES
> > +	if (interpreter) {
> > +		retval = arch_setup_features(&loc->interp_elf_ex,
> > +					     interp_elf_phdata,
> > +					     interpreter, true);
> 
> Can we dummy no-op functions in the common headers to avoid this
> ifdeffery?  Logically all arches will always do this step, even if it's
> a no-op today.

Sure.

Thanks,

Yu-cheng

^ permalink raw reply

* [PATCH v18 3/3] Documentation/filesystems/proc.txt: add arch_status file
From: Aubrey Li @ 2019-04-25 14:32 UTC (permalink / raw)
  To: tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, dave.hansen, arjan, adobriyan, akpm, aubrey.li,
	linux-api, linux-kernel, Aubrey Li, Andy Lutomirski
In-Reply-To: <20190425143219.102258-1-aubrey.li@linux.intel.com>

Added /proc/<pid>/arch_status file, and added AVX512_elapsed_ms in
/proc/<pid>/arch_status. Report it in Documentation/filesystems/proc.txt

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linux API <linux-api@vger.kernel.org>
---
 Documentation/filesystems/proc.txt | 39 ++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 66cad5c86171..e8bc403d15df 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -45,6 +45,7 @@ Table of Contents
   3.9   /proc/<pid>/map_files - Information about memory mapped files
   3.10  /proc/<pid>/timerslack_ns - Task timerslack value
   3.11	/proc/<pid>/patch_state - Livepatch patch operation state
+  3.12	/proc/<pid>/arch_status - Task architecture specific information
 
   4	Configuring procfs
   4.1	Mount options
@@ -1948,6 +1949,44 @@ patched.  If the patch is being enabled, then the task has already been
 patched.  If the patch is being disabled, then the task hasn't been
 unpatched yet.
 
+3.12 /proc/<pid>/arch_status - task architecture specific status
+-------------------------------------------------------------------
+When CONFIG_PROC_PID_ARCH_STATUS is enabled, this file displays the
+architecture specific status of the task.
+
+Example
+-------
+ $ cat /proc/6753/arch_status
+ AVX512_elapsed_ms:      8
+
+Description
+-----------
+
+x86 specific entries:
+---------------------
+ AVX512_elapsed_ms:
+ ------------------
+  If AVX512 is supported on the machine, this entry shows the milliseconds
+  elapsed since the last time AVX512 usage was recorded. The recording
+  happens on a best effort basis when a task is scheduled out. This means
+  that the value depends on two factors:
+
+    1) The time which the task spent on the CPU without being scheduled
+       out. With CPU isolation and a single runnable task this can take
+       several seconds.
+
+    2) The time since the task was scheduled out last. Depending on the
+       reason for being scheduled out (time slice exhausted, syscall ...)
+       this can be arbitrary long time.
+
+  As a consequence the value cannot be considered precise and authoritative
+  information. The application which uses this information has to be aware
+  of the overall scenario on the system in order to determine whether a
+  task is a real AVX512 user or not.
+
+  A special value of '-1' indicates that no AVX512 usage was recorded, thus
+  the task is unlikely an AVX512 user, but depends on the workload and the
+  scheduling scenario, it also could be a false negative mentioned above.
 
 ------------------------------------------------------------------------------
 Configuring procfs
-- 
2.17.1

^ permalink raw reply related

* [PATCH v18 2/3] x86,/proc/pid/arch_status: Add AVX-512 usage elapsed time
From: Aubrey Li @ 2019-04-25 14:32 UTC (permalink / raw)
  To: tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, dave.hansen, arjan, adobriyan, akpm, aubrey.li,
	linux-api, linux-kernel, Aubrey Li, Andy Lutomirski
In-Reply-To: <20190425143219.102258-1-aubrey.li@linux.intel.com>

AVX-512 components use could cause core turbo frequency drop. So
it's useful to expose AVX-512 usage elapsed time as a heuristic hint
for the user space job scheduler to cluster the AVX-512 using tasks
together.

Tensorflow example:
$ while [ 1 ]; do cat /proc/tid/arch_status | grep AVX512; sleep 1; done
AVX512_elapsed_ms:      4
AVX512_elapsed_ms:      8
AVX512_elapsed_ms:      4

This means that 4 milliseconds have elapsed since the AVX512 usage
of tensorflow task was detected when the task was scheduled out.

Or:
$ cat /proc/tid/arch_status | grep AVX512
AVX512_elapsed_ms:      -1

The number '-1' indicates that no AVX512 usage recorded before
thus the task unlikely has frequency drop issue.

User space tools may want to further check by:

$ perf stat --pid <pid> -e core_power.lvl2_turbo_license -- sleep 1

 Performance counter stats for process id '3558':

     3,251,565,961      core_power.lvl2_turbo_license

       1.004031387 seconds time elapsed

Non-zero counter value confirms that the task causes frequency drop.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linux API <linux-api@vger.kernel.org>
---
 arch/x86/Kconfig             |  1 +
 arch/x86/kernel/fpu/xstate.c | 47 ++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5ad92419be19..d5a9c5ddd453 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -208,6 +208,7 @@ config X86
 	select USER_STACKTRACE_SUPPORT
 	select VIRT_TO_BUS
 	select X86_FEATURE_NAMES		if PROC_FS
+	select PROC_PID_ARCH_STATUS		if PROC_FS
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d7432c2b1051..fcaaf21aa015 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -7,6 +7,8 @@
 #include <linux/cpu.h>
 #include <linux/mman.h>
 #include <linux/pkeys.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 
 #include <asm/fpu/api.h>
 #include <asm/fpu/internal.h>
@@ -1243,3 +1245,48 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
 
 	return 0;
 }
+
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+/*
+ * Report the amount of time elapsed in millisecond since last AVX512
+ * use in the task.
+ */
+static void avx512_status(struct seq_file *m, struct task_struct *task)
+{
+	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
+	long delta;
+
+	if (!timestamp) {
+		/*
+		 * Report -1 if no AVX512 usage
+		 */
+		delta = -1;
+	} else {
+		delta = (long)(jiffies - timestamp);
+		/*
+		 * Cap to LONG_MAX if time difference > LONG_MAX
+		 */
+		if (delta < 0)
+			delta = LONG_MAX;
+		delta = jiffies_to_msecs(delta);
+	}
+
+	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
+	seq_putc(m, '\n');
+}
+
+/*
+ * Report architecture specific information
+ */
+int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	/*
+	 * Report AVX512 state if the processor and build option supported.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
+		avx512_status(m, task);
+
+	return 0;
+}
+#endif /* CONFIG_PROC_PID_ARCH_STATUS */
-- 
2.17.1

^ permalink raw reply related

* [PATCH v18 1/3] proc: add /proc/<pid>/arch_status
From: Aubrey Li @ 2019-04-25 14:32 UTC (permalink / raw)
  To: tglx, mingo, peterz, hpa
  Cc: ak, tim.c.chen, dave.hansen, arjan, adobriyan, akpm, aubrey.li,
	linux-api, linux-kernel, Aubrey Li, Andy Lutomirski

The architecture specific information of the running processes
could be useful to the userland. Add /proc/<pid>/arch_status
interface support to examine process architecture specific
information externally.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linux API <linux-api@vger.kernel.org>
---
 fs/proc/Kconfig         | 4 ++++
 fs/proc/base.c          | 6 ++++++
 include/linux/proc_fs.h | 9 +++++++++
 3 files changed, 19 insertions(+)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 817c02b13b1d..d80ebf19d5f1 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -97,3 +97,7 @@ config PROC_CHILDREN
 
 	  Say Y if you are running any user-space software which takes benefit from
 	  this interface. For example, rkt is such a piece of software.
+
+config PROC_PID_ARCH_STATUS
+	def_bool n
+	depends on PROC_FS
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6a803a0b75df..8c71eba47031 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3061,6 +3061,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_STACKLEAK_METRICS
 	ONE("stack_depth", S_IRUGO, proc_stack_depth),
 #endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3449,6 +3452,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 52a283ba0465..a705aa2d03f9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -75,6 +75,15 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
+#ifdef CONFIG_PROC_PID_ARCH_STATUS
+/*
+ * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must
+ * provide proc_pid_arch_status() definition.
+ */
+int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task);
+#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
 #else /* CONFIG_PROC_FS */
 
 static inline void proc_root_init(void)
-- 
2.17.1

^ permalink raw reply related

* Re: [RFC PATCH for 5.2 10/10] rseq/selftests: mips: use break instruction for RSEQ_SIG
From: Mathieu Desnoyers @ 2019-04-25 14:21 UTC (permalink / raw)
  To: Paul Burton
  Cc: Peter Zijlstra, Paul E . McKenney, Boqun Feng, linux-kernel,
	linux-api, Thomas Gleixner, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Will
In-Reply-To: <20190424220609.4kryfcgsv46iu3ds@pburton-laptop>

----- On Apr 24, 2019, at 6:06 PM, Paul Burton paul.burton@mips.com wrote:

> Hi Mathieu,
> 
> On Wed, Apr 24, 2019 at 11:25:02AM -0400, Mathieu Desnoyers wrote:
>> diff --git a/tools/testing/selftests/rseq/rseq-mips.h
>> b/tools/testing/selftests/rseq/rseq-mips.h
>> index fe3eabcdcbe5..eb53a6adfbbb 100644
>> --- a/tools/testing/selftests/rseq/rseq-mips.h
>> +++ b/tools/testing/selftests/rseq/rseq-mips.h
>> @@ -7,7 +7,11 @@
>>   * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>>   */
>>  
>> -#define RSEQ_SIG	0x53053053
>> +/*
>> + * RSEQ_SIG uses the break instruction. The instruction pattern is
>> + *	0350000d        break    0x350
>> + */
>> +#define RSEQ_SIG	0x0350000d
> 
> My apologies for taking a while to get back to you on the various ISAs &
> endian issues here, but I think we'll want this to be something like:
> 
> #if defined(__nanomips__)
> # ifdef __MIPSEL__
> #  define RSEQ_SIG	0x03500010
> # else
> #  define RSEQ_SIG	0x00100350
> # endif
> #elif defined(__mips_micromips)
> # ifdef __MIPSEL__
> #  define RSEQ_SIG	0xd4070000
> # else
> #  define RSEQ_SIG	0x0000d407
> # endif
> #else
> # define RSEQ_SIG	0x0350000d
> #endif
> 
> For plain old MIPS the .word directive will be fine endian-wise, but for
> microMIPS & nanoMIPS we need to take into account that the instruction
> stream is encoded as 16b halfwords & swap those accordingly for little
> endian.

Considering that we have micromips and nanomips already, I guess something
along the lines of "picomips" is not that far away...

I've tried to figure out if we could find a way to have RSEQ_SIG left undefined
if it's not on the plain mips environment, but could not find anything that
would be #defined on plain mips, but #undefined on both micromips and nanomips.

What I'd like to do is e.g.:

#if defined(__nanomips__)
# ifdef __MIPSEL__
#  define RSEQ_SIG	0x03500010
# else
#  define RSEQ_SIG	0x00100350
# endif
#elif defined(__mips_micromips)
# ifdef __MIPSEL__
#  define RSEQ_SIG	0xd4070000
# else
#  define RSEQ_SIG	0x0000d407
# endif
#elif defined(__mips__)
# define RSEQ_SIG	0x0350000d
#else
/* Leave RSEQ_SIG as is. */
#endif

The idea here is to not allow code targeting future MIPS ISA to compile
with the wrong signature.

The delta between compiling without/with -mmicromips on a gcc-8 compiler
is only:

> #define __mips_micromips 1

Some interesting delta when compiling for plain little-endian mips with
gcc-8 compared to the nanomips compiler is:

< #define __mips__ 1
< #define _mips 1
< #define MIPSEL 1

> #define __nanomips__ 1

< #define __mips_isa_rev 2
> #define __mips_isa_rev 6

So let's say we have a picomips introduced in the future, can we rely
on it not defining __mips__ like the nanomips compiler does ? If so,
my "#elif defined(__mips__)" approach would indeed leave RSEQ_SIG undefined
as expected.

Thoughts ?

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCH RESEND v5 0/5] namei: vfs flags to restrict path resolution
From: Adam Borowski @ 2019-04-25 13:22 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Kees Cook, Andy Lutomirski, Al Viro, Jeff Layton, J. Bruce Fields,
	Arnd Bergmann, David Howells, Eric Biederman, Jann Horn,
	Christian Brauner, David Drysdale, Tycho Andersen,
	Linux Containers, Linux FS Devel, Linux API, Andrew Morton,
	Alexei Starovoitov, Chanho Min, Oleg Nesterov, Aleksa Sarai
In-Reply-To: <20190424153806.64qkkmkudzodxnz2@yavin>

On Thu, Apr 25, 2019 at 01:38:06AM +1000, Aleksa Sarai wrote:
>  * openat(2) ignores unknown flags, meaning that old kernels will ignore
>    new programs trying to use O_THISROOT and might end up causing
>    security issues. Yes, it'd be trivial to check whether the new O_*
>    flags are supported at start-up, but I think a security feature
>    shouldn't have a foot-gun associated with it. In fact, I didn't know
>    openat(2) ignored unknown flags until I wrote this patchset -- I
>    doubt many other userspace developers do either.

For this reason, I propose every new syscall that has flags to follow a
bitmask scheme, where any flag assigned a bit in the upper half returns
EOPNOTSUPP when called on an old kernel.  That would allow defining which
flags can be safely ignored and which can't.

It otherwise takes major hacks to implement a fail-if-not-supported flag
while keeping compat with old kernels.  For example, for mmap(), MAP_SHARED
has been duplicated as MAP_SHARED_VALIDATE just to allow an unrelated flag
(MAP_SYNC) to fail on old kernels.


Meow!
-- 
⢀⣴⠾⠻⢶⣦⠀ 
⣾⠁⢰⠒⠀⣿⡁ Imagine there are bandits in your house, your kid is bleeding out,
⢿⡄⠘⠷⠚⠋⠀ the house is on fire, and seven giant trumpets are playing in the
⠈⠳⣄⠀⠀⠀⠀ sky.  Your cat demands food.  The priority should be obvious...

^ permalink raw reply

* Re: [PATCH V2] mm: Allow userland to request that the kernel clear memory on release
From: Jann Horn @ 2019-04-25 12:42 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Matthew Garrett, Linux-MM, kernel list, Matthew Garrett,
	Linux API
In-Reply-To: <20190425121410.GC1144@dhcp22.suse.cz>

On Thu, Apr 25, 2019 at 2:14 PM Michal Hocko <mhocko@kernel.org> wrote:
[...]
> On Wed 24-04-19 14:10:39, Matthew Garrett wrote:
> > From: Matthew Garrett <mjg59@google.com>
> >
> > Applications that hold secrets and wish to avoid them leaking can use
> > mlock() to prevent the page from being pushed out to swap and
> > MADV_DONTDUMP to prevent it from being included in core dumps. Applications
> > can also use atexit() handlers to overwrite secrets on application exit.
> > However, if an attacker can reboot the system into another OS, they can
> > dump the contents of RAM and extract secrets. We can avoid this by setting
> > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the
> > firmware wipe the contents of RAM before booting another OS, but this means
> > rebooting takes a *long* time - the expected behaviour is for a clean
> > shutdown to remove the request after scrubbing secrets from RAM in order to
> > avoid this.
> >
> > Unfortunately, if an application exits uncleanly, its secrets may still be
> > present in RAM. This can't be easily fixed in userland (eg, if the OOM
> > killer decides to kill a process holding secrets, we're not going to be able
> > to avoid that), so this patch adds a new flag to madvise() to allow userland
> > to request that the kernel clear the covered pages whenever the page
> > reference count hits zero. Since vm_flags is already full on 32-bit, it
> > will only work on 64-bit systems.
[...]
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 21a7881a2db4..989c2fde15cf 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma,
> >       case MADV_KEEPONFORK:
> >               new_flags &= ~VM_WIPEONFORK;
> >               break;
> > +     case MADV_WIPEONRELEASE:
> > +             /* MADV_WIPEONRELEASE is only supported on anonymous memory. */
> > +             if (VM_WIPEONRELEASE == 0 || vma->vm_file ||
> > +                 vma->vm_flags & VM_SHARED) {
> > +                     error = -EINVAL;
> > +                     goto out;
> > +             }
> > +             new_flags |= VM_WIPEONRELEASE;
> > +             break;

An interesting effect of this is that it will be possible to set this
on a CoW anon VMA in a fork() child, and then the semantics in the
parent will be subtly different - e.g. if the parent vmsplice()d a
CoWed page into a pipe, then forked an unprivileged child, the child
set MADV_WIPEONRELEASE on its VMA, the parent died somehow, and then
the child died, the page in the pipe would be zeroed out. A child
should not be able to affect its parent like this, I think. If this
was an mmap() flag instead of a madvise() command, that issue could be
avoided. Alternatively, if adding more mmap() flags doesn't work,
perhaps you could scan the VMA and ensure that it contains no pages
yet, or something like that?

> > diff --git a/mm/memory.c b/mm/memory.c
> > index ab650c21bccd..ff78b527660e 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1091,6 +1091,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> >                       page_remove_rmap(page, false);
> >                       if (unlikely(page_mapcount(page) < 0))
> >                               print_bad_pte(vma, addr, ptent, page);
> > +                     if (unlikely(vma->vm_flags & VM_WIPEONRELEASE) &&
> > +                         page_mapcount(page) == 0)
> > +                             clear_highpage(page);
> >                       if (unlikely(__tlb_remove_page(tlb, page))) {
> >                               force_flush = 1;
> >                               addr += PAGE_SIZE;

Should something like this perhaps be added in page_remove_rmap()
instead? That's where the mapcount is decremented; and looking at
other callers of page_remove_rmap(), in particular the following ones
look interesting:

 - do_huge_pmd_wp_page()/do_huge_pmd_wp_page_fallback() might be
relevant in the case where a forking process contains transparent
hugepages?
 - zap_huge_pmd() is relevant when transparent hugepages are used, I
think (otherwise transparent hugepages might not be wiped?)
 - there are various callers related to migration; I think this is
relevant on a NUMA system where memory is moved between nodes to
improve locality (moving memory to a new page and freeing the old one,
in which case you'd want to wipe the old page)

I think all the callers have a reference to the VMA, so perhaps you
could add a VMA parameter to page_remove_rmap() and then look at the
VMA in there?

^ permalink raw reply

* Re: [PATCH V2] mm: Allow userland to request that the kernel clear memory on release
From: Vlastimil Babka @ 2019-04-25 12:40 UTC (permalink / raw)
  To: Michal Hocko, Matthew Garrett
  Cc: linux-mm, linux-kernel, Matthew Garrett, linux-api
In-Reply-To: <20190425121410.GC1144@dhcp22.suse.cz>

On 4/25/19 2:14 PM, Michal Hocko wrote:
> Please cc linux-api for user visible API proposals (now done). Keep the
> rest of the email intact for reference.
> 
> On Wed 24-04-19 14:10:39, Matthew Garrett wrote:
>> From: Matthew Garrett <mjg59@google.com>
>>
>> Applications that hold secrets and wish to avoid them leaking can use
>> mlock() to prevent the page from being pushed out to swap and
>> MADV_DONTDUMP to prevent it from being included in core dumps. Applications

So, do we really need a new madvise() flag and VMA flag, or can we just
infer this page clearing from mlock+MADV_DONTDUMP being both applied?

^ permalink raw reply

* Re: [PATCH V2] mm: Allow userland to request that the kernel clear memory on release
From: Michal Hocko @ 2019-04-25 12:37 UTC (permalink / raw)
  To: Matthew Garrett; +Cc: linux-mm, linux-kernel, Matthew Garrett, linux-api
In-Reply-To: <20190425121410.GC1144@dhcp22.suse.cz>

On Thu 25-04-19 14:14:10, Michal Hocko wrote:
> Please cc linux-api for user visible API proposals (now done). Keep the
> rest of the email intact for reference.
> 
> On Wed 24-04-19 14:10:39, Matthew Garrett wrote:
> > From: Matthew Garrett <mjg59@google.com>
> > 
> > Applications that hold secrets and wish to avoid them leaking can use
> > mlock() to prevent the page from being pushed out to swap and
> > MADV_DONTDUMP to prevent it from being included in core dumps. Applications
> > can also use atexit() handlers to overwrite secrets on application exit.
> > However, if an attacker can reboot the system into another OS, they can
> > dump the contents of RAM and extract secrets. We can avoid this by setting
> > CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the
> > firmware wipe the contents of RAM before booting another OS, but this means
> > rebooting takes a *long* time - the expected behaviour is for a clean
> > shutdown to remove the request after scrubbing secrets from RAM in order to
> > avoid this.
> > 
> > Unfortunately, if an application exits uncleanly, its secrets may still be
> > present in RAM. This can't be easily fixed in userland (eg, if the OOM
> > killer decides to kill a process holding secrets, we're not going to be able
> > to avoid that), so this patch adds a new flag to madvise() to allow userland
> > to request that the kernel clear the covered pages whenever the page
> > reference count hits zero. Since vm_flags is already full on 32-bit, it
> > will only work on 64-bit systems.

The changelog seems stale. You are hooking into unmap path where the
reference count might be still > 0 and the page still held by somebody.
A previous email from Willy said
"
It could be the target/source of direct I/O, or userspace could have
registered it with an RDMA device, or ...

It depends on the semantics you want.  There's no legacy code to
worry about here.  I was seeing this as the equivalent of an atexit()
handler; userspace is saying "When this page is unmapped, zero it".
So it doesn't matter that somebody else might be able to reference it --
userspace could have zeroed it themselves.
"

I am not sure this is really a bullet proof argumentation but it should
definitely be part of the changelog.

Besides that you inherently assume that the user would do mlock because
you do not try to wipe the swap content. Is this intentional?

Another question would be regarding the targeted user API. There are
some attempts to make all the freed memory to be zeroed/poisoned. Are
users who would like to use this feature also be interested in using
system wide setting as well?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH V2] mm: Allow userland to request that the kernel clear memory on release
From: Michal Hocko @ 2019-04-25 12:14 UTC (permalink / raw)
  To: Matthew Garrett; +Cc: linux-mm, linux-kernel, Matthew Garrett, linux-api
In-Reply-To: <20190424211038.204001-1-matthewgarrett@google.com>

Please cc linux-api for user visible API proposals (now done). Keep the
rest of the email intact for reference.

On Wed 24-04-19 14:10:39, Matthew Garrett wrote:
> From: Matthew Garrett <mjg59@google.com>
> 
> Applications that hold secrets and wish to avoid them leaking can use
> mlock() to prevent the page from being pushed out to swap and
> MADV_DONTDUMP to prevent it from being included in core dumps. Applications
> can also use atexit() handlers to overwrite secrets on application exit.
> However, if an attacker can reboot the system into another OS, they can
> dump the contents of RAM and extract secrets. We can avoid this by setting
> CONFIG_RESET_ATTACK_MITIGATION on UEFI systems in order to request that the
> firmware wipe the contents of RAM before booting another OS, but this means
> rebooting takes a *long* time - the expected behaviour is for a clean
> shutdown to remove the request after scrubbing secrets from RAM in order to
> avoid this.
> 
> Unfortunately, if an application exits uncleanly, its secrets may still be
> present in RAM. This can't be easily fixed in userland (eg, if the OOM
> killer decides to kill a process holding secrets, we're not going to be able
> to avoid that), so this patch adds a new flag to madvise() to allow userland
> to request that the kernel clear the covered pages whenever the page
> reference count hits zero. Since vm_flags is already full on 32-bit, it
> will only work on 64-bit systems.
> 
> Signed-off-by: Matthew Garrett <mjg59@google.com>
> ---
> 
> Modified to wipe when the VMA is released rather than on page freeing
> 
>  include/linux/mm.h                     |  6 ++++++
>  include/uapi/asm-generic/mman-common.h |  2 ++
>  mm/madvise.c                           | 21 +++++++++++++++++++++
>  mm/memory.c                            |  3 +++
>  4 files changed, 32 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 6b10c21630f5..64bdab679275 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -257,6 +257,8 @@ extern unsigned int kobjsize(const void *objp);
>  #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
>  #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
>  #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
> +
> +#define VM_WIPEONRELEASE BIT(37)       /* Clear pages when releasing them */
>  #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
>  
>  #ifdef CONFIG_ARCH_HAS_PKEYS
> @@ -298,6 +300,10 @@ extern unsigned int kobjsize(const void *objp);
>  # define VM_GROWSUP	VM_NONE
>  #endif
>  
> +#ifndef VM_WIPEONRELEASE
> +# define VM_WIPEONRELEASE VM_NONE
> +#endif
> +
>  /* Bits set in the VMA until the stack is in its final location */
>  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
>  
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index abd238d0f7a4..82dfff4a8e3d 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -64,6 +64,8 @@
>  #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
>  #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
>  
> +#define MADV_WIPEONRELEASE 20
> +#define MADV_DONTWIPEONRELEASE 21
>  /* compatibility flags */
>  #define MAP_FILE	0
>  
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 21a7881a2db4..989c2fde15cf 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -92,6 +92,22 @@ static long madvise_behavior(struct vm_area_struct *vma,
>  	case MADV_KEEPONFORK:
>  		new_flags &= ~VM_WIPEONFORK;
>  		break;
> +	case MADV_WIPEONRELEASE:
> +		/* MADV_WIPEONRELEASE is only supported on anonymous memory. */
> +		if (VM_WIPEONRELEASE == 0 || vma->vm_file ||
> +		    vma->vm_flags & VM_SHARED) {
> +			error = -EINVAL;
> +			goto out;
> +		}
> +		new_flags |= VM_WIPEONRELEASE;
> +		break;
> +	case MADV_DONTWIPEONRELEASE:
> +		if (VM_WIPEONRELEASE == 0) {
> +			error = -EINVAL;
> +			goto out;
> +		}
> +		new_flags &= ~VM_WIPEONRELEASE;
> +		break;
>  	case MADV_DONTDUMP:
>  		new_flags |= VM_DONTDUMP;
>  		break;
> @@ -727,6 +743,8 @@ madvise_behavior_valid(int behavior)
>  	case MADV_DODUMP:
>  	case MADV_WIPEONFORK:
>  	case MADV_KEEPONFORK:
> +	case MADV_WIPEONRELEASE:
> +	case MADV_DONTWIPEONRELEASE:
>  #ifdef CONFIG_MEMORY_FAILURE
>  	case MADV_SOFT_OFFLINE:
>  	case MADV_HWPOISON:
> @@ -785,6 +803,9 @@ madvise_behavior_valid(int behavior)
>   *  MADV_DONTDUMP - the application wants to prevent pages in the given range
>   *		from being included in its core dump.
>   *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
> + *  MADV_WIPEONRELEASE - clear the contents of the memory after the last
> + *		reference to it has been released
> + *  MADV_DONTWIPEONRELEASE - cancel MADV_WIPEONRELEASE
>   *
>   * return values:
>   *  zero    - success
> diff --git a/mm/memory.c b/mm/memory.c
> index ab650c21bccd..ff78b527660e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1091,6 +1091,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>  			page_remove_rmap(page, false);
>  			if (unlikely(page_mapcount(page) < 0))
>  				print_bad_pte(vma, addr, ptent, page);
> +			if (unlikely(vma->vm_flags & VM_WIPEONRELEASE) &&
> +			    page_mapcount(page) == 0)
> +				clear_highpage(page);
>  			if (unlikely(__tlb_remove_page(tlb, page))) {
>  				force_flush = 1;
>  				addr += PAGE_SIZE;
> -- 
> 2.21.0.593.g511ec345e18-goog

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox