* [PATCHv7 20/33] x86/vdso: Provide vdso_data offset on vvar_page
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Dmitry Safonov, Adrian Reber, Andrei Vagin,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
VDSO support for Time namespace needs to set up a page with the same
layout as VVAR. That timens page will be placed on position of VVAR page
inside namespace. That page has vdso_data->seq set to 1 to enforce
the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce
the time namespace handling path.
To prepare timens page kernel needs to know the vdso_data offset.
Provide arch_get_vdso_data() helper for locating vdso_data on VVAR page.
Co-developed-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
arch/x86/entry/vdso/vdso-layout.lds.S | 2 --
arch/x86/entry/vdso/vma.c | 11 +++++++++++
arch/x86/include/asm/vvar.h | 8 ++++----
arch/x86/kernel/vmlinux.lds.S | 4 +---
include/linux/time_namespace.h | 1 +
5 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index 93c6dc7812d0..2330daad67c3 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -21,9 +21,7 @@ SECTIONS
/* Place all vvars at the offsets in asm/vvar.h. */
#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
-#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
pvclock_page = vvar_start + PAGE_SIZE;
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 000db8282cc8..5dab706aca2e 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -24,6 +24,17 @@
#include <asm/cpufeature.h>
#include <clocksource/hyperv_timer.h>
+#undef _ASM_X86_VVAR_H
+#define EMIT_VVAR(name, offset) \
+ const size_t name ## _offset = offset;
+#include <asm/vvar.h>
+
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
+{
+ return (struct vdso_data *)(vvar_page + _vdso_data_offset);
+}
+#undef EMIT_VVAR
+
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
#endif
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 32f5d9a0b90e..ff2de3025388 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -19,10 +19,10 @@
#ifndef _ASM_X86_VVAR_H
#define _ASM_X86_VVAR_H
-#if defined(__VVAR_KERNEL_LDS)
-
-/* The kernel linker script defines its own magic to put vvars in the
- * right place.
+#ifdef EMIT_VVAR
+/*
+ * EMIT_VVAR() is used by the kernel linker script to put vvars in the
+ * right place. Also, it's used by kernel code to import offsets values.
*/
#define DECLARE_VVAR(offset, type, name) \
EMIT_VVAR(name, offset)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e2feacf921a0..ca02d0d301cd 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -195,12 +195,10 @@ SECTIONS
__vvar_beginning_hack = .;
/* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) \
+#define EMIT_VVAR(name, offset) \
. = __vvar_beginning_hack + offset; \
*(.vvar_ ## name)
-#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
/*
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 9a77d3854830..772911945944 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -37,6 +37,7 @@ extern struct time_namespace *copy_time_ns(unsigned long flags,
struct user_namespace *user_ns, struct time_namespace *old_ns);
extern void free_time_ns(struct kref *kref);
extern int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
+extern struct vdso_data *arch_get_vdso_data(void *vvar_page);
static inline void put_time_ns(struct time_namespace *ns)
{
--
2.23.0
^ permalink raw reply related
* [PATCHv7 19/33] lib/vdso: Prepare for time namespace support
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Thomas Gleixner, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Vincenzo Frascino, containers, criu, linux-api, x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Thomas Gleixner <tglx@linutronix.de>
To support time namespaces in the vdso with a minimal impact on regular non
time namespace affected tasks, the namespace handling needs to be hidden in
a slow path.
The most obvious place is vdso_seq_begin(). If a task belongs to a time
namespace then the VVAR page which contains the system wide vdso data is
replaced with a namespace specific page which has the same layout as the
VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path
and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time
namespace handling path.
The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
update of the vdso data is in progress, is not really affecting regular
tasks which are not part of a time namespace as the task is spin waiting
for the update to finish and vdso_data->seq to become even again.
If a time namespace task hits that code path, it invokes the corresponding
time getter function which retrieves the real VVAR page, reads host time
and then adds the offset for the requested clock which is stored in the
special VVAR page.
If VDSO time namespace support is disabled the whole magic is compiled out.
Initial testing shows that the disabled case is almost identical to the
host case which does not take the slow timens path. With the special timens
page installed the performance hit is constant time and in the range of
5-7%.
For the vdso functions which are not using the sequence count an
unconditional check for vdso_data->clock_mode is added which switches to
the real vdso when the clock_mode is VCLOCK_TIMENS.
Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
include/linux/time.h | 6 ++
include/vdso/datapage.h | 19 +++++-
lib/vdso/Kconfig | 6 ++
lib/vdso/gettimeofday.c | 128 +++++++++++++++++++++++++++++++++++++++-
4 files changed, 155 insertions(+), 4 deletions(-)
diff --git a/include/linux/time.h b/include/linux/time.h
index 27d83fd2ae61..b1a592638d7d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -96,4 +96,10 @@ static inline bool itimerspec64_valid(const struct itimerspec64 *its)
*/
#define time_after32(a, b) ((s32)((u32)(b) - (u32)(a)) < 0)
#define time_before32(b, a) time_after32(a, b)
+
+struct timens_offset {
+ s64 sec;
+ u64 nsec;
+};
+
#endif
diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index 2e302c0f41f7..65a38acce27e 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -21,6 +21,8 @@
#define CS_RAW 1
#define CS_BASES (CS_RAW + 1)
+#define VCLOCK_TIMENS UINT_MAX
+
/**
* struct vdso_timestamp - basetime per clock_id
* @sec: seconds
@@ -48,6 +50,7 @@ struct vdso_timestamp {
* @mult: clocksource multiplier
* @shift: clocksource shift
* @basetime[clock_id]: basetime per clock_id
+ * @offset[clock_id]: time namespace offset per clock_id
* @tz_minuteswest: minutes west of Greenwich
* @tz_dsttime: type of DST correction
* @hrtimer_res: hrtimer resolution
@@ -55,6 +58,17 @@ struct vdso_timestamp {
*
* vdso_data will be accessed by 64 bit and compat code at the same time
* so we should be careful before modifying this structure.
+ *
+ * @basetime is used to store the base time for the system wide time getter
+ * VVAR page.
+ *
+ * @offset is used by the special time namespace VVAR pages which are
+ * installed instead of the real VVAR page. These namespace pages must set
+ * @seq to 1 and @clock_mode to VLOCK_TIMENS to force the code into the
+ * time namespace slow path. The namespace aware functions retrieve the
+ * real system wide VVAR page, read host time and add the per clock offset.
+ * For clocks which are not affected by time namespace adjustement the
+ * offset must be zero.
*/
struct vdso_data {
u32 seq;
@@ -65,7 +79,10 @@ struct vdso_data {
u32 mult;
u32 shift;
- struct vdso_timestamp basetime[VDSO_BASES];
+ union {
+ struct vdso_timestamp basetime[VDSO_BASES];
+ struct timens_offset offset[VDSO_BASES];
+ };
s32 tz_minuteswest;
s32 tz_dsttime;
diff --git a/lib/vdso/Kconfig b/lib/vdso/Kconfig
index 9fe698ff62ec..85276de70dba 100644
--- a/lib/vdso/Kconfig
+++ b/lib/vdso/Kconfig
@@ -24,4 +24,10 @@ config GENERIC_COMPAT_VDSO
help
This config option enables the compat VDSO layer.
+config VDSO_TIMENS
+ bool
+ help
+ Selected by architectures which support time namespaces in the
+ VDSO
+
endif
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index e630e7ff57f1..25244b677823 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,6 +38,51 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#endif
+#ifndef CONFIG_VDSO_TIMENS
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(void)
+{
+ return NULL;
+}
+#endif
+
+static int do_hres_timens(const struct vdso_data *vdns, clockid_t clk,
+ struct __kernel_timespec *ts)
+{
+ const struct vdso_data *vd = __arch_get_timens_vdso_data();
+ const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
+ const struct timens_offset *offs = &vdns->offset[clk];
+ u64 cycles, last, ns;
+ s64 sec;
+ u32 seq;
+
+ do {
+ seq = vdso_read_begin(vd);
+ cycles = __arch_get_hw_counter(vd->clock_mode);
+ ns = vdso_ts->nsec;
+ last = vd->cycle_last;
+ if (unlikely((s64)cycles < 0))
+ return -1;
+
+ ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
+ ns >>= vd->shift;
+ sec = vdso_ts->sec;
+ } while (unlikely(vdso_read_retry(vd, seq)));
+
+ /* Add the namespace offset */
+ sec += offs->sec;
+ ns += offs->nsec;
+
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return 0;
+}
+
static int do_hres(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
@@ -46,7 +91,28 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk,
u32 seq;
do {
- seq = vdso_read_begin(vd);
+ /*
+ * Open coded to handle VCLOCK_TIMENS. Time namespace
+ * enabled tasks have a special VVAR page installed which
+ * has vd->seq set to 1 and vd->clock_mode set to
+ * VCLOCK_TIMENS. For non time namespace affected tasks
+ * this does not affect performance because if vd->seq is
+ * odd, i.e. a concurrent update is in progress the extra
+ * check for vd->clock_mode is just a few extra
+ * instructions while spin waiting for vd->seq to become
+ * even again.
+ */
+ while (1) {
+ seq = READ_ONCE(vd->seq);
+ if (likely(!(seq & 1)))
+ break;
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
+ vd->clock_mode == VCLOCK_TIMENS)
+ return do_hres_timens(vd, clk, ts);
+ cpu_relax();
+ }
+ smp_rmb();
+
cycles = __arch_get_hw_counter(vd->clock_mode);
ns = vdso_ts->nsec;
last = vd->cycle_last;
@@ -68,6 +134,34 @@ static int do_hres(const struct vdso_data *vd, clockid_t clk,
return 0;
}
+static void do_coarse_timens(const struct vdso_data *vdns, clockid_t clk,
+ struct __kernel_timespec *ts)
+{
+ const struct vdso_data *vd = __arch_get_timens_vdso_data();
+ const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
+ const struct timens_offset *offs = &vdns->offset[clk];
+ u64 nsec;
+ s64 sec;
+ s32 seq;
+
+ do {
+ seq = vdso_read_begin(vd);
+ sec = vdso_ts->sec;
+ nsec = vdso_ts->nsec;
+ } while (unlikely(vdso_read_retry(vd, seq)));
+
+ /* Add the namespace offset */
+ sec += offs->sec;
+ nsec += offs->nsec;
+
+ /*
+ * Do this outside the loop: a race inside the loop could result
+ * in __iter_div_u64_rem() being extremely slow.
+ */
+ ts->tv_sec = sec + __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ ts->tv_nsec = nsec;
+}
+
static void do_coarse(const struct vdso_data *vd, clockid_t clk,
struct __kernel_timespec *ts)
{
@@ -75,7 +169,23 @@ static void do_coarse(const struct vdso_data *vd, clockid_t clk,
u32 seq;
do {
- seq = vdso_read_begin(vd);
+ /*
+ * Open coded to handle VCLOCK_TIMENS. See comment in
+ * do_hres().
+ */
+ while (1) {
+ seq = READ_ONCE(vd->seq);
+ if (likely(!(seq & 1)))
+ break;
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
+ vd->clock_mode == VCLOCK_TIMENS) {
+ do_coarse_timens(vd, clk, ts);
+ return;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+
ts->tv_sec = vdso_ts->sec;
ts->tv_nsec = vdso_ts->nsec;
} while (unlikely(vdso_read_retry(vd, seq)));
@@ -156,6 +266,10 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
}
if (unlikely(tz != NULL)) {
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) &&
+ vd->clock_mode == VCLOCK_TIMENS)
+ vd = __arch_get_timens_vdso_data();
+
tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest;
tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime;
}
@@ -167,7 +281,12 @@ __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
static __maybe_unused time_t __cvdso_time(time_t *time)
{
const struct vdso_data *vd = __arch_get_vdso_data();
- time_t t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);
+ time_t t;
+
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) && vd->clock_mode == VCLOCK_TIMENS)
+ vd = __arch_get_timens_vdso_data();
+
+ t = READ_ONCE(vd[CS_HRES_COARSE].basetime[CLOCK_REALTIME].sec);
if (time)
*time = t;
@@ -189,6 +308,9 @@ int __cvdso_clock_getres_common(clockid_t clock, struct __kernel_timespec *res)
if (unlikely((u32) clock >= MAX_CLOCKS))
return -1;
+ if (IS_ENABLED(CONFIG_VDSO_TIMENS) && vd->clock_mode == VCLOCK_TIMENS)
+ vd = __arch_get_timens_vdso_data();
+
hrtimer_res = READ_ONCE(vd[CS_HRES_COARSE].hrtimer_res);
/*
* Convert the clockid to a bitmask and use it to check which
--
2.23.0
^ permalink raw reply related
* [PATCHv7 18/33] lib/vdso: Add unlikely() hint into vdso_read_begin()
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
Place the branch with no concurrent write before contended case.
Performance numbers for Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
(more clock_gettime() cycles - the better):
| before | after
-----------------------------------
| 150252214 | 153242367
| 150301112 | 153324800
| 150392773 | 153125401
| 150373957 | 153399355
| 150303157 | 153489417
| 150365237 | 153494270
-----------------------------------
avg | 150331408 | 153345935
diff % | 2 | 0
-----------------------------------
stdev % | 0.3 | 0.1
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
include/vdso/helpers.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h
index 01641dbb68ef..9a2af9fca45e 100644
--- a/include/vdso/helpers.h
+++ b/include/vdso/helpers.h
@@ -10,7 +10,7 @@ static __always_inline u32 vdso_read_begin(const struct vdso_data *vd)
{
u32 seq;
- while ((seq = READ_ONCE(vd->seq)) & 1)
+ while (unlikely((seq = READ_ONCE(vd->seq)) & 1))
cpu_relax();
smp_rmb();
--
2.23.0
^ permalink raw reply related
* [PATCHv7 17/33] x86/vdso: Restrict splitting VVAR VMA
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Dmitry Safonov, Adrian Reber, Andrei Vagin,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
Forbid splitting VVAR resulting in stricter ABI and reducing amount
of corner-cases to consider while working further on VDSO.
As offset from timens to VVAR page is computed compile-time,
the pages in VVAR should stay together and not being partically
mremap()'ed.
Co-developed-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
arch/x86/entry/vdso/vma.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index f5937742b290..000db8282cc8 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -84,6 +84,18 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
+static int vvar_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+ const struct vdso_image *image = new_vma->vm_mm->context.vdso_image;
+
+ if (new_size != -image->sym_vvar_start)
+ return -EINVAL;
+
+ return 0;
+}
+
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
@@ -136,6 +148,7 @@ static const struct vm_special_mapping vdso_mapping = {
static const struct vm_special_mapping vvar_mapping = {
.name = "[vvar]",
.fault = vvar_fault,
+ .mremap = vvar_mremap,
};
/*
--
2.23.0
^ permalink raw reply related
* [PATCHv7 16/33] fs/proc: Respect boottime inside time namespace for /proc/uptime
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Dmitry Safonov, Adrian Reber, Andrei Vagin,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
Co-developed-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
fs/proc/uptime.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index a4c2791ab70b..5a1b228964fb 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/time_namespace.h>
#include <linux/kernel_stat.h>
static int uptime_proc_show(struct seq_file *m, void *v)
@@ -20,6 +21,8 @@ static int uptime_proc_show(struct seq_file *m, void *v)
nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
ktime_get_boottime_ts64(&uptime);
+ timens_add_boottime(&uptime);
+
idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
idle.tv_nsec = rem;
seq_printf(m, "%lu.%02lu %lu.%02lu\n",
--
2.23.0
^ permalink raw reply related
* [PATCHv7 15/33] posix-timers: Make clock_nanosleep() time namespace aware
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@openvz.org>
clock_nanosleep() accepts absolute values of expiration time, if the
TIMER_ABSTIME flag is set. This value is in the task time namespace,
which has to be converted to the host time namespace.
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/posix-stubs.c | 12 ++++++++++--
kernel/time/posix-timers.c | 17 +++++++++++++++--
2 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 2ccefc9ce184..47ee2684d250 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -129,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct __kernel_timespec __user *, rmtp)
{
struct timespec64 t;
+ ktime_t texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -147,7 +148,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(clockid, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -215,6 +219,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
+ ktime texp;
switch (which_clock) {
case CLOCK_REALTIME:
@@ -233,7 +238,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ?
+ texp = timespec64_to_ktime(t);
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(clockid, texp);
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index c0ae1f6d2add..ccae99a645e1 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1228,6 +1228,19 @@ static int common_nsleep(const clockid_t which_clock, int flags,
which_clock);
}
+static int common_nsleep_timens(const clockid_t which_clock, int flags,
+ const struct timespec64 *rqtp)
+{
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ if (flags & TIMER_ABSTIME)
+ texp = timens_ktime_to_host(which_clock, texp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
+}
+
SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct __kernel_timespec __user *, rqtp,
struct __kernel_timespec __user *, rmtp)
@@ -1305,7 +1318,7 @@ static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_monotonic_timespec,
.clock_get_ktime = posix_get_monotonic_ktime,
- .nsleep = common_nsleep,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
@@ -1354,7 +1367,7 @@ static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_ktime = posix_get_boottime_ktime,
.clock_get_timespec = posix_get_boottime_timespec,
- .nsleep = common_nsleep,
+ .nsleep = common_nsleep_timens,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
.timer_get = common_timer_get,
--
2.23.0
^ permalink raw reply related
* [PATCHv7 14/33] hrtimers: Prepare hrtimer_nanosleep() for time namespaces
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
clock_nanosleep() accepts absolute values of expiration time when
TIMER_ABSTIME flag is set. This absolute value is inside the task's
time namespace, and has to be converted to the host's time.
There is timens_ktime_to_host() helper for converting time, but
it accepts ktime argument.
As a preparation, make hrtimer_nanosleep() accept a clock value in ktime
instead of timespec64.
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
include/linux/hrtimer.h | 2 +-
kernel/time/hrtimer.c | 8 ++++----
kernel/time/posix-stubs.c | 4 ++--
kernel/time/posix-timers.c | 4 +++-
tools/perf/examples/bpf/5sec.c | 6 ++++--
5 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 1b9a51a1bccb..cdcb2e9cd54a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -502,7 +502,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
/* Precise sleep: */
extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
-extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
+extern long hrtimer_nanosleep(ktime_t rqtp,
const enum hrtimer_mode mode,
const clockid_t clockid);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0d4dc241c0fb..19cc504bd7cc 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1907,7 +1907,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
return ret;
}
-long hrtimer_nanosleep(const struct timespec64 *rqtp,
+long hrtimer_nanosleep(ktime_t rqtp,
const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
@@ -1920,7 +1920,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
slack = 0;
hrtimer_init_sleeper_on_stack(&t, clockid, mode);
- hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
+ hrtimer_set_expires_range_ns(&t.timer, rqtp, slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
goto out;
@@ -1955,7 +1955,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -1975,7 +1975,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index edaf075d1ee4..2ccefc9ce184 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -147,7 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
@@ -233,7 +233,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
+ return hrtimer_nanosleep(timespec64_to_ktime(t), flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 6e350cc8f600..c0ae1f6d2add 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1221,7 +1221,9 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
static int common_nsleep(const clockid_t which_clock, int flags,
const struct timespec64 *rqtp)
{
- return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ?
+ ktime_t texp = timespec64_to_ktime(*rqtp);
+
+ return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ?
HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
which_clock);
}
diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c
index b9c203219691..e6b6181c6dc6 100644
--- a/tools/perf/examples/bpf/5sec.c
+++ b/tools/perf/examples/bpf/5sec.c
@@ -41,9 +41,11 @@
#include <bpf.h>
-int probe(hrtimer_nanosleep, rqtp->tv_sec)(void *ctx, int err, long sec)
+#define NSEC_PER_SEC 1000000000L
+
+int probe(hrtimer_nanosleep, rqtp)(void *ctx, int err, long long sec)
{
- return sec == 5;
+ return sec / NSEC_PER_SEC == 5ULL;
}
license(GPL);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 13/33] alarmtimer: Make nanosleep time namespace aware
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
clock_nanosleep() accepts absolute values of expiration time when
TIMER_ABSTIME flag is set. This absolute value is inside the task's
time namespace, and has to be converted to the host's time.
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/alarmtimer.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c326427bb4cb..353e46f9acc2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -838,6 +838,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
ktime_t now = alarm_bases[type].get_ktime();
exp = ktime_add_safe(now, exp);
+ } else {
+ exp = timens_ktime_to_host(which_clock, exp);
}
ret = alarmtimer_do_nsleep(&alarm, exp, type);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 12/33] posix-timers: Make timer_settime() time namespace aware
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
Wire timer_settime() syscall into time namespace virtualization.
sys_timer_settime() calls the ktime->timer_set() callback. Right now,
common_timer_set() is the only implementation for the callback.
There user-supplied timer's value is converted from timespec64 to ktime
and then timens_ktime_to_host() can be used to convert namespace's time
to the host time.
Inside a time namespace kernel's time differ on a fixed offset from
a user-supplied, but only absolute values (TIMER_ABSTIME) must
be converted.
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/posix-timers.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index d8b5bd4cbae1..6e350cc8f600 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -885,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
+ if (flags & TIMER_ABSTIME)
+ expires = timens_ktime_to_host(timr->it_clock, expires);
sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 11/33] timerfd: Make timerfd_settime() time namespace aware
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
timerfd_settime() accepts an absolute value of the expiration time if
TFD_TIMER_ABSTIME is specified. This value is in task's time namespace
and has to be converted to the host's time namespace.
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
fs/timerfd.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 48305ba41e3c..f9da5752a79e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -26,6 +26,7 @@
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/rcupdate.h>
+#include <linux/time_namespace.h>
struct timerfd_ctx {
union {
@@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
}
if (texp != 0) {
+ if (flags & TFD_TIMER_ABSTIME)
+ texp = timens_ktime_to_host(clockid, texp);
if (isalarm(ctx)) {
if (flags & TFD_TIMER_ABSTIME)
alarm_start(&ctx->t.alarm, texp);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 10/33] kernel: Add do_timens_ktime_to_host() helper
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
The helper subtracts namespace's clock offset from the given time
and checks that the result is in [0, KTIME_MAX].
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
include/linux/time_namespace.h | 14 +++++++++++++
kernel/time/namespace.c | 36 ++++++++++++++++++++++++++++++++++
2 files changed, 50 insertions(+)
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 3d429c7ecca5..9a77d3854830 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -57,6 +57,15 @@ static inline void timens_add_boottime(struct timespec64 *ts)
*ts = timespec64_add(*ts, ns_offsets->boottime);
}
+ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
+ struct timens_offsets *offsets);
+static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
+{
+ struct timens_offsets *offsets = ¤t->nsproxy->time_ns->offsets;
+
+ return do_timens_ktime_to_host(clockid, tim, offsets);
+}
+
#else
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
@@ -83,6 +92,11 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts
static inline void timens_add_monotonic(struct timespec64 *ts) {}
static inline void timens_add_boottime(struct timespec64 *ts) {}
+
+static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
+{
+ return tim;
+}
#endif
#endif /* _LINUX_TIMENS_H */
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index c2a58e45fc4b..1a0fbaa5d2d4 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -16,6 +16,42 @@
#include <linux/err.h>
#include <linux/mm.h>
+ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
+ struct timens_offsets *ns_offsets)
+{
+ ktime_t offset;
+
+ switch (clockid) {
+ case CLOCK_MONOTONIC:
+ offset = timespec64_to_ktime(ns_offsets->monotonic);
+ break;
+ case CLOCK_BOOTTIME:
+ case CLOCK_BOOTTIME_ALARM:
+ offset = timespec64_to_ktime(ns_offsets->boottime);
+ break;
+ default:
+ return tim;
+ }
+
+ /*
+ * Check that @tim value is in [offset, KTIME_MAX + offset]
+ * and subtract offset.
+ */
+ if (tim < offset) {
+ /*
+ * User can specify @tim *absolute* value - if it's lesser than
+ * the time namespace's offset - it's already expired.
+ */
+ tim = 0;
+ } else {
+ tim = ktime_sub(tim, offset);
+ if (unlikely(tim > KTIME_MAX))
+ tim = KTIME_MAX;
+ }
+
+ return tim;
+}
+
static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 09/33] posix-clocks: Wire up clock_gettime() with timens offsets
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@openvz.org>
Adjust monotonic and boottime clocks with per-timens offsets.
As the result a process inside time namespace will see timers and clocks
corrected to offsets that were set on creating namespace.
Note that applications usually go through vDSO to get time, which is not
yet adjusted. Further changes complete time namespace virtualisation
with vDSO support.
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/alarmtimer.c | 1 +
kernel/time/posix-stubs.c | 3 +++
kernel/time/posix-timers.c | 5 +++++
3 files changed, 9 insertions(+)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9415c83f8cca..c326427bb4cb 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,7 @@
#include <linux/freezer.h>
#include <linux/compat.h>
#include <linux/module.h>
+#include <linux/time_namespace.h>
#include "posix-timers.h"
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 67df65f887ac..edaf075d1ee4 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -14,6 +14,7 @@
#include <linux/ktime.h>
#include <linux/timekeeping.h>
#include <linux/posix-timers.h>
+#include <linux/time_namespace.h>
#include <linux/compat.h>
#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
@@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
break;
case CLOCK_MONOTONIC:
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
break;
case CLOCK_BOOTTIME:
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
break;
default:
return -EINVAL;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 47a8d43fe1c6..d8b5bd4cbae1 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,6 +30,7 @@
#include <linux/hashtable.h>
#include <linux/compat.h>
#include <linux/nospec.h>
+#include <linux/time_namespace.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -195,6 +196,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -209,6 +211,7 @@ static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_raw_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -223,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
struct timespec64 *tp)
{
ktime_get_coarse_ts64(tp);
+ timens_add_monotonic(tp);
return 0;
}
@@ -235,6 +239,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
+ timens_add_boottime(tp);
return 0;
}
--
2.23.0
^ permalink raw reply related
* [PATCHv7 08/33] posix-timers: Use clock_get_ktime() in common_timer_get()
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
Now, when the clock_get_ktime() callback exists, the suboptimal
timespec64-based conversion can be removed from common_timer_get().
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/posix-timers.c | 8 +-------
1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1d7329e8425f..47a8d43fe1c6 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -665,7 +665,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
{
const struct k_clock *kc = timr->kclock;
ktime_t now, remaining, iv;
- struct timespec64 ts64;
bool sig_none;
sig_none = timr->it_sigev_notify == SIGEV_NONE;
@@ -683,12 +682,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
return;
}
- /*
- * The timespec64 based conversion is suboptimal, but it's not
- * worth to implement yet another callback.
- */
- kc->clock_get_timespec(timr->it_clock, &ts64);
- now = timespec64_to_ktime(ts64);
+ now = kc->clock_get_ktime(timr->it_clock);
/*
* When a requeue is pending or this is a SIGEV_NONE timer move the
--
2.23.0
^ permalink raw reply related
* [PATCHv7 07/33] posix-clocks: Introduce clock_get_ktime() callback
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
The callsite in common_timer_get() has already a comment:
/*
* The timespec64 based conversion is suboptimal, but it's not
* worth to implement yet another callback.
*/
kc->clock_get(timr->it_clock, &ts64);
now = timespec64_to_ktime(ts64);
The upcoming support for time namespaces requires to have access to:
- The time in a task's time namespace for sys_clock_gettime()
- The time in the root name space for common_timer_get()
That adds a valid reason to finally implement a separate callback which
returns the time in ktime_t format.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/alarmtimer.c | 19 ++++++++++++++++++-
kernel/time/posix-timers.c | 26 +++++++++++++++++++++++++-
kernel/time/posix-timers.h | 3 +++
3 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 73a5458194c7..9415c83f8cca 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -664,7 +664,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
* @which_clock: clockid
* @tp: timespec to fill.
*
- * Provides the underlying alarm base time.
+ * Provides the underlying alarm base time in a tasks time namespace.
*/
static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp)
{
@@ -676,6 +676,22 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp
return base->get_timespec(base->base_clockid, tp);
}
+/**
+ * alarm_clock_get_ktime - posix clock_get_ktime interface
+ * @which_clock: clockid
+ *
+ * Provides the underlying alarm base time in the root namespace.
+ */
+static ktime_t alarm_clock_get_ktime(clockid_t which_clock)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ if (!alarmtimer_get_rtcdev())
+ return -EINVAL;
+
+ return base->get_ktime();
+}
+
/**
* alarm_timer_create - posix timer_create interface
* @new_timer: k_itimer pointer to manage
@@ -839,6 +855,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
+ .clock_get_ktime = alarm_clock_get_ktime,
.clock_get_timespec = alarm_clock_get_timespec,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index e65241a46038..1d7329e8425f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -171,6 +171,11 @@ int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
+static ktime_t posix_get_realtime_ktime(clockid_t which_clock)
+{
+ return ktime_get_real();
+}
+
/* Set clock_realtime */
static int posix_clock_realtime_set(const clockid_t which_clock,
const struct timespec64 *tp)
@@ -193,6 +198,11 @@ static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64
return 0;
}
+static ktime_t posix_get_monotonic_ktime(clockid_t which_clock)
+{
+ return ktime_get();
+}
+
/*
* Get monotonic-raw time for posix timers
*/
@@ -228,12 +238,22 @@ int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *
return 0;
}
+static ktime_t posix_get_boottime_ktime(const clockid_t which_clock)
+{
+ return ktime_get_boottime();
+}
+
static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_clocktai_ts64(tp);
return 0;
}
+static ktime_t posix_get_tai_ktime(clockid_t which_clock)
+{
+ return ktime_get_clocktai();
+}
+
static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
{
tp->tv_sec = 0;
@@ -781,7 +801,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
* hood. See hrtimer_init(). Update timr->kclock, so the generic
- * functions which use timr->kclock->clock_get_timespec() work.
+ * functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
* use ABSTIME, so it needs to switch back.
@@ -1262,6 +1282,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_realtime_timespec,
+ .clock_get_ktime = posix_get_realtime_ktime,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
@@ -1280,6 +1301,7 @@ static const struct k_clock clock_realtime = {
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
.clock_get_timespec = posix_get_monotonic_timespec,
+ .clock_get_ktime = posix_get_monotonic_ktime,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1310,6 +1332,7 @@ static const struct k_clock clock_monotonic_coarse = {
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
+ .clock_get_ktime = posix_get_tai_ktime,
.clock_get_timespec = posix_get_tai_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
@@ -1326,6 +1349,7 @@ static const struct k_clock clock_tai = {
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
+ .clock_get_ktime = posix_get_boottime_ktime,
.clock_get_timespec = posix_get_boottime_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 070611b2c253..f32a2ebba9b8 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -6,8 +6,11 @@ struct k_clock {
struct timespec64 *tp);
int (*clock_set)(const clockid_t which_clock,
const struct timespec64 *tp);
+ /* Returns the clock value in the current time namespace. */
int (*clock_get_timespec)(const clockid_t which_clock,
struct timespec64 *tp);
+ /* Returns the clock value in the root time namespace. */
+ ktime_t (*clock_get_ktime)(const clockid_t which_clock);
int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
--
2.23.0
^ permalink raw reply related
* [PATCHv7 06/33] alarmtimer: Provide get_timespec() callback
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
The upcoming support for time namespaces requires to have access to:
- The time in a task's time namespace for sys_clock_gettime()
- The time in the root name space for common_timer_get()
Wire up alarm bases with get_timespec().
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
include/linux/posix-timers.h | 3 +++
kernel/time/alarmtimer.c | 8 ++++++--
kernel/time/posix-timers.c | 4 ++--
3 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 3d10c84a97a9..d535d52eb3ca 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -230,4 +230,7 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
void posixtimer_rearm(struct kernel_siginfo *info);
+
+int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp);
+int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp);
#endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 22b6f9b133b2..73a5458194c7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,12 +37,15 @@
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
+ * @get_timespec: Function to read the namespace time correlating to the base
* @base_clockid: clockid for the base
*/
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
ktime_t (*get_ktime)(void);
+ int (*get_timespec)(const clockid_t which_clock,
+ struct timespec64 *tp);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -670,8 +673,7 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec64(base->get_ktime());
- return 0;
+ return base->get_timespec(base->base_clockid, tp);
}
/**
@@ -883,8 +885,10 @@ static int __init alarmtimer_init(void)
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_timespec = posix_get_realtime_timespec,
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_timespec = posix_get_boottime_timespec;
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 68d4690cc225..e65241a46038 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
+int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
return 0;
@@ -222,7 +222,7 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
return 0;
}
-static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
+int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
return 0;
--
2.23.0
^ permalink raw reply related
* [PATCHv7 05/33] alarmtimer: Rename gettime() callback to get_ktime()
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
The upcoming support for time namespaces requires to have access to:
- The time in a tasks time namespace for sys_clock_gettime()
- The time in the root name space for common_timer_get()
struct alarm_base needs to follow the same name convention, so rename
.gettime() callback into get_ktime() as a preparation for introducing
get_timespec().
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/alarmtimer.c | 34 +++++++++++++++++-----------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 62b06cfa710d..22b6f9b133b2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -36,13 +36,13 @@
* struct alarm_base - Alarm timer bases
* @lock: Lock for syncrhonized access to the base
* @timerqueue: Timerqueue head managing the list of events
- * @gettime: Function to read the time correlating to the base
+ * @get_ktime: Function to read the time correlating to the base
* @base_clockid: clockid for the base
*/
static struct alarm_base {
spinlock_t lock;
struct timerqueue_head timerqueue;
- ktime_t (*gettime)(void);
+ ktime_t (*get_ktime)(void);
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
@@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
spin_unlock_irqrestore(&base->lock, flags);
if (alarm->function)
- restart = alarm->function(alarm, base->gettime());
+ restart = alarm->function(alarm, base->get_ktime());
spin_lock_irqsave(&base->lock, flags);
if (restart != ALARMTIMER_NORESTART) {
@@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
}
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_fired(alarm, base->gettime());
+ trace_alarmtimer_fired(alarm, base->get_ktime());
return ret;
}
@@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
ktime_t alarm_expires_remaining(const struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return ktime_sub(alarm->node.expires, base->gettime());
+ return ktime_sub(alarm->node.expires, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_expires_remaining);
@@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev)
spin_unlock_irqrestore(&base->lock, flags);
if (!next)
continue;
- delta = ktime_sub(next->expires, base->gettime());
+ delta = ktime_sub(next->expires, base->get_ktime());
if (!min || (delta < min)) {
expires = next->expires;
min = delta;
@@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start)
hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_start(alarm, base->gettime());
+ trace_alarmtimer_start(alarm, base->get_ktime());
}
EXPORT_SYMBOL_GPL(alarm_start);
@@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add_safe(start, base->gettime());
+ start = ktime_add_safe(start, base->get_ktime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
alarmtimer_dequeue(base, alarm);
spin_unlock_irqrestore(&base->lock, flags);
- trace_alarmtimer_cancel(alarm, base->gettime());
+ trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
}
EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
@@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- return alarm_forward(alarm, base->gettime(), interval);
+ return alarm_forward(alarm, base->get_ktime(), interval);
}
EXPORT_SYMBOL_GPL(alarm_forward_now);
@@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
return;
}
- delta = ktime_sub(absexp, base->gettime());
+ delta = ktime_sub(absexp, base->get_ktime());
spin_lock_irqsave(&freezer_delta_lock, flags);
if (!freezer_delta || (delta < freezer_delta)) {
@@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
struct alarm_base *base = &alarm_bases[alarm->type];
if (!absolute)
- expires = ktime_add_safe(expires, base->gettime());
+ expires = ktime_add_safe(expires, base->get_ktime());
if (sigev_none)
alarm->node.expires = expires;
else
@@ -670,7 +670,7 @@ static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp
if (!alarmtimer_get_rtcdev())
return -EINVAL;
- *tp = ktime_to_timespec64(base->gettime());
+ *tp = ktime_to_timespec64(base->get_ktime());
return 0;
}
@@ -747,7 +747,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
struct timespec64 rmt;
ktime_t rem;
- rem = ktime_sub(absexp, alarm_bases[type].gettime());
+ rem = ktime_sub(absexp, alarm_bases[type].get_ktime());
if (rem <= 0)
return 0;
@@ -816,7 +816,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
exp = timespec64_to_ktime(*tsreq);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
- ktime_t now = alarm_bases[type].gettime();
+ ktime_t now = alarm_bases[type].get_ktime();
exp = ktime_add_safe(now, exp);
}
@@ -882,9 +882,9 @@ static int __init alarmtimer_init(void)
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
- alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
- alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
for (i = 0; i < ALARM_NUMTYPE; i++) {
timerqueue_init_head(&alarm_bases[i].timerqueue);
spin_lock_init(&alarm_bases[i].lock);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 04/33] posix-clocks: Rename .clock_get_timespec() callbacks accordingly
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
The upcoming support for time namespaces requires to have access to:
- The time in a task's time namespace for sys_clock_gettime()
- The time in the root name space for common_timer_get()
That adds a valid reason to finally implement a separate callback which
returns the time in ktime_t format in (struct k_clock).
As a preparation ground for introducing clock_get_ktime(), the original
callback clock_get() was renamed into clock_get_timespec().
Reflect the renaming into callbacks realizations.
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/alarmtimer.c | 6 +++---
kernel/time/posix-timers.c | 16 ++++++++--------
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8523df726fee..62b06cfa710d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -657,13 +657,13 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
}
/**
- * alarm_clock_get - posix clock_get_timespec interface
+ * alarm_clock_get_timespec - posix clock_get_timespec interface
* @which_clock: clockid
* @tp: timespec to fill.
*
* Provides the underlying alarm base time.
*/
-static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
+static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
@@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
- .clock_get_timespec = alarm_clock_get,
+ .clock_get_timespec = alarm_clock_get_timespec,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
.timer_del = common_timer_del,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 44d4f9cb782d..68d4690cc225 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -165,7 +165,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
}
/* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_real_ts64(tp);
return 0;
@@ -187,7 +187,7 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
/*
* Get monotonic time for posix timers
*/
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_ts64(tp);
return 0;
@@ -222,13 +222,13 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *
return 0;
}
-static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_boottime_ts64(tp);
return 0;
}
-static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
+static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp)
{
ktime_get_clocktai_ts64(tp);
return 0;
@@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get_timespec = posix_clock_realtime_get,
+ .clock_get_timespec = posix_get_realtime_timespec,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
@@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = {
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get_timespec = posix_ktime_get_ts,
+ .clock_get_timespec = posix_get_monotonic_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1310,7 +1310,7 @@ static const struct k_clock clock_monotonic_coarse = {
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get_timespec = posix_get_tai,
+ .clock_get_timespec = posix_get_tai_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = {
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get_timespec = posix_get_boottime,
+ .clock_get_timespec = posix_get_boottime_timespec,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
--
2.23.0
^ permalink raw reply related
* [PATCHv7 03/33] posix-clocks: Rename the clock_get() callback to clock_get_timespec()
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andrei Vagin, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@gmail.com>
The upcoming support for time namespaces requires to have access to:
- The time in a task's time namespace for sys_clock_gettime()
- The time in the root name space for common_timer_get()
That adds a valid reason to finally implement a separate callback which
returns the time in ktime_t format, rather than in (struct timespec).
Rename the clock_get() callback to clock_get_timespec() as a preparation
for introducing clock_get_ktime().
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
kernel/time/alarmtimer.c | 4 ++--
kernel/time/posix-clock.c | 8 ++++----
kernel/time/posix-cpu-timers.c | 32 ++++++++++++++++----------------
kernel/time/posix-timers.c | 22 +++++++++++-----------
kernel/time/posix-timers.h | 4 ++--
5 files changed, 35 insertions(+), 35 deletions(-)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 451f9d05ccfe..8523df726fee 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -657,7 +657,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp
}
/**
- * alarm_clock_get - posix clock_get interface
+ * alarm_clock_get - posix clock_get_timespec interface
* @which_clock: clockid
* @tp: timespec to fill.
*
@@ -837,7 +837,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
const struct k_clock alarm_clock = {
.clock_getres = alarm_clock_getres,
- .clock_get = alarm_clock_get,
+ .clock_get_timespec = alarm_clock_get,
.timer_create = alarm_timer_create,
.timer_set = common_timer_set,
.timer_del = common_timer_del,
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index ec960bb939fd..c8f9c9b1cd82 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -315,8 +315,8 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
}
const struct k_clock clock_posix_dynamic = {
- .clock_getres = pc_clock_getres,
- .clock_set = pc_clock_settime,
- .clock_get = pc_clock_gettime,
- .clock_adj = pc_clock_adjtime,
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get_timespec = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
};
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 92a431981b1c..c84ee50e2eab 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
}
const struct k_clock clock_posix_cpu = {
- .clock_getres = posix_cpu_clock_getres,
- .clock_set = posix_cpu_clock_set,
- .clock_get = posix_cpu_clock_get,
- .timer_create = posix_cpu_timer_create,
- .nsleep = posix_cpu_nsleep,
- .timer_set = posix_cpu_timer_set,
- .timer_del = posix_cpu_timer_del,
- .timer_get = posix_cpu_timer_get,
- .timer_rearm = posix_cpu_timer_rearm,
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get_timespec = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+ .timer_rearm = posix_cpu_timer_rearm,
};
const struct k_clock clock_process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get_timespec = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
};
const struct k_clock clock_thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .timer_create = thread_cpu_timer_create,
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get_timespec = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
};
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0ec5b7a1d769..44d4f9cb782d 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -667,7 +667,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
* The timespec64 based conversion is suboptimal, but it's not
* worth to implement yet another callback.
*/
- kc->clock_get(timr->it_clock, &ts64);
+ kc->clock_get_timespec(timr->it_clock, &ts64);
now = timespec64_to_ktime(ts64);
/*
@@ -781,7 +781,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
* hood. See hrtimer_init(). Update timr->kclock, so the generic
- * functions which use timr->kclock->clock_get() work.
+ * functions which use timr->kclock->clock_get_timespec() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
* use ABSTIME, so it needs to switch back.
@@ -1067,7 +1067,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp);
+ error = kc->clock_get_timespec(which_clock, &kernel_tp);
if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
@@ -1149,7 +1149,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
if (!kc)
return -EINVAL;
- err = kc->clock_get(which_clock, &ts);
+ err = kc->clock_get_timespec(which_clock, &ts);
if (!err && put_old_timespec32(&ts, tp))
err = -EFAULT;
@@ -1261,7 +1261,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
static const struct k_clock clock_realtime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_clock_realtime_get,
+ .clock_get_timespec = posix_clock_realtime_get,
.clock_set = posix_clock_realtime_set,
.clock_adj = posix_clock_realtime_adj,
.nsleep = common_nsleep,
@@ -1279,7 +1279,7 @@ static const struct k_clock clock_realtime = {
static const struct k_clock clock_monotonic = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_ktime_get_ts,
+ .clock_get_timespec = posix_ktime_get_ts,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1295,22 +1295,22 @@ static const struct k_clock clock_monotonic = {
static const struct k_clock clock_monotonic_raw = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_monotonic_raw,
+ .clock_get_timespec = posix_get_monotonic_raw,
};
static const struct k_clock clock_realtime_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
+ .clock_get_timespec = posix_get_realtime_coarse,
};
static const struct k_clock clock_monotonic_coarse = {
.clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
+ .clock_get_timespec = posix_get_monotonic_coarse,
};
static const struct k_clock clock_tai = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_tai,
+ .clock_get_timespec = posix_get_tai,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
@@ -1326,7 +1326,7 @@ static const struct k_clock clock_tai = {
static const struct k_clock clock_boottime = {
.clock_getres = posix_get_hrtimer_res,
- .clock_get = posix_get_boottime,
+ .clock_get_timespec = posix_get_boottime,
.nsleep = common_nsleep,
.timer_create = common_timer_create,
.timer_set = common_timer_set,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 897c29e162b9..070611b2c253 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -6,8 +6,8 @@ struct k_clock {
struct timespec64 *tp);
int (*clock_set)(const clockid_t which_clock,
const struct timespec64 *tp);
- int (*clock_get)(const clockid_t which_clock,
- struct timespec64 *tp);
+ int (*clock_get_timespec)(const clockid_t which_clock,
+ struct timespec64 *tp);
int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
--
2.23.0
^ permalink raw reply related
* [PATCHv7 02/33] time: Add timens_offsets to be used for tasks in timens
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@openvz.org>
Introduce offsets for time namespace. They will contain an adjustment
needed to convert clocks to/from host's.
A new namespace is created with the same offsets as the time namespace
of the current process.
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
include/linux/time_namespace.h | 22 ++++++++++++++++++++++
kernel/time/namespace.c | 2 ++
2 files changed, 24 insertions(+)
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
index 873b908c9ba8..3d429c7ecca5 100644
--- a/include/linux/time_namespace.h
+++ b/include/linux/time_namespace.h
@@ -12,11 +12,17 @@
struct user_namespace;
extern struct user_namespace init_user_ns;
+struct timens_offsets {
+ struct timespec64 monotonic;
+ struct timespec64 boottime;
+};
+
struct time_namespace {
struct kref kref;
struct user_namespace *user_ns;
struct ucounts *ucounts;
struct ns_common ns;
+ struct timens_offsets offsets;
} __randomize_layout;
extern struct time_namespace init_time_ns;
@@ -37,6 +43,20 @@ static inline void put_time_ns(struct time_namespace *ns)
kref_put(&ns->kref, free_time_ns);
}
+static inline void timens_add_monotonic(struct timespec64 *ts)
+{
+ struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets;
+
+ *ts = timespec64_add(*ts, ns_offsets->monotonic);
+}
+
+static inline void timens_add_boottime(struct timespec64 *ts)
+{
+ struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets;
+
+ *ts = timespec64_add(*ts, ns_offsets->boottime);
+}
+
#else
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
@@ -61,6 +81,8 @@ static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *ts
return 0;
}
+static inline void timens_add_monotonic(struct timespec64 *ts) {}
+static inline void timens_add_boottime(struct timespec64 *ts) {}
#endif
#endif /* _LINUX_TIMENS_H */
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 2662a69e0382..c2a58e45fc4b 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/err.h>
+#include <linux/mm.h>
static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
{
@@ -60,6 +61,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
ns->ucounts = ucounts;
ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
+ ns->offsets = old_ns->offsets;
return ns;
fail_free:
--
2.23.0
^ permalink raw reply related
* [PATCHv7 01/33] ns: Introduce Time Namespace
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Andrei Vagin, Dmitry Safonov, Adrian Reber,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
In-Reply-To: <20191011012341.846266-1-dima@arista.com>
From: Andrei Vagin <avagin@openvz.org>
Time Namespace isolates clock values.
The kernel provides access to several clocks CLOCK_REALTIME,
CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc.
CLOCK_REALTIME
System-wide clock that measures real (i.e., wall-clock) time.
CLOCK_MONOTONIC
Clock that cannot be set and represents monotonic time since
some unspecified starting point.
CLOCK_BOOTTIME
Identical to CLOCK_MONOTONIC, except it also includes any time
that the system is suspended.
For many users, the time namespace means the ability to changes date and
time in a container (CLOCK_REALTIME).
But in a context of the checkpoint/restore functionality, monotonic and
bootime clocks become interesting. Both clocks are monotonic with
unspecified staring points. These clocks are widely used to measure time
slices and set timers. After restoring or migrating processes, we have to
guarantee that they never go backward. In an ideal case, the behavior of
these clocks should be the same as for a case when a whole system is
suspended. All this means that we need to be able to set CLOCK_MONOTONIC
and CLOCK_BOOTTIME clocks, what can be done by adding per-namespace
offsets for clocks.
A time namespace is similar to a pid namespace in a way how it is
created: unshare(CLONE_NEWTIME) system call creates a new time namespace,
but doesn't set it to the current process. Then all children of
the process will be born in the new time namespace, or a process can
use the setns() system call to join a namespace.
This scheme allows setting clock offsets for a namespace, before any
processes appear in it.
All available clone flags have been used, so CLONE_NEWTIME uses the
highest bit of CSIGNAL. It means that we can use it with the unshare()
system call only. Rith now, this works for us, because time namespace
offsets can be set only when a new time namespace is not populated. In a
future, we will have the clone3() system call [1] which will allow to use
the CSIGNAL mask for clone flags.
[1]: httmps://lkml.kernel.org/r/20190604160944.4058-1-christian@brauner.io
Link: https://criu.org/Time_namespace
Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Co-developed-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
MAINTAINERS | 2 +
fs/proc/namespaces.c | 4 +
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 3 +
include/linux/time_namespace.h | 66 ++++++++++
include/linux/user_namespace.h | 1 +
include/uapi/linux/sched.h | 6 +
init/Kconfig | 7 ++
kernel/fork.c | 16 ++-
kernel/nsproxy.c | 41 +++++--
kernel/time/Makefile | 1 +
kernel/time/namespace.c | 217 +++++++++++++++++++++++++++++++++
12 files changed, 356 insertions(+), 10 deletions(-)
create mode 100644 include/linux/time_namespace.h
create mode 100644 kernel/time/namespace.c
diff --git a/MAINTAINERS b/MAINTAINERS
index d44d6732510d..cabe7bddbf69 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13009,6 +13009,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
S: Maintained
F: fs/timerfd.c
F: include/linux/timer*
+F: include/linux/time_namespace.h
+F: kernel/time_namespace.c
F: kernel/time/*timer*
POWER MANAGEMENT CORE
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f78b09..8b5c720fe5d7 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
+#ifdef CONFIG_TIME_NS
+ &timens_operations,
+ &timens_for_children_operations,
+#endif
};
static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 2ae1b1a4d84d..074f395b9ad2 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -35,6 +35,8 @@ struct nsproxy {
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
+ struct time_namespace *time_ns;
+ struct time_namespace *time_ns_for_children;
struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index d31cb6215905..d312e6281e69 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations timens_operations;
+extern const struct proc_ns_operations timens_for_children_operations;
/*
* We always define these enumerators
@@ -43,6 +45,7 @@ enum {
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
+ PROC_TIME_INIT_INO = 0xEFFFFFFAU,
};
#ifdef CONFIG_PROC_FS
diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
new file mode 100644
index 000000000000..873b908c9ba8
--- /dev/null
+++ b/include/linux/time_namespace.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TIMENS_H
+#define _LINUX_TIMENS_H
+
+
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/nsproxy.h>
+#include <linux/ns_common.h>
+#include <linux/err.h>
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct time_namespace {
+ struct kref kref;
+ struct user_namespace *user_ns;
+ struct ucounts *ucounts;
+ struct ns_common ns;
+} __randomize_layout;
+extern struct time_namespace init_time_ns;
+
+#ifdef CONFIG_TIME_NS
+static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
+{
+ kref_get(&ns->kref);
+ return ns;
+}
+
+extern struct time_namespace *copy_time_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct time_namespace *old_ns);
+extern void free_time_ns(struct kref *kref);
+extern int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
+
+static inline void put_time_ns(struct time_namespace *ns)
+{
+ kref_put(&ns->kref, free_time_ns);
+}
+
+#else
+static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
+{
+ return NULL;
+}
+
+static inline void put_time_ns(struct time_namespace *ns)
+{
+}
+
+static inline struct time_namespace *copy_time_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+ if (flags & CLONE_NEWTIME)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+
+static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+ return 0;
+}
+
+#endif
+
+#endif /* _LINUX_TIMENS_H */
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index fb9f4f799554..6ef1c7109fc4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -45,6 +45,7 @@ enum ucount_type {
UCOUNT_NET_NAMESPACES,
UCOUNT_MNT_NAMESPACES,
UCOUNT_CGROUP_NAMESPACES,
+ UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 99335e1f4a27..5b7511ac2005 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -33,6 +33,12 @@
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
+/*
+ * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
+ * syscalls only:
+ */
+#define CLONE_NEWTIME 0x00000080 /* New time namespace */
+
#ifndef __ASSEMBLY__
/**
* struct clone_args - arguments for the clone3 syscall
diff --git a/init/Kconfig b/init/Kconfig
index b4daad2bac23..bc2ee93408ad 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1096,6 +1096,13 @@ config UTS_NS
In this namespace tasks see different info provided with the
uname() system call
+config TIME_NS
+ bool "TIME namespace"
+ default y
+ help
+ In this namespace boottime and monotonic clocks can be set.
+ The time will keep going with the same pace.
+
config IPC_NS
bool "IPC namespace"
depends on (SYSVIPC || POSIX_MQUEUE)
diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf53125210..aa65333dbc45 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1772,6 +1772,7 @@ static __latent_entropy struct task_struct *copy_process(
struct multiprocess_signals delayed;
struct file *pidfile = NULL;
u64 clone_flags = args->flags;
+ struct nsproxy *nsp = current->nsproxy;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1814,8 +1815,16 @@ static __latent_entropy struct task_struct *copy_process(
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
- (task_active_pid_ns(current) !=
- current->nsproxy->pid_ns_for_children))
+ (task_active_pid_ns(current) != nsp->pid_ns_for_children))
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * If the new process will be in a different time namespace
+ * do not allow it to share VM or a thread group with the forking task.
+ */
+ if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
+ if (nsp->time_ns != nsp->time_ns_for_children)
return ERR_PTR(-EINVAL);
}
@@ -2703,7 +2712,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+ CLONE_NEWTIME))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c815f58e6bc0..ed9882108cd2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/time_namespace.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
+#ifdef CONFIG_TIME_NS
+ .time_ns = &init_time_ns,
+ .time_ns_for_children = &init_time_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_net;
}
+ new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
+ tsk->nsproxy->time_ns_for_children);
+ if (IS_ERR(new_nsp->time_ns_for_children)) {
+ err = PTR_ERR(new_nsp->time_ns_for_children);
+ goto out_time;
+ }
+ new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
+
return new_nsp;
+out_time:
+ put_net(new_nsp->net_ns);
out_net:
put_cgroup_ns(new_nsp->cgroup_ns);
out_cgroup:
@@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
+ int ret;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP)))) {
- get_nsproxy(old_ns);
- return 0;
- }
-
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ get_nsproxy(old_ns);
+ return 0;
+ }
+ } else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
/*
@@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
+ ret = timens_on_fork(new_ns, tsk);
+ if (ret) {
+ free_nsproxy(new_ns);
+ return ret;
+ }
+
tsk->nsproxy = new_ns;
return 0;
}
@@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->time_ns)
+ put_time_ns(ns->time_ns);
+ if (ns->time_ns_for_children)
+ put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
@@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 1867044800bb..c8f00168afe8 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
+obj-$(CONFIG_TIME_NS) += namespace.o
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
new file mode 100644
index 000000000000..2662a69e0382
--- /dev/null
+++ b/kernel/time/namespace.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/time_namespace.h>
+#include <linux/user_namespace.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/proc_ns.h>
+#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/cred.h>
+#include <linux/err.h>
+
+static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
+}
+
+static void dec_time_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
+}
+
+/**
+ * clone_time_ns - Clone a time namespace
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * Clone @old_ns and set the clone refcount to 1
+ *
+ * Return: The new namespace or ERR_PTR.
+ */
+static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
+ struct time_namespace *old_ns)
+{
+ struct time_namespace *ns;
+ struct ucounts *ucounts;
+ int err;
+
+ err = -ENOSPC;
+ ucounts = inc_time_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
+ ns = kmalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns)
+ goto fail_dec;
+
+ kref_init(&ns->kref);
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto fail_free;
+
+ ns->ucounts = ucounts;
+ ns->ns.ops = &timens_operations;
+ ns->user_ns = get_user_ns(user_ns);
+ return ns;
+
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_time_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
+}
+
+/**
+ * copy_time_ns - Create timens_for_children from @old_ns
+ * @flags: Cloning flags
+ * @user_ns: User namespace which owns a new namespace.
+ * @old_ns: Namespace to clone
+ *
+ * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
+ * adds a refcounter to @old_ns otherwise.
+ *
+ * Return: timens_for_children namespace or ERR_PTR.
+ */
+struct time_namespace *copy_time_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct time_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWTIME))
+ return get_time_ns(old_ns);
+
+ return clone_time_ns(user_ns, old_ns);
+}
+
+void free_time_ns(struct kref *kref)
+{
+ struct time_namespace *ns;
+
+ ns = container_of(kref, struct time_namespace, kref);
+ dec_time_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+
+static struct time_namespace *to_time_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct time_namespace, ns);
+}
+
+static struct ns_common *timens_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static struct ns_common *timens_for_children_get(struct task_struct *task)
+{
+ struct time_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void timens_put(struct ns_common *ns)
+{
+ put_time_ns(to_time_ns(ns));
+}
+
+static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct time_namespace *ns = to_time_ns(new);
+
+ if (!current_is_single_threaded())
+ return -EUSERS;
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns_for_children);
+ nsproxy->time_ns_for_children = ns;
+ return 0;
+}
+
+int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+{
+ struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
+ struct time_namespace *ns = to_time_ns(nsc);
+
+ /* create_new_namespaces() already incremented the ref counter */
+ if (nsproxy->time_ns == nsproxy->time_ns_for_children)
+ return 0;
+
+ get_time_ns(ns);
+ put_time_ns(nsproxy->time_ns);
+ nsproxy->time_ns = ns;
+
+ return 0;
+}
+
+static struct user_namespace *timens_owner(struct ns_common *ns)
+{
+ return to_time_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations timens_operations = {
+ .name = "time",
+ .type = CLONE_NEWTIME,
+ .get = timens_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+const struct proc_ns_operations timens_for_children_operations = {
+ .name = "time_for_children",
+ .type = CLONE_NEWTIME,
+ .get = timens_for_children_get,
+ .put = timens_put,
+ .install = timens_install,
+ .owner = timens_owner,
+};
+
+struct time_namespace init_time_ns = {
+ .kref = KREF_INIT(3),
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_TIME_INIT_INO,
+ .ns.ops = &timens_operations,
+};
+
+static int __init time_ns_init(void)
+{
+ return 0;
+}
+subsys_initcall(time_ns_init);
--
2.23.0
^ permalink raw reply related
* [PATCHv7 00/33] kernel: Introduce Time Namespace
From: Dmitry Safonov @ 2019-10-11 1:23 UTC (permalink / raw)
To: linux-kernel
Cc: Dmitry Safonov, Dmitry Safonov, Adrian Reber, Andrei Vagin,
Andy Lutomirski, Arnd Bergmann, Christian Brauner,
Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
Thomas Gleixner, Vincenzo Frascino, containers, criu, linux-api,
x86
Discussions around time namespace are there for a long time. The first
attempt to implement it was in 2006 by Jeff Dike. From that time, the
topic appears on and off in various discussions.
There are two main use cases for time namespaces:
1. change date and time inside a container;
2. adjust clocks for a container restored from a checkpoint.
“It seems like this might be one of the last major obstacles keeping
migration from being used in production systems, given that not all
containers and connections can be migrated as long as a time dependency
is capable of messing it up.” (by github.com/dav-ell)
The kernel provides access to several clocks: CLOCK_REALTIME,
CLOCK_MONOTONIC, CLOCK_BOOTTIME. Last two clocks are monotonous, but the
start points for them are not defined and are different for each
system. When a container is migrated from one node to another, all
clocks have to be restored into consistent states; in other words, they
have to continue running from the same points where they have been
dumped.
The main idea of this patch set is adding per-namespace offsets for
system clocks. When a process in a non-root time namespace requests
time of a clock, a namespace offset is added to the current value of
this clock and the sum is returned.
All offsets are placed on a separate page, this allows us to map it as
part of VVAR into user processes and use offsets from VDSO calls.
Now offsets are implemented for CLOCK_MONOTONIC and CLOCK_BOOTTIME
clocks.
v6..v7 Changes:
* Based on Andy & Thomas suggestions and the patches that Thomas kindely
sent, reworked from two VDSO code images into trick with odd seq
number for timens page that goes on the place of vvar page inside ns.
* Moved kernel/time_namespace.c => kernel/time/namespace.c
* Fixed bpf 5sec example
* Added selftests outputs
* By Thomas's suggestion simplified overflow check as ktime_sub(tim, offset)
* Other Thomas's review notes: stylistic, simplifications and
clearifications (Thanks!)
* Split VDSO patches on generic/x86 parts
* Fixed kernel-doc warnings
* Added checks in selftests for capabilities
* Fixed bisectability issues
v5..v6 Changes:
* Used current_is_single_threaded() instead of thread_group_empty()
(Thanks for the review, Andy).
* Changed errno code when there are threads on timens joining to
something more grepabble (EUSERS).
* posix_get_timespec() should have been posix_get_monotonic_timespec()
(Thanks, Thomas)
* timens_add_monotonic() & timens_add_boottime() were relocated to
the patch that introduces (struct timens_offsets) (Thomas)
* Avoid breaking alarmtimer for ALARM_REALTIME (Thanks, Thomas)
* Nested namespace inherits father's offsets now
(Andrei while working on CRIU side for time namespace)
* A minor conflict with commit dbc1625fc9de ("hrtimer: Consolidate
hrtimer_init() + hrtimer_init_sleeper() calls") in linux-next
[Sending against next-20190814]
[v1..v5 Changelogs is at the very bottom here]
Our performance measurements show that the price of VDSO's clock_gettime()
in a child time namespace is about 8% with a hot CPU cache and about 90%
with a cold CPU cache. There is no performance regression for host
processes outside time namespace on those tests.
We wrote two small benchmarks. The first one gettime_perf.c calls
clock_gettime() in a loop for 3 seconds. It shows us performance with
a hot CPU cache (more clock_gettime() cycles - the better):
| before | CONFIG_TIME_NS=n | host | inside timens
--------------------------------------------------------------
| 153242367 | 153567617 | 150933203 | 139310914
| 153324800 | 153115132 | 150919828 | 139299761
| 153125401 | 153686868 | 150930471 | 139273917
| 153399355 | 153694866 | 151083410 | 139286081
| 153489417 | 153739716 | 150997262 | 139146403
| 153494270 | 153724332 | 150035651 | 138835612
-----------------------------------------------------------
avg | 153345935 | 153588088 | 150816637 | 139192114
diff % | 100 | 100.1 | 98.3 | 90.7
-------------------------------------------------------------
stdev % | 0.09 | 0.15 |0.25 | 0.13
The gettime_perf_cold test does 10K iterations. In each iteration, it
drops cpu caches for vdso pages, clflush() is used for this, then it runs
rdtsc(); clock_gettime; rdtsc(); and prints the number of tsc cycles.
Cold CPU cache (lesser tsc per cycle - the better):
| before | CONFIG_TIME_NS=n | host | inside timens
--------------------------------------------------------------
tsc | 476 | 480 | 487 | 531
stdev(tsc) | 0.6 | 1.3 | 4.3 | 5.7
diff (%) | 100 | 100.9 | 102 | 112
vdsotest results: https://gist.github.com/avagin/f290afb8b721ae0522a561d585f34de0
The numbers gathered on Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz.
Cc: Adrian Reber <adrian@lisas.de>
Cc: Andrei Vagin <avagin@openvz.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dmitry Safonov <0x7f454c46@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: containers@lists.linux-foundation.org
Cc: criu@openvz.org
Cc: linux-api@vger.kernel.org
Cc: x86@kernel.org
v7 on github (if someone prefers `git pull` to `git am`):
https://github.com/0x7f454c46/linux/tree/timens-v7
v6: https://lkml.kernel.org/r/20190815163836.2927-1-dima@arista.com
v5: https://lkml.kernel.org/r/20190729215758.28405-1-dima@arista.com
v4: https://lkml.kernel.org/r/20190612192628.23797-1-dima@arista.com
v3: https://lkml.kernel.org/r/20190425161416.26600-1-dima@arista.com
v2: https://lore.kernel.org/lkml/20190206001107.16488-1-dima@arista.com/
RFC: https://lkml.kernel.org/r/20180919205037.9574-1-dima@arista.com/
v4..v5 Changes:
* Rebased over generic vdso (already in master)
* Addressing review comments by Thomas Gleixner
(thanks much for your time and patience):
- Dropping `timens` prefix from subjects (it's not a subsystem)
- Keeping commit messages in a neutral technical form
- Splitting unreasonably large patches
- Document code with missing comments
- Dropped dead code that's not compiled with !CONFIG_TIME_NS
* Updated performance results [here, at the bottom]
* Split vdso jump tables patch
* Allow unshare() with many threads: it's safe until fork()/clone(),
where we check for CLONE_THREADS
* Add missed check in setns() for CLONE_VM | CLONE_THREADS
* Fixed compilation with !CONFIG_UTS_NS
* Add a plan in selftests (prevents new warning "Planned tests != run tests")
* Set jump table section address & size to (-1UL) just in case if there
is no such section while running vdso2c (and WARN() on boot in such
case)
v3..v4 Changes:
* CLOCKE_NEWTIME is unshare()-only flag now (CLON_PIDFD took previous value)
* Addressing Jann Horn's feedback - we don't allow CLONE_THREAD or
CLONE_VM together with CLONE_NEWTIME (thanks for spotting!)
* Addressing issues found by Thomas - removed unmaintainable CLOCK_TIMENS
and introduced another call back into k_clock to get ktime instead
of getting timespec and converting it (Patch 03)
* Renaming timens_offsets members to omit _offset postfix
(thanks Cyrill for the suggestion)
* Suggestions, renaming and making code more maintainable from Thomas's
feedback (thanks much!)
* Fixing out-of-bounds and other issues in procfs file (kudos Jann Horn)
* vdso_fault() can be called on a remote task by /proc/$pid/mem or
process_vm_readv() - addressed by adding a slow-path with searching
for owner's namespace (thanks for spotting this unobvious issue, Jann)
* Other nits by Jann Horn
v2..v3: Major changes:
* Simplify two VDSO images by using static_branch() in vclock_gettime()
Removes unwanted conflicts with generic VDSO movement patches and
simplifies things by dropping too invasive linker magic.
As an alternative to static_branch() we tested an attempt to introduce
home-made dynamic patching called retcalls:
https://github.com/0x7f454c46/linux/commit/4cc0180f6d65
Considering some theoretical problems with toolchains, we decided to go
with long well-tested nop-patching in static_branch(). Though, it was
needed to provide backend for relative code.
* address Thomas' comments.
* add sanity checks for offsets:
- the current clock time in a namespace has to be in [0, KTIME_MAX / 2).
KTIME_MAX is divided by two here to be sure that the KTIME_MAX limit
is still unreachable.
Link: https://lkml.org/lkml/2018/9/19/950
Link: https://lkml.org/lkml/2019/2/5/867
v1..v2: There are two major changes:
* Two versions of the VDSO library to avoid a performance penalty for
host tasks outside time namespace (as suggested by Andy and Thomas).
As it has been discussed on timens RFC, adding a new conditional branch
`if (inside_time_ns)` on VDSO for all processes is undesirable.
It will add a penalty for everybody as branch predictor may mispredict
the jump. Also there are instruction cache lines wasted on cmp/jmp.
Those effects of introducing time namespace are very much unwanted
having in mind how much work have been spent on micro-optimisation
VDSO code.
Addressing those problems, there are two versions of VDSO's .so:
for host tasks (without any penalty) and for processes inside of time
namespace with clk_to_ns() that subtracts offsets from host's time.
* Allow to set clock offsets for a namespace only before any processes
appear in it.
Now a time namespace looks similar to a pid namespace in a way how it is
created: unshare(CLONE_NEWTIME) system call creates a new time namespace,
but doesn't set it to the current process. Then all children of
the process will be born in the new time namespace, or a process can
use the setns() system call to join a namespace.
This scheme allows to create a new time namespaces, set clock offsets
and then populate the namespace with processes.
Andrei Vagin (22):
ns: Introduce Time Namespace
time: Add timens_offsets to be used for tasks in timens
posix-clocks: Rename the clock_get() callback to clock_get_timespec()
posix-clocks: Rename .clock_get_timespec() callbacks accordingly
alarmtimer: Rename gettime() callback to get_ktime()
alarmtimer: Provide get_timespec() callback
posix-clocks: Introduce clock_get_ktime() callback
posix-timers: Use clock_get_ktime() in common_timer_get()
posix-clocks: Wire up clock_gettime() with timens offsets
kernel: Add do_timens_ktime_to_host() helper
timerfd: Make timerfd_settime() time namespace aware
posix-timers: Make timer_settime() time namespace aware
alarmtimer: Make nanosleep time namespace aware
hrtimers: Prepare hrtimer_nanosleep() for time namespaces
posix-timers: Make clock_nanosleep() time namespace aware
lib/vdso: Add unlikely() hint into vdso_read_begin()
fs/proc: Introduce /proc/pid/timens_offsets
selftests/timens: Add a test for timerfd
selftests/timens: Add a test for clock_nanosleep()
selftests/timens: Add timer offsets test
selftests/timens: Add a simple perf test for clock_gettime()
selftests/timens: Check for right timens offsets after fork and exec
Dmitry Safonov (10):
fs/proc: Respect boottime inside time namespace for /proc/uptime
x86/vdso: Restrict splitting VVAR VMA
x86/vdso: Provide vdso_data offset on vvar_page
x86/vdso: Add timens page
time: Allocate per-timens vvar page
x86/vdso: Handle faults on timens page
x86/vdso: On timens page fault prefault also VVAR page
x86/vdso: Zap vvar pages on switch a time namspace
selftests/timens: Add Time Namespace test for supported clocks
selftests/timens: Add procfs selftest
Thomas Gleixner (1):
lib/vdso: Prepare for time namespace support
MAINTAINERS | 2 +
arch/x86/Kconfig | 1 +
arch/x86/entry/vdso/vdso-layout.lds.S | 13 +-
arch/x86/entry/vdso/vdso2c.c | 3 +
arch/x86/entry/vdso/vma.c | 119 ++++-
arch/x86/include/asm/vdso.h | 1 +
arch/x86/include/asm/vdso/gettimeofday.h | 9 +
arch/x86/include/asm/vvar.h | 13 +-
arch/x86/kernel/vmlinux.lds.S | 4 +-
fs/proc/base.c | 95 ++++
fs/proc/namespaces.c | 4 +
fs/proc/uptime.c | 3 +
fs/timerfd.c | 3 +
include/linux/hrtimer.h | 2 +-
include/linux/nsproxy.h | 2 +
include/linux/posix-timers.h | 3 +
include/linux/proc_ns.h | 3 +
include/linux/time.h | 6 +
include/linux/time_namespace.h | 119 +++++
include/linux/user_namespace.h | 1 +
include/uapi/linux/sched.h | 6 +
include/vdso/datapage.h | 19 +-
include/vdso/helpers.h | 2 +-
init/Kconfig | 7 +
kernel/fork.c | 16 +-
kernel/nsproxy.c | 41 +-
kernel/time/Makefile | 1 +
kernel/time/alarmtimer.c | 68 ++-
kernel/time/hrtimer.c | 8 +-
kernel/time/namespace.c | 466 ++++++++++++++++++
kernel/time/posix-clock.c | 8 +-
kernel/time/posix-cpu-timers.c | 32 +-
kernel/time/posix-stubs.c | 15 +-
kernel/time/posix-timers.c | 88 +++-
kernel/time/posix-timers.h | 7 +-
lib/vdso/Kconfig | 6 +
lib/vdso/gettimeofday.c | 128 ++++-
mm/mmap.c | 2 +
tools/perf/examples/bpf/5sec.c | 6 +-
tools/testing/selftests/Makefile | 1 +
tools/testing/selftests/timens/.gitignore | 8 +
tools/testing/selftests/timens/Makefile | 7 +
.../selftests/timens/clock_nanosleep.c | 143 ++++++
tools/testing/selftests/timens/config | 1 +
tools/testing/selftests/timens/exec.c | 94 ++++
tools/testing/selftests/timens/gettime_perf.c | 91 ++++
tools/testing/selftests/timens/log.h | 26 +
tools/testing/selftests/timens/procfs.c | 144 ++++++
tools/testing/selftests/timens/timens.c | 185 +++++++
tools/testing/selftests/timens/timens.h | 73 +++
tools/testing/selftests/timens/timer.c | 118 +++++
tools/testing/selftests/timens/timerfd.c | 129 +++++
52 files changed, 2246 insertions(+), 106 deletions(-)
create mode 100644 include/linux/time_namespace.h
create mode 100644 kernel/time/namespace.c
create mode 100644 tools/testing/selftests/timens/.gitignore
create mode 100644 tools/testing/selftests/timens/Makefile
create mode 100644 tools/testing/selftests/timens/clock_nanosleep.c
create mode 100644 tools/testing/selftests/timens/config
create mode 100644 tools/testing/selftests/timens/exec.c
create mode 100644 tools/testing/selftests/timens/gettime_perf.c
create mode 100644 tools/testing/selftests/timens/log.h
create mode 100644 tools/testing/selftests/timens/procfs.c
create mode 100644 tools/testing/selftests/timens/timens.c
create mode 100644 tools/testing/selftests/timens/timens.h
create mode 100644 tools/testing/selftests/timens/timer.c
create mode 100644 tools/testing/selftests/timens/timerfd.c
--
2.23.0
^ permalink raw reply
* Re: [PATCH ghak90 V7 18/21] audit: track container nesting
From: Paul Moore @ 2019-10-11 0:40 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: nhorman, linux-api, containers, LKML, dhowells,
Linux-Audit Mailing List, netfilter-devel, ebiederm, simo, netdev,
linux-fsdevel, Eric Paris, mpatel, Serge Hallyn
In-Reply-To: <a6b00624ac746bc0df9dd0044311b8364374b25b.1568834525.git.rgb@redhat.com>
On Wed, Sep 18, 2019 at 9:27 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Track the parent container of a container to be able to filter and
> report nesting.
>
> Now that we have a way to track and check the parent container of a
> container, fixup other patches, or squash all nesting fixes together.
>
> fixup! audit: add container id
> fixup! audit: log drop of contid on exit of last task
> fixup! audit: log container info of syscalls
> fixup! audit: add containerid filtering
> fixup! audit: NETFILTER_PKT: record each container ID associated with a netNS
> fixup! audit: convert to contid list to check for orch/engine ownership softirq (for netfilter) audit: protect contid list lock from softirq
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
> include/linux/audit.h | 1 +
> kernel/audit.c | 67 ++++++++++++++++++++++++++++++++++++++++++---------
> kernel/audit.h | 3 +++
> kernel/auditfilter.c | 20 ++++++++++++++-
> kernel/auditsc.c | 2 +-
> 5 files changed, 79 insertions(+), 14 deletions(-)
This is my last comment of the patchset because this is where it
starts to get a little weird. I know we've talked about fixup!
patches some in the past, but perhaps I didn't do a very good job
communicating my poin; let me try again.
Submitting a fixup patch is okay if you've already posted a (lengthy)
patchset and there was a small nit that someone uncovered that needed
to be fixed prior to merging, assuming everyone (this includes the
reviewer, the patch author, and the maintainer) is okay with the
author posting the fix as fixup! patch then go for it. Done this way,
fixup patches can save a lot of development, testing, and review time.
However, in my opinion it is wrong to submit a patchset that has fixup
patches as part of the original posting. In this case fixup patches
have the opposite effect: the patchset becomes more complicated,
reviews take longer, and the likelihood of missing important details
increases.
When in doubt, don't submit separate fixup patches, fold them into the
original patches instead.
--
paul moore
www.paul-moore.com
^ permalink raw reply
* Re: [PATCH ghak90 V7 17/21] audit: add support for loginuid/sessionid set/get by netlink
From: Paul Moore @ 2019-10-11 0:40 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
LKML, netdev, netfilter-devel, sgrubb, omosnace, dhowells, simo,
Eric Paris, Serge Hallyn, ebiederm, nhorman, Dan Walsh, mpatel
In-Reply-To: <6cef16c2a019e61e49f4d62497b5ca8dab79b45f.1568834525.git.rgb@redhat.com>
On Wed, Sep 18, 2019 at 9:27 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Add the ability to get and set the login uid and to get the session id
> using an audit netlink message using message types AUDIT_GET_LOGINUID
> 1024, AUDIT_SET_LOGINUID 1025 and AUDIT_GET_SESSIONID 1026 in addition
> to using the proc filesystem.
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
> include/uapi/linux/audit.h | 3 +++
> kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 65 insertions(+)
This is completely independent of the audit container ID work, yes?
If so, it shouldn't be part of this patchset.
--
paul moore
www.paul-moore.com
^ permalink raw reply
* Re: [PATCH ghak90 V7 16/21] audit: add support for contid set/get by netlink
From: Paul Moore @ 2019-10-11 0:40 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
LKML, netdev, netfilter-devel, sgrubb, omosnace, dhowells, simo,
Eric Paris, Serge Hallyn, ebiederm, nhorman, Dan Walsh, mpatel
In-Reply-To: <ea4e8352fd1671f91d1b015a15abee785ea17136.1568834525.git.rgb@redhat.com>
On Wed, Sep 18, 2019 at 9:26 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Add the ability to get and set the audit container identifier using an
> audit netlink message using message types AUDIT_SET_CONTID 1023 and
> AUDIT_GET_CONTID 1022 in addition to using the proc filesystem. The
> message format includes the data structure:
>
> struct audit_contid_status {
> pid_t pid;
> u64 id;
> };
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
> include/uapi/linux/audit.h | 2 ++
> kernel/audit.c | 40 ++++++++++++++++++++++++++++++++++++++++
> kernel/audit.h | 5 +++++
> 3 files changed, 47 insertions(+)
I'm not a fan of having multiple interfaces to do one thing if it can
be avoided. Presumably the argument for the netlink API is the
container folks don't want to have to mount /proc inside containers
which are going to host nested orchestrators? Can you reasonably run
a fully fledged orchestrator without a valid /proc?
^ permalink raw reply
* Re: [PATCH ghak90 V7 15/21] sched: pull task_is_descendant into kernel/sched/core.c
From: Paul Moore @ 2019-10-11 0:40 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
LKML, netdev, netfilter-devel, sgrubb, omosnace, dhowells, simo,
Eric Paris, Serge Hallyn, ebiederm, nhorman, Dan Walsh, mpatel
In-Reply-To: <ff8a73c7841ef788c60f13f90d036b321af0e431.1568834524.git.rgb@redhat.com>
On Wed, Sep 18, 2019 at 9:26 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Since the task_is_descendant() function is used in YAMA and in audit,
> pull the function into kernel/core/sched.c
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
> include/linux/sched.h | 3 +++
> kernel/audit.c | 33 ---------------------------------
> kernel/sched/core.c | 33 +++++++++++++++++++++++++++++++++
> security/yama/yama_lsm.c | 33 ---------------------------------
> 4 files changed, 36 insertions(+), 66 deletions(-)
I'm not really reviewing this as I'm still a little confused from
patch 14/21, but if 14/21 works out as correct this patch should be
squashed into that one.
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a936d162513a..b251f018f4db 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1988,4 +1988,7 @@ static inline void rseq_syscall(struct pt_regs *regs)
>
> const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
>
> +extern int task_is_descendant(struct task_struct *parent,
> + struct task_struct *child);
> +
> #endif
> diff --git a/kernel/audit.c b/kernel/audit.c
> index 69fe1e9af7cb..4fe7678304dd 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -2560,39 +2560,6 @@ static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> }
>
> /*
> - * task_is_descendant - walk up a process family tree looking for a match
> - * @parent: the process to compare against while walking up from child
> - * @child: the process to start from while looking upwards for parent
> - *
> - * Returns 1 if child is a descendant of parent, 0 if not.
> - */
> -static int task_is_descendant(struct task_struct *parent,
> - struct task_struct *child)
> -{
> - int rc = 0;
> - struct task_struct *walker = child;
> -
> - if (!parent || !child)
> - return 0;
> -
> - rcu_read_lock();
> - if (!thread_group_leader(parent))
> - parent = rcu_dereference(parent->group_leader);
> - while (walker->pid > 0) {
> - if (!thread_group_leader(walker))
> - walker = rcu_dereference(walker->group_leader);
> - if (walker == parent) {
> - rc = 1;
> - break;
> - }
> - walker = rcu_dereference(walker->real_parent);
> - }
> - rcu_read_unlock();
> -
> - return rc;
> -}
> -
> -/*
> * audit_set_contid - set current task's audit contid
> * @task: target task
> * @contid: contid value
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 2b037f195473..7ba9e07381fa 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7509,6 +7509,39 @@ void dump_cpu_task(int cpu)
> }
>
> /*
> + * task_is_descendant - walk up a process family tree looking for a match
> + * @parent: the process to compare against while walking up from child
> + * @child: the process to start from while looking upwards for parent
> + *
> + * Returns 1 if child is a descendant of parent, 0 if not.
> + */
> +int task_is_descendant(struct task_struct *parent,
> + struct task_struct *child)
> +{
> + int rc = 0;
> + struct task_struct *walker = child;
> +
> + if (!parent || !child)
> + return 0;
> +
> + rcu_read_lock();
> + if (!thread_group_leader(parent))
> + parent = rcu_dereference(parent->group_leader);
> + while (walker->pid > 0) {
> + if (!thread_group_leader(walker))
> + walker = rcu_dereference(walker->group_leader);
> + if (walker == parent) {
> + rc = 1;
> + break;
> + }
> + walker = rcu_dereference(walker->real_parent);
> + }
> + rcu_read_unlock();
> +
> + return rc;
> +}
> +
> +/*
> * Nice levels are multiplicative, with a gentle 10% change for every
> * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
> * nice 1, it will get ~10% less CPU time than another CPU-bound task
> diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
> index 94dc346370b1..25eae205eae8 100644
> --- a/security/yama/yama_lsm.c
> +++ b/security/yama/yama_lsm.c
> @@ -263,39 +263,6 @@ static int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3,
> }
>
> /**
> - * task_is_descendant - walk up a process family tree looking for a match
> - * @parent: the process to compare against while walking up from child
> - * @child: the process to start from while looking upwards for parent
> - *
> - * Returns 1 if child is a descendant of parent, 0 if not.
> - */
> -static int task_is_descendant(struct task_struct *parent,
> - struct task_struct *child)
> -{
> - int rc = 0;
> - struct task_struct *walker = child;
> -
> - if (!parent || !child)
> - return 0;
> -
> - rcu_read_lock();
> - if (!thread_group_leader(parent))
> - parent = rcu_dereference(parent->group_leader);
> - while (walker->pid > 0) {
> - if (!thread_group_leader(walker))
> - walker = rcu_dereference(walker->group_leader);
> - if (walker == parent) {
> - rc = 1;
> - break;
> - }
> - walker = rcu_dereference(walker->real_parent);
> - }
> - rcu_read_unlock();
> -
> - return rc;
> -}
> -
> -/**
> * ptracer_exception_found - tracer registered as exception for this tracee
> * @tracer: the task_struct of the process attempting ptrace
> * @tracee: the task_struct of the process to be ptraced
--
paul moore
www.paul-moore.com
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox