From: Dmitry Safonov <dima@arista.com>
To: linux-kernel@vger.kernel.org
Cc: Dmitry Safonov <dima@arista.com>, Adrian Reber <adrian@lisas.de>,
Andrei Vagin <avagin@openvz.org>, Andrei Vagin <avagin@gmail.com>,
Andy Lutomirski <luto@kernel.org>,
Andy Tucker <agtucker@google.com>, Arnd Bergmann <arnd@arndb.de>,
Christian Brauner <christian.brauner@ubuntu.com>,
Cyrill Gorcunov <gorcunov@openvz.org>,
Dmitry Safonov <0x7f454c46@gmail.com>,
"Eric W. Biederman" <ebiederm@xmission.com>,
"H. Peter Anvin" <hpa@zytor.com>, Ingo Molnar <mingo@redhat.com>,
Jeff Dike <jdike@addtoit.com>, Oleg Nesterov <oleg@redhat.com>,
Pavel Emelyanov <xemul@virtuozzo.com>,
Shuah Khan <shuah@kernel.org>,
Thomas Gleixner <tglx@linutronix.de>,
containers@lists.linux-foundation.org, criu@openvz.org,
linux-api@vger.kernel.org, x86@kernel.org
Subject: [PATCH 21/32] x86/vdso: Switch image on setns()/unshare()/clone()
Date: Wed, 6 Feb 2019 00:10:55 +0000 [thread overview]
Message-ID: <20190206001107.16488-22-dima@arista.com> (raw)
In-Reply-To: <20190206001107.16488-1-dima@arista.com>
As it has been discussed on timens RFC, adding a new conditional branch
`if (inside_time_ns)` on VDSO for all processes is undesirable.
It will add a penalty for everybody as branch predictor may mispredict
the jump. Also there are instruction cache lines wasted on cmp/jmp.
Those effects of introducing time namespace are very much unwanted
having in mind how much work have been spent on micro-optimisation
vdso code.
Addressing those problems, there are two versions of VDSO's .so:
for host tasks (without any penalty) and for processes inside of time
namespace with clk_to_ns() that subtracts offsets from host's time.
Whenever a user does setns()/unshare() or clone() with CLONE_TIMENS,
change VDSO image in mm and zap existing VVAR/VDSO page tables.
They will be re-faulted with corresponding image and VVAR offsets.
Co-developed-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
arch/x86/entry/vdso/vma.c | 81 +++++++++++++++++++++++++++++++++++++
arch/x86/include/asm/vdso.h | 1 +
kernel/time_namespace.c | 11 +++++
3 files changed, 93 insertions(+)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 56a62076a320..52c1e4c24455 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -25,6 +25,7 @@
#include <asm/cpufeature.h>
#include <asm/mshyperv.h>
#include <asm/page.h>
+#include <asm/tlb.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
@@ -150,6 +151,84 @@ static const struct vm_special_mapping vvar_mapping = {
.fault = vvar_fault,
};
+#ifdef CONFIG_TIME_NS
+static const struct vdso_image *timens_vdso(const struct vdso_image *old_img,
+ bool in_ns)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (old_img == &vdso_image_x32)
+ return NULL;
+#endif
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ if (old_img == &vdso_image_32 || old_img == &vdso_image_32_timens)
+ return in_ns ? &vdso_image_32_timens : &vdso_image_32;
+#endif
+#ifdef CONFIG_X86_64
+ if (old_img == &vdso_image_64 || old_img == &vdso_image_64_timens)
+ return in_ns ? &vdso_image_64_timens : &vdso_image_64;
+#endif
+ return NULL;
+}
+
+static const struct vdso_image *image_to_timens(const struct vdso_image *img)
+{
+ bool in_ns = (current->nsproxy->time_ns != &init_time_ns);
+ const struct vdso_image *ns;
+
+ ns = timens_vdso(img, in_ns);
+
+ return ns ?: img;
+}
+
+int vdso_join_timens(struct task_struct *task, bool inside_ns)
+{
+ const struct vdso_image *new_image, *old_image;
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+ int ret = 0;
+
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
+ old_image = mm->context.vdso_image;
+ new_image = timens_vdso(old_image, inside_ns);
+ if (!new_image) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* Sanity checks, shouldn't happen */
+ if (unlikely(old_image->size != new_image->size)) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ mm->context.vdso_image = new_image;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (vma_is_special_mapping(vma, &vvar_mapping))
+ zap_page_range(vma, vma->vm_start, size);
+ if (vma_is_special_mapping(vma, &vdso_mapping))
+ zap_page_range(vma, vma->vm_start, size);
+ }
+
+out:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+#else /* CONFIG_TIME_NS */
+static const struct vdso_image *image_to_timens(const struct vdso_image *img)
+{
+ return img;
+}
+int vdso_join_timens(struct task_struct *task, bool inside_ns)
+{
+ return -ENXIO;
+}
+#endif
+
/*
* Add vdso and vvar mappings to current process.
* @image - blob to map
@@ -165,6 +244,8 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
+ image = image_to_timens(image);
+
addr = get_unmapped_area(NULL, addr,
image->size - image->sym_vvar_start, 0, 0);
if (IS_ERR_VALUE(addr)) {
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index b6a1a028ac62..c8db853344a0 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -51,6 +51,7 @@ extern const struct vdso_image vdso_image_32_timens;
extern void __init init_vdso_image(const struct vdso_image *image);
extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
+extern int vdso_join_timens(struct task_struct *task, bool inside_ns);
#endif /* __ASSEMBLER__ */
diff --git a/kernel/time_namespace.c b/kernel/time_namespace.c
index 36b31f234472..1d1d1c023ec1 100644
--- a/kernel/time_namespace.c
+++ b/kernel/time_namespace.c
@@ -14,6 +14,7 @@
#include <linux/proc_ns.h>
#include <linux/sched/task.h>
#include <linux/mm.h>
+#include <asm/vdso.h>
static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
{
@@ -155,11 +156,16 @@ static void timens_put(struct ns_common *ns)
static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
{
struct time_namespace *ns = to_time_ns(new);
+ int ret;
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
+ ret = vdso_join_timens(current, ns != &init_time_ns);
+ if (ret)
+ return ret;
+
get_time_ns(ns);
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
@@ -174,10 +180,15 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
struct time_namespace *ns = to_time_ns(nsc);
+ int ret;
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
return 0;
+ ret = vdso_join_timens(tsk, ns != &init_time_ns);
+ if (ret)
+ return ret;
+
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
--
2.20.1
next prev parent reply other threads:[~2019-02-06 0:10 UTC|newest]
Thread overview: 52+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-06 0:10 [PATCH 00/32] kernel: Introduce Time Namespace Dmitry Safonov
2019-02-06 0:10 ` [PATCH 01/32] ns: " Dmitry Safonov
2019-02-06 0:10 ` [PATCH 02/32] timens: Add timens_offsets Dmitry Safonov
2019-02-06 0:10 ` [PATCH 03/32] timens: Introduce CLOCK_MONOTONIC offsets Dmitry Safonov
2019-02-07 21:40 ` Thomas Gleixner
2019-02-08 9:02 ` Andrei Vagin
2019-02-08 9:46 ` Thomas Gleixner
2019-02-06 0:10 ` [PATCH 04/32] timens: Introduce CLOCK_BOOTTIME offset Dmitry Safonov
2019-02-06 0:10 ` [PATCH 05/32] timerfd/timens: Take into account ns clock offsets Dmitry Safonov
2019-02-06 8:52 ` Cyrill Gorcunov
2019-02-06 8:55 ` Cyrill Gorcunov
2019-02-07 6:38 ` Andrei Vagin
2019-02-06 0:10 ` [PATCH 06/32] posix-timers/timens: Take into account " Dmitry Safonov
2019-02-06 0:10 ` [PATCH 07/32] timens/kernel: Take into account timens clock offsets in clock_nanosleep Dmitry Safonov
2019-02-08 7:56 ` Thomas Gleixner
2019-02-06 0:10 ` [PATCH 08/32] timens: Shift /proc/uptime Dmitry Safonov
2019-02-06 0:10 ` [PATCH 09/32] x86/vdso2c: Correct err messages on file opening Dmitry Safonov
2019-02-06 0:10 ` [PATCH 10/32] x86/vdso2c: Convert iterator to unsigned Dmitry Safonov
2019-02-06 0:10 ` [PATCH 11/32] x86/vdso/Makefile: Add vobjs32 Dmitry Safonov
2019-02-06 0:10 ` [PATCH 12/32] x86/vdso/timens: Add offsets page in vvar Dmitry Safonov
2019-02-06 0:10 ` [PATCH 13/32] x86/vdso: Build timens .so(s) Dmitry Safonov
2019-02-06 0:10 ` [PATCH 14/32] x86/VDSO: Build VDSO with -ffunction-sections Dmitry Safonov
2019-02-06 0:10 ` [PATCH 15/32] x86/vdso2c: Optionally produce linker script for vdso entries Dmitry Safonov
2019-02-06 0:10 ` [PATCH 16/32] x86/vdso: Generate vdso{,32}-timens.lds Dmitry Safonov
2019-02-07 8:31 ` Rasmus Villemoes
2019-02-07 16:11 ` Dmitry Safonov
2019-02-08 9:57 ` Thomas Gleixner
2019-02-08 9:57 ` Thomas Gleixner
2019-02-08 15:18 ` Dmitry Safonov
2019-02-08 15:18 ` Dmitry Safonov
2019-03-27 18:00 ` Andrei Vagin
2019-03-27 18:00 ` Andrei Vagin
2019-03-27 18:06 ` [PATCH RFC] x86/asm: Introduce static_retcall(s) Andrei Vagin
2019-03-27 18:06 ` Andrei Vagin
2019-03-27 18:06 ` [PATCH RFC] vdso: introduce timens_static_branch Andrei Vagin
2019-03-27 18:06 ` Andrei Vagin
2019-02-06 0:10 ` [PATCH 17/32] x86/vdso2c: Sort vdso entries by addresses for linker script Dmitry Safonov
2019-02-06 0:10 ` [PATCH 18/32] x86/vdso.lds: Align !timens (host's) vdso.so entries Dmitry Safonov
2019-02-06 0:10 ` [PATCH 19/32] x86/vdso2c: Align LOCAL symbols between vdso{-timens,}.so Dmitry Safonov
2019-02-06 0:10 ` [PATCH 20/32] x86/vdso: Initialize timens 64-bit vdso Dmitry Safonov
2019-02-06 0:10 ` Dmitry Safonov [this message]
2019-02-06 0:10 ` [PATCH 22/32] timens: Add align for timens_offsets Dmitry Safonov
2019-02-06 0:10 ` [PATCH 23/32] timens/fs/proc: Introduce /proc/pid/timens_offsets Dmitry Safonov
2019-02-06 0:10 ` [PATCH 24/32] selftest/timens: Add Time Namespace test for supported clocks Dmitry Safonov
2019-02-06 0:10 ` [PATCH 25/32] selftest/timens: Add a test for timerfd Dmitry Safonov
2019-02-06 0:11 ` [PATCH 26/32] selftest/timens: Add a test for clock_nanosleep() Dmitry Safonov
2019-02-06 0:11 ` [PATCH 27/32] selftest/timens: Add procfs selftest Dmitry Safonov
2019-02-06 0:11 ` [PATCH 28/32] selftest/timens: Add timer offsets test Dmitry Safonov
2019-02-06 0:11 ` [PATCH 29/32] selftests: Add a simple perf test for clock_gettime() Dmitry Safonov
2019-02-06 0:11 ` [PATCH 30/32] selftest/timens: Check that a right vdso is mapped after fork and exec Dmitry Safonov
2019-02-06 0:11 ` [PATCH 31/32] x86/vdso: Align VDSO functions by CPU L1 cache line Dmitry Safonov
2019-02-06 0:11 ` [PATCH 32/32] x86/vdso: Restrict splitting VVAR VMA Dmitry Safonov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190206001107.16488-22-dima@arista.com \
--to=dima@arista.com \
--cc=0x7f454c46@gmail.com \
--cc=adrian@lisas.de \
--cc=agtucker@google.com \
--cc=arnd@arndb.de \
--cc=avagin@gmail.com \
--cc=avagin@openvz.org \
--cc=christian.brauner@ubuntu.com \
--cc=containers@lists.linux-foundation.org \
--cc=criu@openvz.org \
--cc=ebiederm@xmission.com \
--cc=gorcunov@openvz.org \
--cc=hpa@zytor.com \
--cc=jdike@addtoit.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=luto@kernel.org \
--cc=mingo@redhat.com \
--cc=oleg@redhat.com \
--cc=shuah@kernel.org \
--cc=tglx@linutronix.de \
--cc=x86@kernel.org \
--cc=xemul@virtuozzo.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.