public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Willy Tarreau <w@1wt.eu>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>,
	Andi Kleen <andi@firstfloor.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Ingo Molnar <mingo@kernel.org>,
	Ben Hutchings <ben@decadent.org.uk>, Willy Tarreau <w@1wt.eu>
Subject: [ 05/48] x86_64, switch_to(): Load TLS descriptors before switching DS and ES
Date: Fri, 15 May 2015 10:05:35 +0200	[thread overview]
Message-ID: <20150515080530.510828652@1wt.eu> (raw)
In-Reply-To: <9c2783dfae10ef2d1e9b08bcc1e562c5@local>

2.6.32-longterm review patch.  If anyone has any objections, please let me know.

------------------

From: Andy Lutomirski <luto@amacapital.net>

commit f647d7c155f069c1a068030255c300663516420e upstream.

Otherwise, if buggy user code points DS or ES into the TLS
array, they would be corrupted after a context switch.

This also significantly improves the comments and documents some
gotchas in the code.

Before this patch, the both tests below failed.  With this
patch, the es test passes, although the gsbase test still fails.

 ----- begin es test -----

/*
 * Copyright (c) 2014 Andy Lutomirski
 * GPL v2
 */

static unsigned short GDT3(int idx)
{
	return (idx << 3) | 3;
}

static int create_tls(int idx, unsigned int base)
{
	struct user_desc desc = {
		.entry_number    = idx,
		.base_addr       = base,
		.limit           = 0xfffff,
		.seg_32bit       = 1,
		.contents        = 0, /* Data, grow-up */
		.read_exec_only  = 0,
		.limit_in_pages  = 1,
		.seg_not_present = 0,
		.useable         = 0,
	};

	if (syscall(SYS_set_thread_area, &desc) != 0)
		err(1, "set_thread_area");

	return desc.entry_number;
}

int main()
{
	int idx = create_tls(-1, 0);
	printf("Allocated GDT index %d\n", idx);

	unsigned short orig_es;
	asm volatile ("mov %%es,%0" : "=rm" (orig_es));

	int errors = 0;
	int total = 1000;
	for (int i = 0; i < total; i++) {
		asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
		usleep(100);

		unsigned short es;
		asm volatile ("mov %%es,%0" : "=rm" (es));
		asm volatile ("mov %0,%%es" : : "rm" (orig_es));
		if (es != GDT3(idx)) {
			if (errors == 0)
				printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
				       GDT3(idx), es);
			errors++;
		}
	}

	if (errors) {
		printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
		return 1;
	} else {
		printf("[OK]\tES was preserved\n");
		return 0;
	}
}

 ----- end es test -----

 ----- begin gsbase test -----

/*
 * gsbase.c, a gsbase test
 * Copyright (c) 2014 Andy Lutomirski
 * GPL v2
 */

static unsigned char *testptr, *testptr2;

static unsigned char read_gs_testvals(void)
{
	unsigned char ret;
	asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
	return ret;
}

int main()
{
	int errors = 0;

	testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
	if (testptr == MAP_FAILED)
		err(1, "mmap");

	testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
	if (testptr2 == MAP_FAILED)
		err(1, "mmap");

	*testptr = 0;
	*testptr2 = 1;

	if (syscall(SYS_arch_prctl, ARCH_SET_GS,
		    (unsigned long)testptr2 - (unsigned long)testptr) != 0)
		err(1, "ARCH_SET_GS");

	usleep(100);

	if (read_gs_testvals() == 1) {
		printf("[OK]\tARCH_SET_GS worked\n");
	} else {
		printf("[FAIL]\tARCH_SET_GS failed\n");
		errors++;
	}

	asm volatile ("mov %0,%%gs" : : "r" (0));

	if (read_gs_testvals() == 0) {
		printf("[OK]\tWriting 0 to gs worked\n");
	} else {
		printf("[FAIL]\tWriting 0 to gs failed\n");
		errors++;
	}

	usleep(100);

	if (read_gs_testvals() == 0) {
		printf("[OK]\tgsbase is still zero\n");
	} else {
		printf("[FAIL]\tgsbase was corrupted\n");
		errors++;
	}

	return errors == 0 ? 0 : 1;
}

 ----- end gsbase test -----

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
(cherry picked from commit cca3e6170e186ad88c11ee91cfd37d400dcaa9b0)
Signed-off-by: Willy Tarreau <w@1wt.eu>
---
 arch/x86/kernel/process_64.c | 101 +++++++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 39493bc..0b3d98b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -394,24 +394,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (preload_fpu)
 		prefetch(next->xstate);
 
-	/*
-	 * Reload esp0, LDT and the page table pointer:
-	 */
+	/* Reload esp0 and ss1. */
 	load_sp0(tss, next);
 
-	/*
-	 * Switch DS and ES.
-	 * This won't pick up thread selector changes, but I guess that is ok.
-	 */
-	savesegment(es, prev->es);
-	if (unlikely(next->es | prev->es))
-		loadsegment(es, next->es);
-
-	savesegment(ds, prev->ds);
-	if (unlikely(next->ds | prev->ds))
-		loadsegment(ds, next->ds);
-
-
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
 	 *
@@ -420,6 +405,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	savesegment(fs, fsindex);
 	savesegment(gs, gsindex);
 
+	/*
+	 * Load TLS before restoring any segments so that segment loads
+	 * reference the correct GDT entries.
+	 */
 	load_TLS(next, cpu);
 
 	/* Must be after DS reload */
@@ -430,38 +419,94 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		clts();
 
 	/*
-	 * Leave lazy mode, flushing any hypercalls made here.
-	 * This must be done before restoring TLS segments so
-	 * the GDT and LDT are properly updated, and must be
-	 * done before math_state_restore, so the TS bit is up
-	 * to date.
+	 * Leave lazy mode, flushing any hypercalls made here.  This
+	 * must be done after loading TLS entries in the GDT but before
+	 * loading segments that might reference them, and and it must
+	 * be done before math_state_restore, so the TS bit is up to
+	 * date.
 	 */
 	arch_end_context_switch(next_p);
 
+	/* Switch DS and ES.
+	 *
+	 * Reading them only returns the selectors, but writing them (if
+	 * nonzero) loads the full descriptor from the GDT or LDT.  The
+	 * LDT for next is loaded in switch_mm, and the GDT is loaded
+	 * above.
+	 *
+	 * We therefore need to write new values to the segment
+	 * registers on every context switch unless both the new and old
+	 * values are zero.
+	 *
+	 * Note that we don't need to do anything for CS and SS, as
+	 * those are saved and restored as part of pt_regs.
+	 */
+	savesegment(es, prev->es);
+	if (unlikely(next->es | prev->es))
+		loadsegment(es, next->es);
+
+	savesegment(ds, prev->ds);
+	if (unlikely(next->ds | prev->ds))
+		loadsegment(ds, next->ds);
+
 	/*
 	 * Switch FS and GS.
 	 *
-	 * Segment register != 0 always requires a reload.  Also
-	 * reload when it has changed.  When prev process used 64bit
-	 * base always reload to avoid an information leak.
+	 * These are even more complicated than FS and GS: they have
+	 * 64-bit bases are that controlled by arch_prctl.  Those bases
+	 * only differ from the values in the GDT or LDT if the selector
+	 * is 0.
+	 *
+	 * Loading the segment register resets the hidden base part of
+	 * the register to 0 or the value from the GDT / LDT.  If the
+	 * next base address zero, writing 0 to the segment register is
+	 * much faster than using wrmsr to explicitly zero the base.
+	 *
+	 * The thread_struct.fs and thread_struct.gs values are 0
+	 * if the fs and gs bases respectively are not overridden
+	 * from the values implied by fsindex and gsindex.  They
+	 * are nonzero, and store the nonzero base addresses, if
+	 * the bases are overridden.
+	 *
+	 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
+	 * be impossible.
+	 *
+	 * Therefore we need to reload the segment registers if either
+	 * the old or new selector is nonzero, and we need to override
+	 * the base address if next thread expects it to be overridden.
+	 *
+	 * This code is unnecessarily slow in the case where the old and
+	 * new indexes are zero and the new base is nonzero -- it will
+	 * unnecessarily write 0 to the selector before writing the new
+	 * base address.
+	 *
+	 * Note: This all depends on arch_prctl being the only way that
+	 * user code can override the segment base.  Once wrfsbase and
+	 * wrgsbase are enabled, most of this code will need to change.
 	 */
 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
 		loadsegment(fs, next->fsindex);
+
 		/*
-		 * Check if the user used a selector != 0; if yes
-		 *  clear 64bit base, since overloaded base is always
-		 *  mapped to the Null selector
+		 * If user code wrote a nonzero value to FS, then it also
+		 * cleared the overridden base address.
+		 *
+		 * XXX: if user code wrote 0 to FS and cleared the base
+		 * address itself, we won't notice and we'll incorrectly
+		 * restore the prior base address next time we reschdule
+		 * the process.
 		 */
 		if (fsindex)
 			prev->fs = 0;
 	}
-	/* when next process has a 64bit base use it */
 	if (next->fs)
 		wrmsrl(MSR_FS_BASE, next->fs);
 	prev->fsindex = fsindex;
 
 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
 		load_gs_index(next->gsindex);
+
+		/* This works (and fails) the same way as fsindex above. */
 		if (gsindex)
 			prev->gs = 0;
 	}
-- 
1.7.12.2.21.g234cd45.dirty




  parent reply	other threads:[~2015-05-15  8:27 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <9c2783dfae10ef2d1e9b08bcc1e562c5@local>
2015-05-15  8:05 ` [ 00/48] 2.6.32.66-longterm review Willy Tarreau
2015-05-15  8:05 ` [ 01/48] x86/asm/traps: Disable tracing and kprobes in fixup_bad_iret and sync_regs Willy Tarreau
2015-05-15  8:05 ` [ 02/48] x86/tls: Validate TLS entries to protect espfix Willy Tarreau
2015-05-15  8:05 ` [ 03/48] x86, tls, ldt: Stop checking lm in LDT_empty Willy Tarreau
2015-05-15  8:05 ` [ 04/48] x86, tls: Interpret an all-zero struct user_desc as "no segment" Willy Tarreau
2015-05-15  8:05 ` Willy Tarreau [this message]
2015-05-15 12:32   ` [ 05/48] x86_64, switch_to(): Load TLS descriptors before switching DS and ES Ben Hutchings
2015-05-15 13:38     ` Willy Tarreau
2015-05-15 14:25       ` Ben Hutchings
2015-05-15 14:31         ` Ben Hutchings
2015-05-15 14:37         ` Willy Tarreau
2015-05-15 15:53         ` Andi Kleen
2015-05-15 16:48           ` Willy Tarreau
2015-05-15 20:53           ` Ben Hutchings
2015-05-15 22:15             ` Andi Kleen
2015-05-15  8:05 ` [ 06/48] x86/tls: Disallow unusual TLS segments Willy Tarreau
2015-05-15  8:05 ` [ 07/48] x86/tls: Dont validate lm in set_thread_area() after all Willy Tarreau
2015-05-15  8:05 ` [ 08/48] x86, kvm: Clear paravirt_enabled on KVM guests for espfix32s benefit Willy Tarreau
2015-05-15  8:05 ` [ 09/48] x86_64, vdso: Fix the vdso address randomization algorithm Willy Tarreau
2015-05-15 21:02   ` Ben Hutchings
2015-05-15  8:05 ` [ 10/48] ASLR: fix stack randomization on 64-bit systems Willy Tarreau
2015-05-15  8:05 ` [ 11/48] x86, cpu, amd: Add workaround for family 16h, erratum 793 Willy Tarreau
2015-05-15  8:05 ` [ 12/48] x86/asm/entry/64: Remove a bogus ret_from_fork optimization Willy Tarreau
2015-05-15  8:05 ` [ 13/48] x86: Conditionally update time when ack-ing pending irqs Willy Tarreau
2015-05-15  8:05 ` [ 14/48] serial: samsung: wait for transfer completion before clock disable Willy Tarreau
2015-05-15  8:05 ` [ 15/48] splice: Apply generic position and size checks to each write Willy Tarreau
2015-05-15  8:05 ` [ 16/48] netfilter: conntrack: disable generic tracking for known protocols Willy Tarreau
2015-05-15 21:05   ` Ben Hutchings
2015-05-15  8:05 ` [ 17/48] isofs: Fix infinite looping over CE entries Willy Tarreau
2015-05-15  8:05 ` [ 18/48] isofs: Fix unchecked printing of ER records Willy Tarreau
2015-05-15  8:05 ` [ 19/48] net: sctp: fix memory leak in auth key management Willy Tarreau
2015-05-15  8:05 ` [ 20/48] net: sctp: fix slab corruption from use after free on INIT collisions Willy Tarreau
2015-05-15  8:05 ` [ 21/48] IB/uverbs: Prevent integer overflow in ib_umem_get address arithmetic Willy Tarreau
2015-05-15  8:05 ` [ 22/48] net: llc: use correct size for sysctl timeout entries Willy Tarreau
2015-05-15  8:05 ` [ 23/48] net: rds: use correct size for max unacked packets and bytes Willy Tarreau
2015-05-15  8:05 ` [ 24/48] ipv6: Dont reduce hop limit for an interface Willy Tarreau
2015-05-15  8:05 ` [ 25/48] fs: take i_mutex during prepare_binprm for set[ug]id executables Willy Tarreau
2015-05-15  8:05 ` [ 26/48] net:socket: set msg_namelen to 0 if msg_name is passed as NULL in msghdr struct from userland Willy Tarreau
2015-05-15 21:08   ` Ben Hutchings
2015-05-16  5:31     ` Willy Tarreau
2015-05-15  8:05 ` [ 27/48] ppp: deflate: never return len larger than output buffer Willy Tarreau
2015-05-15  8:05 ` [ 29/48] net: reject creation of netdev names with colons Willy Tarreau
2015-05-15  8:06 ` [ 30/48] ipv4: Dont use ufo handling on later transformed packets Willy Tarreau
2015-05-15  8:06 ` [ 31/48] udp: only allow UFO for packets from SOCK_DGRAM sockets Willy Tarreau
2015-05-15  8:06 ` [ 32/48] net: avoid to hang up on sending due to sysctl configuration overflow Willy Tarreau
2015-05-15  8:06 ` [ 33/48] net: sysctl_net_core: check SNDBUF and RCVBUF for min length Willy Tarreau
2015-05-15  8:06 ` [ 34/48] rds: avoid potential stack overflow Willy Tarreau
2015-05-15  8:06 ` [ 35/48] rxrpc: bogus MSG_PEEK test in rxrpc_recvmsg() Willy Tarreau
2015-05-15  8:06 ` [ 36/48] tcp: make connect() mem charging friendly Willy Tarreau
2015-05-15  8:06 ` [ 37/48] ip_forward: Drop frames with attached skb->sk Willy Tarreau
2015-05-15  8:06 ` [ 38/48] tcp: avoid looping in tcp_send_fin() Willy Tarreau
2015-05-15  8:06 ` [ 39/48] spi: spidev: fix possible arithmetic overflow for multi-transfer message Willy Tarreau
2015-05-15  8:06 ` [ 40/48] IB/core: Avoid leakage from kernel to user space Willy Tarreau
2015-05-15  8:06 ` [ 41/48] ipvs: uninitialized data with IP_VS_IPV6 Willy Tarreau
2015-05-15  8:06 ` [ 42/48] ipv4: fix nexthop attlen check in fib_nh_match Willy Tarreau
2015-05-15  8:06 ` [ 43/48] pagemap: do not leak physical addresses to non-privileged userspace Willy Tarreau
2015-05-15  8:06 ` [ 44/48] lockd: Try to reconnect if statd has moved Willy Tarreau
2015-05-15  8:06 ` [ 45/48] scsi: Fix error handling in SCSI_IOCTL_SEND_COMMAND Willy Tarreau
2015-05-15  8:06 ` [ 46/48] posix-timers: Fix stack info leak in timer_create() Willy Tarreau
2015-05-15  8:06 ` [ 47/48] hfsplus: fix B-tree corruption after insertion at position 0 Willy Tarreau
2015-05-15  8:06 ` [ 48/48] sound/oss: fix deadlock in sequencer_ioctl(SNDCTL_SEQ_OUTOFBAND) Willy Tarreau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150515080530.510828652@1wt.eu \
    --to=w@1wt.eu \
    --cc=andi@firstfloor.org \
    --cc=ben@decadent.org.uk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=mingo@kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox