linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Andy Lutomirski <luto@amacapital.net>,
	Andi Kleen <andi@firstfloor.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Ingo Molnar <mingo@kernel.org>
Subject: [PATCH 3.10 04/38] x86_64, switch_to(): Load TLS descriptors before switching DS and ES
Date: Tue,  6 Jan 2015 17:50:16 -0800	[thread overview]
Message-ID: <20150107014953.189717451@linuxfoundation.org> (raw)
In-Reply-To: <20150107014952.440109372@linuxfoundation.org>

3.10-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Andy Lutomirski <luto@amacapital.net>

commit f647d7c155f069c1a068030255c300663516420e upstream.

Otherwise, if buggy user code points DS or ES into the TLS
array, they would be corrupted after a context switch.

This also significantly improves the comments and documents some
gotchas in the code.

Before this patch, the both tests below failed.  With this
patch, the es test passes, although the gsbase test still fails.

 ----- begin es test -----

/*
 * Copyright (c) 2014 Andy Lutomirski
 * GPL v2
 */

static unsigned short GDT3(int idx)
{
	return (idx << 3) | 3;
}

static int create_tls(int idx, unsigned int base)
{
	struct user_desc desc = {
		.entry_number    = idx,
		.base_addr       = base,
		.limit           = 0xfffff,
		.seg_32bit       = 1,
		.contents        = 0, /* Data, grow-up */
		.read_exec_only  = 0,
		.limit_in_pages  = 1,
		.seg_not_present = 0,
		.useable         = 0,
	};

	if (syscall(SYS_set_thread_area, &desc) != 0)
		err(1, "set_thread_area");

	return desc.entry_number;
}

int main()
{
	int idx = create_tls(-1, 0);
	printf("Allocated GDT index %d\n", idx);

	unsigned short orig_es;
	asm volatile ("mov %%es,%0" : "=rm" (orig_es));

	int errors = 0;
	int total = 1000;
	for (int i = 0; i < total; i++) {
		asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
		usleep(100);

		unsigned short es;
		asm volatile ("mov %%es,%0" : "=rm" (es));
		asm volatile ("mov %0,%%es" : : "rm" (orig_es));
		if (es != GDT3(idx)) {
			if (errors == 0)
				printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
				       GDT3(idx), es);
			errors++;
		}
	}

	if (errors) {
		printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
		return 1;
	} else {
		printf("[OK]\tES was preserved\n");
		return 0;
	}
}

 ----- end es test -----

 ----- begin gsbase test -----

/*
 * gsbase.c, a gsbase test
 * Copyright (c) 2014 Andy Lutomirski
 * GPL v2
 */

static unsigned char *testptr, *testptr2;

static unsigned char read_gs_testvals(void)
{
	unsigned char ret;
	asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
	return ret;
}

int main()
{
	int errors = 0;

	testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
	if (testptr == MAP_FAILED)
		err(1, "mmap");

	testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
	if (testptr2 == MAP_FAILED)
		err(1, "mmap");

	*testptr = 0;
	*testptr2 = 1;

	if (syscall(SYS_arch_prctl, ARCH_SET_GS,
		    (unsigned long)testptr2 - (unsigned long)testptr) != 0)
		err(1, "ARCH_SET_GS");

	usleep(100);

	if (read_gs_testvals() == 1) {
		printf("[OK]\tARCH_SET_GS worked\n");
	} else {
		printf("[FAIL]\tARCH_SET_GS failed\n");
		errors++;
	}

	asm volatile ("mov %0,%%gs" : : "r" (0));

	if (read_gs_testvals() == 0) {
		printf("[OK]\tWriting 0 to gs worked\n");
	} else {
		printf("[FAIL]\tWriting 0 to gs failed\n");
		errors++;
	}

	usleep(100);

	if (read_gs_testvals() == 0) {
		printf("[OK]\tgsbase is still zero\n");
	} else {
		printf("[FAIL]\tgsbase was corrupted\n");
		errors++;
	}

	return errors == 0 ? 0 : 1;
}

 ----- end gsbase test -----

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 arch/x86/kernel/process_64.c |  101 +++++++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 28 deletions(-)

--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -279,24 +279,9 @@ __switch_to(struct task_struct *prev_p,
 
 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
-	/*
-	 * Reload esp0, LDT and the page table pointer:
-	 */
+	/* Reload esp0 and ss1. */
 	load_sp0(tss, next);
 
-	/*
-	 * Switch DS and ES.
-	 * This won't pick up thread selector changes, but I guess that is ok.
-	 */
-	savesegment(es, prev->es);
-	if (unlikely(next->es | prev->es))
-		loadsegment(es, next->es);
-
-	savesegment(ds, prev->ds);
-	if (unlikely(next->ds | prev->ds))
-		loadsegment(ds, next->ds);
-
-
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
 	 *
@@ -305,41 +290,101 @@ __switch_to(struct task_struct *prev_p,
 	savesegment(fs, fsindex);
 	savesegment(gs, gsindex);
 
+	/*
+	 * Load TLS before restoring any segments so that segment loads
+	 * reference the correct GDT entries.
+	 */
 	load_TLS(next, cpu);
 
 	/*
-	 * Leave lazy mode, flushing any hypercalls made here.
-	 * This must be done before restoring TLS segments so
-	 * the GDT and LDT are properly updated, and must be
-	 * done before math_state_restore, so the TS bit is up
-	 * to date.
+	 * Leave lazy mode, flushing any hypercalls made here.  This
+	 * must be done after loading TLS entries in the GDT but before
+	 * loading segments that might reference them, and and it must
+	 * be done before math_state_restore, so the TS bit is up to
+	 * date.
 	 */
 	arch_end_context_switch(next_p);
 
+	/* Switch DS and ES.
+	 *
+	 * Reading them only returns the selectors, but writing them (if
+	 * nonzero) loads the full descriptor from the GDT or LDT.  The
+	 * LDT for next is loaded in switch_mm, and the GDT is loaded
+	 * above.
+	 *
+	 * We therefore need to write new values to the segment
+	 * registers on every context switch unless both the new and old
+	 * values are zero.
+	 *
+	 * Note that we don't need to do anything for CS and SS, as
+	 * those are saved and restored as part of pt_regs.
+	 */
+	savesegment(es, prev->es);
+	if (unlikely(next->es | prev->es))
+		loadsegment(es, next->es);
+
+	savesegment(ds, prev->ds);
+	if (unlikely(next->ds | prev->ds))
+		loadsegment(ds, next->ds);
+
 	/*
 	 * Switch FS and GS.
 	 *
-	 * Segment register != 0 always requires a reload.  Also
-	 * reload when it has changed.  When prev process used 64bit
-	 * base always reload to avoid an information leak.
+	 * These are even more complicated than FS and GS: they have
+	 * 64-bit bases are that controlled by arch_prctl.  Those bases
+	 * only differ from the values in the GDT or LDT if the selector
+	 * is 0.
+	 *
+	 * Loading the segment register resets the hidden base part of
+	 * the register to 0 or the value from the GDT / LDT.  If the
+	 * next base address zero, writing 0 to the segment register is
+	 * much faster than using wrmsr to explicitly zero the base.
+	 *
+	 * The thread_struct.fs and thread_struct.gs values are 0
+	 * if the fs and gs bases respectively are not overridden
+	 * from the values implied by fsindex and gsindex.  They
+	 * are nonzero, and store the nonzero base addresses, if
+	 * the bases are overridden.
+	 *
+	 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
+	 * be impossible.
+	 *
+	 * Therefore we need to reload the segment registers if either
+	 * the old or new selector is nonzero, and we need to override
+	 * the base address if next thread expects it to be overridden.
+	 *
+	 * This code is unnecessarily slow in the case where the old and
+	 * new indexes are zero and the new base is nonzero -- it will
+	 * unnecessarily write 0 to the selector before writing the new
+	 * base address.
+	 *
+	 * Note: This all depends on arch_prctl being the only way that
+	 * user code can override the segment base.  Once wrfsbase and
+	 * wrgsbase are enabled, most of this code will need to change.
 	 */
 	if (unlikely(fsindex | next->fsindex | prev->fs)) {
 		loadsegment(fs, next->fsindex);
+
 		/*
-		 * Check if the user used a selector != 0; if yes
-		 *  clear 64bit base, since overloaded base is always
-		 *  mapped to the Null selector
+		 * If user code wrote a nonzero value to FS, then it also
+		 * cleared the overridden base address.
+		 *
+		 * XXX: if user code wrote 0 to FS and cleared the base
+		 * address itself, we won't notice and we'll incorrectly
+		 * restore the prior base address next time we reschdule
+		 * the process.
 		 */
 		if (fsindex)
 			prev->fs = 0;
 	}
-	/* when next process has a 64bit base use it */
 	if (next->fs)
 		wrmsrl(MSR_FS_BASE, next->fs);
 	prev->fsindex = fsindex;
 
 	if (unlikely(gsindex | next->gsindex | prev->gs)) {
 		load_gs_index(next->gsindex);
+
+		/* This works (and fails) the same way as fsindex above. */
 		if (gsindex)
 			prev->gs = 0;
 	}



  parent reply	other threads:[~2015-01-07  1:52 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-07  1:50 [PATCH 3.10 00/38] 3.10.64-stable review Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 01/38] isofs: Fix infinite looping over CE entries Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 02/38] x86/tls: Validate TLS entries to protect espfix Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 03/38] x86/tls: Disallow unusual TLS segments Greg Kroah-Hartman
2015-01-07  1:50 ` Greg Kroah-Hartman [this message]
2015-01-07  1:50 ` [PATCH 3.10 05/38] x86, kvm: Clear paravirt_enabled on KVM guests for espfix32s benefit Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 06/38] md/bitmap: always wait for writes on unplug Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 07/38] mfd: tc6393xb: Fail ohci suspend if full state restore is required Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 08/38] mmc: block: add newline to sysfs display of force_ro Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 09/38] megaraid_sas: corrected return of wait_event from abort frame path Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 10/38] nfs41: fix nfs4_proc_layoutget error handling Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 11/38] dm bufio: fix memleak when using a dm_buffers inline bio Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 12/38] dm space map metadata: fix sm_bootstrap_get_nr_blocks() Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 13/38] x86/tls: Dont validate lm in set_thread_area() after all Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 14/38] isofs: Fix unchecked printing of ER records Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 15/38] KEYS: Fix stale key registration at error path Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 17/38] mac80211: free management frame keys when removing station Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 18/38] mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 19/38] mnt: Update unprivileged remount test Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 20/38] umount: Disallow unprivileged mount force Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 21/38] groups: Consolidate the setgroups permission checks Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 22/38] userns: Document what the invariant required for safe unprivileged mappings Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 23/38] userns: Dont allow setgroups until a gid mapping has been setablished Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 24/38] userns: Dont allow unprivileged creation of gid mappings Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 25/38] userns: Check euid no fsuid when establishing an unprivileged uid mapping Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 26/38] userns: Only allow the creator of the userns unprivileged mappings Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 27/38] userns: Rename id_map_mutex to userns_state_mutex Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 28/38] userns: Add a knob to disable setgroups on a per user namespace basis Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 29/38] userns: Allow setting gid_maps without privilege when setgroups is disabled Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 30/38] userns: Unbreak the unprivileged remount tests Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 31/38] crypto: af_alg - fix backlog handling Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 32/38] ncpfs: return proper error from NCP_IOC_SETROOT ioctl Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 33/38] exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 34/38] udf: Verify symlink size before loading it Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 35/38] eCryptfs: Force RO mount when encrypted view is enabled Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 36/38] eCryptfs: Remove buggy and unnecessary write in file name decode routine Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 37/38] Btrfs: do not move em to modified list when unpinning Greg Kroah-Hartman
2015-01-07  1:50 ` [PATCH 3.10 38/38] Btrfs: fix fs corruption on transaction abort if device supports discard Greg Kroah-Hartman
2015-01-07 13:38 ` [PATCH 3.10 00/38] 3.10.64-stable review Guenter Roeck
2015-01-07 23:33 ` Shuah Khan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150107014953.189717451@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=andi@firstfloor.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=mingo@kernel.org \
    --cc=stable@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).