[RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
@ 2004-01-29  2:46 john stultz
  2004-01-29  5:06 ` Ulrich Drepper
  2004-01-31  0:17 ` Eric W. Biederman
  0 siblings, 2 replies; 42+ messages in thread
From: john stultz @ 2004-01-29  2:46 UTC (permalink / raw)
  To: lkml; +Cc: Andi Kleen, andrea, Joel Becker, Wim Coekaerts, Chris McDermott

[-- Attachment #1: Type: text/plain, Size: 25856 bytes --]

All,
        This is my port of the x86-64 vsyscall gettimeofday code to
i386. This patch moves gettimeofday into userspace, so it can be called
without the syscall overhead, greatly improving performance. This is
important for any application, like a database, which heavily uses
gettimeofday for timestamping. It supports both the TSC and IBM x44X
cyclone time source.

Example performance gain: (vs. int80)
Normal gettimeofday 
gettimeofday ( 1665576us / 1000000runs ) = 1.665574us
vsyscall LD_PRELOAD gettimeofday
gettimeofday ( 868378us / 1000000runs ) = 0.868377us

This patch becomes especially important with the introduction of the
4G/4G split, as there the syscall overhead is greatly increased. 

Example gain w/ 4/4g split: (vs. int80)
Normal gettimeofday 
gettimeofday ( 7210630us / 1000000runs ) = 7.210623us
vsyscall LD_PRELOAD gettimeofday
gettimeofday ( 844855us / 1000000runs ) = 0.844854us

Also attached is an example test program which generated the numbers
above, and shows how to use vsyscall-gtod via LD_PRELOAD. Ideally glibc
would support this, as it does vsyscall-sysenter.

Please let me know if you have any comments or suggestions. 

New in this patch (B0 -> B1):
o Cleaned up 4/4 split code, so no additional patch is needed.
o Fixed permissions on fixmapped cyclone page
o Improved alternate_instruction workaround 
o Use NTP variables to avoid related time inconsistencies
o minor code cleanups

thanks
-john


diff -Nru a/arch/i386/Kconfig b/arch/i386/Kconfig
--- a/arch/i386/Kconfig	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/Kconfig	Tue Jan 27 19:26:21 2004
@@ -416,6 +416,10 @@
 config HPET_EMULATE_RTC
 	def_bool HPET_TIMER && RTC=y
 
+config VSYSCALL_GTOD
+	depends on EXPERIMENTAL
+	bool "VSYSCALL gettimeofday() interface"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
diff -Nru a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/Makefile	Tue Jan 27 19:26:21 2004
@@ -31,6 +31,7 @@
 obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
+obj-$(CONFIG_VSYSCALL_GTOD)	+= vsyscall-gtod.o
 
 EXTRA_AFLAGS   := -traditional
 
diff -Nru a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
--- a/arch/i386/kernel/setup.c	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/setup.c	Tue Jan 27 19:26:21 2004
@@ -47,6 +47,7 @@
 #include <asm/sections.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
+#include <asm/vsyscall-gtod.h>
 #include "setup_arch_pre.h"
 #include "mach_resources.h"
 
@@ -1142,6 +1143,7 @@
 	conswitchp = &dummy_con;
 #endif
 #endif
+	vsyscall_init();
 }
 
 #include "setup_arch_post.h"
diff -Nru a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
--- a/arch/i386/kernel/time.c	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/time.c	Tue Jan 27 19:26:21 2004
@@ -393,5 +393,8 @@
 	cur_timer = select_timer();
 	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
 
+	/* set vsyscall to use selected time source */
+	vsyscall_set_timesource(cur_timer->name);
+	
 	time_init_hook();
 }
diff -Nru a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c
--- a/arch/i386/kernel/timers/timer.c	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/timers/timer.c	Tue Jan 27 19:26:21 2004
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <asm/timer.h>
+#include <asm/vsyscall-gtod.h>
 
 #ifdef CONFIG_HPET_TIMER
 /*
@@ -41,6 +42,9 @@
 void clock_fallback(void)
 {
 	cur_timer = &timer_pit;
+
+	/* set vsyscall to use selected time source */
+	vsyscall_set_timesource(cur_timer->name);
 }
 
 /* iterates through the list of timers, returning the first 
diff -Nru a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c
--- a/arch/i386/kernel/timers/timer_cyclone.c	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/timers/timer_cyclone.c	Tue Jan 27 19:26:21 2004
@@ -21,18 +21,24 @@
 extern spinlock_t i8253_lock;
 
 /* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
+int cyclone_delay_at_last_interrupt;
+
+/* FIXMAP flag  */
+#ifdef CONFIG_VSYSCALL_GTOD
+#define PAGE_CYCLONE PAGE_KERNEL_VSYSCALL_NOCACHE
+#else
+#define PAGE_CYCLONE PAGE_KERNEL_NOCACHE
+#endif
 
 #define CYCLONE_CBAR_ADDR 0xFEB00CD0
 #define CYCLONE_PMCC_OFFSET 0x51A0
 #define CYCLONE_MPMC_OFFSET 0x51D0
 #define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
 #define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
 int use_cyclone = 0;
 
-static u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */
-static u32 last_cyclone_low;
+u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */
+u32 last_cyclone_low;
 static u32 last_cyclone_high;
 static unsigned long long monotonic_base;
 static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
@@ -57,7 +63,7 @@
 	spin_lock(&i8253_lock);
 	read_cyclone_counter(last_cyclone_low,last_cyclone_high);
 
-	/* read values for delay_at_last_interrupt */
+	/* read values for cyclone_delay_at_last_interrupt */
 	outb_p(0x00, 0x43);     /* latch the count ASAP */
 
 	count = inb_p(0x40);    /* read the latched count */
@@ -67,7 +73,7 @@
 	/* lost tick compensation */
 	delta = last_cyclone_low - delta;	
 	delta /= (CYCLONE_TIMER_FREQ/1000000);
-	delta += delay_at_last_interrupt;
+	delta += cyclone_delay_at_last_interrupt;
 	lost = delta/(1000000/HZ);
 	delay = delta%(1000000/HZ);
 	if (lost >= 2)
@@ -78,16 +84,16 @@
 	monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
 	write_sequnlock(&monotonic_lock);
 
-	/* calculate delay_at_last_interrupt */
+	/* calculate cyclone_delay_at_last_interrupt */
 	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
+	cyclone_delay_at_last_interrupt = (count + LATCH/2) / LATCH;
 
 
 	/* catch corner case where tick rollover occured 
 	 * between cyclone and pit reads (as noted when 
 	 * usec delta is > 90% # of usecs/tick)
 	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
+	if (lost && abs(delay - cyclone_delay_at_last_interrupt) > (900000/HZ))
 		jiffies_64++;
 }
 
@@ -96,7 +102,7 @@
 	u32 offset;
 
 	if(!cyclone_timer)
-		return delay_at_last_interrupt;
+		return cyclone_delay_at_last_interrupt;
 
 	/* Read the cyclone timer */
 	offset = cyclone_timer[0];
@@ -109,7 +115,7 @@
 	offset = offset/(CYCLONE_TIMER_FREQ/1000000);
 
 	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + offset;
+	return cyclone_delay_at_last_interrupt + offset;
 }
 
 static unsigned long long monotonic_clock_cyclone(void)
@@ -193,7 +199,7 @@
 	/* map in cyclone_timer */
 	pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
 	offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
+	__set_fixmap(FIX_CYCLONE_TIMER, pageaddr, PAGE_CYCLONE);
 	cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
 	if(!cyclone_timer){
 		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
diff -Nru a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
--- a/arch/i386/kernel/timers/timer_tsc.c	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/timers/timer_tsc.c	Tue Jan 27 19:26:21 2004
@@ -33,7 +33,7 @@
 
 static int use_tsc;
 /* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
+int tsc_delay_at_last_interrupt;
 
 static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
@@ -104,7 +104,7 @@
 		 "0" (eax));
 
 	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + edx;
+	return tsc_delay_at_last_interrupt + edx;
 }
 
 static unsigned long long monotonic_clock_tsc(void)
@@ -223,7 +223,7 @@
 		 "0" (eax));
 		delta = edx;
 	}
-	delta += delay_at_last_interrupt;
+	delta += tsc_delay_at_last_interrupt;
 	lost = delta/(1000000/HZ);
 	delay = delta%(1000000/HZ);
 	if (lost >= 2) {
@@ -244,15 +244,15 @@
 	monotonic_base += cycles_2_ns(this_offset - last_offset);
 	write_sequnlock(&monotonic_lock);
 
-	/* calculate delay_at_last_interrupt */
+	/* calculate tsc_delay_at_last_interrupt */
 	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
+	tsc_delay_at_last_interrupt = (count + LATCH/2) / LATCH;
 
 	/* catch corner case where tick rollover occured 
 	 * between tsc and pit reads (as noted when 
 	 * usec delta is > 90% # of usecs/tick)
 	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
+	if (lost && abs(delay - tsc_delay_at_last_interrupt) > (900000/HZ))
 		jiffies_64++;
 }
 
@@ -304,7 +304,7 @@
 	monotonic_base += cycles_2_ns(this_offset - last_offset);
 	write_sequnlock(&monotonic_lock);
 
-	/* calculate delay_at_last_interrupt */
+	/* calculate tsc_delay_at_last_interrupt */
 	/*
 	 * Time offset = (hpet delta) * ( usecs per HPET clock )
 	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
@@ -312,9 +312,9 @@
 	 * Where,
 	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
 	 */
-	delay_at_last_interrupt = hpet_current - offset;
-	ASM_MUL64_REG(temp, delay_at_last_interrupt,
-			hpet_usec_quotient, delay_at_last_interrupt);
+	tsc_delay_at_last_interrupt = hpet_current - offset;
+	ASM_MUL64_REG(temp, tsc_delay_at_last_interrupt,
+			hpet_usec_quotient, tsc_delay_at_last_interrupt);
 }
 #endif
 
diff -Nru a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
--- a/arch/i386/kernel/vmlinux.lds.S	Tue Jan 27 19:26:21 2004
+++ b/arch/i386/kernel/vmlinux.lds.S	Tue Jan 27 19:26:21 2004
@@ -3,11 +3,11 @@
  */
 
 #include <asm-generic/vmlinux.lds.h>
+#include <linux/config.h>
 	
 OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
 OUTPUT_ARCH(i386)
 ENTRY(startup_32)
-jiffies = jiffies_64;
 SECTIONS
 {
   . = 0xC0000000 + 0x100000;
@@ -48,6 +48,79 @@
 
   _edata = .;			/* End of data section */
 
+/* VSYSCALL_GTOD data */
+#ifdef CONFIG_VSYSCALL_GTOD
+
+	/* vsyscall entry */
+   . = ALIGN(64);
+  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+	/* Must be the same as VSYSCALL_GTOD_START */
+  .vsyscall_0 0xffffc000: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
+  __vsyscall_0 = LOADADDR(.vsyscall_0);
+
+
+	/* generic gtod variables */
+  . = ALIGN(64);
+  .vsyscall_timesource : AT ((LOADADDR(.vsyscall_0) + SIZEOF(.vsyscall_0) + 63) & ~(63)) { *(.vsyscall_timesource) }
+  vsyscall_timesource = LOADADDR(.vsyscall_timesource);
+
+    . = ALIGN(16);
+  .xtime_lock : AT ((LOADADDR(.vsyscall_timesource) + SIZEOF(.vsyscall_timesource) + 15) & ~(15)) { *(.xtime_lock) }
+  xtime_lock = LOADADDR(.xtime_lock);
+
+  . = ALIGN(16);
+  .xtime : AT ((LOADADDR(.xtime_lock) + SIZEOF(.xtime_lock) + 15) & ~(15)) { *(.xtime) }
+  xtime = LOADADDR(.xtime);
+
+  . = ALIGN(16);
+  .jiffies : AT ((LOADADDR(.xtime) + SIZEOF(.xtime) + 15) & ~(15)) { *(.jiffies) }
+  jiffies = LOADADDR(.jiffies);
+
+  . = ALIGN(16);
+  .wall_jiffies : AT ((LOADADDR(.jiffies) + SIZEOF(.jiffies) + 15) & ~(15)) { *(.wall_jiffies) }
+  wall_jiffies = LOADADDR(.wall_jiffies);
+  
+  .sys_tz : AT (LOADADDR(.wall_jiffies) + SIZEOF(.wall_jiffies)) { *(.sys_tz) }
+  sys_tz = LOADADDR(.sys_tz);
+  
+	/* NTP variables */
+  .tickadj : AT (LOADADDR(.sys_tz) + SIZEOF(.sys_tz)) { *(.tickadj) }
+  tickadj = LOADADDR(.tickadj);
+
+  .time_adjust : AT (LOADADDR(.tickadj) + SIZEOF(.tickadj)) { *(.time_adjust) }
+  time_adjust = LOADADDR(.time_adjust);
+
+	/* TSC variables*/
+  .last_tsc_low : AT (LOADADDR(.time_adjust) + SIZEOF(.time_adjust)) { *(.last_tsc_low) }
+  last_tsc_low = LOADADDR(.last_tsc_low);
+  
+  .tsc_delay_at_last_interrupt : AT (LOADADDR(.last_tsc_low) + SIZEOF(.last_tsc_low)) { *(.tsc_delay_at_last_interrupt) }
+  tsc_delay_at_last_interrupt = LOADADDR(.tsc_delay_at_last_interrupt);
+  
+  .fast_gettimeoffset_quotient : AT (LOADADDR(.tsc_delay_at_last_interrupt) + SIZEOF(.tsc_delay_at_last_interrupt)) { *(.fast_gettimeoffset_quotient) }
+  fast_gettimeoffset_quotient = LOADADDR(.fast_gettimeoffset_quotient);
+
+
+	/*cyclone values*/
+  .cyclone_timer : AT (LOADADDR(.fast_gettimeoffset_quotient) + SIZEOF(.fast_gettimeoffset_quotient))  { *(.cyclone_timer) }
+  cyclone_timer = LOADADDR(.cyclone_timer);
+
+  .last_cyclone_low : AT (LOADADDR(.cyclone_timer) + SIZEOF(.cyclone_timer)) { *(.last_cyclone_low) }
+  last_cyclone_low = LOADADDR(.last_cyclone_low);
+  
+  .cyclone_delay_at_last_interrupt : AT (LOADADDR(.last_cyclone_low) + SIZEOF(.last_cyclone_low)) { *(.cyclone_delay_at_last_interrupt) }
+  cyclone_delay_at_last_interrupt = LOADADDR(.cyclone_delay_at_last_interrupt);
+
+
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
+  . = LOADADDR(.vsyscall_0) + 4096;
+  
+  jiffies_64 = jiffies;
+#else
+  jiffies = jiffies_64;
+#endif 
+/* END of VSYSCALL_GTOD data*/
+	
   . = ALIGN(8192);		/* init_task */
   .data.init_task : { *(.data.init_task) }
 
diff -Nru a/arch/i386/kernel/vsyscall-gtod.c b/arch/i386/kernel/vsyscall-gtod.c
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/arch/i386/kernel/vsyscall-gtod.c	Tue Jan 27 19:26:21 2004
@@ -0,0 +1,275 @@
+/*
+ *  linux/arch/i386/kernel/vsyscall-gtod.c
+ *
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2003,2004 John Stultz <johnstul@us.ibm.com> IBM 
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 0 is located at VSYSCALL_START, vsyscall 1 is located
+ *  at virtual address VSYSCALL_START+1024bytes etc... 
+ *
+ *  Originally written for x86-64 by Andrea Arcangeli <andrea@suse.de>
+ *  Ported to i386 by John Stultz <johnstul@us.ibm.com>
+ */
+
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+
+#include <asm/vsyscall-gtod.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/fixmap.h>
+#include <asm/msr.h>
+#include <asm/timer.h>
+#include <asm/system.h>
+#include <asm/unistd.h>
+#include <asm/errno.h>
+
+int errno;
+static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz);
+static int vsyscall_mapped = 0; /* flag variable for remap_vsyscall() */
+
+enum vsyscall_timesource_e vsyscall_timesource;
+enum vsyscall_timesource_e __vsyscall_timesource __section_vsyscall_timesource;
+
+/* readonly clones of generic time values */
+seqlock_t  __xtime_lock __section_xtime_lock  = SEQLOCK_UNLOCKED;
+struct timespec __xtime __section_xtime;
+volatile unsigned long __jiffies __section_jiffies;
+unsigned long __wall_jiffies __section_wall_jiffies;
+struct timezone __sys_tz __section_sys_tz;
+/* readonly clones of ntp time variables */
+int __tickadj __section_tickadj;
+long __time_adjust __section_time_adjust;
+
+/* readonly clones of TSC timesource values*/
+unsigned long __last_tsc_low __section_last_tsc_low;
+int __tsc_delay_at_last_interrupt __section_tsc_delay_at_last_interrupt;
+unsigned long __fast_gettimeoffset_quotient __section_fast_gettimeoffset_quotient;
+
+/* readonly clones of cyclone timesource values*/
+u32* __cyclone_timer __section_cyclone_timer;	/* Cyclone MPMC0 register */
+u32 __last_cyclone_low __section_last_cyclone_low;
+int __cyclone_delay_at_last_interrupt __section_cyclone_delay_at_last_interrupt;
+
+
+static inline unsigned long vgettimeoffset_tsc(void)
+{
+	unsigned long eax, edx;
+
+	/* Read the Time Stamp Counter */
+	rdtsc(eax,edx);
+
+	/* .. relative to previous jiffy (32 bits is enough) */
+	eax -= __last_tsc_low;	/* tsc_low delta */
+
+	/*
+	 * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
+	 *             = (tsc_low delta) * (usecs_per_clock)
+	 *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
+	 *
+	 * Using a mull instead of a divl saves up to 31 clock cycles
+	 * in the critical path.
+	 */
+
+
+	__asm__("mull %2"
+		:"=a" (eax), "=d" (edx)
+		:"rm" (__fast_gettimeoffset_quotient),
+		 "0" (eax));
+
+	/* our adjusted time offset in microseconds */
+	return __tsc_delay_at_last_interrupt + edx;
+
+}
+
+static inline unsigned long vgettimeoffset_cyclone(void)
+{
+	u32 offset;
+	
+	if (!__cyclone_timer)
+		return 0;
+
+	/* Read the cyclone timer */
+	offset = __cyclone_timer[0];
+
+	/* .. relative to previous jiffy */
+	offset = offset - __last_cyclone_low;
+
+	/* convert cyclone ticks to microseconds */	
+	offset = offset/(CYCLONE_TIMER_FREQ/1000000);
+
+	/* our adjusted time offset in microseconds */
+	return __cyclone_delay_at_last_interrupt + offset;
+}
+
+static inline void do_vgettimeofday(struct timeval * tv)
+{
+	long sequence;
+	unsigned long usec, sec;
+	unsigned long lost;
+	unsigned long max_ntp_tick;
+
+	/* If we don't have a valid vsyscall time source, 
+	 * just call gettimeofday()
+	 */
+	if (__vsyscall_timesource == VSYSCALL_GTOD_NONE) {
+		gettimeofday(tv, NULL);
+		return;
+	}
+	
+	
+	do {
+		sequence = read_seqbegin(&__xtime_lock);
+
+		/* Get the high-res offset */
+		if (__vsyscall_timesource == VSYSCALL_GTOD_CYCLONE)
+			usec = vgettimeoffset_cyclone();
+		else
+			usec = vgettimeoffset_tsc();
+
+		lost = __jiffies - __wall_jiffies;
+
+		/*
+		 * If time_adjust is negative then NTP is slowing the clock
+		 * so make sure not to go into next possible interval.
+		 * Better to lose some accuracy than have time go backwards..
+		 */
+		if (unlikely(__time_adjust < 0)) {
+			max_ntp_tick = (USEC_PER_SEC / HZ) - __tickadj;
+			usec = min(usec, max_ntp_tick);
+
+			if (lost)
+				usec += lost * max_ntp_tick;
+		}
+		else if (unlikely(lost))
+			usec += lost * (USEC_PER_SEC / HZ);
+
+		sec = __xtime.tv_sec;
+		usec += (__xtime.tv_nsec / 1000);
+
+	} while (read_seqretry(&__xtime_lock, sequence));
+
+	tv->tv_sec = sec + usec / 1000000;
+	tv->tv_usec = usec % 1000000;
+}
+
+static inline void do_get_tz(struct timezone * tz)
+{
+	long sequence;
+
+	do {
+		sequence = read_seqbegin(&__xtime_lock);
+
+		*tz = __sys_tz;
+
+	} while (read_seqretry(&__xtime_lock, sequence));
+}
+
+static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+{
+	if (tv)
+		do_vgettimeofday(tv);
+	if (tz)
+		do_get_tz(tz);
+	return 0;
+}
+
+static time_t __vsyscall(1) vtime(time_t * t)
+{
+	struct timeval tv; 
+	vgettimeofday(&tv,NULL); 
+	if (t)
+		*t = tv.tv_sec; 
+	return tv.tv_sec;
+}
+
+static long __vsyscall(2) venosys_0(void)
+{
+	return -ENOSYS;
+}
+
+static long __vsyscall(3) venosys_1(void)
+{
+	return -ENOSYS;
+}
+
+
+void vsyscall_set_timesource(char* name)
+{
+	if (!strncmp(name, "tsc", 3))
+		vsyscall_timesource = VSYSCALL_GTOD_TSC;
+	else if (!strncmp(name, "cyclone", 7))
+		vsyscall_timesource = VSYSCALL_GTOD_CYCLONE;
+	else
+		vsyscall_timesource = VSYSCALL_GTOD_NONE;
+}
+
+
+static void __init map_vsyscall(void)
+{
+	unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET;
+
+	/* Initially we map the VSYSCALL page w/ PAGE_KERNEL permissions to
+	 * keep the alternate_instruction code from bombing out when it 
+	 * changes the seq_lock memory barriers in vgettimeofday()
+	 */
+	__set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL);
+}
+
+static int __init remap_vsyscall(void)
+{
+	unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET;
+
+	if (!vsyscall_mapped)
+		return 0;
+
+	/* Remap the VSYSCALL page w/ PAGE_KERNEL_VSYSCALL permissions 
+	 * after the alternate_instruction code has run
+	 */
+	clear_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE);
+	__set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+
+	return 0;
+}
+
+int __init vsyscall_init(void)
+{
+	printk("VSYSCALL: consistency checks...");
+	if ((unsigned long) &vgettimeofday != VSYSCALL_ADDR(__NR_vgettimeofday)) {
+		printk("vgettimeofday link addr broken\n");
+		printk("VSYSCALL: vsyscall_init failed!\n");
+		return -EFAULT;
+	}
+	if ((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)) {
+		printk("vtime link addr broken\n");
+		printk("VSYSCALL: vsyscall_init failed!\n");
+		return -EFAULT;
+	}
+	if (VSYSCALL_ADDR(0) != __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)) {
+		printk("fixmap first vsyscall 0x%lx should be 0x%x\n", 
+			__fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE),
+			VSYSCALL_ADDR(0));
+		printk("VSYSCALL: vsyscall_init failed!\n");
+		return -EFAULT;
+	}
+
+
+	printk("passed...mapping...");
+	map_vsyscall();
+	printk("done.\n");
+	vsyscall_mapped = 1;
+	printk("VSYSCALL: fixmap virt addr: 0x%lx\n",
+		__fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE));
+
+	return 0;
+}
+
+__initcall(remap_vsyscall);
diff -Nru a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
--- a/include/asm-i386/fixmap.h	Tue Jan 27 19:26:21 2004
+++ b/include/asm-i386/fixmap.h	Tue Jan 27 19:26:21 2004
@@ -18,6 +18,7 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
+#include <asm/vsyscall-gtod.h>
 #ifdef CONFIG_HIGHMEM
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -44,6 +45,17 @@
 enum fixed_addresses {
 	FIX_HOLE,
 	FIX_VSYSCALL,
+#ifdef CONFIG_VSYSCALL_GTOD
+#ifndef CONFIG_X86_4G
+	FIX_VSYSCALL_GTOD_PAD,
+#endif /* !CONFIG_X86_4G */
+	FIX_VSYSCALL_GTOD_LAST_PAGE,
+	FIX_VSYSCALL_GTOD_FIRST_PAGE = FIX_VSYSCALL_GTOD_LAST_PAGE
+					+ VSYSCALL_GTOD_NUMPAGES - 1,
+#ifdef CONFIG_X86_4G
+	FIX_VSYSCALL_GTOD_4GALIGN,
+#endif /* CONFIG_X86_4G */
+#endif /* CONFIG_VSYSCALL_GTOD */
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
diff -Nru a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
--- a/include/asm-i386/pgtable.h	Tue Jan 27 19:26:21 2004
+++ b/include/asm-i386/pgtable.h	Tue Jan 27 19:26:21 2004
@@ -137,11 +137,15 @@
 #define __PAGE_KERNEL_RO	(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_NOCACHE	(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_LARGE	(__PAGE_KERNEL | _PAGE_PSE)
-
+#define __PAGE_KERNEL_VSYSCALL \
+	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
+	
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL|(__PAGE_KERNEL_RO | _PAGE_PCD))
 
 /*
  * The i386 can't do page protection for execute, and considers that
diff -Nru a/include/asm-i386/timer.h b/include/asm-i386/timer.h
--- a/include/asm-i386/timer.h	Tue Jan 27 19:26:21 2004
+++ b/include/asm-i386/timer.h	Tue Jan 27 19:26:21 2004
@@ -20,6 +20,7 @@
 };
 
 #define TICK_SIZE (tick_nsec / 1000)
+#define CYCLONE_TIMER_FREQ 100000000
 
 extern struct timer_opts* select_timer(void);
 extern void clock_fallback(void);
diff -Nru a/include/asm-i386/vsyscall-gtod.h b/include/asm-i386/vsyscall-gtod.h
--- /dev/null	Wed Dec 31 16:00:00 1969
+++ b/include/asm-i386/vsyscall-gtod.h	Tue Jan 27 19:26:21 2004
@@ -0,0 +1,68 @@
+#ifndef _ASM_i386_VSYSCALL_GTOD_H_
+#define _ASM_i386_VSYSCALL_GTOD_H_
+#include <linux/seqlock.h>
+
+#ifdef CONFIG_VSYSCALL_GTOD
+
+/* VSYSCALL_GTOD_START must be the same as 
+ * __fix_to_virt(FIX_VSYSCALL_GTOD FIRST_PAGE) 
+ * and must also be same as addr in vmlinux.lds.S */
+#define VSYSCALL_GTOD_START 0xffffc000  
+#define VSYSCALL_GTOD_SIZE 1024
+#define VSYSCALL_GTOD_END (VSYSCALL_GTOD_START + PAGE_SIZE)
+#define VSYSCALL_GTOD_NUMPAGES \
+	((VSYSCALL_GTOD_END-VSYSCALL_GTOD_START) >> PAGE_SHIFT)
+#define VSYSCALL_ADDR(vsyscall_nr) \
+	(VSYSCALL_GTOD_START+VSYSCALL_GTOD_SIZE*(vsyscall_nr))
+
+#ifdef __KERNEL__
+
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+
+/* ReadOnly generic time value attributes*/
+#define __section_vsyscall_timesource __attribute__ ((unused, __section__ (".vsyscall_timesource")))
+#define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock")))
+#define __section_xtime __attribute__ ((unused, __section__ (".xtime")))
+#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies")))
+#define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies")))
+#define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz")))
+
+/* ReadOnly NTP variables */
+#define __section_tickadj __attribute__ ((unused, __section__ (".tickadj")))
+#define __section_time_adjust __attribute__ ((unused, __section__ (".time_adjust")))
+
+
+/* ReadOnly TSC time value attributes*/
+#define __section_last_tsc_low	__attribute__ ((unused, __section__ (".last_tsc_low")))
+#define __section_tsc_delay_at_last_interrupt	__attribute__ ((unused, __section__ (".tsc_delay_at_last_interrupt")))
+#define __section_fast_gettimeoffset_quotient	__attribute__ ((unused, __section__ (".fast_gettimeoffset_quotient")))
+
+/* ReadOnly Cyclone time value attributes*/
+#define __section_cyclone_timer __attribute__ ((unused, __section__ (".cyclone_timer")))
+#define __section_last_cyclone_low __attribute__ ((unused, __section__ (".last_cyclone_low")))
+#define __section_cyclone_delay_at_last_interrupt	__attribute__ ((unused, __section__ (".cyclone_delay_at_last_interrupt")))
+
+enum vsyscall_num {
+	__NR_vgettimeofday,
+	__NR_vtime,
+};
+
+enum vsyscall_timesource_e {
+	VSYSCALL_GTOD_NONE,
+	VSYSCALL_GTOD_TSC,
+	VSYSCALL_GTOD_CYCLONE,
+};
+
+int vsyscall_init(void);
+void vsyscall_set_timesource(char* name);
+
+extern char __vsyscall_0;
+
+
+#endif /* __KERNEL__ */
+#else /* CONFIG_VSYSCALL_GTOD */
+#define vsyscall_init() 
+#define vsyscall_set_timesource(x)
+#endif /* CONFIG_VSYSCALL_GTOD */
+#endif /* _ASM_i386_VSYSCALL_GTOD_H_ */
+


[-- Attachment #2: vsyscall-gtod_test_B1.tar.gz --]
[-- Type: application/x-compressed-tar, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29  2:46 [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch john stultz
@ 2004-01-29  5:06 ` Ulrich Drepper
  2004-01-29 13:26   ` Jamie Lokier
  2004-02-01  1:28   ` Andrea Arcangeli
  2004-01-31  0:17 ` Eric W. Biederman
  1 sibling, 2 replies; 42+ messages in thread
From: Ulrich Drepper @ 2004-01-29  5:06 UTC (permalink / raw)
  To: john stultz; +Cc: lkml

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

john stultz wrote:

> Please let me know if you have any comments or suggestions. 

I really don't like this special address in the vdso approach.  Yes,
it's unfortunately done for x86-64 as well but this doesn't mean the
mistakes have to be repeated.

Ideally there will be a couple more syscalls which over time can at
least partially be handled at userlevel in the vdso.  Do you want to add
a new special address for each of them?

There are two ways two avoid this which are easy to support in the
current framework:

~ to transparently invoke the optimized syscalls change the DSO entry
code to do a table lookup.  The table content are pointers to code.  By
default, it points to the syscall code we now use.  If there is a
special version of the syscall point to that code and see it magically
called.  No need for libc changes, old libcs automatically take
advantage of the optimizations.  No information about the optimizations
is spilled out to userlevel.

~ alternatively use the symbol table the vdso has.  Export the new code
only via the symbol table.  No fixed address for the function, the
runtime gets it from the symbol table.  glibc will use weak symbol
references; if the symbol isn't there, the old code is used.  This will
require that every single optimized syscall needs to be handled special.

I personally like the first approach better.  The indirection table can
maintained in sync with the syscall table inside the kernel.  It all
comes at all times from the same source.  The overhead of the memory
load should be neglectable.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAGJTa2ijCOnn/RHQRArL2AJ9ULsq2xl3m8TNLNkJydPzrmhQXbACgrlhe
uYIrFlankjw1TIU5W/AdvBA=
=yP4a
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29  5:06 ` Ulrich Drepper
@ 2004-01-29 13:26   ` Jamie Lokier
  2004-01-29 18:05     ` Ulrich Drepper
  2004-01-31  0:10     ` Eric W. Biederman
  2004-02-01  1:28   ` Andrea Arcangeli
  1 sibling, 2 replies; 42+ messages in thread
From: Jamie Lokier @ 2004-01-29 13:26 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: john stultz, lkml

Ulrich Drepper wrote:
> ~ alternatively use the symbol table the vdso has.  Export the new code
> only via the symbol table.  No fixed address for the function, the
> runtime gets it from the symbol table.  glibc will use weak symbol
> references; if the symbol isn't there, the old code is used.  This will
> require that every single optimized syscall needs to be handled special.
> 
> 
> I personally like the first approach better.  The indirection table can
> maintained in sync with the syscall table inside the kernel.  It all
> comes at all times from the same source.  The overhead of the memory
> load should be neglectable.

I like the second approach more.  You can change glibc to look up the
weak symbol for _all_ syscalls, then none of them are special and it
will work with future kernel optimisations.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29 13:26   ` Jamie Lokier
@ 2004-01-29 18:05     ` Ulrich Drepper
  2004-01-29 19:15       ` Jamie Lokier
  2004-01-31  0:10     ` Eric W. Biederman
  1 sibling, 1 reply; 42+ messages in thread
From: Ulrich Drepper @ 2004-01-29 18:05 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: john stultz, lkml

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Jamie Lokier wrote:

> I like the second approach more.  You can change glibc to look up the
> weak symbol for _all_ syscalls, then none of them are special and it
> will work with future kernel optimisations.

Symbol lookups are slow.  And they require the syscall stubs to suddenly
set up the usual PIC infrastructure since a jump through the PLT is
used.  This is much slower than the extra indirection the vdso could do.

The vdso is just one of the DSOs in the search path and usually the very
last.  So there would be possible many objects which are looked at
first, unsuccessfully.

And another problem I should have mentioned last night: in statically
linked applications the vDSO isn't used this way.  Do dynamic linker
functionality is available.  We find the vDSO through the auxiliary
vector and use the absolute address, not the symbol table of the vDSO.
If the syscall entry in the vDSO would do the dispatch automatically,
statically linked apps would benefit from the optimizations, too.
Otherwise they are left out.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAGUtt2ijCOnn/RHQRAgLwAKCcvvzg/FB8/8C+Jo1I6wfWBju25gCeKr4z
kErg4cvJuxBvmRltLF4AxEE=
=f2aR
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29 18:05     ` Ulrich Drepper
@ 2004-01-29 19:15       ` Jamie Lokier
  2004-01-29 23:59         ` john stultz
  2004-01-30  0:31         ` Ulrich Drepper
  0 siblings, 2 replies; 42+ messages in thread
From: Jamie Lokier @ 2004-01-29 19:15 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: john stultz, lkml

Ulrich Drepper wrote:
> And they require the syscall stubs to suddenly set up the usual PIC
> infrastructure since a jump through the PLT is used.

As this is x86, can't the syscall routines in Glibc call directly
without a PLT entry?  With prelinking, because the vdso is always
located at the same address, there isn't even a dirty page overhead to
using non-PIC in this case.

> This is much slower than the extra indirection the vdso could do.

If you have to use a PLT entry it is.  If you can do it without a PLT,
direct jump to the optimised syscall address is fastest.

> The vdso is just one of the DSOs in the search path and usually the very
> last.  So there would be possible many objects which are looked at
> first, unsuccessfully.

Being Glibc, you could always tweak ld.so to only look at the last one
if this were really a performance issue.  Btw, every syscall used by
the program requires at least one symbol lookup, usually over the
whole search path, anyway.

> And another problem I should have mentioned last night: in statically
> linked applications the vDSO isn't used this way.  Do dynamic linker
> functionality is available.  We find the vDSO through the auxiliary
> vector and use the absolute address, not the symbol table of the vDSO.
> If the syscall entry in the vDSO would do the dispatch automatically,
> statically linked apps would benefit from the optimizations, too.
> Otherwise they are left out.

I hear what you're saying.  These are the things which bother me:

   1. There are already three indirect jumps to make a syscall.
      (PLT to libc function, indirect jump to vsyscall entry, indirect
      jump inside kernel).  Another is not necessary (in fact two of
      those aren't necessary either), why add more?

   2. Table makes the stub for all syscalls slower.

All this is moot, though, because in reality only very few syscalls
will be optimised, and it doesn't really matter if an older Glibc
doesn't take advantage of a newer kernel's optimised version.  If
someone would like the performance, installing an up to date Glibc is
no big deal.

So pragmatically John's solution, with Glibc looking in the vdso just
for syscalls it knows have an optimised implementation (i.e. just
gettimeofday so far), is best IMHO.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29 19:15       ` Jamie Lokier
@ 2004-01-29 23:59         ` john stultz
  2004-01-30  0:40           ` Ulrich Drepper
  2004-01-30  0:31         ` Ulrich Drepper
  1 sibling, 1 reply; 42+ messages in thread
From: john stultz @ 2004-01-29 23:59 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, lkml

On Thu, 2004-01-29 at 11:15, Jamie Lokier wrote:
> Ulrich Drepper wrote:
> > And they require the syscall stubs to suddenly set up the usual PIC
> > infrastructure since a jump through the PLT is used.
> 
> As this is x86, can't the syscall routines in Glibc call directly
> without a PLT entry?  With prelinking, because the vdso is always
> located at the same address, there isn't even a dirty page overhead to
> using non-PIC in this case.
> 
> > This is much slower than the extra indirection the vdso could do.
> 
> If you have to use a PLT entry it is.  If you can do it without a PLT,
> direct jump to the optimised syscall address is fastest.
> 
> > The vdso is just one of the DSOs in the search path and usually the very
> > last.  So there would be possible many objects which are looked at
> > first, unsuccessfully.
> 
> Being Glibc, you could always tweak ld.so to only look at the last one
> if this were really a performance issue.  Btw, every syscall used by
> the program requires at least one symbol lookup, usually over the
> whole search path, anyway.
> 
> > And another problem I should have mentioned last night: in statically
> > linked applications the vDSO isn't used this way.  Do dynamic linker
> > functionality is available.  We find the vDSO through the auxiliary
> > vector and use the absolute address, not the symbol table of the vDSO.
> > If the syscall entry in the vDSO would do the dispatch automatically,
> > statically linked apps would benefit from the optimizations, too.
> > Otherwise they are left out.
> 
> I hear what you're saying.  These are the things which bother me:
> 
>    1. There are already three indirect jumps to make a syscall.
>       (PLT to libc function, indirect jump to vsyscall entry, indirect
>       jump inside kernel).  Another is not necessary (in fact two of
>       those aren't necessary either), why add more?
> 
>    2. Table makes the stub for all syscalls slower.
> 
> All this is moot, though, because in reality only very few syscalls
> will be optimised, and it doesn't really matter if an older Glibc
> doesn't take advantage of a newer kernel's optimised version.  If
> someone would like the performance, installing an up to date Glibc is
> no big deal.
> 
> So pragmatically John's solution, with Glibc looking in the vdso just
> for syscalls it knows have an optimised implementation (i.e. just
> gettimeofday so far), is best IMHO.

[Head spins] Forgive me, but my glibc/linker knowledge is minimal, so
I'm mostly guessing at the meaning of your comments above. 

The picture in my mind is:

Ulrich is suggesting the vsyscall-sysenter code be extended to such that
it switches on the syscall number and jumps to the vsyscall-gettimeofday
code when appropriate. This avoids having to change glibc. 

Jamie is suggesting that the extra indirection
glibc->sysenter->vgettimeofday could be simply cut down to
glibc->vgettimeofday. This requires changing glibc, but would be faster.
Personally I like Ulrich's suggestion, as it requires no change to
userspace. I had even considered it in developing the patch, but with
the current vsyscall-sysenter being all asm, I figured I could implement
the LD_PRELOAD code much faster. 

Another issue with having a separate entry point for vgettimeofday is
that I don't quite understand how glibc detects if vsyscall is
available, and how deals with the vsyscall page moving around. It seems
the i386 4/4g split patch moved the vsyscall-sysenter page to 0xffffd000
(instead of 0xffffe000). I've conditionally padded the fixmap table so
vgettimeofday() is at 0xffffc000 regardless, but clearly this isn't the
best thing to do. 

Thoughts?

thanks
-john


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29 19:15       ` Jamie Lokier
  2004-01-29 23:59         ` john stultz
@ 2004-01-30  0:31         ` Ulrich Drepper
  2004-01-30  4:17           ` Jamie Lokier
  1 sibling, 1 reply; 42+ messages in thread
From: Ulrich Drepper @ 2004-01-30  0:31 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: john stultz, lkml

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Jamie Lokier wrote:

> As this is x86, can't the syscall routines in Glibc call directly
> without a PLT entry?

No, since this is just an ordinary jump through the PLT.  That the
target DSO is synthesized is irrelevant.  It's ld.so which needs the PIC
setup, not the called DSO.

> With prelinking, because the vdso is always
> located at the same address, there isn't even a dirty page overhead to
> using non-PIC in this case.

But this isn't true.  The address can change.  There are already two
flavors (normal and 4G/4G) and there will probably be more.  Ingo would
have to comment on that.

> If you have to use a PLT entry it is.  If you can do it without a PLT,
> direct jump to the optimised syscall address is fastest.

A direct jump is hardcoding the information which is exactly what should
be avoided.

> Being Glibc, you could always tweak ld.so to only look at the last one
> if this were really a performance issue.  Btw, every syscall used by
> the program requires at least one symbol lookup, usually over the
> whole search path, anyway.

The symbol for the syscall stub in libc is looked up, yes, but nothing
else.  I fail to see the relevence.  You cannot say that since a call
already requires N ns it is OK for it to take 2*N ns.

> I hear what you're saying.  These are the things which bother me:
> 
>    1. There are already three indirect jumps to make a syscall.
>       (PLT to libc function, indirect jump to vsyscall entry, indirect
>       jump inside kernel).  Another is not necessary (in fact two of
>       those aren't necessary either), why add more?

Because they are all at different level and to abstract out different
things.

>    2. Table makes the stub for all syscalls slower.

Not as much as any other acceptable solution.  The vdso code is compiled
for a given address and therefore the memory loads can use absolute
addresses.

> All this is moot, though, because in reality only very few syscalls
> will be optimised, and it doesn't really matter if an older Glibc
> doesn't take advantage of a newer kernel's optimised version.  If
> someone would like the performance, installing an up to date Glibc is
> no big deal.

This is certainly not what many people think.  In general, every
dependency on the runtime for programs to take advantage of new kernel
features is bad and should be avoided.  If I'd write kernel code I'd
want to control the use as much as possible.  And this trivial jump
table can do this very efficiently.

> So pragmatically John's solution, with Glibc looking in the vdso just
> for syscalls it knows have an optimised implementation (i.e. just
> gettimeofday so far), is best IMHO.

Pragmatically?  How about "practically"?  Mind you, for x86 the code
wouldn't be as simple as the vgettimeofday call on x86-64.  For x86-64
all acceptable kernels had the vsyscall and therefore glibc doesn't have
to worry about it not being available.  And there currently in no second
or third location for the vDSO.

For x86 we have to handle in the same binary old kernels and kernels
where the vDSO is at a different address than the stock kernel.  This
means the computation of the address consists of several step.  Get the
vDSO address (passed up in the auxiliary vector), adding the magic
offset, and then jumping.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAGaXS2ijCOnn/RHQRAqITAJ98+xMjIQInUqOZjVo52xOM3IFqZwCdHbFJ
O1poE0GkZx/75yGEDuNBz7o=
=GvFA
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29 23:59         ` john stultz
@ 2004-01-30  0:40           ` Ulrich Drepper
  0 siblings, 0 replies; 42+ messages in thread
From: Ulrich Drepper @ 2004-01-30  0:40 UTC (permalink / raw)
  To: john stultz; +Cc: Jamie Lokier, lkml

john stultz wrote:

> Another issue with having a separate entry point for vgettimeofday is
> that I don't quite understand how glibc detects if vsyscall is
> available, and how deals with the vsyscall page moving around.

Well, this is indeed a problem which needs an addition solution.  If
we'd look for the symbol, it's automatically handled.  Likewise if the
normal syscall handler does it magically.

If a call to the magic address is needed we'd need from kind of version
information in the vDSO to check for the existence of the extension.
This check needs to be done for every call unless a new-enough kernel is
assumed outright.  One could add a functions to the vDSO which is called
via the symbol table and which returns the necessary information to make
the decision.  The result would have to be stored in a local variable to
avoid making that call over and over again.  This all requires the PIC
setup which makes the whole thing once again more expensive than the
simple implicit table lookup.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  0:31         ` Ulrich Drepper
@ 2004-01-30  4:17           ` Jamie Lokier
  2004-01-30  5:09             ` Ulrich Drepper
  2004-01-30  8:33             ` Jakub Jelinek
  0 siblings, 2 replies; 42+ messages in thread
From: Jamie Lokier @ 2004-01-30  4:17 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: john stultz, lkml

Ulrich Drepper wrote:
> > As this is x86, can't the syscall routines in Glibc call directly
> > without a PLT entry?
> 
> No, since this is just an ordinary jump through the PLT.  That the
> target DSO is synthesized is irrelevant.  It's ld.so which needs the PIC
> setup, not the called DSO.

I have not explained well.  Please read carefully, as I am certain no
indirect jumps on the userspace side are needed, including the one
currently in libc.

It is possible to compile, assemble and link a shared library with
-fno-PIC on x86, and this does work.  I just tested it to make sure.
Furthermore, the "prelink" program is effective on these libraries.

If you assemble calls into the kernel in Glibc with the instruction
"call __kernel_vsyscall", i.e. NOT "call __kernel_vsyscall@PLT", then
the address in the instruction is patched at run time to be a direct
(not indirect) jump to the kernel entry point.  The address is
retrieved from the kernel vDSO.

Generally you do not use the non-PIC form of "call" in a shared
library, because it causes the page containing the instruction to be
dirtied in each instance of the program.  If done for all "call"
instructions in a whole library this is wasteful in time and memory.

However, for the syscall stubs, if they are placed close together,
then they may fit in one or two pages, and that keeps the dirtying to
a minimum.  Even better is to place the stubs just after libc's PLT,
so the dirty pages are the same.

This converts the indirect jump in libc to the kernel entry point to a
direct jump.

> > With prelinking, because the vdso is always
> > located at the same address, there isn't even a dirty page overhead to
> > using non-PIC in this case.
> 
> But this isn't true.  The address can change.  There are already two
> flavors (normal and 4G/4G) and there will probably be more.  Ingo would
> have to comment on that.

I'm talking about the "prelink" program.  When you run "prelink" on a
libc.so which has direct jump instructions as described above, is
patches the libc.so file to contain the address of the kernel entry
point at the time "prelink" was run.

If this libc.so is loaded onto a kernel with the same address for
__kernel_vsyscall, then the run time linker does not need to alter the
address, and does not dirty the pages containing direct jumps to that
address.

If this libc.so is loaded onto a kernel with a different vsyscall
address, the run time linker patches the jumps as described above.  So
it always works, but loads faster on the kernel it is optimised for.

> > If you have to use a PLT entry it is.  If you can do it without a PLT,
> > direct jump to the optimised syscall address is fastest.
> 
> A direct jump is hardcoding the information which is exactly what should
> be avoided.

I think you misunderstood.  The direct jump is to an address resolved
at load time by the run time linker, or at preprocessing time by
"preload".  On x86 both of these work.

> > Being Glibc, you could always tweak ld.so to only look at the last one
> > if this were really a performance issue.  Btw, every syscall used by
> > the program requires at least one symbol lookup, usually over the
> > whole search path, anyway.
> 
> The symbol for the syscall stub in libc is looked up, yes, but nothing
> else.  I fail to see the relevence.  You cannot say that since a call
> already requires N ns it is OK for it to take 2*N ns.

If you run "preload" it takes 0 ns: these addresses are fixed into
ld.so and the symbols are not looked up at load time.

However if you don't care to depend on that, other optimisations to
ld.so are possible too.

> > I hear what you're saying.  These are the things which bother me:
> > 
> >    1. There are already three indirect jumps to make a syscall.
> >       (PLT to libc function, indirect jump to vsyscall entry, indirect
> >       jump inside kernel).  Another is not necessary (in fact two of
> >       those aren't necessary either), why add more?
> 
> Because they are all at different level and to abstract out different
> things.

The abstractions are good.  However indirect jumps are not required
for three out of four of those abstractions, because ld.so and prelink
can both resolve addresses in direct jumps; ld.so at load time, and
prelink at preprocessing time.  This is nothing fancy.

> >    2. Table makes the stub for all syscalls slower.
> 
> Not as much as any other acceptable solution.  The vdso code is compiled
> for a given address and therefore the memory loads can use absolute
> addresses.

> For x86 we have to handle in the same binary old kernels and kernels
> where the vDSO is at a different address than the stock kernel.  This
> means the computation of the address consists of several step.  Get the
> vDSO address (passed up in the auxiliary vector), adding the magic
> offset, and then jumping.

This is my thesis: system calls do not require _any_ indirect jumps in
libc or in the user space part of the kernel stub, with no dirty
pages, and the symbol table lookups can be eliminated.

Because I am sure you don't agree :) this is how to implement it:

  1. ld.so gets the vDSO address from the auxiliary vector, and then
     includes the vDSO in the list of symbol tables to search.  If
     there is no vDSO (due to running on an old kernel without one), then
     it is simply omitted.

  2. The vDSO offers __kernel_vsyscall, the general syscall entry point,
     and may offer any other __kernel_* symbols for optimised
     syscalls.  E.g. __kernel_gettimeofday is defined.

  3. After the vDSO in the search list are weak aliases from
     __kernel_* to __kernel_vsyscall (for each syscall mentioned in libc).

  4. After the vDSO is a definition of __kernel_vsyscall, which
     does int80.  It's exactly the same as the kernel's int80 stub.
     This is the code which will be run if we're running on an old kernel
     without a vDSO.

One way to place the symbols from 3. and 4. after the vDSO in the
search list is to arrange that ld.so places the vDSO before libc.so.
There are a few other ways to do it.

  5. Glib's syscall stubs should look a lot like this example:

         movl 0x10(%esp),%edx
         movl 0x0c(%esp),%ecx
         movl 0x08(%esp),%ebx
         movl $3,%eax
         call __kernel_read

1-5 together implement system calls using direct call instructions (a
minor run time improvement over current Glibc, at the cost of some
load time overhead) and also supports optimised system calls in a
future compatible way.  Add to this:

  6. Group all small routines in libc.so which call __kernel_*
     together, and locate them as close as possible to the PLT section.

That minimises the run time footprint.

  7. Make "prelink" aware of the vDSO dependency.

  8. Add the prelink signature to the kernel's vDSO object.

  9. Run "prelink", even if only on libc.so or on a sub-library which
     contains the system call stubs.

That totally eliminates the load time symbol lookups for these kernel
functions.

Once these changes are made to Glibc, it will automatically take
advantage of any future vsyscall optimisations in the kernel _and_
system calls will be a little bit faster than they are now.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  4:17           ` Jamie Lokier
@ 2004-01-30  5:09             ` Ulrich Drepper
  2004-01-30  9:29               ` Ingo Molnar
  2004-01-30 17:34               ` Jamie Lokier
  2004-01-30  8:33             ` Jakub Jelinek
  1 sibling, 2 replies; 42+ messages in thread
From: Ulrich Drepper @ 2004-01-30  5:09 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: john stultz, lkml

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Jamie Lokier wrote:

> Because I am sure you don't agree :) this is how to implement it:

You are right, nothing like this is in the least acceptable.  Text
relocations are completely off-limits.  Depending on prelinking being
available is not acceptable.

Your entire scheme is based on this and therefore not worth the bits
used to store it.  Your understanding of how and where syscalls are made
and how ELF works is flawed.  There is no "group the syscalls nicely
together", they are all over the place, inlined in many places.  There
is no concept of weak aliases in dynamic linking.  Finding all the
"aliases" requires lookups by name for now more than 200 syscall names
and growing.  Prelinking can help if it is wanted, but if the vDSO
address is changed or randomized (this is crucial I'd say) or the vDSO
is not setup up for a process, the full price has to be paid.  With
every kernel change the whole prelinking has to be redone.  We kept the
dependencies with the vDSO minimal exactly because it is not a normal DSO.

This proposed method is many times more expensive for all processes
which do not call the same syscalls many many times over.  Every single
name lookup costs dearly, the larger the application the more expensive.
 The startup times will probably increase ten-fold or moreif prelinking
isn't available or disabled because one or more of the linked in objects
changed.  You cannot say that it's OK the system becomes unusable if
prelinking isn't used.  There are always programs which are not
prelinked because they cannot be prelinked, they are newly
installed/updated, or because prelinking isn't done at all to increase
security.  No method must perform measurably worse than normal,
non-prelinked code does now.

And I am not in the least convinced that this one direct jump from a
tainted page is faster then the indirect jump from read-only memory.
You increase the memory usage of the system.  You'd need a couple of
additional pages in each process' glibc which are not shared.

If gettimeofday() is the only optimized syscall, just add a simple

  cmp $__NR_gettimeofday, %eax
  je  __vsyscall_gettimeofday

to the __kernel_vsyscall code.  With correct static branch prediction
you'll not be able to measure the effect.  The correct way is IMO to
completely hide the optimizations since otherwise the increased
dependencies between kernel and libc only create more friction and cost
and loss of speed.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAGecT2ijCOnn/RHQRAkBwAJ9f4gKLdVeUpA4kbfxwb1Y4oiJmdQCghg7e
JK8NvNy1GyEJXtE5pGJB1IU=
=D0yc
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  4:17           ` Jamie Lokier
  2004-01-30  5:09             ` Ulrich Drepper
@ 2004-01-30  8:33             ` Jakub Jelinek
  2004-01-30 17:21               ` Jamie Lokier
  1 sibling, 1 reply; 42+ messages in thread
From: Jakub Jelinek @ 2004-01-30  8:33 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, john stultz, lkml

On Fri, Jan 30, 2004 at 04:17:08AM +0000, Jamie Lokier wrote:
> Ulrich Drepper wrote:
> > > As this is x86, can't the syscall routines in Glibc call directly
> > > without a PLT entry?
> > 
> > No, since this is just an ordinary jump through the PLT.  That the
> > target DSO is synthesized is irrelevant.  It's ld.so which needs the PIC
> > setup, not the called DSO.
> 
> I have not explained well.  Please read carefully, as I am certain no
> indirect jumps on the userspace side are needed, including the one
> currently in libc.
> 
> It is possible to compile, assemble and link a shared library with
> -fno-PIC on x86, and this does work.  I just tested it to make sure.
> Furthermore, the "prelink" program is effective on these libraries.

Only if there are no prelink conflicts in the read-only sections.
Furthermore, there is additional overhead of remapping RW and back RX
and wasted page.  You can get around that by making a RWX section (which
ends up in the RW segment which gets a PF_X bit set as well), but that means
all the data in that segment is executable, which is obviously not
desirable, especially for libc.so.

> I'm talking about the "prelink" program.  When you run "prelink" on a
> libc.so which has direct jump instructions as described above, is
> patches the libc.so file to contain the address of the kernel entry
> point at the time "prelink" was run.

Prelink ATM doesn't take VDSO into account at all and surely it would
be best if it did not have to.  For example if VDSO is randomized, userspace
has no control over its placement like it has for shared libraries
(if DSO base is NULL, kernel randomizes, if non-NULL (usually means
prelinked), then does not randomize unless the binary is PIE).

	Jakub

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  5:09             ` Ulrich Drepper
@ 2004-01-30  9:29               ` Ingo Molnar
  2004-02-03  4:38                 ` Ulrich Drepper
  2004-01-30 17:34               ` Jamie Lokier
  1 sibling, 1 reply; 42+ messages in thread
From: Ingo Molnar @ 2004-01-30  9:29 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Jamie Lokier, john stultz, lkml

* Ulrich Drepper <drepper@redhat.com> wrote:

> If gettimeofday() is the only optimized syscall, just add a simple
> 
>   cmp $__NR_gettimeofday, %eax
>   je  __vsyscall_gettimeofday
> 
> to the __kernel_vsyscall code.  With correct static branch prediction
> you'll not be able to measure the effect.  The correct way is IMO to
> completely hide the optimizations since otherwise the increased
> dependencies between kernel and libc only create more friction and
> cost and loss of speed.

agreed 100%. Once the # of vsyscalls grows to above a certain treshold,
a table can be used just like we do in kernel-mode.

but i'm a bit worried about the apparent fact that adding 200 more
symbols (and making the vDSO a real DSO in essence) to abstract the
kernel syscalls is apparently unacceptable performance-wise. If this is
true then the whole dynamic linking architecture is much slower than it
should be, isnt it? Why cannot the same argument be made about the ~1400
symbols libc itself provides? Wouldnt a tighter libc API avoid all the
overhead (in fact 7x overhead) you described wrt. adding 200+ kernel
symbols? Why is the kernel vDSO so special [assuming, for the sake of
argument, a clean, versioned function API between libc and the kernel
vDSO]?

	Ingo

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  8:33             ` Jakub Jelinek
@ 2004-01-30 17:21               ` Jamie Lokier
  0 siblings, 0 replies; 42+ messages in thread
From: Jamie Lokier @ 2004-01-30 17:21 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Ulrich Drepper, john stultz, lkml

Jakub Jelinek wrote:
> Prelink ATM doesn't take VDSO into account at all and surely it would
> be best if it did not have to.  For example if VDSO is randomized, userspace
> has no control over its placement like it has for shared libraries
> (if DSO base is NULL, kernel randomizes, if non-NULL (usually means
> prelinked), then does not randomize unless the binary is PIE).

Randomisation of vDSO and randomisation of PIE, or non-PIE objects
which don't get mapped where you intended all break prelinking.  In
this regard vDSO is no different to any other library.

I agree that any uses of randomisation tend to break prelinking, and
so it's not reasonable to depend solely on prelinking.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  5:09             ` Ulrich Drepper
  2004-01-30  9:29               ` Ingo Molnar
@ 2004-01-30 17:34               ` Jamie Lokier
  1 sibling, 0 replies; 42+ messages in thread
From: Jamie Lokier @ 2004-01-30 17:34 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: john stultz, lkml

Ulrich Drepper wrote:
> Your entire scheme is based on this and therefore not worth the bits
> used to store it.  Your understanding of how and where syscalls are made
> and how ELF works is flawed.  There is no "group the syscalls nicely
> together", they are all over the place, inlined in many places.

That's a choice.  There is no reason why you cannot put the entry path
of all the stub functions called "read", "write" etc. in a special
section.  For syscalls inlined in larger functions, then it's
reasonable to avoid text relocation in those places and use an
indirect call as done now.  Surely there aren't too many of those,
though, because LD_PRELOAD libraries which override syscall stubs
rather depend on all normal calls to a syscall going through the stubs?

> there is no concept of weak aliases in dynamic linking.  Finding all
> the "aliases" requires lookups by name for now more than 200 syscall
> names and growing.

See Ingo's post.

> Prelinking can help if it is wanted, but if the vDSO address is
> changed or randomized (this is crucial I'd say) or the vDSO is not
> setup up for a process, the full price has to be paid.

I agree; it is not reasonable to depend on prelinking.

> With every kernel change the whole prelinking has to be redone.

Not really, that's an implementation limitation, there's no reason to
prelink the entire system just to alter the jumps in libc.so on those
occasions when a new kernel is run.  If vDSO randomisation is per boot
rather than per task (because the latter implies an MSR write per
context switch), then a libsyscall.so can be patched at boot time.

Yes I know, extravagent ideas, just wanted to write them for folk to
be aware of the possibilities.

> If gettimeofday() is the only optimized syscall, just add a simple
> 
>   cmp $__NR_gettimeofday, %eax
>   je  __vsyscall_gettimeofday
> 
> to the __kernel_vsyscall code.

That does seem to be a very practical answer for now :)

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29 13:26   ` Jamie Lokier
  2004-01-29 18:05     ` Ulrich Drepper
@ 2004-01-31  0:10     ` Eric W. Biederman
  2004-01-31  2:41       ` Jamie Lokier
  1 sibling, 1 reply; 42+ messages in thread
From: Eric W. Biederman @ 2004-01-31  0:10 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, john stultz, lkml

Jamie Lokier <jamie@shareable.org> writes:

> Ulrich Drepper wrote:
> > ~ alternatively use the symbol table the vdso has.  Export the new code
> > only via the symbol table.  No fixed address for the function, the
> > runtime gets it from the symbol table.  glibc will use weak symbol
> > references; if the symbol isn't there, the old code is used.  This will
> > require that every single optimized syscall needs to be handled special.
> > 
> > 
> > I personally like the first approach better.  The indirection table can
> > maintained in sync with the syscall table inside the kernel.  It all
> > comes at all times from the same source.  The overhead of the memory
> > load should be neglectable.
> 
> I like the second approach more.  You can change glibc to look up the
> weak symbol for _all_ syscalls, then none of them are special and it
> will work with future kernel optimisations.

There is one more piece to consider with either approach.  The
calling conventions.

With the x86-64 optimized vsyscall the syscall number does
not need to be placed into a register, because you have used
the proper entry point.  For any syscall worth tuning in
user space I suspect that level of optimization would be
beneficial.  A fast call path that does not waste a register.

Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29  2:46 [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch john stultz
  2004-01-29  5:06 ` Ulrich Drepper
@ 2004-01-31  0:17 ` Eric W. Biederman
  2004-01-31  2:20   ` john stultz
  1 sibling, 1 reply; 42+ messages in thread
From: Eric W. Biederman @ 2004-01-31  0:17 UTC (permalink / raw)
  To: john stultz
  Cc: lkml, Andi Kleen, andrea, Joel Becker, Wim Coekaerts,
	Chris McDermott

john stultz <johnstul@us.ibm.com> writes:

> All,
>         This is my port of the x86-64 vsyscall gettimeofday code to
> i386. This patch moves gettimeofday into userspace, so it can be called
> without the syscall overhead, greatly improving performance. This is
> important for any application, like a database, which heavily uses
> gettimeofday for timestamping. It supports both the TSC and IBM x44X
> cyclone time source.

> 
> Example performance gain: (vs. int80)
> Normal gettimeofday 
> gettimeofday ( 1665576us / 1000000runs ) = 1.665574us
> vsyscall LD_PRELOAD gettimeofday
> gettimeofday ( 868378us / 1000000runs ) = 0.868377us

And what is the performance gain over using the kernel sysenter
implementation?

> This patch becomes especially important with the introduction of the
> 4G/4G split, as there the syscall overhead is greatly increased. 
> 
> Example gain w/ 4/4g split: (vs. int80)
> Normal gettimeofday 
> gettimeofday ( 7210630us / 1000000runs ) = 7.210623us
> vsyscall LD_PRELOAD gettimeofday
> gettimeofday ( 844855us / 1000000runs ) = 0.844854us

This is clear evidence that the 4g/4g kernel has significant overhead,
suggesting that a 64bit kernel should be used if you care about
syscall performance with gobs of RAM.

Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-31  0:17 ` Eric W. Biederman
@ 2004-01-31  2:20   ` john stultz
  0 siblings, 0 replies; 42+ messages in thread
From: john stultz @ 2004-01-31  2:20 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: lkml, Andi Kleen, andrea, Joel Becker, Wim Coekaerts,
	Chris McDermott

On Fri, 2004-01-30 at 16:17, Eric W. Biederman wrote:
> john stultz <johnstul@us.ibm.com> writes:
> 
> > All,
> >         This is my port of the x86-64 vsyscall gettimeofday code to
> > i386. This patch moves gettimeofday into userspace, so it can be called
> > without the syscall overhead, greatly improving performance. This is
> > important for any application, like a database, which heavily uses
> > gettimeofday for timestamping. It supports both the TSC and IBM x44X
> > cyclone time source.
> 
> > 
> > Example performance gain: (vs. int80)
> > Normal gettimeofday 
> > gettimeofday ( 1665576us / 1000000runs ) = 1.665574us
> > vsyscall LD_PRELOAD gettimeofday
> > gettimeofday ( 868378us / 1000000runs ) = 0.868377us
> 
> And what is the performance gain over using the kernel sysenter
> implementation?

Sorry, I hadn't gotten around to upgrading the glibc on my dev box. 

Here's the sysenter comparison:

Normal gettimeofday
gettimeofday ( 1239215us / 1000000runs ) = 1.239214us
vsyscall LD_PRELOAD gettimeofday
gettimeofday ( 805117us / 1000000runs ) = 0.805116us


It should be noted that all the numbers I've posted so far are using the
cyclone timesource. Here's the test running using the TSC timesource
(sysenter as well):

Normal gettimeofday
gettimeofday ( 586046us / 1000000runs ) = 0.586045us
vsyscall LD_PRELOAD gettimeofday
gettimeofday ( 179972us / 1000000runs ) = 0.179972us


thanks
-john


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-31  0:10     ` Eric W. Biederman
@ 2004-01-31  2:41       ` Jamie Lokier
  2004-01-31  5:54         ` Eric W. Biederman
  0 siblings, 1 reply; 42+ messages in thread
From: Jamie Lokier @ 2004-01-31  2:41 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Ulrich Drepper, john stultz, lkml

Eric W. Biederman wrote:
> With the x86-64 optimized vsyscall the syscall number does
> not need to be placed into a register, because you have used
> the proper entry point.  For any syscall worth tuning in
> user space I suspect that level of optimization would be
> beneficial.  A fast call path that does not waste a register.

The cost of loading a constant into a register is _much_ lower than
the cost of indirect jumps which we have been discussing.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-31  2:41       ` Jamie Lokier
@ 2004-01-31  5:54         ` Eric W. Biederman
  0 siblings, 0 replies; 42+ messages in thread
From: Eric W. Biederman @ 2004-01-31  5:54 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, john stultz, lkml

Jamie Lokier <jamie@shareable.org> writes:

> Eric W. Biederman wrote:
> > With the x86-64 optimized vsyscall the syscall number does
> > not need to be placed into a register, because you have used
> > the proper entry point.  For any syscall worth tuning in
> > user space I suspect that level of optimization would be
> > beneficial.  A fast call path that does not waste a register.
> 
> The cost of loading a constant into a register is _much_ lower than
> the cost of indirect jumps which we have been discussing.

I was thinking more of the register pressure in the load.

But in the case of gettimeofday I think it makes to do a kernel
implementation that is argument compatible with libc and then linker
magic could just short circuit the calls to the vsyscall page and libc
would not need to get involved at all, which removes one of the
indirect calls.

We could probably do that today by just renaming the function
gettimeofday.  But that is rude and has name space pollution issues.

Eric

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-29  5:06 ` Ulrich Drepper
  2004-01-29 13:26   ` Jamie Lokier
@ 2004-02-01  1:28   ` Andrea Arcangeli
  2004-02-03  4:35     ` Ulrich Drepper
  1 sibling, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-01  1:28 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: john stultz, lkml

On Wed, Jan 28, 2004 at 09:06:34PM -0800, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> john stultz wrote:
> 
> > Please let me know if you have any comments or suggestions. 
> 
> I really don't like this special address in the vdso approach.  Yes,
> it's unfortunately done for x86-64 as well but this doesn't mean the
> mistakes have to be repeated.

we investigated all possible implementations and we choosed for x86-64
the most efficient possible one. I think the current api was suggested
originally by hpa. Any other implementation would be running slower
period, so I wouldn't call it a mistake, I definitely call it a great
success, infact it is the best performing one for x86 too (this is why I
think john is using it). I know it's not nice from a computer science
theorical standpoint compared to other much slower implementations, but
when I run gettimeofday I want it to run as fast as possible, and I
don't care about anything else (well, besides the result being correct
of course ;), and I think the industry at large has my same needs. So I
definitely wouldn't trade it with anything else.

I'm unsure if we took care of implementing the backwards compatibility
-ENOSYS in the kernel at the next offsets of the vsyscalls, for making
it trivially extensible, if they're still missing we may want to add
them (there's no need to waste physical ram to do that btw). I had them
in my todo list for a while but at least from my part I never
implemented it, I'm sure I mentioned this had to be implemented a few
times though. Not sure if Andi or somebody else added the compatibility
-ENOSYS in the meantime. This is the sort of thing that nobody will
care about until it's too late. Well, it's not too bad anyways, the
current upgrade path would simply force an upgrade of kernel after
adding a glibc that has knowledge of the new vsyscalls, and overall
there would be no risk of any silent malfunction, it could only segfault
apps "safely". Also there is already space for at least two more
vsyscalls that currently are returning -ENOSYS. So overall even if we
don't add it, it probably won't matter much.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-01  1:28   ` Andrea Arcangeli
@ 2004-02-03  4:35     ` Ulrich Drepper
  2004-02-03  5:34       ` Andrea Arcangeli
  2004-02-03  8:52       ` Jamie Lokier
  0 siblings, 2 replies; 42+ messages in thread
From: Ulrich Drepper @ 2004-02-03  4:35 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: john stultz, lkml

Andrea Arcangeli wrote:
> I definitely call it a great success,

You got to be kidding.  Some object fixed in the address space which can
perform system calls.  Nothing is more welcome to somebody trying to
exploit some bugs.

The vdso must be randomized.  This is completely impossible with this
stupid fixed address scheme and it must be changed as soon as possible.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-01-30  9:29               ` Ingo Molnar
@ 2004-02-03  4:38                 ` Ulrich Drepper
  0 siblings, 0 replies; 42+ messages in thread
From: Ulrich Drepper @ 2004-02-03  4:38 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: john stultz, lkml

Ingo Molnar wrote:

> but i'm a bit worried about the apparent fact that adding 200 more
> symbols (and making the vDSO a real DSO in essence) to abstract the
> kernel syscalls is apparently unacceptable performance-wise. If this is
> true then the whole dynamic linking architecture is much slower than it
> should be, isnt it?

If the syscall entry code in the vdso takes care of the multiplexing we
do not have to add any symbols to the vdso's symbol table.  There are no
additional costs associated with adding more optimized syscalls other
than the multiplexer and that cost would be constant if a jump table is
used.

For now, the simple single cmpl is sufficient.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03  4:35     ` Ulrich Drepper
@ 2004-02-03  5:34       ` Andrea Arcangeli
  2004-02-03  8:52       ` Jamie Lokier
  1 sibling, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-03  5:34 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: john stultz, lkml

On Mon, Feb 02, 2004 at 08:35:40PM -0800, Ulrich Drepper wrote:
> Andrea Arcangeli wrote:
> > I definitely call it a great success,
> 
> You got to be kidding.  Some object fixed in the address space which can
> perform system calls.  Nothing is more welcome to somebody trying to
> exploit some bugs.
> 
> The vdso must be randomized.  This is completely impossible with this
> stupid fixed address scheme and it must be changed as soon as possible.

sorry, no idea what you're talking about. I can't see any valid single
reason to randomize the addresses. (the only effect is that it will hurt
performance)

Whatever problem you found, feel free to post an exploit so I will
certainly be able to understand your problem, if you can't to me it
means there's no problem.

the closer thing that your statements reminds me, is the discussion
about the reentrancy of the gettimeofday, basically to allow
virtualization, if that's what you meant that can be addressed just fine
with a modification to the ptes with a syscall, no valid reason to
slowdown the production fast path with an inefficient API just for the
re-virtualization of the vsyscalls.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03  4:35     ` Ulrich Drepper
  2004-02-03  5:34       ` Andrea Arcangeli
@ 2004-02-03  8:52       ` Jamie Lokier
  2004-02-03 16:25         ` Andrea Arcangeli
  1 sibling, 1 reply; 42+ messages in thread
From: Jamie Lokier @ 2004-02-03  8:52 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Andrea Arcangeli, john stultz, lkml

Ulrich Drepper wrote:
> You got to be kidding.  Some object fixed in the address space which can
> perform system calls.  Nothing is more welcome to somebody trying to
> exploit some bugs.

Two approaches to randomising the vdso address:

  1. Selecting a random address at boot time.  All tasks have the same
     vdso for that run of the kernel.  Advantages: no MSR write at
     each context switch; could patch libsyscall.so at boot time with
     address if we were fanatical about optimisation (i.e. other
     libcs, not Glibc :)  Disadvantages: the attacker may eventually
     learn the address.

  2. Select a random address for every new task.  Advantages: harder
     to guess from studying a machine for a long time.  Disadvantages:
     slower context switches; the gain from randomising each task is
     nothing if all the tasks are very long lived anyway.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03  8:52       ` Jamie Lokier
@ 2004-02-03 16:25         ` Andrea Arcangeli
  2004-02-03 17:37           ` Jamie Lokier
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-03 16:25 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, john stultz, lkml

On Tue, Feb 03, 2004 at 08:52:24AM +0000, Jamie Lokier wrote:
> Ulrich Drepper wrote:
> > You got to be kidding.  Some object fixed in the address space which can
> > perform system calls.  Nothing is more welcome to somebody trying to
> > exploit some bugs.
> 
> Two approaches to randomising the vdso address:
> 
>   1. Selecting a random address at boot time.  All tasks have the same
>      vdso for that run of the kernel.  Advantages: no MSR write at
>      each context switch; could patch libsyscall.so at boot time with
>      address if we were fanatical about optimisation (i.e. other
>      libcs, not Glibc :)  Disadvantages: the attacker may eventually
>      learn the address.
> 
>   2. Select a random address for every new task.  Advantages: harder
>      to guess from studying a machine for a long time.  Disadvantages:
>      slower context switches; the gain from randomising each task is
>      nothing if all the tasks are very long lived anyway.

could you please explain what's the point of this randomising thing what
this attacker is trying to do?

nothing can be randomized, as far as the vsyscall can be executed it
means its address in the address space is known and not random. If the
address is random you can't execute it. The whole vsyscall space is
readonly, the attacker can do nothing on it, no way to touch it with
put_user either.

on x86-64 whatever is executable is readable too (readable non
executable is possible but that's another issue)

whatever the API you'll always be able to find the vsyscall address or
it means you can't execute it in the first place.

so in short, either we have vsyscalls non-randomized, or we don't have
them at all, period.

especially having a fixed address per-kernel makes no sense at all since
it's trivial to find out by all other tasks anyways.

the current API was presented around two years ago at UKUUG, and it was
developed in the open in the x86-64 mailing list (archives should be
online), so if there's really a fundamental problem it would been much
better if you would send your complains to those lists at that time,
instead of coming out of the blue years later when the code runs in
production just fine for years (and it's in glibc for a long time too I
think).

Still I'm struggling to understand what's your point about
randomization, your request makes no sense at all to me and I cannot
imagine any remote security issue related to the current API of the
vsyscalls, furthmore I cannot remotely imagine any difference in terms
of security by using a vsyscall table, the only difference to the end
user would be that its userspace would be running slower, while right
now it's running as fast as the hardware can.

I would appreciate a more detailed explanation rather than "address must
randomized and the api must be changed".

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03 16:25         ` Andrea Arcangeli
@ 2004-02-03 17:37           ` Jamie Lokier
  2004-02-03 18:10             ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Jamie Lokier @ 2004-02-03 17:37 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Ulrich Drepper, john stultz, lkml

Andrea Arcangeli wrote:
> could you please explain what's the point of this randomising thing what
> this attacker is trying to do?

Most buffer overflow attacks work by overwriting the return address of
a function to make it jump to a known fixed address.

The simplest form of that is where the stack is at a known address, so
the attack overwrites a return address to jump to the stack, to
instructions which are directly controlled by the attacker (part of
the same buffer overflow).

When the stack is not executable or randomised, more complex attacks
are used that take advantage of code sequences in the library or
executable itself.

To counter that, if everything executable is mapped at a random
address, there is no fixed address that can be jumped to for this kind
of attack.  More complex attacks which trick code into behaving
wrongly are required.

> nothing can be randomized, as far as the vsyscall can be executed it
> means its address in the address space is known and not random. If the
> address is random you can't execute it. The whole vsyscall space is
> readonly, the attacker can do nothing on it, no way to touch it with
> put_user either.

In this context, random means that the process knows the address and
the (remote) attacker does not.

> especially having a fixed address per-kernel makes no sense at all since
> it's trivial to find out by all other tasks anyways.

To put it another way, that protects against some kinds of remote
attack but it doesn't protect at all against local ones.

> the current API was presented around two years ago at UKUUG, and it was
> developed in the open in the x86-64 mailing list (archives should be
> online), so if there's really a fundamental problem it would been much
> better if you would send your complains to those lists at that time,
> instead of coming out of the blue years later when the code runs in
> production just fine for years (and it's in glibc for a long time too I
> think).

> Still I'm struggling to understand what's your point about
> randomization, your request makes no sense at all to me

I presume you mean Ulrich's request?  I couldn't care less :)

Also you'll notice that randomised executables & libraries is a
relatively new feature, nobody was doing it 2 years ago.

> and I cannot imagine any remote security issue related to the
> current API of the vsyscalls,

Simple: Attack finds buffer overflow, uses it to overwrite a
function's return address to make the CPU jump to the vsyscall code.
There's a good change the function will have popped some registers
before returning, to values also set by the overflow.  If that
function isn't quite convenient enough, the overflow could overwrite
the parent function's registers and return address instead.

By making the CPU jump to the vsyscall code and with some register
values settable, the attack can perform a syscall.  This is the remote
security issue: it allows a buffer overflow to escalate easily to
making a syscall.

All systems with non-randomised libc address have this problem at the
moment, i.e. virtually all systems.  There's just a few that have been
hardened with the randomised executable and library stuff, and Ulrich
would like that to be complete, which means the vsyscall page as well.

> furthmore I cannot remotely imagine any difference in terms
> of security by using a vsyscall table, the only difference to the end
> user would be that its userspace would be running slower, while right
> now it's running as fast as the hardware can.

The vsyscall table discussion has nothing to do with security.

At the moment, Glibc is not running as far as the hardware can on
i386, but the cost of making it do so which includes some program
startup time and memory cost is considered not worth the minor speed change.

> I would appreciate a more detailed explanation rather than "address must
> randomized and the api must be changed".

If there's something I missed feel free to ask.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03 17:37           ` Jamie Lokier
@ 2004-02-03 18:10             ` Andrea Arcangeli
  2004-02-03 18:23               ` Jamie Lokier
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-03 18:10 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, john stultz, lkml

On Tue, Feb 03, 2004 at 05:37:16PM +0000, Jamie Lokier wrote:
> Andrea Arcangeli wrote:
> > could you please explain what's the point of this randomising thing what
> > this attacker is trying to do?
> 
> Most buffer overflow attacks work by overwriting the return address of
> a function to make it jump to a known fixed address.
> 
> The simplest form of that is where the stack is at a known address, so
> the attack overwrites a return address to jump to the stack, to
> instructions which are directly controlled by the attacker (part of
> the same buffer overflow).
> 
> When the stack is not executable or randomised, more complex attacks
> are used that take advantage of code sequences in the library or
> executable itself.
> 
> To counter that, if everything executable is mapped at a random
> address, there is no fixed address that can be jumped to for this kind
> of attack.  More complex attacks which trick code into behaving
> wrongly are required.

so you're talking about using vgettimeofday to exploit an insecure
explitable buggy application.

so you mean, people could jump to the vsyscall address, this is true. so
what, they will execute gettimeofday, then what? how bad is to execute
vgettimeofday? when vgettimeofday returns it will pop the next address
on the stack and then what? if you can change the stack with the
parameter of vgettimeofday then you know the stack address too.

I don't see any decrease of security in allowing an attacker to execute
vgettimeofday.

vsyscalls will never execute anything like execve. They can at most
modify userspace memory a fixed address, so if the userspace isn't
fixed, then nothing can be done with a vsyscall.

Please elaborate how can you use vsyscalls to write an exploit for a
buggy application. I don't see it.

> I presume you mean Ulrich's request?  I couldn't care less :)

yes ;)

> Also you'll notice that randomised executables & libraries is a
> relatively new feature, nobody was doing it 2 years ago.

with vsyscalls as worse you can modify memory at a fixed address passed
as parameter, so as far as the interesting userspace  parts are
randomized (i.e. system()) there's no way to exploit anything jumping in
a vsyscall.

> > and I cannot imagine any remote security issue related to the
> > current API of the vsyscalls,
> 
> Simple: Attack finds buffer overflow, uses it to overwrite a
> function's return address to make the CPU jump to the vsyscall code.
> There's a good change the function will have popped some registers
> before returning, to values also set by the overflow.  If that
> function isn't quite convenient enough, the overflow could overwrite
> the parent function's registers and return address instead.
> 
> By making the CPU jump to the vsyscall code and with some register
> values settable, the attack can perform a syscall.  This is the remote

how can the attacker perform a syscall? at worst it can do a
gettimeofday syscall anyways which is equivalent to the vgettimeofday.

> security issue: it allows a buffer overflow to escalate easily to
> making a syscall.

so what, how can a gettimeofday syscall can ever help the attacker?

we're talking about x86-64, only purerly readonly syscalls are exported
as vsyscalls, we're not using vsyscalls to run random syscalls, never.

> All systems with non-randomised libc address have this problem at the
> moment, i.e. virtually all systems.  There's just a few that have been
> hardened with the randomised executable and library stuff, and Ulrich
> would like that to be complete, which means the vsyscall page as well.

relocating the vsyscall page is doable anyways, this is not the problem
and it doesn't require any change of API, it doesn't impact the current
API at all.

You simply want a syscall that can relocate the vsyscall page in a
random place of the unused kernel address space. This is perfectly
doable. But it slowsdown the kernel if it has to be per-process as it
will require a tlb flush for every mm switch. So I'd rather do it only
if you can demonstrate that vgettimeofday vtime can ever help an
attacker and I don't see it, allowing an attacker to execute one
specific system
call is not bad, allowing an attacker to execute _any_ system call is
bad instead.

> > furthmore I cannot remotely imagine any difference in terms
> > of security by using a vsyscall table, the only difference to the end
> > user would be that its userspace would be running slower, while right
> > now it's running as fast as the hardware can.
> 
> The vsyscall table discussion has nothing to do with security.

agreed.

> If there's something I missed feel free to ask.

I don't see how can you exploit an application by allowing the attacker
to run vgettimeofday or even gettimeofday if the rest of the userspace
is randomized as claimed. Sure you can try to modify a fixed address,
that you could not reach, this is a slight decrease in security ifff the
userspace is not randomized (as otherwise claimed), but no way to run a
execve or similar bad thing. And the syscall table wouldn't change
anything. you're simply asking for a syscall for relocation of the
vsyscall page, this is similar to what uml needs for revirtaulization
too, and it can be implemented, and it has nothing to do with the actual
API (that would be an extension to the API) or the lack of syscall table
that would only slowdown the userspace.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03 18:10             ` Andrea Arcangeli
@ 2004-02-03 18:23               ` Jamie Lokier
  2004-02-03 18:34                 ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Jamie Lokier @ 2004-02-03 18:23 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Ulrich Drepper, john stultz, lkml

Andrea Arcangeli wrote:
> vsyscalls will never execute anything like execve. They can at most
> modify userspace memory a fixed address, so if the userspace isn't
> fixed, then nothing can be done with a vsyscall.

Are we talking about the same x86_64?

I see this in arch/x86_64/vsyscall.S:

__kernel_vsyscall:
.LSTART_vsyscall:
	push	%ebp
.Lpush_ebp:
	movl	%ecx, %ebp
	syscall

Is that page not mapped into userspace?

If the answer is no, then btw we were talking about i386 until you joined in. :)
The "sysenter" instruction is definitely mapped into userspace there.

-- Jamie


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-03 18:23               ` Jamie Lokier
@ 2004-02-03 18:34                 ` Andrea Arcangeli
  0 siblings, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-03 18:34 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Ulrich Drepper, john stultz, lkml

On Tue, Feb 03, 2004 at 06:23:10PM +0000, Jamie Lokier wrote:
> Andrea Arcangeli wrote:
> > vsyscalls will never execute anything like execve. They can at most
> > modify userspace memory a fixed address, so if the userspace isn't
> > fixed, then nothing can be done with a vsyscall.
> 
> Are we talking about the same x86_64?

I did, I don't think it worth to backport to i386 btw.

> 
> I see this in arch/x86_64/vsyscall.S:
> 
> __kernel_vsyscall:
> .LSTART_vsyscall:
> 	push	%ebp
> .Lpush_ebp:
> 	movl	%ecx, %ebp
> 	syscall
> 
> Is that page not mapped into userspace?

this code wasn't there last time I worked on it, it's not in 2.4 either.
I assume it's mapped in userspace, but I'm unsure why it's necessary. I
need to think more about it to understand why such code is there and how
can it be removed. I was taking about the .c file not this new .S one.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
       [not found]               ` <20040203182310.GA18326@mail.shareable.org.suse.lists.linux.kernel>
@ 2004-02-04  2:27                 ` Andi Kleen
  2004-02-04  2:40                   ` Andrea Arcangeli
  2004-02-04  4:21                   ` Jamie Lokier
  0 siblings, 2 replies; 42+ messages in thread
From: Andi Kleen @ 2004-02-04  2:27 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: johnstul, drepper, linux-kernel, andrea

Jamie Lokier <jamie@shareable.org> writes:

> Andrea Arcangeli wrote:
> > vsyscalls will never execute anything like execve. They can at most
> > modify userspace memory a fixed address, so if the userspace isn't
> > fixed, then nothing can be done with a vsyscall.
> 
> Are we talking about the same x86_64?
> 
> I see this in arch/x86_64/vsyscall.S:
> 
> __kernel_vsyscall:
> .LSTART_vsyscall:
> 	push	%ebp
> .Lpush_ebp:
> 	movl	%ecx, %ebp
> 	syscall
> 
> Is that page not mapped into userspace?

It is. It is needed for the vsyscall fallback for UML (UML cannot
support fixed address vsyscalls) and when we have to disable user
space vgettimeofday for other reasons (e.g. to use alternative time
sources that cannot be mapped to user space or doing time workarounds
that require real locks)

But any security advantages of not having it are at best illusionary.
If you don't believe me just grep any random executable for 
0xf 0x05 (= syscall) or 0xcd 0x80 (= int $0x80). Even if it wasn't 
in the vsyscall page you just have to find these two bytes somewhere
(doesn't have to be an own instruction, they occur commonly as part
of other instructions or data) and jump to them. Executables are
at fixed addresses.

-Andi

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-04  2:27                 ` Andi Kleen
@ 2004-02-04  2:40                   ` Andrea Arcangeli
  2004-02-04  4:21                   ` Jamie Lokier
  1 sibling, 0 replies; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-04  2:40 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Jamie Lokier, johnstul, drepper, linux-kernel

On Wed, Feb 04, 2004 at 03:27:16AM +0100, Andi Kleen wrote:
> Jamie Lokier <jamie@shareable.org> writes:
> 
> > Andrea Arcangeli wrote:
> > > vsyscalls will never execute anything like execve. They can at most
> > > modify userspace memory a fixed address, so if the userspace isn't
> > > fixed, then nothing can be done with a vsyscall.
> > 
> > Are we talking about the same x86_64?
> > 
> > I see this in arch/x86_64/vsyscall.S:
> > 
> > __kernel_vsyscall:
> > .LSTART_vsyscall:
> > 	push	%ebp
> > .Lpush_ebp:
> > 	movl	%ecx, %ebp
> > 	syscall
> > 
> > Is that page not mapped into userspace?
> 
> It is. It is needed for the vsyscall fallback for UML (UML cannot
> support fixed address vsyscalls) and when we have to disable user
> space vgettimeofday for other reasons (e.g. to use alternative time
> sources that cannot be mapped to user space or doing time workarounds
> that require real locks)

the fallback in gettimeofday which may be needed in some system would
require a syscall instruction at fixed address too indeed (however in
most systems that is not necessary so the uml fallback seems to be the
one inserting the syscall instruction in common hardware).

> But any security advantages of not having it are at best illusionary.
> If you don't believe me just grep any random executable for 
> 0xf 0x05 (= syscall) or 0xcd 0x80 (= int $0x80). Even if it wasn't 
> in the vsyscall page you just have to find these two bytes somewhere
> (doesn't have to be an own instruction, they occur commonly as part
> of other instructions or data) and jump to them. Executables are
> at fixed addresses.

agreed.

And if they really want to relocate the vsyscall page, it's possible to
implement with a new syscall without having to slowdown or change the
API. We simply need to change the pte during context switch, but it will
force some invlpg at every context switch. I agree it doesn't worth as
far as the executable is the same on all systems.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-04  2:27                 ` Andi Kleen
  2004-02-04  2:40                   ` Andrea Arcangeli
@ 2004-02-04  4:21                   ` Jamie Lokier
  2004-02-05 21:43                     ` Andrea Arcangeli
  1 sibling, 1 reply; 42+ messages in thread
From: Jamie Lokier @ 2004-02-04  4:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: johnstul, drepper, linux-kernel, andrea

Andi Kleen wrote:
> Executables are at fixed addresses.

No, they are not.

Look up PIE - Position Independent Executable.

That's the point: on a hardened system _all_ objects, executable and
libraries, are mapped at randomised addresses.  Therefore the simple
overwrite-return-address exploit is no longer reliable and tends to
crash the program.

That's what this desire for randomised VDSO address is all about.  The
executable and all the libraries are at random addresses in
security-hardened PIE systems.

(Actually even when executables are at fixed addresses, they can be
mapped at an address which is harder to exploit because the address
contains a zero byte - something which is harder to get into a buffer
overflow - but only a little harder).

[ Ulrich: I see randomised prelinking with PIE mentioned, to give
per-box random addresses instead of per process.  I guess I wasn't far
wrong in suggesting prelinked random VDSO positions :) ]

If you are not running PIE and randomised executable and library
positions, then I agree there is nothing to gain from varying the VDSO
position, and it is a slight performance loss so should be disabled.

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-04  4:21                   ` Jamie Lokier
@ 2004-02-05 21:43                     ` Andrea Arcangeli
  2004-02-06  4:15                       ` Rik van Riel
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-05 21:43 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Andi Kleen, johnstul, drepper, linux-kernel

On Wed, Feb 04, 2004 at 04:21:34AM +0000, Jamie Lokier wrote:
> Andi Kleen wrote:
> > Executables are at fixed addresses.
> 
> No, they are not.

this won't happen without some cost, the vsyscalls relocation syscall
(the current API extension) as well won't happen without some cost. And
the idea of having the vsyscall fixed per-system makes little sense
since it doesn't protect against the local exploits, so if we add it, it
has to be relocated per-task, so it will have some real cost (doing it
fixed per-system would have zerocost instead).

However I'm unsure if you want all applications to be relocated
ranodmly, and in turn if you want the vsyscalls relocated for all apps,
exactly because this carry a cost. I think it should be optional. I
don't think I want to slowdown to have all my applications relocated.

And really before you can ever care about the relocation, for the
security-critical-apps you should recompile the app with stackguard
immediatly so that you will trap when functions returns and pop an
address, rendering the vsyscall relocation useless too since it'll never
jump there. So before I can ever care about the vsyscalls relocation I
want all security related apps compiled with stackguard, and secondly I
want the ELF executable binary image relocated as well at runtime
randomly. Only then I will bother to add the vsyscall relocation syscall
that will simply allow userspace to define the address where to move the
vsyscall and it'll flush the tlb and allocate a new pte to map the
vsyscall page in there and it'll do the tlb flush and pte update during
context switch. So in short there are a lot higher prio things to take
care of IMHO, before going down to the vsyscall address level.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-05 21:43                     ` Andrea Arcangeli
@ 2004-02-06  4:15                       ` Rik van Riel
  2004-02-06  4:28                         ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Rik van Riel @ 2004-02-06  4:15 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Jamie Lokier, Andi Kleen, johnstul, drepper, linux-kernel

On Thu, 5 Feb 2004, Andrea Arcangeli wrote:

> However I'm unsure if you want all applications to be relocated
> ranodmly, and in turn if you want the vsyscalls relocated for all apps,
> exactly because this carry a cost. I think it should be optional.

If you think extra security should be optional, please don't
argue against it completely.

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-06  4:15                       ` Rik van Riel
@ 2004-02-06  4:28                         ` Andrea Arcangeli
  2004-02-06  9:23                           ` Ulrich Drepper
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-06  4:28 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Jamie Lokier, Andi Kleen, johnstul, drepper, linux-kernel

On Thu, Feb 05, 2004 at 11:15:00PM -0500, Rik van Riel wrote:
> On Thu, 5 Feb 2004, Andrea Arcangeli wrote:
> 
> > However I'm unsure if you want all applications to be relocated
> > ranodmly, and in turn if you want the vsyscalls relocated for all apps,
> > exactly because this carry a cost. I think it should be optional.
> 
> If you think extra security should be optional, please don't
> argue against it completely.

I don't think I was arguing against it completely, exactly because I'm
just saying it should be optional.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-06  4:28                         ` Andrea Arcangeli
@ 2004-02-06  9:23                           ` Ulrich Drepper
  2004-02-06 15:49                             ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Ulrich Drepper @ 2004-02-06  9:23 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Rik van Riel, Jamie Lokier, Andi Kleen, johnstul, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Andrea Arcangeli wrote:

> I don't think I was arguing against it completely, exactly because I'm
> just saying it should be optional.

And the result is that the current fast syscall handling on x86-64 is
completely unacceptable.  If it's not change security enhancements are
not possible since the libc has to hardcode the address.

- -- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.3 (GNU/Linux)

iD8DBQFAI10P2ijCOnn/RHQRAuegAKCtk8W1cXWKlTWkDrmfJfykzvqATQCfRX4Q
cUVAR4+yIue/MFRL2xNbwfQ=
=VHoF
-----END PGP SIGNATURE-----

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-06  9:23                           ` Ulrich Drepper
@ 2004-02-06 15:49                             ` Andrea Arcangeli
  2004-02-07  0:37                               ` Ulrich Drepper
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-06 15:49 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Rik van Riel, Jamie Lokier, Andi Kleen, johnstul, linux-kernel

On Fri, Feb 06, 2004 at 01:23:23AM -0800, Ulrich Drepper wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
> 
> Andrea Arcangeli wrote:
> 
> > I don't think I was arguing against it completely, exactly because I'm
> > just saying it should be optional.
> 
> And the result is that the current fast syscall handling on x86-64 is
> completely unacceptable.  If it's not change security enhancements are
> not possible since the libc has to hardcode the address.

by the same argument the 2.6 i386 vsyscall is not acceptable too since
it has an hardcoded address too that is the same for all binary kernels
that you ship, and furthmore it has the sysenter or int 0x80 hardcoded
at a fixed address to jump into. In short either you claim the 2.6.2
i386 code as broken the way glibc calls into it, or x86-64 is perfectly
fine too. so your claims makes very little sense to me.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-06 15:49                             ` Andrea Arcangeli
@ 2004-02-07  0:37                               ` Ulrich Drepper
  2004-02-07  2:19                                 ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Ulrich Drepper @ 2004-02-07  0:37 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Rik van Riel, Jamie Lokier, Andi Kleen, johnstul, linux-kernel

Andrea Arcangeli wrote:

> by the same argument the 2.6 i386 vsyscall is not acceptable too since
> it has an hardcoded address too that is the same for all binary kernels
> that you ship, and furthmore it has the sysenter or int 0x80 hardcoded
> at a fixed address to jump into.

You don't read what I write.

The official kernel might have the vdso at a fixed address part no part
of the ABI requires this address and so anybody with some security
conscience can change the kernel to randomize the vdso address.  It's
not my or Ingo's fault that Linus doesn't like the exec-shield code
which would introduce the randomization.  The important aspect is that
we can add vdso randomization and nothing else needs changing.  The same
libc will run6 on a stock kernel and the one with the randomized vdso.
This is not the case on x86-64 where the absolute address for the
gettimeofday is used.

-- 
➧ Ulrich Drepper ➧ Red Hat, Inc. ➧ 444 Castro St ➧ Mountain View, CA ❖

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-07  0:37                               ` Ulrich Drepper
@ 2004-02-07  2:19                                 ` Andrea Arcangeli
  2004-02-07  3:37                                   ` Daniel Jacobowitz
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-07  2:19 UTC (permalink / raw)
  To: Ulrich Drepper
  Cc: Rik van Riel, Jamie Lokier, Andi Kleen, johnstul, linux-kernel

On Fri, Feb 06, 2004 at 04:37:15PM -0800, Ulrich Drepper wrote:
> Andrea Arcangeli wrote:
> 
> > by the same argument the 2.6 i386 vsyscall is not acceptable too since
> > it has an hardcoded address too that is the same for all binary kernels
> > that you ship, and furthmore it has the sysenter or int 0x80 hardcoded
> > at a fixed address to jump into.
> 
> You don't read what I write.

not sure if it's just me, and if what you write is obvious to everyone
on the list.

> The official kernel might have the vdso at a fixed address part no part
> of the ABI requires this address and so anybody with some security
> conscience can change the kernel to randomize the vdso address.  It's
> not my or Ingo's fault that Linus doesn't like the exec-shield code
> which would introduce the randomization.  The important aspect is that
> we can add vdso randomization and nothing else needs changing.  The same
> libc will run6 on a stock kernel and the one with the randomized vdso.
> This is not the case on x86-64 where the absolute address for the
> gettimeofday is used.

I don't know exactly what your "randomization exec-shield" code is doing
either. the way I understand what you wrote is that you want to relocate
the vsyscall trasparently without glibc knowledge, so in short you're
saying that you don't care to randomize everything in the userspace
executable address space, you only care to relocate the vgettimeofday
bytecode, not the rest of the vsyscall pieces. So with your solution
you'll still have "fixed" addresses in the address space that will allow
an attacker to execute vgettimeofday, just like glibc can execute it
without noticing the actual function was relocated. As far as glibc
won't notice that vgettimeofday has been relocated by your
"exec-shield", it means the attacker as well can execute it just fine.
So your solution that doesn't randomize everything look less secure
because you still have a "fixed address executable vgettimeofday", and
most important it's less efficient in terms of performance too since the
cpu will have to lookup the address at runtime every time you run a
vsyscall (or even a syscall like in john's example), instead of doing it
only once for all at execve time in glibc.

in short the problem with what you're proposing is that as far as you
refuse to choose the "random" address in glibc instead of in kernel, it
means you're not randomizing all the vsyscall code (less secure), and it
also means, you'll have to find out what the kernel has choosen at
runtime at every vsyscall (and in turn potentially at every syscall on
x86 where kernel chooses between sysenter and int 0x80).

If you take my approch of choosing the address in glibc (possibly only
when a certain environment variable is set to avoid randomizing for
performance critical apps in not networked trusted enviroments) and
passing it to the kernel with a new syscall "mremap_vsyscall", you'll
solve those problems, and there will be no difference at all between x86
and x86-64.  We may even use mremap by teaching it the potential
vsyscall space.

BTW, on both x86 and x86-64 brute forcing won't be too hard, there's
only a few mbytes on x86 and 2G on x86-64 to randomize, that means on
x86-64 you'll break it in mean after 524288 tries, that means 8 minutes
if each try takes 1msec or 6 days if each try takes 1sec. On x86 instead
(due the too short address space already allocated for the direct
mapping and vmalloc areas) it will take next to nothing to brute force
it, infact I'm unsure if it even worth to try to randomize it on x86, it
probably makes sense for the network daemons that after a non successful
exploit, will segfault and won't be reachable anymore (hoping it's not a
service restarted automatically by xinetd, in such case the brute force
will take a few minutes ;). So by having say 16 chances you'll decrease
of 15/16 the possibility of being exploited, and you'll turn the exploit
in a segfault.

Comments welcome.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-07  2:19                                 ` Andrea Arcangeli
@ 2004-02-07  3:37                                   ` Daniel Jacobowitz
  2004-02-07  4:36                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 42+ messages in thread
From: Daniel Jacobowitz @ 2004-02-07  3:37 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Ulrich Drepper, Rik van Riel, Jamie Lokier, Andi Kleen, johnstul,
	linux-kernel

On Sat, Feb 07, 2004 at 03:19:55AM +0100, Andrea Arcangeli wrote:
> > The official kernel might have the vdso at a fixed address part no part
> > of the ABI requires this address and so anybody with some security
> > conscience can change the kernel to randomize the vdso address.  It's
> > not my or Ingo's fault that Linus doesn't like the exec-shield code
> > which would introduce the randomization.  The important aspect is that
> > we can add vdso randomization and nothing else needs changing.  The same
> > libc will run6 on a stock kernel and the one with the randomized vdso.
> > This is not the case on x86-64 where the absolute address for the
> > gettimeofday is used.
> 
> I don't know exactly what your "randomization exec-shield" code is doing
> either. the way I understand what you wrote is that you want to relocate
> the vsyscall trasparently without glibc knowledge, so in short you're
> saying that you don't care to randomize everything in the userspace
> executable address space, you only care to relocate the vgettimeofday
> bytecode, not the rest of the vsyscall pieces. So with your solution
> you'll still have "fixed" addresses in the address space that will allow
> an attacker to execute vgettimeofday, just like glibc can execute it
> without noticing the actual function was relocated. As far as glibc
> won't notice that vgettimeofday has been relocated by your
> "exec-shield", it means the attacker as well can execute it just fine.

You might want to stop and take a look at the way this works on i386
before you argue with Ulrich any more about it.

Specifically, the vsyscall DSO is constructed as a normal ELF image,
and its base address is passed to glibc as an AT_SYSINFO tag in the
application's auxv vector.  Glibc source code has absolutely no
knowledge of the base address, which in fact has changed at least three
times since it was created.

-- 
Daniel Jacobowitz
MontaVista Software                         Debian GNU/Linux Developer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-07  3:37                                   ` Daniel Jacobowitz
@ 2004-02-07  4:36                                     ` Andrea Arcangeli
  2004-02-07  4:53                                       ` Jamie Lokier
  0 siblings, 1 reply; 42+ messages in thread
From: Andrea Arcangeli @ 2004-02-07  4:36 UTC (permalink / raw)
  To: Ulrich Drepper, Rik van Riel, Jamie Lokier, Andi Kleen, johnstul,
	linux-kernel

On Fri, Feb 06, 2004 at 10:37:59PM -0500, Daniel Jacobowitz wrote:
> On Sat, Feb 07, 2004 at 03:19:55AM +0100, Andrea Arcangeli wrote:
> > > The official kernel might have the vdso at a fixed address part no part
> > > of the ABI requires this address and so anybody with some security
> > > conscience can change the kernel to randomize the vdso address.  It's
> > > not my or Ingo's fault that Linus doesn't like the exec-shield code
> > > which would introduce the randomization.  The important aspect is that
> > > we can add vdso randomization and nothing else needs changing.  The same
> > > libc will run6 on a stock kernel and the one with the randomized vdso.
> > > This is not the case on x86-64 where the absolute address for the
> > > gettimeofday is used.
> > 
> > I don't know exactly what your "randomization exec-shield" code is doing
> > either. the way I understand what you wrote is that you want to relocate
> > the vsyscall trasparently without glibc knowledge, so in short you're
> > saying that you don't care to randomize everything in the userspace
> > executable address space, you only care to relocate the vgettimeofday
> > bytecode, not the rest of the vsyscall pieces. So with your solution
> > you'll still have "fixed" addresses in the address space that will allow
> > an attacker to execute vgettimeofday, just like glibc can execute it
> > without noticing the actual function was relocated. As far as glibc
> > won't notice that vgettimeofday has been relocated by your
> > "exec-shield", it means the attacker as well can execute it just fine.
> 
> You might want to stop and take a look at the way this works on i386
> before you argue with Ulrich any more about it.
> 
> Specifically, the vsyscall DSO is constructed as a normal ELF image,
> and its base address is passed to glibc as an AT_SYSINFO tag in the
> application's auxv vector.  Glibc source code has absolutely no
> knowledge of the base address, which in fact has changed at least three
> times since it was created.

(changing three times is worthless in terms of security, all computers
runs the same bzImage so it's not changing, anyways as Ulrich said this
can be fixed transparently in "their" kernel)

The idea of randomizing the base address in kernel is not different from
generating it in glibc and asking the kernel to relocate. this mean it
probably won't be an environment variable but a root system wide sysctl
to control the randomization (so I find it less flexible, though it's
probably simpler to implement). But regardless my point is that the last
patch posted by john is not the way to go. glibc should call into a
fixed _offset_, the base address after all doesn't matter much if it's
generated by kernel or glibc, so I don't care if it's the kernel
randomizing, my whole point is that glibc must keep calling
vgettimeofday _direct_ without passing through the vsyscall wrapper
where all the other syscalls are passing through. there is not point to
pass through a wrapper when you can speed it up using a _fixed_ offset.

Glibc needs an hardcoded _fixed_ like in a stone table of offsets (yeah
they're not "addresses" like in x86-64, but still they're fixed and
glibc knows about each vsyscall). If we want to implement it the same
way in x86-64 too (randomizing in kernel and passing down the random
base address from kernel instead of generating it in glibc), it maybe
troublesome for compatibility, however I don't care that much about it,
as far as there is a table of hardcoded like in a stone offsets for each
vsyscall, to call it directly, without a table or wrapper. I preferred
the generation of the address in glibc (where the randomizing code must
already exist to randomize everything else including the .text of the
normally non relocatable binary) that was also backwards compatible, but
this is not the bit I care about. The bit I care about is that glibc
should know about the vsyscall to be efficient, and that the offsets
should be fixed.

And as said the randomization in x86 will be a joke to bruce force so
this randomization issue mostly matters for x86-64.

I'd like to know if people dislikes the mremap for the vsyscalls, and if
they prefer the randomization code duplicated in kernel breaking
backwards compatibility with current production x86-64 glibc.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch
  2004-02-07  4:36                                     ` Andrea Arcangeli
@ 2004-02-07  4:53                                       ` Jamie Lokier
  0 siblings, 0 replies; 42+ messages in thread
From: Jamie Lokier @ 2004-02-07  4:53 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Ulrich Drepper, Rik van Riel, Andi Kleen, johnstul, linux-kernel

Andrea Arcangeli wrote:
> (changing three times is worthless in terms of security, all computers
> runs the same bzImage so it's not changing, anyways as Ulrich said this
> can be fixed transparently in "their" kernel)

Andrea, please stop mixing the different arguments.  The three changes
were for technical reasons, not security.

> The bit I care about is that glibc should know about the vsyscall to
> be efficient,

This I agree with,

> and that the offsets should be fixed.

If the vdso position can vary between kernels, there is no real
technical reason why the offsets have to be fixed.  At the simplest,
just like there is AT_SYSINFO to get the generic syscall entry point,
pass an AT_SYSINFO_GTOD for the gettimeofday syscall.  Glibc can use
that with no significant changes to its existing mechanism.

(Although I prefer to use symbols in the vdso because it's cleaner,
AT_SYSINFO_GTOD works.  Another alternative is to have a table of
offsets in the vsyscall page which Glibc can read - future extensible
without the overhead of symbol lookups which Ulrich doesn't like).

-- Jamie

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2004-02-07  4:53 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-01-29  2:46 [RFC][PATCH] linux-2.6.2-rc2_vsyscall-gtod_B1.patch john stultz
2004-01-29  5:06 ` Ulrich Drepper
2004-01-29 13:26   ` Jamie Lokier
2004-01-29 18:05     ` Ulrich Drepper
2004-01-29 19:15       ` Jamie Lokier
2004-01-29 23:59         ` john stultz
2004-01-30  0:40           ` Ulrich Drepper
2004-01-30  0:31         ` Ulrich Drepper
2004-01-30  4:17           ` Jamie Lokier
2004-01-30  5:09             ` Ulrich Drepper
2004-01-30  9:29               ` Ingo Molnar
2004-02-03  4:38                 ` Ulrich Drepper
2004-01-30 17:34               ` Jamie Lokier
2004-01-30  8:33             ` Jakub Jelinek
2004-01-30 17:21               ` Jamie Lokier
2004-01-31  0:10     ` Eric W. Biederman
2004-01-31  2:41       ` Jamie Lokier
2004-01-31  5:54         ` Eric W. Biederman
2004-02-01  1:28   ` Andrea Arcangeli
2004-02-03  4:35     ` Ulrich Drepper
2004-02-03  5:34       ` Andrea Arcangeli
2004-02-03  8:52       ` Jamie Lokier
2004-02-03 16:25         ` Andrea Arcangeli
2004-02-03 17:37           ` Jamie Lokier
2004-02-03 18:10             ` Andrea Arcangeli
2004-02-03 18:23               ` Jamie Lokier
2004-02-03 18:34                 ` Andrea Arcangeli
2004-01-31  0:17 ` Eric W. Biederman
2004-01-31  2:20   ` john stultz
     [not found] <1075344395.1592.87.camel@cog.beaverton.ibm.com.suse.lists.linux.kernel>
     [not found] ` <401894DA.7000609@redhat.com.suse.lists.linux.kernel>
     [not found]   ` <20040201012803.GN26076@dualathlon.random.suse.lists.linux.kernel>
     [not found]     ` <401F251C.2090300@redhat.com.suse.lists.linux.kernel>
     [not found]       ` <20040203085224.GA15738@mail.shareable.org.suse.lists.linux.kernel>
     [not found]         ` <20040203162515.GY26076@dualathlon.random.suse.lists.linux.kernel>
     [not found]           ` <20040203173716.GC17895@mail.shareable.org.suse.lists.linux.kernel>
     [not found]             ` <20040203181001.GA26076@dualathlon.random.suse.lists.linux.kernel>
     [not found]               ` <20040203182310.GA18326@mail.shareable.org.suse.lists.linux.kernel>
2004-02-04  2:27                 ` Andi Kleen
2004-02-04  2:40                   ` Andrea Arcangeli
2004-02-04  4:21                   ` Jamie Lokier
2004-02-05 21:43                     ` Andrea Arcangeli
2004-02-06  4:15                       ` Rik van Riel
2004-02-06  4:28                         ` Andrea Arcangeli
2004-02-06  9:23                           ` Ulrich Drepper
2004-02-06 15:49                             ` Andrea Arcangeli
2004-02-07  0:37                               ` Ulrich Drepper
2004-02-07  2:19                                 ` Andrea Arcangeli
2004-02-07  3:37                                   ` Daniel Jacobowitz
2004-02-07  4:36                                     ` Andrea Arcangeli
2004-02-07  4:53                                       ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox