Linux MIPS Architecture development
 help / color / mirror / Atom feed
* [PATCH] Improve o32 syscall handling
@ 2004-11-21 16:45 Thiemo Seufer
  2004-11-22  6:18 ` Ralf Baechle
  2004-11-23 12:47 ` Gleb O. Raiko
  0 siblings, 2 replies; 7+ messages in thread
From: Thiemo Seufer @ 2004-11-21 16:45 UTC (permalink / raw)
  To: linux-mips; +Cc: ralf

Hello All,

this is a major cleanup for the o32 syscall handling.
For the 32bit kernel, it
 - uses a more efficient syscall table layout, and reduces its size
 - handles stack arguments also more efficiently, and allows for up
   to 8 arguments. This gives an indirect fadvise64_64 syscall a
   chance to work.
 - Fixes several flaws in the indirect syscall path, like duplicated
   user stack handling, and incomplete argument handling.

For the 64bit Kernel, it
 - checks for unaligned user stack
 - also allows now up to 8 arguments
 - removes unused stackhandling cruft from the indirect syscall path
   and does complete argument handling there.


Thiemo


Index: arch/mips/kernel/scall32-o32.S
===================================================================
RCS file: /home/cvs/linux/arch/mips/kernel/scall32-o32.S,v
retrieving revision 1.15
diff -u -p -r1.15 scall32-o32.S
--- arch/mips/kernel/scall32-o32.S	15 Nov 2004 11:49:19 -0000	1.15
+++ arch/mips/kernel/scall32-o32.S	20 Nov 2004 16:46:39 -0000
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 1995, 96, 97, 98, 99, 2000, 01, 02 by Ralf Baechle
  * Copyright (C) 2001 MIPS Technologies, Inc.
+ * Copyright (C) 2004 Thiemo Seufer
  */
 #include <linux/config.h>
 #include <linux/errno.h>
@@ -32,26 +33,30 @@ NESTED(handle_sys, PT_SIZE, sp)
 
 	lw	t1, PT_EPC(sp)		# skip syscall on return
 
+#if defined(CONFIG_BINFMT_IRIX)
 	sltiu	t0, v0, MAX_SYSCALL_NO + 1 # check syscall number
+#else
+	subu	v0, v0, __NR_O32_Linux	# check syscall number
+	sltiu	t0, v0, __NR_O32_Linux_syscalls + 1
+#endif
 	addiu	t1, 4			# skip to next instruction
 	sw	t1, PT_EPC(sp)
 	beqz	t0, illegal_syscall
 
-	/* XXX Put both in one cacheline, should save a bit. */
-	sll	t0, v0, 2
-	lw	t2, sys_call_table(t0)	# syscall routine
-	lbu	t3, sys_narg_table(v0)	# number of arguments
-	beqz	t2, illegal_syscall;
+	sll	t0, v0, 3
+	la	t1, sys_call_table
+	addu	t1, t0
+	lw	t2, (t1)		# syscall routine
+	lw	t3, 4(t1)		# >= 0 if we need stack arguments
+	beqz	t2, illegal_syscall
 
-	subu	t0, t3, 5		# 5 or more arguments?
 	sw	a3, PT_R26(sp)		# save a3 for syscall restarting
-	bgez	t0, stackargs
+	bgez	t3, stackargs
 
 stack_done:
-	sw	a3, PT_R26(sp)          # save for syscall restart
-	LONG_L	t0, TI_FLAGS($28)	# syscall tracing enabled?
+	lw	t0, TI_FLAGS($28)	# syscall tracing enabled?
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
-	and	t0, t1, t0
+	and	t0, t1
 	bnez	t0, syscall_trace_entry	# -> yes
 
 	jalr	t2			# Do The Real Thing (TM)
@@ -70,9 +75,9 @@ o32_syscall_exit:
 	local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
-	LONG_L	a2, TI_FLAGS($28)	# current->work
+	lw	a2, TI_FLAGS($28)	# current->work
 	li	t0, _TIF_ALLWORK_MASK
-	and	t0, a2, t0
+	and	t0, a2
 	bnez	t0, o32_syscall_exit_work
 
 	j	restore_partial
@@ -117,49 +122,50 @@ syscall_trace_entry:
 	 */
 stackargs:
 	lw	t0, PT_R29(sp)		# get old user stack pointer
-	subu	t3, 4
-	sll	t1, t3, 2		# stack valid?
-
-	addu	t1, t0			# end address
-	or	t0, t1
-	bltz	t0, bad_stack		# -> sp is bad
-
-	lw	t0, PT_R29(sp)		# get old user stack pointer
-	PTR_LA	t1, 4f			# copy 1 to 3 arguments
-	sll	t3, t3, 4
-	subu	t1, t3
-	jr	t1
 
-	/* Ok, copy the args from the luser stack to the kernel stack */
 	/*
-	 * I know Ralf doesn't like nops but this avoids code
-	 * duplication for R3000 targets (and this is the
-	 * only place where ".set reorder" doesn't help).
-	 * Harald.
+	 * We intentionally keep the kernel stack a little below the top of
+	 * userspace so we don't have to do a slower byte accurate check here.
 	 */
+	andi	t1, t0, 7
+	lw	t5, TI_ADDR_LIMIT($28)
+	bnez	t1, bad_stack
+	addu	t4, t0, 32
+	and	t5, t4
+	bltz	t5, bad_stack		# -> sp is bad
+
+	/* Ok, copy the args from the luser stack to the kernel stack.
+	 * t3 is the precomputed number of instruction bytes needed to
+	 * load or store arguments 6-8.
+	 */
+
+	la	t1, 5f			# load up to 3 arguments
+	subu	t1, t3
+1:	lw	t5, 16(t0)		# argument #5 from usp
 	.set    push
 	.set    noreorder
 	.set	nomacro
-1:	lw	t1, 24(t0)		# argument #7 from usp
-	nop
-	sw	t1, 24(sp)
-	nop
-2:	lw	t1, 20(t0)		# argument #5 from usp
-	nop
-	sw	t1, 20(sp)
-	nop
-3:	lw	t1, 16(t0)		# argument #5 from usp
-	nop
-	sw	t1, 16(sp)
-	nop
-4:	.set	pop
+	jr	t1
+	 addiu	t1, 6f - 5f
 
-	j	stack_done		# go back
+2:	lw	t8, 28(t0)		# argument #8 from usp
+3:	lw	t7, 24(t0)		# argument #7 from usp
+4:	lw	t6, 20(t0)		# argument #6 from usp
+5:	jr	t1
+	 sw	t5, 16(sp)		# argument #5 to ksp
+
+	sw	t8, 28(sp)		# argument #8 to ksp
+	sw	t7, 24(sp)		# argument #7 to ksp
+	sw	t6, 20(sp)		# argument #6 to ksp
+6:	j	stack_done		# go back
+	 nop
+	.set	pop
 
 	.section __ex_table,"a"
 	PTR	1b,bad_stack
 	PTR	2b,bad_stack
 	PTR	3b,bad_stack
+	PTR	4b,bad_stack
 	.previous
 
 	/*
@@ -239,12 +245,12 @@ illegal_syscall:
 	sw	v0, PT_R2(sp)		# result
 
 	/* Success, so skip usual error handling garbage.  */
-	LONG_L	a2, TI_FLAGS($28)	# syscall tracing enabled?
+	lw	a2, TI_FLAGS($28)	# syscall tracing enabled?
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2, t0
 	bnez	t0, 1f
 
-	b	o32_syscall_exit
+	j	o32_syscall_exit
 
 1:	SAVE_STATIC
 	move	a0, sp
@@ -270,67 +276,47 @@ bad_alignment:
 	END(sys_sysmips)
 
 	LEAF(sys_syscall)
-	lw	t0, PT_R29(sp)			# user sp
-
-	sltu	v0, a0, __NR_O32_Linux + __NR_O32_Linux_syscalls + 1
+#if defined(CONFIG_BINFMT_IRIX)
+	sltiu	v0, a0, MAX_SYSCALL_NO + 1 # check syscall number
+#else
+	subu	v0, a0, __NR_O32_Linux	# check syscall number
+	sltiu	v0, v0, __NR_O32_Linux_syscalls + 1
+#endif
 	beqz	v0, enosys
 
-	sll	v0, a0, 2
-	la	v1, sys_syscall
-	lw	t2, sys_call_table(v0)		# function pointer
-	lbu	t4, sys_narg_table(a0)		# number of arguments
-
-	li	v0, -EINVAL
-	beq	t2, v1, out			# do not recurse
+	sll	t0, v0, 3
+	lw	t2, sys_call_table(t0)		# syscall routine
 
+	li	v1, 4000			# nr of sys_syscall
 	beqz	t2, enosys			# null function pointer?
 
-	andi	v0, t0, 0x3			# unaligned stack pointer?
-	bnez	v0, sigsegv
+	li	v0, -EINVAL
+	beq	a0, v1, out			# do not recurse
 
-	addu	v0, t0, 16			# v0 = usp + 16
-	addu	t1, v0, 12			# 3 32-bit arguments
-	lw	v1, TI_ADDR_LIMIT($28)
-	or	v0, v0, t1
-	and	v1, v1, v0
-	bltz	v1, efault
+	/* Some syscalls like execve get their arguments from struct pt_regs
+	   and claim zero arguments in the syscall table. Thus we have to
+	   assume the worst case and shuffle around all potential arguments.
+	   If you want performance, don't use indirect syscalls. */
 
 	move	a0, a1				# shift argument registers
 	move	a1, a2
 	move	a2, a3
-
-1:	lw	a3, 16(t0)
-2:	lw	t3, 20(t0)
-3:	lw	t4, 24(t0)
-
-	.section	__ex_table, "a"
-	.word	1b, efault
-	.word	2b, efault
-	.word	3b, efault
-	.previous
-
-	sw	t3, 16(sp)			# put into new stackframe
-	sw	t4, 20(sp)
-
-	bnez	t4, 1f				# zero arguments?
-	addu	a0, sp, 32			# then pass sp in a0
-1:
-
-	sw	t3, 16(sp)
-	sw	v1, 20(sp)
+	lw	a3, 16(sp)
+	lw	t4, 20(sp)
+	lw	t5, 24(sp)
+	lw	t6, 28(sp)
+	sw	t4, 16(sp)
+	sw	t5, 20(sp)
+	sw	t6, 24(sp)
+	sw	a0, PT_R4(sp)			# .. and push back a0 - a3, some
+	sw	a1, PT_R5(sp)			# syscalls expect them there
+	sw	a2, PT_R6(sp)
+	sw	a3, PT_R7(sp)
+	sw	a3, PT_R26(sp)			# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
 enosys:	li	v0, -ENOSYS
-	b	out
-
-sigsegv:
-	li	a0, _SIGSEGV
-	move	a1, $28
-	jal	force_sig
-	/* Fall through */
-
-efault:	li	v0, -EFAULT
 
 out:	jr	ra
 	END(sys_syscall)
@@ -350,12 +336,14 @@ out:	jr	ra
 	.endm
 
 	.macro	syscalltable
+#if defined(CONFIG_BINFMT_IRIX)
 	mille	sys_ni_syscall		0	/*    0 -  999 SVR4 flavour */
-	#include "irix5sys.h"			/* 1000 - 1999 32-bit IRIX */
+# include "irix5sys.h"				/* 1000 - 1999 32-bit IRIX */
 	mille	sys_ni_syscall		0	/* 2000 - 2999 BSD43 flavour */
 	mille	sys_ni_syscall		0	/* 3000 - 3999 POSIX flavour */
+#endif
 
-	sys	sys_syscall		0	/* 4000 */
+	sys	sys_syscall		8	/* 4000 */
 	sys	sys_exit		1
 	sys	sys_fork		0
 	sys	sys_read		3
@@ -641,19 +629,16 @@ out:	jr	ra
 
 	.endm
 
+	/* We pre-compute the number of _instruction_ bytes needed to
+	   load or store the arguments 6-8. Negative values are ignored. */
+
 	.macro  sys function, nargs
 	PTR	\function
+	LONG	(\nargs << 2) - (5 << 2)
 	.endm
 
 	.align	3
+	.type	sys_call_table,@object
 sys_call_table:
 	syscalltable
 	.size	sys_call_table, . - sys_call_table
-
-	.macro	sys function, nargs
-	.byte	\nargs
-	.endm
-
-sys_narg_table:
-	syscalltable
-	.size	sys_narg_table, . - sys_narg_table
Index: arch/mips/kernel/scall64-o32.S
===================================================================
RCS file: /home/cvs/linux/arch/mips/kernel/scall64-o32.S,v
retrieving revision 1.22
diff -u -p -r1.22 scall64-o32.S
--- arch/mips/kernel/scall64-o32.S	15 Nov 2004 11:49:19 -0000	1.22
+++ arch/mips/kernel/scall64-o32.S	20 Nov 2004 16:46:39 -0000
@@ -6,6 +6,7 @@
  * Copyright (C) 1995 - 2000, 2001 by Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 2001 MIPS Technologies, Inc.
+ * Copyright (C) 2004 Thiemo Seufer
  *
  * Hairy, the userspace application uses a different argument passing
  * convention than the kernel, so we have to translate things from o32
@@ -43,6 +44,8 @@ NESTED(handle_sys, PT_SIZE, sp)
  RESTORE_ALL
 #endif
 
+	/* We don't want to stumble over broken sign extensions from
+	   userland. O32 does never use the upper half. */
 	sll	a0, a0, 0
 	sll	a1, a1, 0
 	sll	a2, a2, 0
@@ -62,17 +65,21 @@ NESTED(handle_sys, PT_SIZE, sp)
 	 * userspace so we don't have to do a slower byte accurate check here.
 	 */
 	ld	t0, PT_R29(sp)		# get old user stack pointer
+	andi	t3, t0, 7
+	bnez	t3, bad_stack
 	daddu	t1, t0, 32
 	bltz	t1, bad_stack
 
 1:	lw	a4, 16(t0)		# argument #5 from usp
 2:	lw	a5, 20(t0)		# argument #6 from usp
 3:	lw	a6, 24(t0)		# argument #7 from usp
+4:	lw	a7, 28(t0)		# argument #8 from usp (for indirect syscalls)
 
 	.section __ex_table,"a"
 	PTR	1b, bad_stack
 	PTR	2b, bad_stack
 	PTR	3b, bad_stack
+	PTR	4b, bad_stack
 	.previous
 
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
@@ -91,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp)
 	sd	v0, PT_R0(sp)		# flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-FEXPORT(o32_syscall_exit)
+o32_syscall_exit:
 	local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
@@ -109,12 +116,11 @@ o32_syscall_exit_work:
 
 trace_a_syscall:
 	SAVE_STATIC
-	sd	a4, PT_R8(sp)
+	sd	t2, PT_R1(sp)
+	sd	a4, PT_R8(sp)		# Save argument registers
 	sd	a5, PT_R9(sp)
 	sd	a6, PT_R10(sp)
-	sd	a7, PT_R11(sp)
-
-	sd	t2,PT_R1(sp)
+	sd	a7, PT_R11(sp)		# For indirect syscalls
 	move	a0, sp
 	li	a1, 0
 	jal	do_syscall_trace
@@ -126,7 +132,8 @@ trace_a_syscall:
 	ld	a3, PT_R7(sp)
 	ld	a4, PT_R8(sp)
 	ld	a5, PT_R9(sp)
-	ld	a6, PT_R10(sp)		# For indirect syscalls
+	ld	a6, PT_R10(sp)
+	ld	a7, PT_R11(sp)		# For indirect syscalls
 	jalr	t2
 
 	li	t0, -EMAXERRNO - 1	# error?
@@ -174,55 +181,40 @@ illegal_syscall:
 	END(handle_sys)
 
 LEAF(sys32_syscall)
-	ld	t0, PT_R29(sp)		# user sp
-
 	sltu	v0, a0, __NR_O32_Linux + __NR_O32_Linux_syscalls + 1
 	beqz	v0, enosys
 
 	dsll	v0, a0, 3
-	dla	v1, sys32_syscall
 	ld	t2, (sys_call_table - (__NR_O32_Linux * 8))(v0)
 
+	li	v1, 4000		# indirect syscall number
 	li	v0, -EINVAL
-	beq	t2, v1, out		# do not recurse
+	beq	a0, v1, out		# do not recurse
 
 	beqz	t2, enosys		# null function pointer?
 
-	andi	v0, t0, 0x3		# unaligned stack pointer?
-	bnez	v0, sigsegv
-
-	daddiu	v0, t0, 16		# v0 = usp + 16
-	daddu	t1, v0, 12		# 3 32-bit arguments
-	ld	v1, TI_ADDR_LIMIT($28)
-	or	v0, v0, t1
-	and	v1, v1, v0
-	bnez	v1, efault
-
 	move	a0, a1			# shift argument registers
 	move	a1, a2
 	move	a2, a3
 	move	a3, a4
 	move	a4, a5
 	move	a5, a6
+	move	a6, a7
+	sd	a0, PT_R4(sp)		# ... and push back a0 - a3, some
+	sd	a1, PT_R5(sp)		# syscalls expect them there
+	sd	a2, PT_R6(sp)
+	sd	a3, PT_R7(sp)
+	sd	a3, PT_R26(sp)		# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
 enosys:	li	v0, -ENOSYS
-	b	out
-
-sigsegv:
-	li	a0, _SIGSEGV
-	move	a1, $28
-	jal	force_sig
-	/* Fall through */
-
-efault:	li	v0, -EFAULT
 
 out:	jr	ra
 	END(sys32_syscall)
 
 	.align	3
-	.type	sys_call_table,@object;
+	.type	sys_call_table,@object
 sys_call_table:
 	PTR	sys32_syscall			/* 4000 */
 	PTR	sys_exit	

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Improve o32 syscall handling
  2004-11-21 16:45 [PATCH] Improve o32 syscall handling Thiemo Seufer
@ 2004-11-22  6:18 ` Ralf Baechle
  2004-11-22  7:00   ` Thiemo Seufer
  2004-11-23 12:47 ` Gleb O. Raiko
  1 sibling, 1 reply; 7+ messages in thread
From: Ralf Baechle @ 2004-11-22  6:18 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: linux-mips

On Sun, Nov 21, 2004 at 05:45:57PM +0100, Thiemo Seufer wrote:

> For the 64bit Kernel, it
>  - checks for unaligned user stack

Why bother, the unaligned exception handler should take care of this.

>  - also allows now up to 8 arguments

Quite frankly I'd prefer to see this being handle in userspace.  For o32
it's too late to go for that but for N32 / N64 we still may have a chance.

> -	LONG_L	a2, TI_FLAGS($28)	# current->work
> +	lw	a2, TI_FLAGS($28)	# current->work

Flags is a long variable.

  Ralf

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Improve o32 syscall handling
  2004-11-22  6:18 ` Ralf Baechle
@ 2004-11-22  7:00   ` Thiemo Seufer
  2004-11-22  7:13     ` Ralf Baechle
  0 siblings, 1 reply; 7+ messages in thread
From: Thiemo Seufer @ 2004-11-22  7:00 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

Ralf Baechle wrote:
> On Sun, Nov 21, 2004 at 05:45:57PM +0100, Thiemo Seufer wrote:
> 
> > For the 64bit Kernel, it
> >  - checks for unaligned user stack
> 
> Why bother, the unaligned exception handler should take care of this.

It really does so for unaligned accesses from kernel space?

> >  - also allows now up to 8 arguments
> 
> Quite frankly I'd prefer to see this being handle in userspace.  For o32
> it's too late to go for that but for N32 / N64 we still may have a chance.

My changes are for O32 only. N32/N64 doesn't need more than 6 arguments.

> > -	LONG_L	a2, TI_FLAGS($28)	# current->work
> > +	lw	a2, TI_FLAGS($28)	# current->work
> 
> Flags is a long variable.

"long" isn't a quantity the assembler knows about. :-)

The whole assembler file for O32 support in 32bit Kernels makes only
sense when it is compiled as 32bit code. In that case, the C "long"
has 4 bytes and is loaded with lw. Using a macro which abstracts for
32/64bit compilation hides this needlessly, and can even lead to the
erraneous impression the code would be useful for 64bit, too.


Thiemo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Improve o32 syscall handling
  2004-11-22  7:00   ` Thiemo Seufer
@ 2004-11-22  7:13     ` Ralf Baechle
  2004-11-22  9:37       ` Thiemo Seufer
  0 siblings, 1 reply; 7+ messages in thread
From: Ralf Baechle @ 2004-11-22  7:13 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: linux-mips

On Mon, Nov 22, 2004 at 08:00:04AM +0100, Thiemo Seufer wrote:

> > Why bother, the unaligned exception handler should take care of this.
> 
> It really does so for unaligned accesses from kernel space?

Yes.  In fact it's crucially important for this very case.  TCP for example
may result in missalignment.  And not everybody is using get_unaligned /
put_unaligned as they were intended.  Relying on the unaligned handler
is preferable where we expect pointers to be properly aligned almost
always.

The MIPS ABI mandates at least 8 byte stack alignment and funny things
happen if that assumption is violated.  So there is no motivation at all
to care about the performance of missalignment.  Aside of me defining this
to be verboten by punishment of signal 9 ;-)

> has 4 bytes and is loaded with lw. Using a macro which abstracts for
> 32/64bit compilation hides this needlessly, and can even lead to the
> erraneous impression the code would be useful for 64bit, too.

I'm more following the religion of using such abstractions everywhere
because code tends to be copied around mindlessly ...

  Ralf

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Improve o32 syscall handling
  2004-11-22  7:13     ` Ralf Baechle
@ 2004-11-22  9:37       ` Thiemo Seufer
  2004-11-27  0:29         ` Thiemo Seufer
  0 siblings, 1 reply; 7+ messages in thread
From: Thiemo Seufer @ 2004-11-22  9:37 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

Ralf Baechle wrote:
> On Mon, Nov 22, 2004 at 08:00:04AM +0100, Thiemo Seufer wrote:
> 
> > > Why bother, the unaligned exception handler should take care of this.
> > 
> > It really does so for unaligned accesses from kernel space?
> 
> Yes.  In fact it's crucially important for this very case.

Ok, I'll update the patch accordingly when I'm back to better
connectivity than I have now.

[snip]
> > has 4 bytes and is loaded with lw. Using a macro which abstracts for
> > 32/64bit compilation hides this needlessly, and can even lead to the
> > erraneous impression the code would be useful for 64bit, too.
> 
> I'm more following the religion of using such abstractions everywhere
> because code tends to be copied around mindlessly ...

I would agree if there was a roughly similiar 64bit version of the code.
But due to the differences between 32bit and 64bit kernel there will
never be one, so it's IMHO best to make them as distinct as reasonable
in this case.


Thiemo

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Improve o32 syscall handling
  2004-11-21 16:45 [PATCH] Improve o32 syscall handling Thiemo Seufer
  2004-11-22  6:18 ` Ralf Baechle
@ 2004-11-23 12:47 ` Gleb O. Raiko
  1 sibling, 0 replies; 7+ messages in thread
From: Gleb O. Raiko @ 2004-11-23 12:47 UTC (permalink / raw)
  To: Thiemo Seufer; +Cc: linux-mips, ralf

Hello,

Thiemo Seufer wrote:
> this is a major cleanup for the o32 syscall handling.

While we're here, there is an ptrace exploit in the syscall handling.

The kernel parses arguments, gets the address of the syscall handling 
routine in t2, and goes to the process which ptraces. On return from 
this process, the kernel restores t2 from the user stack and jumps 
there. I've got an example that gets root from this.

Regards,
Gleb.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Improve o32 syscall handling
  2004-11-22  9:37       ` Thiemo Seufer
@ 2004-11-27  0:29         ` Thiemo Seufer
  0 siblings, 0 replies; 7+ messages in thread
From: Thiemo Seufer @ 2004-11-27  0:29 UTC (permalink / raw)
  To: linux-mips; +Cc: Ralf Baechle

Thiemo Seufer wrote:
> Ralf Baechle wrote:
> > On Mon, Nov 22, 2004 at 08:00:04AM +0100, Thiemo Seufer wrote:
> > 
> > > > Why bother, the unaligned exception handler should take care of this.
> > > 
> > > It really does so for unaligned accesses from kernel space?
> > 
> > Yes.  In fact it's crucially important for this very case.
> 
> Ok, I'll update the patch accordingly when I'm back to better
> connectivity than I have now.
> 
> [snip]
> > > has 4 bytes and is loaded with lw. Using a macro which abstracts for
> > > 32/64bit compilation hides this needlessly, and can even lead to the
> > > erraneous impression the code would be useful for 64bit, too.
> > 
> > I'm more following the religion of using such abstractions everywhere
> > because code tends to be copied around mindlessly ...
> 
> I would agree if there was a roughly similiar 64bit version of the code.
> But due to the differences between 32bit and 64bit kernel there will
> never be one, so it's IMHO best to make them as distinct as reasonable
> in this case.

The appended patch leaves unaligned stack handling alone and is updated
to newest CVS, including the recent ptrace fix.


Thiemo


Index: arch/mips/kernel/scall32-o32.S
===================================================================
RCS file: /home/cvs/linux/arch/mips/kernel/scall32-o32.S,v
retrieving revision 1.17
diff -u -p -r1.17 scall32-o32.S
--- arch/mips/kernel/scall32-o32.S	25 Nov 2004 13:40:10 -0000	1.17
+++ arch/mips/kernel/scall32-o32.S	26 Nov 2004 23:15:47 -0000
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 1995, 96, 97, 98, 99, 2000, 01, 02 by Ralf Baechle
  * Copyright (C) 2001 MIPS Technologies, Inc.
+ * Copyright (C) 2004 Thiemo Seufer
  */
 #include <linux/config.h>
 #include <linux/errno.h>
@@ -32,26 +33,30 @@ NESTED(handle_sys, PT_SIZE, sp)
 
 	lw	t1, PT_EPC(sp)		# skip syscall on return
 
+#if defined(CONFIG_BINFMT_IRIX)
 	sltiu	t0, v0, MAX_SYSCALL_NO + 1 # check syscall number
+#else
+	subu	v0, v0, __NR_O32_Linux	# check syscall number
+	sltiu	t0, v0, __NR_O32_Linux_syscalls + 1
+#endif
 	addiu	t1, 4			# skip to next instruction
 	sw	t1, PT_EPC(sp)
 	beqz	t0, illegal_syscall
 
-	/* XXX Put both in one cacheline, should save a bit. */
-	sll	t0, v0, 2
-	lw	t2, sys_call_table(t0)	# syscall routine
-	lbu	t3, sys_narg_table(v0)	# number of arguments
-	beqz	t2, illegal_syscall;
+	sll	t0, v0, 3
+	la	t1, sys_call_table
+	addu	t1, t0
+	lw	t2, (t1)		# syscall routine
+	lw	t3, 4(t1)		# >= 0 if we need stack arguments
+	beqz	t2, illegal_syscall
 
-	subu	t0, t3, 5		# 5 or more arguments?
 	sw	a3, PT_R26(sp)		# save a3 for syscall restarting
-	bgez	t0, stackargs
+	bgez	t3, stackargs
 
 stack_done:
-	sw	a3, PT_R26(sp)          # save for syscall restart
-	LONG_L	t0, TI_FLAGS($28)	# syscall tracing enabled?
+	lw	t0, TI_FLAGS($28)	# syscall tracing enabled?
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
-	and	t0, t1, t0
+	and	t0, t1
 	bnez	t0, syscall_trace_entry	# -> yes
 
 	jalr	t2			# Do The Real Thing (TM)
@@ -70,9 +75,9 @@ o32_syscall_exit:
 	local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
-	LONG_L	a2, TI_FLAGS($28)	# current->work
+	lw	a2, TI_FLAGS($28)	# current->work
 	li	t0, _TIF_ALLWORK_MASK
-	and	t0, a2, t0
+	and	t0, a2
 	bnez	t0, o32_syscall_exit_work
 
 	j	restore_partial
@@ -116,49 +121,48 @@ syscall_trace_entry:
 	 */
 stackargs:
 	lw	t0, PT_R29(sp)		# get old user stack pointer
-	subu	t3, 4
-	sll	t1, t3, 2		# stack valid?
-
-	addu	t1, t0			# end address
-	or	t0, t1
-	bltz	t0, bad_stack		# -> sp is bad
-
-	lw	t0, PT_R29(sp)		# get old user stack pointer
-	PTR_LA	t1, 4f			# copy 1 to 3 arguments
-	sll	t3, t3, 4
-	subu	t1, t3
-	jr	t1
 
-	/* Ok, copy the args from the luser stack to the kernel stack */
 	/*
-	 * I know Ralf doesn't like nops but this avoids code
-	 * duplication for R3000 targets (and this is the
-	 * only place where ".set reorder" doesn't help).
-	 * Harald.
+	 * We intentionally keep the kernel stack a little below the top of
+	 * userspace so we don't have to do a slower byte accurate check here.
 	 */
+	lw	t5, TI_ADDR_LIMIT($28)
+	addu	t4, t0, 32
+	and	t5, t4
+	bltz	t5, bad_stack		# -> sp is bad
+
+	/* Ok, copy the args from the luser stack to the kernel stack.
+	 * t3 is the precomputed number of instruction bytes needed to
+	 * load or store arguments 6-8.
+	 */
+
+	la	t1, 5f			# load up to 3 arguments
+	subu	t1, t3
+1:	lw	t5, 16(t0)		# argument #5 from usp
 	.set    push
 	.set    noreorder
 	.set	nomacro
-1:	lw	t1, 24(t0)		# argument #7 from usp
-	nop
-	sw	t1, 24(sp)
-	nop
-2:	lw	t1, 20(t0)		# argument #5 from usp
-	nop
-	sw	t1, 20(sp)
-	nop
-3:	lw	t1, 16(t0)		# argument #5 from usp
-	nop
-	sw	t1, 16(sp)
-	nop
-4:	.set	pop
+	jr	t1
+	 addiu	t1, 6f - 5f
 
-	j	stack_done		# go back
+2:	lw	t8, 28(t0)		# argument #8 from usp
+3:	lw	t7, 24(t0)		# argument #7 from usp
+4:	lw	t6, 20(t0)		# argument #6 from usp
+5:	jr	t1
+	 sw	t5, 16(sp)		# argument #5 to ksp
+
+	sw	t8, 28(sp)		# argument #8 to ksp
+	sw	t7, 24(sp)		# argument #7 to ksp
+	sw	t6, 20(sp)		# argument #6 to ksp
+6:	j	stack_done		# go back
+	 nop
+	.set	pop
 
 	.section __ex_table,"a"
 	PTR	1b,bad_stack
 	PTR	2b,bad_stack
 	PTR	3b,bad_stack
+	PTR	4b,bad_stack
 	.previous
 
 	/*
@@ -238,12 +242,12 @@ illegal_syscall:
 	sw	v0, PT_R2(sp)		# result
 
 	/* Success, so skip usual error handling garbage.  */
-	LONG_L	a2, TI_FLAGS($28)	# syscall tracing enabled?
+	lw	a2, TI_FLAGS($28)	# syscall tracing enabled?
 	li	t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, a2, t0
 	bnez	t0, 1f
 
-	b	o32_syscall_exit
+	j	o32_syscall_exit
 
 1:	SAVE_STATIC
 	move	a0, sp
@@ -269,67 +273,47 @@ bad_alignment:
 	END(sys_sysmips)
 
 	LEAF(sys_syscall)
-	lw	t0, PT_R29(sp)			# user sp
-
-	sltu	v0, a0, __NR_O32_Linux + __NR_O32_Linux_syscalls + 1
+#if defined(CONFIG_BINFMT_IRIX)
+	sltiu	v0, a0, MAX_SYSCALL_NO + 1 # check syscall number
+#else
+	subu	v0, a0, __NR_O32_Linux	# check syscall number
+	sltiu	v0, v0, __NR_O32_Linux_syscalls + 1
+#endif
 	beqz	v0, enosys
 
-	sll	v0, a0, 2
-	la	v1, sys_syscall
-	lw	t2, sys_call_table(v0)		# function pointer
-	lbu	t4, sys_narg_table(a0)		# number of arguments
-
-	li	v0, -EINVAL
-	beq	t2, v1, out			# do not recurse
+	sll	t0, v0, 3
+	lw	t2, sys_call_table(t0)		# syscall routine
 
+	li	v1, 4000			# nr of sys_syscall
 	beqz	t2, enosys			# null function pointer?
 
-	andi	v0, t0, 0x3			# unaligned stack pointer?
-	bnez	v0, sigsegv
+	li	v0, -EINVAL
+	beq	a0, v1, out			# do not recurse
 
-	addu	v0, t0, 16			# v0 = usp + 16
-	addu	t1, v0, 12			# 3 32-bit arguments
-	lw	v1, TI_ADDR_LIMIT($28)
-	or	v0, v0, t1
-	and	v1, v1, v0
-	bltz	v1, efault
+	/* Some syscalls like execve get their arguments from struct pt_regs
+	   and claim zero arguments in the syscall table. Thus we have to
+	   assume the worst case and shuffle around all potential arguments.
+	   If you want performance, don't use indirect syscalls. */
 
 	move	a0, a1				# shift argument registers
 	move	a1, a2
 	move	a2, a3
-
-1:	lw	a3, 16(t0)
-2:	lw	t3, 20(t0)
-3:	lw	t4, 24(t0)
-
-	.section	__ex_table, "a"
-	.word	1b, efault
-	.word	2b, efault
-	.word	3b, efault
-	.previous
-
-	sw	t3, 16(sp)			# put into new stackframe
-	sw	t4, 20(sp)
-
-	bnez	t4, 1f				# zero arguments?
-	addu	a0, sp, 32			# then pass sp in a0
-1:
-
-	sw	t3, 16(sp)
-	sw	v1, 20(sp)
+	lw	a3, 16(sp)
+	lw	t4, 20(sp)
+	lw	t5, 24(sp)
+	lw	t6, 28(sp)
+	sw	t4, 16(sp)
+	sw	t5, 20(sp)
+	sw	t6, 24(sp)
+	sw	a0, PT_R4(sp)			# .. and push back a0 - a3, some
+	sw	a1, PT_R5(sp)			# syscalls expect them there
+	sw	a2, PT_R6(sp)
+	sw	a3, PT_R7(sp)
+	sw	a3, PT_R26(sp)			# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
 enosys:	li	v0, -ENOSYS
-	b	out
-
-sigsegv:
-	li	a0, _SIGSEGV
-	move	a1, $28
-	jal	force_sig
-	/* Fall through */
-
-efault:	li	v0, -EFAULT
 
 out:	jr	ra
 	END(sys_syscall)
@@ -349,12 +333,14 @@ out:	jr	ra
 	.endm
 
 	.macro	syscalltable
+#if defined(CONFIG_BINFMT_IRIX)
 	mille	sys_ni_syscall		0	/*    0 -  999 SVR4 flavour */
-	#include "irix5sys.h"			/* 1000 - 1999 32-bit IRIX */
+# include "irix5sys.h"				/* 1000 - 1999 32-bit IRIX */
 	mille	sys_ni_syscall		0	/* 2000 - 2999 BSD43 flavour */
 	mille	sys_ni_syscall		0	/* 3000 - 3999 POSIX flavour */
+#endif
 
-	sys	sys_syscall		0	/* 4000 */
+	sys	sys_syscall		8	/* 4000 */
 	sys	sys_exit		1
 	sys	sys_fork		0
 	sys	sys_read		3
@@ -640,19 +626,16 @@ out:	jr	ra
 
 	.endm
 
+	/* We pre-compute the number of _instruction_ bytes needed to
+	   load or store the arguments 6-8. Negative values are ignored. */
+
 	.macro  sys function, nargs
 	PTR	\function
+	LONG	(\nargs << 2) - (5 << 2)
 	.endm
 
 	.align	3
+	.type	sys_call_table,@object
 sys_call_table:
 	syscalltable
 	.size	sys_call_table, . - sys_call_table
-
-	.macro	sys function, nargs
-	.byte	\nargs
-	.endm
-
-sys_narg_table:
-	syscalltable
-	.size	sys_narg_table, . - sys_narg_table
Index: arch/mips/kernel/scall64-o32.S
===================================================================
RCS file: /home/cvs/linux/arch/mips/kernel/scall64-o32.S,v
retrieving revision 1.24
diff -u -p -r1.24 scall64-o32.S
--- arch/mips/kernel/scall64-o32.S	25 Nov 2004 13:40:10 -0000	1.24
+++ arch/mips/kernel/scall64-o32.S	26 Nov 2004 23:15:47 -0000
@@ -6,6 +6,7 @@
  * Copyright (C) 1995 - 2000, 2001 by Ralf Baechle
  * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
  * Copyright (C) 2001 MIPS Technologies, Inc.
+ * Copyright (C) 2004 Thiemo Seufer
  *
  * Hairy, the userspace application uses a different argument passing
  * convention than the kernel, so we have to translate things from o32
@@ -43,6 +44,8 @@ NESTED(handle_sys, PT_SIZE, sp)
  RESTORE_ALL
 #endif
 
+	/* We don't want to stumble over broken sign extensions from
+	   userland. O32 does never use the upper half. */
 	sll	a0, a0, 0
 	sll	a1, a1, 0
 	sll	a2, a2, 0
@@ -68,11 +71,13 @@ NESTED(handle_sys, PT_SIZE, sp)
 1:	lw	a4, 16(t0)		# argument #5 from usp
 2:	lw	a5, 20(t0)		# argument #6 from usp
 3:	lw	a6, 24(t0)		# argument #7 from usp
+4:	lw	a7, 28(t0)		# argument #8 from usp (for indirect syscalls)
 
 	.section __ex_table,"a"
 	PTR	1b, bad_stack
 	PTR	2b, bad_stack
 	PTR	3b, bad_stack
+	PTR	4b, bad_stack
 	.previous
 
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
@@ -91,7 +96,7 @@ NESTED(handle_sys, PT_SIZE, sp)
 	sd	v0, PT_R0(sp)		# flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-FEXPORT(o32_syscall_exit)
+o32_syscall_exit:
 	local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
@@ -109,12 +114,12 @@ o32_syscall_exit_work:
 
 trace_a_syscall:
 	SAVE_STATIC
-	sd	a4, PT_R8(sp)
+	sd	a4, PT_R8(sp)		# Save argument registers
 	sd	a5, PT_R9(sp)
 	sd	a6, PT_R10(sp)
-	sd	a7, PT_R11(sp)
+	sd	a7, PT_R11(sp)		# For indirect syscalls
 
-	move	s0, t2
+	move	s0, t2			# Save syscall pointer
 	move	a0, sp
 	li	a1, 0
 	jal	do_syscall_trace
@@ -125,7 +130,8 @@ trace_a_syscall:
 	ld	a3, PT_R7(sp)
 	ld	a4, PT_R8(sp)
 	ld	a5, PT_R9(sp)
-	ld	a6, PT_R10(sp)		# For indirect syscalls
+	ld	a6, PT_R10(sp)
+	ld	a7, PT_R11(sp)		# For indirect syscalls
 	jalr	s0
 
 	li	t0, -EMAXERRNO - 1	# error?
@@ -173,55 +179,40 @@ illegal_syscall:
 	END(handle_sys)
 
 LEAF(sys32_syscall)
-	ld	t0, PT_R29(sp)		# user sp
-
 	sltu	v0, a0, __NR_O32_Linux + __NR_O32_Linux_syscalls + 1
 	beqz	v0, enosys
 
 	dsll	v0, a0, 3
-	dla	v1, sys32_syscall
 	ld	t2, (sys_call_table - (__NR_O32_Linux * 8))(v0)
 
+	li	v1, 4000		# indirect syscall number
 	li	v0, -EINVAL
-	beq	t2, v1, out		# do not recurse
+	beq	a0, v1, out		# do not recurse
 
 	beqz	t2, enosys		# null function pointer?
 
-	andi	v0, t0, 0x3		# unaligned stack pointer?
-	bnez	v0, sigsegv
-
-	daddiu	v0, t0, 16		# v0 = usp + 16
-	daddu	t1, v0, 12		# 3 32-bit arguments
-	ld	v1, TI_ADDR_LIMIT($28)
-	or	v0, v0, t1
-	and	v1, v1, v0
-	bnez	v1, efault
-
 	move	a0, a1			# shift argument registers
 	move	a1, a2
 	move	a2, a3
 	move	a3, a4
 	move	a4, a5
 	move	a5, a6
+	move	a6, a7
+	sd	a0, PT_R4(sp)		# ... and push back a0 - a3, some
+	sd	a1, PT_R5(sp)		# syscalls expect them there
+	sd	a2, PT_R6(sp)
+	sd	a3, PT_R7(sp)
+	sd	a3, PT_R26(sp)		# update a3 for syscall restarting
 	jr	t2
 	/* Unreached */
 
 enosys:	li	v0, -ENOSYS
-	b	out
-
-sigsegv:
-	li	a0, _SIGSEGV
-	move	a1, $28
-	jal	force_sig
-	/* Fall through */
-
-efault:	li	v0, -EFAULT
 
 out:	jr	ra
 	END(sys32_syscall)
 
 	.align	3
-	.type	sys_call_table,@object;
+	.type	sys_call_table,@object
 sys_call_table:
 	PTR	sys32_syscall			/* 4000 */
 	PTR	sys_exit	

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2004-11-27  0:29 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-11-21 16:45 [PATCH] Improve o32 syscall handling Thiemo Seufer
2004-11-22  6:18 ` Ralf Baechle
2004-11-22  7:00   ` Thiemo Seufer
2004-11-22  7:13     ` Ralf Baechle
2004-11-22  9:37       ` Thiemo Seufer
2004-11-27  0:29         ` Thiemo Seufer
2004-11-23 12:47 ` Gleb O. Raiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox