public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 3/3] Fix copy_user on x86_64
@ 2008-06-27 21:52 Vitaly Mayatskikh
  2008-06-28 18:26 ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-06-27 21:52 UTC (permalink / raw)
  To: linux-kernel; +Cc: Linus Torvalds, Andi Kleen, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 75 bytes --]

Added copy_user_64.c instead of copy_user_64.S and
copy_user_nocache_64.S


[-- Attachment #2: add copy_user_64.c --]
[-- Type: text/plain, Size: 7902 bytes --]

diff --git a/arch/x86/lib/copy_user_64.c b/arch/x86/lib/copy_user_64.c
new file mode 100644
index 0000000..7317bf5
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+/*
+ * Try to copy last bytes and clear rest if needed.
+ * Since protection fault in copy_from/to_user is not a normal situation,
+ * it is not necessary to optimize tail handling .
+ */
+unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
+{
+	char c;
+	unsigned zero_len;
+
+	for (; len; --len) {
+		if (__get_user_nocheck(c, from++, sizeof(char)))
+			break;
+		if (__put_user_nocheck(c, to++, sizeof(char)))
+			break;
+	}
+
+	for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
+		if (__put_user_nocheck(c, to++, sizeof(char)))
+			break;
+	return len;
+}
+
+/* Some CPUs run faster using the string copy instructions.
+ * This is also a lot simpler. Use them when possible.
+ *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ */
+inline unsigned long
+copy_user_generic_string(void *to, const void *from, unsigned len)
+{
+	unsigned long ret;
+	asm volatile (
+		"	movl %%ecx,%%edx\n"
+		"	shrl $3,%%ecx\n"
+		"	andl $7,%%edx\n"
+		"1:	rep; movsq\n"
+		"	movl %%edx,%%ecx\n"
+		"2:	rep; movsb\n"
+		"3:\n"
+		".section .fixup,\"ax\"\n"
+		"12:	xorl %%ecx,%%ecx\n"
+		"11:	leal (%%edx,%%ecx,8),%%ecx\n"
+		"	movl %%ecx,%%edx\n"	/* ecx is zerorest also */
+		"	call copy_user_handle_tail\n"
+		"	movl %%eax,%%ecx\n"
+		"	jmp 3b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.quad 1b,11b\n"
+		"	.quad 2b,12b\n"
+		".previous"
+		: "=c"(ret)
+		: "D"(to), "S"(from), "c"(len)
+		: "eax", "edx", "memory"
+		);
+	return ret;
+}
+
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro
+ * code for rep movsq
+ */
+inline unsigned long
+copy_user_generic_unrolled(void *to, const void *from, unsigned len)
+{
+	unsigned long ret;
+	asm volatile (
+		"	movl %%ecx,%%edx\n"
+		"	andl $63,%%edx\n"
+		"	shrl $6,%%ecx\n"
+		"	jz 17f\n"
+		"1:	movq (%%rsi),%%r8\n"
+		"2:	movq 1*8(%%rsi),%%r9\n"
+		"3:	movq 2*8(%%rsi),%%r10\n"
+		"4:	movq 3*8(%%rsi),%%r11\n"
+		"5:	movq %%r8,(%%rdi)\n"
+		"6:	movq %%r9,1*8(%%rdi)\n"
+		"7:	movq %%r10,2*8(%%rdi)\n"
+		"8:	movq %%r11,3*8(%%rdi)\n"
+		"9:	movq 4*8(%%rsi),%%r8\n"
+		"10:	movq 5*8(%%rsi),%%r9\n"
+		"11:	movq 6*8(%%rsi),%%r10\n"
+		"12:	movq 7*8(%%rsi),%%r11\n"
+		"13:	movq %%r8,4*8(%%rdi)\n"
+		"14:	movq %%r9,5*8(%%rdi)\n"
+		"15:	movq %%r10,6*8(%%rdi)\n"
+		"16:	movq %%r11,7*8(%%rdi)\n"
+		"	leaq 64(%%rsi),%%rsi\n"
+		"	leaq 64(%%rdi),%%rdi\n"
+		"	decl %%ecx\n"
+		"	jnz 1b\n"
+		"17:	movl %%edx,%%ecx\n"
+		"	andl $7,%%edx\n"
+		"	shrl $3,%%ecx\n"
+		"	jz 20f\n"
+		"18:	movq (%%rsi),%%r8\n"
+		"19:	movq %%r8,(%%rdi)\n"
+		"	leaq 8(%%rsi),%%rsi\n"
+		"	leaq 8(%%rdi),%%rdi\n"
+		"	decl %%ecx\n"
+		"	jnz 18b\n"
+		"20:	andl %%edx,%%edx\n"
+		"	jz 23f\n"
+		"	movl %%edx,%%ecx\n"
+		"21:	movb (%%rsi),%%al\n"
+		"22:	movb %%al,(%%rdi)\n"
+		"	incq %%rsi\n"
+		"	incq %%rdi\n"
+		"	decl %%ecx\n"
+		"	jnz 21b\n"
+		"23:\n"
+		".section .fixup,\"ax\"\n"
+		"30:	shll $6,%%ecx\n"
+		"	addl %%ecx,%%edx\n"
+		"	jmp 60f\n"
+		"40:	leal (%%edx,%%ecx,8),%%edx\n"
+		"	jmp 60f\n"
+		"50:	movl %%ecx,%%edx\n"
+		"60:\n"				/* ecx is zerorest also */
+		"	call copy_user_handle_tail\n"
+		"	movl %%eax,%%ecx\n"
+		"	jmp  23b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.quad 1b,30b\n"
+		"	.quad 2b,30b\n"
+		"	.quad 3b,30b\n"
+		"	.quad 4b,30b\n"
+		"	.quad 5b,30b\n"
+		"	.quad 6b,30b\n"
+		"	.quad 7b,30b\n"
+		"	.quad 8b,30b\n"
+		"	.quad 9b,30b\n"
+		"	.quad 10b,30b\n"
+		"	.quad 11b,30b\n"
+		"	.quad 12b,30b\n"
+		"	.quad 13b,30b\n"
+		"	.quad 14b,30b\n"
+		"	.quad 15b,30b\n"
+		"	.quad 16b,30b\n"
+		"	.quad 18b,40b\n"
+		"	.quad 19b,40b\n"
+		"	.quad 21b,50b\n"
+		"	.quad 22b,50b\n"
+		".previous"
+		: "=c"(ret)
+		: "D"(to), "S"(from), "c"(len)
+		: "eax", "edx", "r8", "r9", "r10", "r11", "memory"
+		);
+	return ret;
+}
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ */
+long __copy_user_nocache(void *to, const void *from, unsigned len, int zerorest)
+{
+	unsigned long ret;
+	asm volatile (
+		"	movl %%ecx,%%edx\n"
+		"	andl $63,%%edx\n"
+		"	shrl $6,%%ecx\n"
+		"	jz 17f\n"
+		"1:	movq (%%rsi),%%r8\n"
+		"2:	movq 1*8(%%rsi),%%r9\n"
+		"3:	movq 2*8(%%rsi),%%r10\n"
+		"4:	movq 3*8(%%rsi),%%r11\n"
+		"5:	movnti %%r8,(%%rdi)\n"
+		"6:	movnti %%r9,1*8(%%rdi)\n"
+		"7:	movnti %%r10,2*8(%%rdi)\n"
+		"8:	movnti %%r11,3*8(%%rdi)\n"
+		"9:	movq 4*8(%%rsi),%%r8\n"
+		"10:	movq 5*8(%%rsi),%%r9\n"
+		"11:	movq 6*8(%%rsi),%%r10\n"
+		"12:	movq 7*8(%%rsi),%%r11\n"
+		"13:	movnti %%r8,4*8(%%rdi)\n"
+		"14:	movnti %%r9,5*8(%%rdi)\n"
+		"15:	movnti %%r10,6*8(%%rdi)\n"
+		"16:	movnti %%r11,7*8(%%rdi)\n"
+		"	leaq 64(%%rsi),%%rsi\n"
+		"	leaq 64(%%rdi),%%rdi\n"
+		"	decl %%ecx\n"
+		"	jnz 1b\n"
+		"17:	movl %%edx,%%ecx\n"
+		"	andl $7,%%edx\n"
+		"	shrl $3,%%ecx\n"
+		"	jz 20f\n"
+		"18:	movq (%%rsi),%%r8\n"
+		"19:	movnti %%r8,(%%rdi)\n"
+		"	leaq 8(%%rsi),%%rsi\n"
+		"	leaq 8(%%rdi),%%rdi\n"
+		"	decl %%ecx\n"
+		"	jnz 18b\n"
+		"20:	andl %%edx,%%edx\n"
+		"	jz 23f\n"
+		"	movl %%edx,%%ecx\n"
+		"21:	movb (%%rsi),%%al\n"
+		"22:	movb %%al,(%%rdi)\n"
+		"	incq %%rsi\n"
+		"	incq %%rdi\n"
+		"	decl %%ecx\n"
+		"	jnz 21b\n"
+		"23:	sfence\n"
+		".section .fixup,\"ax\"\n"
+		"30:	shll $6,%%ecx\n"
+		"	addl %%ecx,%%edx\n"
+		"	jmp 60f\n"
+		"40:	leal (%%edx,%%ecx,8),%%edx\n"
+		"	jmp 60f\n"
+		"50:	movl %%ecx,%%edx\n"
+		"60:	sfence\n"
+		"	movl %%ebx,%%ecx\n"
+		"	call copy_user_handle_tail\n"
+		"	movl %%eax,%%ecx\n"
+		"	jmp  23b\n"
+		".previous\n"
+		".section __ex_table,\"a\"\n"
+		"	.quad 1b,30b\n"
+		"	.quad 2b,30b\n"
+		"	.quad 3b,30b\n"
+		"	.quad 4b,30b\n"
+		"	.quad 5b,30b\n"
+		"	.quad 6b,30b\n"
+		"	.quad 7b,30b\n"
+		"	.quad 8b,30b\n"
+		"	.quad 9b,30b\n"
+		"	.quad 10b,30b\n"
+		"	.quad 11b,30b\n"
+		"	.quad 12b,30b\n"
+		"	.quad 13b,30b\n"
+		"	.quad 14b,30b\n"
+		"	.quad 15b,30b\n"
+		"	.quad 16b,30b\n"
+		"	.quad 18b,40b\n"
+		"	.quad 19b,40b\n"
+		"	.quad 21b,50b\n"
+		"	.quad 22b,50b\n"
+		".previous"
+		: "=c"(ret)
+		: "D"(to), "S"(from), "c"(len), "b"(zerorest)
+		: "eax", "edx", "r8", "r9", "r10", "r11", "memory"
+		);
+	return ret;
+}
+
+unsigned long copy_user_generic(void *to, const void *from, unsigned len)
+{
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_REP_GOOD))
+		return copy_user_generic_string(to, from, len);
+	else
+		return copy_user_generic_unrolled(to, from, len);
+}
+
+/* Standard copy_to_user with segment limit checking */
+unsigned long copy_to_user(void __user *to, const void *from, unsigned len)
+{
+	if (access_ok(VERIFY_WRITE, to, len))
+		return copy_user_generic(to, from, len);
+	return len;
+}
+
+/* Standard copy_from_user with segment limit checking */
+unsigned long copy_from_user(void *to, const void __user *from, unsigned len)
+{
+	if (access_ok(VERIFY_READ, from, len))
+		return copy_user_generic(to, from, len);
+	else
+		memset(to, 0, len);
+	return len;
+}
+
+long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
+{
+	return copy_user_generic(dst, src, size);
+}

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>

[-- Attachment #3: Type: text/plain, Size: 17 bytes --]


-- 
wbr, Vitaly

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] Fix copy_user on x86_64
  2008-06-27 21:52 [PATCH 3/3] Fix copy_user on x86_64 Vitaly Mayatskikh
@ 2008-06-28 18:26 ` Linus Torvalds
  2008-06-30 15:12   ` Vitaly Mayatskikh
  0 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2008-06-28 18:26 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: linux-kernel, Andi Kleen, Andrew Morton



On Fri, 27 Jun 2008, Vitaly Mayatskikh wrote:
>
> Added copy_user_64.c instead of copy_user_64.S and
> copy_user_nocache_64.S

Grr, your patches are as attachements, which means that answering to 
themmakes quoting them much harder. Oh well. Anyway, a few more comments:

 - I don't think it's worth it to move what is effectively 100% asm code 
   into a C function is really worth it. It doesn't make things easier to 
   read (often quite the reverse), nor to maintain.

   Using inline asm from C is great if you can actually make the compiler 
   do a noticeable part of the job, like register allocation and a fair 
   amount of setup, but here there is really room for neither.

   The part I wanted to be in C was the copy_user_handle_tail() thing 
   (which you did), nothing more.

   That said - you seem to have done this C conversion correctly, and if 
   you have some inherent reason why you want to use the "asm within C" 
   model, then I don't think this is _fundamentally_ bad. Moving it to 
   within C *can* have advantages (for doing things like adding debugging 
   hooks etc). So if you can explain the thinking a bit more...

 - You've introduced a new pessimization: the original asm code did the 
   jump to the "rep string" vs "the hand-unrolled" loop with

	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string

   which while a nasty and fairly complex macro was actually more clever 
   than the code you replaced it with:

	if (cpu_has(&boot_cpu_data, X86_FEATURE_REP_GOOD))
		return copy_user_generic_string(to, from, len);
	else
		return copy_user_generic_unrolled(to, from, len);

   because the "altinstruction" thing actually does a one-time (boot-time) 
   choice of the code sequence to run, so it _never_ does any conditional 
   jumps at run-time.

   From C code you can use the altinstruction infrastructure to do this, 
   but in fact it would probably be best to keep it in asm (again, the 
   only thing that C function would contain would be the jump one way or 
   the other - there's nothing really helping it being in C, although the 
   same thing can really be done with the C macro

	alternative(non-rep-version, rep-version, X86_FEATURE_REP_GOOD);

   but since you cannot do jumps out of inline asm (because the compiler 
   may have set up a stackframe that needs to be undone etc), you cannot 
   actually do as well in C code as in asm.

Hmm? Sorry for being such a stickler. This code does end up being fairly 
critical, both for correctness and for performance. A lot of user copies 
are actually short (smallish structures, or just small reads and writes), 
and I'd like to make sure that really basic infrastructure like this is 
just basically "known to be optimal", so that we can totally forget about 
it for a few more years.

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] Fix copy_user on x86_64
  2008-06-28 18:26 ` Linus Torvalds
@ 2008-06-30 15:12   ` Vitaly Mayatskikh
  2008-06-30 15:55     ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-06-30 15:12 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Vitaly Mayatskikh, linux-kernel, Andi Kleen, Andrew Morton

Linus Torvalds <torvalds@linux-foundation.org> writes:

>> Added copy_user_64.c instead of copy_user_64.S and
>> copy_user_nocache_64.S
>
> Grr, your patches are as attachements, which means that answering to 
> themmakes quoting them much harder. 

Sorry... But what was mentioned in Documentation/SubmittingPatches with:

"For this reason, all patches should be submitting e-mail "inline".
WARNING:  Be wary of your editor's word-wrap corrupting your patch,
if you choose to cut-n-paste your patch."

My first thought was "should be attached inline".

> Hmm? Sorry for being such a stickler. This code does end up being fairly 
> critical, both for correctness and for performance. A lot of user copies 
> are actually short (smallish structures, or just small reads and writes), 
> and I'd like to make sure that really basic infrastructure like this is 
> just basically "known to be optimal", so that we can totally forget about 
> it for a few more years.

Agreed. Code was reworked again, will test it a bit more. Two more
questions to you and Andi:

1. Do you see any reasons to do fix alignment for destination as it was
done in copy_user_generic_unrolled (yes, I know, access to unaligned
address is slower)? It tries to byte-copy unaligned bytes first and then
to do a normal copy. I think, most times destination addresses will be
aligned and this check is not so necessary. If it is necessary, then
copy_user_generic_string should do the same.

2. What is the purpose of "minor optimization" in commit
3022d734a54cbd2b65eea9a024564821101b4a9a?

ENTRY(copy_user_generic_string)
        CFI_STARTPROC
        movl %ecx,%r8d          /* save zero flag */
        movl %edx,%ecx
        shrl $3,%ecx
        andl $7,%edx
        jz   10f
1:      rep
        movsq
        movl %edx,%ecx
2:      rep
        movsb
9:      movl %ecx,%eax
        ret
        
        /* multiple of 8 byte */
10:     rep
        movsq
        xor %eax,%eax
        ret

I don't think CPU is able to speculate with 'rep movs*' in both
branches, and I'm not sure if conditional jump is cheaper then empty
'rep movsb' (when ecx is 0). I want to eliminate it if you don't have
any objections.

Thanks.
-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] Fix copy_user on x86_64
  2008-06-30 15:12   ` Vitaly Mayatskikh
@ 2008-06-30 15:55     ` Linus Torvalds
  2008-06-30 16:16       ` Andi Kleen
                         ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Linus Torvalds @ 2008-06-30 15:55 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: linux-kernel, Andi Kleen, Andrew Morton



On Mon, 30 Jun 2008, Vitaly Mayatskikh wrote:
> 
> "For this reason, all patches should be submitting e-mail "inline".
> WARNING:  Be wary of your editor's word-wrap corrupting your patch,
> if you choose to cut-n-paste your patch."
> 
> My first thought was "should be attached inline".

Yeah, no, the "inline" there means literally as no attachment at all, but 
inline in the normal mail.

Sometimes it's not possible (known broken MUA's/MTA's), and for really big 
patches it's usually not all that useful anyway, since nobody is going to 
review or comment on rally big patches in the first place (but because of 
that, nobody should ever even _send_ such patches, because they are 
pointless). But in general, if you don't have a crappy MUA/MTA setup, 
putting the patch at the end of the email as normal inline text, no 
attachment, means that every form of emailer known to man will have no 
problem quoting it for commentary or showing it by default etc.

> Agreed. Code was reworked again, will test it a bit more. Two more
> questions to you and Andi:
> 
> 1. Do you see any reasons to do fix alignment for destination as it was
> done in copy_user_generic_unrolled (yes, I know, access to unaligned
> address is slower)? It tries to byte-copy unaligned bytes first and then
> to do a normal copy. I think, most times destination addresses will be
> aligned and this check is not so necessary. If it is necessary, then
> copy_user_generic_string should do the same.

Usually the cost of alignment is higher for writes than for reads (eg you 
may be able to do two cache reads per cycle but only one cache write), so 
aligning the destination preferentially is always a good idea.

Also, if the source and destination are actualy mutually aligned, and the 
_start_ is just not aligned, then aligning the destination will align the 
source too (if they aren't mutually aligned, one or the other will always 
be an unaligned access, and as mentioned, it's _usually_ cheaper to do the 
load unaligned rather than the store).

So I suspect the alignment code is worth it. There are many situations 
where the kernel ends up having unaligned memory copies, sometimes big 
ones too: things like TCP packets aren't nice powrs-of-two, so when you do 
per-packet copying, even if the user passed in a buffer that was 
originally aligned, by the time you've copied a few packets you may no 
longer be nicely aligned any more.

> 2. What is the purpose of "minor optimization" in commit
> 3022d734a54cbd2b65eea9a024564821101b4a9a?

I think that one was just a "since we're doing that 'and' operation, and 
since it sets the flags anyway, jumping to a special sequence is free".

Btw, for string instructions, it would probably be nice if we actually 
tried to trigger the "fast string" mode if possible. Intel CPU's (and 
maybe AMD ones too) have a special cache-line optimizing mode for "rep 
movs" that triggers in special circumstances:

  "In order for a fast string move to occur, five conditions must be met:

   1. The source and destination address must be 8-byte aligned.
   2. The string operation (rep movs) must operate on the data in 
      ascending order
   3. The initial count (ECX) must be at least 64
   4. The source and the destination can't overlap by less than a cache 
      line
   5. The memory types of both source and destination must either be write 
      back cacheable or write combining."

and we historically haven't cared much, because the above _naturally_ 
happens for the bulk of the important cases (copy whole pages, which 
happens not just in the VM for COW, but also when a user reads a regular 
file in aligned chunks). But again, for networking buffers, it _might_ 
make sense to try to help trigger this case explicitly.

			Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] Fix copy_user on x86_64
  2008-06-30 15:55     ` Linus Torvalds
@ 2008-06-30 16:16       ` Andi Kleen
  2008-06-30 18:22       ` Kari Hurtta
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2008-06-30 16:16 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Vitaly Mayatskikh, linux-kernel, Andrew Morton


One optimization (I haven't explored) is that the startup overhead
of rep ; movs might make it worthwhile to branch to a special version
for copies larger than 8 bytes (which are optimized in uaccess
for constant counts) and smaller than the threshold.

Best heuristic unfortunately varies per CPU core (and possible
even per stepping)

> 
> and we historically haven't cared much, because the above _naturally_ 
> happens for the bulk of the important cases (copy whole pages, which 
> happens not just in the VM for COW, but also when a user reads a regular 
> file in aligned chunks). But again, for networking buffers, it _might_ 
> make sense to try to help trigger this case explicitly.

The problem (I did some measurements on that a long time ago) is that
often the source and destination alignments are different and you can't
really fix that up in copy_*_user.

That is why I dropped the original "fix up alignments" code.

One way to do that would be use a alignment hint from the user address
while allocating the destination buffer (or rather adding an offset),
so that source and destination are (mis)aligned  in the same way
and that can be fixed up then.

But that is not trivial for networking. Not sure it would be really
worth the effort.

For the file system is not feasible at all because the page cache
offsets are fixed.

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/3] Fix copy_user on x86_64
  2008-06-30 15:55     ` Linus Torvalds
  2008-06-30 16:16       ` Andi Kleen
@ 2008-06-30 18:22       ` Kari Hurtta
  2008-07-02 13:48       ` [PATCH 1/2] Introduce copy_user_handle_tail routine Vitaly Mayatskikh
  2008-07-02 13:53       ` [PATCH 2/2] Fix copy_user on x86 Vitaly Mayatskikh
  3 siblings, 0 replies; 26+ messages in thread
From: Kari Hurtta @ 2008-06-30 18:22 UTC (permalink / raw)
  To: linux-kernel

Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Mon, 30 Jun 2008, Vitaly Mayatskikh wrote:
> > 
> > "For this reason, all patches should be submitting e-mail "inline".
> > WARNING:  Be wary of your editor's word-wrap corrupting your patch,
> > if you choose to cut-n-paste your patch."
> > 
> > My first thought was "should be attached inline".
> 
> Yeah, no, the "inline" there means literally as no attachment at all, but 
> inline in the normal mail.

And not as own mime body part with

Content-Disposition: inline


?


( As you see "inline" have several meanings. 

  yes. Some mail readers treats only first body part as 'inline'
  and other have 'attachments'.
)

/ Kari Hurtta



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-06-30 15:55     ` Linus Torvalds
  2008-06-30 16:16       ` Andi Kleen
  2008-06-30 18:22       ` Kari Hurtta
@ 2008-07-02 13:48       ` Vitaly Mayatskikh
  2008-07-02 14:06         ` Andi Kleen
  2008-07-02 13:53       ` [PATCH 2/2] Fix copy_user on x86 Vitaly Mayatskikh
  3 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-02 13:48 UTC (permalink / raw)
  To: linux-kernel; +Cc: Linus Torvalds, Andi Kleen, Andrew Morton

Introduce generic C routine for handling necessary tail operations after
protection fault in copy_*_user on x86.

diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0c89d1b..3ab4ec8 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -158,3 +158,26 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le
 }
 EXPORT_SYMBOL(copy_in_user);
 
+/*
+ * Try to copy last bytes and clear the rest if needed.
+ * Since protection fault in copy_from/to_user is not a normal situation,
+ * it is not necessary to optimize tail handling.
+ */
+unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
+{
+	char c;
+	unsigned zero_len;
+
+	for (; len; --len) {
+		if (__get_user_nocheck(c, from++, sizeof(char)))
+			break;
+		if (__put_user_nocheck(c, to++, sizeof(char)))
+			break;
+	}
+
+	for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
+		if (__put_user_nocheck(c, to++, sizeof(char)))
+			break;
+	return len;
+}
diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h
index b8a2f43..1dc2003 100644
--- a/include/asm-x86/uaccess_64.h
+++ b/include/asm-x86/uaccess_64.h
@@ -455,4 +455,7 @@ static inline int __copy_from_user_inatomic_nocache(void *dst,
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
+unsigned long
+copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+
 #endif /* __X86_64_UACCESS_H */

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>

-- 
wbr, Vitaly

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/2] Fix copy_user on x86
  2008-06-30 15:55     ` Linus Torvalds
                         ` (2 preceding siblings ...)
  2008-07-02 13:48       ` [PATCH 1/2] Introduce copy_user_handle_tail routine Vitaly Mayatskikh
@ 2008-07-02 13:53       ` Vitaly Mayatskikh
  2008-07-02 14:08         ` Andi Kleen
  3 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-02 13:53 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Vitaly Mayatskikh, linux-kernel, Andi Kleen, Andrew Morton

Switch copy_user_generic_string(), copy_user_generic_unrolled() and
__copy_user_nocache() from custom tail handlers to generic
copy_user_tail_handle(). 

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index ee1c3f6..5181653 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
  * Subject to the GNU Public License v2.
- * 
- * Functions to copy from and to user space.		
- */		 
+ *
+ * Functions to copy from and to user space.
+ */
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
@@ -20,60 +22,88 @@
 	.long \orig-1f	/* by default jump to orig */
 1:
 	.section .altinstr_replacement,"ax"
-2:	.byte 0xe9	             /* near jump with 32bit immediate */
+2:	.byte 0xe9			/* near jump with 32bit immediate */
 	.long \alt-1b /* offset */   /* or alternatively to alt */
 	.previous
 	.section .altinstructions,"a"
 	.align 8
 	.quad  0b
 	.quad  2b
-	.byte  \feature		     /* when feature is set */
+	.byte  \feature			/* when feature is set */
 	.byte  5
 	.byte  5
 	.previous
 	.endm
 
-/* Standard copy_to_user with segment limit checking */		
+	.macro ALIGN_DESTINATION
+#ifdef FIX_ALIGNMENT
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+100:	movb (%rsi),%al
+101:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	.section .fixup,"ax"
+103:	addl %r8d,%edx			/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	.section __ex_table,"a"
+	.align 8
+	.quad 100b,103b
+	.quad 101b,103b
+	.previous
+#endif
+	.endm
+
+/* Standard copy_to_user with segment limit checking */
 ENTRY(copy_to_user)
 	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rdi,%rcx
 	addq %rdx,%rcx
-	jc  bad_to_user
+	jc bad_to_user
 	cmpq threadinfo_addr_limit(%rax),%rcx
 	jae bad_to_user
-	xorl %eax,%eax	/* clear zero flag */
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
 	CFI_ENDPROC
 
-ENTRY(copy_user_generic)
+/* Standard copy_from_user with segment limit checking */
+ENTRY(copy_from_user)
 	CFI_STARTPROC
-	movl $1,%ecx	/* set zero flag */
+	GET_THREAD_INFO(%rax)
+	movq %rsi,%rcx
+	addq %rdx,%rcx
+	jc bad_from_user
+	cmpq threadinfo_addr_limit(%rax),%rcx
+	jae bad_from_user
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
 	CFI_ENDPROC
+ENDPROC(copy_from_user)
 
-ENTRY(__copy_from_user_inatomic)
+ENTRY(copy_user_generic)
 	CFI_STARTPROC
-	xorl %ecx,%ecx	/* clear zero flag */
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
 	CFI_ENDPROC
+ENDPROC(copy_user_generic)
 
-/* Standard copy_from_user with segment limit checking */	
-ENTRY(copy_from_user)
+ENTRY(__copy_from_user_inatomic)
 	CFI_STARTPROC
-	GET_THREAD_INFO(%rax)
-	movq %rsi,%rcx
-	addq %rdx,%rcx
-	jc  bad_from_user
-	cmpq threadinfo_addr_limit(%rax),%rcx
-	jae  bad_from_user
-	movl $1,%ecx	/* set zero flag */
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
 	CFI_ENDPROC
-ENDPROC(copy_from_user)
-	
+ENDPROC(__copy_from_user_inatomic)
+
 	.section .fixup,"ax"
 	/* must zero dest */
+ENTRY(bad_from_user)
 bad_from_user:
 	CFI_STARTPROC
 	movl %edx,%ecx
@@ -81,271 +111,158 @@ bad_from_user:
 	rep
 	stosb
 bad_to_user:
-	movl	%edx,%eax
+	movl %edx,%eax
 	ret
 	CFI_ENDPROC
-END(bad_from_user)
+ENDPROC(bad_from_user)
 	.previous
-	
-		
+
 /*
  * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
- * 	
- * Input:	
+ * This version is for CPUs like P4 that don't have efficient micro
+ * code for rep movsq
+ *
+ * Input:
  * rdi destination
  * rsi source
  * rdx count
- * ecx zero flag -- if true zero destination on error
  *
- * Output:		
- * eax uncopied bytes or 0 if successful.
+ * Output:
+ * eax uncopied bytes or 0 if successfull.
  */
 ENTRY(copy_user_generic_unrolled)
 	CFI_STARTPROC
-	pushq %rbx
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rbx, 0
-	pushq %rcx
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rcx, 0
-	xorl %eax,%eax		/*zero for the exception handler */
-
-#ifdef FIX_ALIGNMENT
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-
-	movq %rdx,%rcx
-
-	movl $64,%ebx
-	shrq $6,%rdx
-	decq %rdx
-	js   .Lhandle_tail
-
-	.p2align 4
-.Lloop:
-.Ls1:	movq (%rsi),%r11
-.Ls2:	movq 1*8(%rsi),%r8
-.Ls3:	movq 2*8(%rsi),%r9
-.Ls4:	movq 3*8(%rsi),%r10
-.Ld1:	movq %r11,(%rdi)
-.Ld2:	movq %r8,1*8(%rdi)
-.Ld3:	movq %r9,2*8(%rdi)
-.Ld4:	movq %r10,3*8(%rdi)
-
-.Ls5:	movq 4*8(%rsi),%r11
-.Ls6:	movq 5*8(%rsi),%r8
-.Ls7:	movq 6*8(%rsi),%r9
-.Ls8:	movq 7*8(%rsi),%r10
-.Ld5:	movq %r11,4*8(%rdi)
-.Ld6:	movq %r8,5*8(%rdi)
-.Ld7:	movq %r9,6*8(%rdi)
-.Ld8:	movq %r10,7*8(%rdi)
-
-	decq %rdx
-
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+	ALIGN_DESTINATION
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+5:	movq %r8,(%rdi)
+6:	movq %r9,1*8(%rdi)
+7:	movq %r10,2*8(%rdi)
+8:	movq %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+13:	movq %r8,4*8(%rdi)
+14:	movq %r9,5*8(%rdi)
+15:	movq %r10,6*8(%rdi)
+16:	movq %r11,7*8(%rdi)
 	leaq 64(%rsi),%rsi
 	leaq 64(%rdi),%rdi
-
-	jns  .Lloop
-
-	.p2align 4
-.Lhandle_tail:
-	movl %ecx,%edx
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz   .Lhandle_7
-	movl $8,%ebx
-	.p2align 4
-.Lloop_8:
-.Ls9:	movq (%rsi),%r8
-.Ld9:	movq %r8,(%rdi)
 	decl %ecx
-	leaq 8(%rdi),%rdi
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+19:	movq %r8,(%rdi)
 	leaq 8(%rsi),%rsi
-	jnz .Lloop_8
-
-.Lhandle_7:
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
 	movl %edx,%ecx
-	andl $7,%ecx
-	jz   .Lende
-	.p2align 4
-.Lloop_1:
-.Ls10:	movb (%rsi),%bl
-.Ld10:	movb %bl,(%rdi)
-	incq %rdi
+21:	movb (%rsi),%al
+22:	movb %al,(%rdi)
 	incq %rsi
+	incq %rdi
 	decl %ecx
-	jnz .Lloop_1
-
-	CFI_REMEMBER_STATE
-.Lende:
-	popq %rcx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rcx
-	popq %rbx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rbx
+	jnz 21b
+23:	xor %eax,%eax
 	ret
-	CFI_RESTORE_STATE
 
-#ifdef FIX_ALIGNMENT
-	/* align destination */
-	.p2align 4
-.Lbad_alignment:
-	movl $8,%r9d
-	subl %ecx,%r9d
-	movl %r9d,%ecx
-	cmpq %r9,%rdx
-	jz   .Lhandle_7
-	js   .Lhandle_7
-.Lalign_1:
-.Ls11:	movb (%rsi),%bl
-.Ld11:	movb %bl,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz .Lalign_1
-	subq %r9,%rdx
-	jmp .Lafter_bad_alignment
-#endif
+	.section .fixup,"ax"
+30:	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 60f
+40:	leal (%edx,%ecx,8),%edx
+	jmp 60f
+50:	movl %ecx,%edx
+60:	jmp copy_user_handle_tail /* ecx is zerorest also */
+	.previous
 
-	/* table sorted by exception address */
 	.section __ex_table,"a"
 	.align 8
-	.quad .Ls1,.Ls1e	/* Ls1-Ls4 have copied zero bytes */
-	.quad .Ls2,.Ls1e
-	.quad .Ls3,.Ls1e
-	.quad .Ls4,.Ls1e
-	.quad .Ld1,.Ls1e	/* Ld1-Ld4 have copied 0-24 bytes */
-	.quad .Ld2,.Ls2e
-	.quad .Ld3,.Ls3e
-	.quad .Ld4,.Ls4e
-	.quad .Ls5,.Ls5e	/* Ls5-Ls8 have copied 32 bytes */
-	.quad .Ls6,.Ls5e
-	.quad .Ls7,.Ls5e
-	.quad .Ls8,.Ls5e
-	.quad .Ld5,.Ls5e	/* Ld5-Ld8 have copied 32-56 bytes */
-	.quad .Ld6,.Ls6e
-	.quad .Ld7,.Ls7e
-	.quad .Ld8,.Ls8e
-	.quad .Ls9,.Le_quad
-	.quad .Ld9,.Le_quad
-	.quad .Ls10,.Le_byte
-	.quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT
-	.quad .Ls11,.Lzero_rest
-	.quad .Ld11,.Lzero_rest
-#endif
-	.quad .Le5,.Le_zero
+	.quad 1b,30b
+	.quad 2b,30b
+	.quad 3b,30b
+	.quad 4b,30b
+	.quad 5b,30b
+	.quad 6b,30b
+	.quad 7b,30b
+	.quad 8b,30b
+	.quad 9b,30b
+	.quad 10b,30b
+	.quad 11b,30b
+	.quad 12b,30b
+	.quad 13b,30b
+	.quad 14b,30b
+	.quad 15b,30b
+	.quad 16b,30b
+	.quad 18b,40b
+	.quad 19b,40b
+	.quad 21b,50b
+	.quad 22b,50b
 	.previous
-
-	/* eax: zero, ebx: 64 */
-.Ls1e: 	addl $8,%eax		/* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
-.Ls2e: 	addl $8,%eax
-.Ls3e: 	addl $8,%eax
-.Ls4e: 	addl $8,%eax
-.Ls5e: 	addl $8,%eax
-.Ls6e: 	addl $8,%eax
-.Ls7e: 	addl $8,%eax
-.Ls8e: 	addl $8,%eax
-	addq %rbx,%rdi	/* +64 */
-	subq %rax,%rdi  /* correct destination with computed offset */
-
-	shlq $6,%rdx	/* loop counter * 64 (stride length) */
-	addq %rax,%rdx	/* add offset to loopcnt */
-	andl $63,%ecx	/* remaining bytes */
-	addq %rcx,%rdx	/* add them */
-	jmp .Lzero_rest
-
-	/* exception on quad word loop in tail handling */
-	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
-	shll $3,%ecx
-	andl $7,%edx
-	addl %ecx,%edx
-	/* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
-	cmpl $0,(%rsp)
-	jz   .Le_zero
-	movq %rdx,%rcx
-.Le_byte:
-	xorl %eax,%eax
-.Le5:	rep
-	stosb
-	/* when there is another exception while zeroing the rest just return */
-.Le_zero:
-	movq %rdx,%rax
-	jmp .Lende
 	CFI_ENDPROC
-ENDPROC(copy_user_generic)
+ENDPROC(copy_user_generic_unrolled)
 
-
-	/* Some CPUs run faster using the string copy instructions.
-	   This is also a lot simpler. Use them when possible.
-	   Patch in jmps to this code instead of copying it fully
-	   to avoid unwanted aliasing in the exception tables. */
-
- /* rdi	destination
-  * rsi source
-  * rdx count
-  * ecx zero flag
-  *
-  * Output:
-  * eax uncopied bytes or 0 if successfull.
-  *
-  * Only 4GB of copy is supported. This shouldn't be a problem
-  * because the kernel normally only writes from/to page sized chunks
-  * even if user space passed a longer buffer.
-  * And more would be dangerous because both Intel and AMD have
-  * errata with rep movsq > 4GB. If someone feels the need to fix
-  * this please consider this.
-  */
+/* Some CPUs run faster using the string copy instructions.
+ * This is also a lot simpler. Use them when possible.
+ *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
 ENTRY(copy_user_generic_string)
 	CFI_STARTPROC
-	movl %ecx,%r8d		/* save zero flag */
+	andl %edx,%edx
+	jz 4f
+	cmpl $8,%edx
+	jb 2f		/* less than 8 bytes, go to byte copy loop */
+	ALIGN_DESTINATION
 	movl %edx,%ecx
 	shrl $3,%ecx
-	andl $7,%edx	
-	jz   10f
-1:	rep 
-	movsq 
-	movl %edx,%ecx
-2:	rep
-	movsb
-9:	movl %ecx,%eax
-	ret
-
-	/* multiple of 8 byte */
-10:	rep
+	andl $7,%edx
+1:	rep
 	movsq
-	xor %eax,%eax
+2:	movl %edx,%ecx
+3:	rep
+	movsb
+4:	xorl %eax,%eax
 	ret
 
-	/* exception handling */
-3:      lea (%rdx,%rcx,8),%rax	/* exception on quad loop */
-	jmp 6f
-5:	movl %ecx,%eax		/* exception on byte loop */
-	/* eax: left over bytes */
-6:	testl %r8d,%r8d		/* zero flag set? */
-	jz 7f
-	movl %eax,%ecx		/* initialize x86 loop counter */
-	push %rax
-	xorl %eax,%eax
-8:	rep
-	stosb 			/* zero the rest */
-11:	pop %rax
-7:	ret
-	CFI_ENDPROC
-END(copy_user_generic_c)
+	.section .fixup,"ax"
+11:	leal (%edx,%ecx,8),%ecx
+12:	movl %ecx,%edx		/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
 
 	.section __ex_table,"a"
-	.quad 1b,3b
-	.quad 2b,5b
-	.quad 8b,11b
-	.quad 10b,3b
+	.align 8
+	.quad 1b,11b
+	.quad 3b,12b
 	.previous
+	CFI_ENDPROC
+ENDPROC(copy_user_generic_string)

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 9d3d1ab..93353d6 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -1,4 +1,6 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
  * Subject to the GNU Public License v2.
  *
  * Functions to copy from and to user space.
@@ -12,204 +14,125 @@
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpufeature.h>
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- * rcx zero flag	when 1 zero on exception
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-ENTRY(__copy_user_nocache)
-	CFI_STARTPROC
-	pushq %rbx
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rbx, 0
-	pushq %rcx		/* save zero flag */
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rcx, 0
-
-	xorl %eax,%eax		/* zero for the exception handler */
 
+	.macro ALIGN_DESTINATION
 #ifdef FIX_ALIGNMENT
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
 	andl $7,%ecx
-	jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-
-	movq %rdx,%rcx
-
-	movl $64,%ebx
-	shrq $6,%rdx
-	decq %rdx
-	js   .Lhandle_tail
-
-	.p2align 4
-.Lloop:
-.Ls1:	movq (%rsi),%r11
-.Ls2:	movq 1*8(%rsi),%r8
-.Ls3:	movq 2*8(%rsi),%r9
-.Ls4:	movq 3*8(%rsi),%r10
-.Ld1:	movnti %r11,(%rdi)
-.Ld2:	movnti %r8,1*8(%rdi)
-.Ld3:	movnti %r9,2*8(%rdi)
-.Ld4:	movnti %r10,3*8(%rdi)
-
-.Ls5:	movq 4*8(%rsi),%r11
-.Ls6:	movq 5*8(%rsi),%r8
-.Ls7:	movq 6*8(%rsi),%r9
-.Ls8:	movq 7*8(%rsi),%r10
-.Ld5:	movnti %r11,4*8(%rdi)
-.Ld6:	movnti %r8,5*8(%rdi)
-.Ld7:	movnti %r9,6*8(%rdi)
-.Ld8:	movnti %r10,7*8(%rdi)
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+100:	movb (%rsi),%al
+101:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	.section .fixup,"ax"
+103:	addl %r8d,%edx			/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
 
-	dec  %rdx
+	.section __ex_table,"a"
+	.align 8
+	.quad 100b,103b
+	.quad 101b,103b
+	.previous
+#endif
+	.endm
 
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(__copy_user_nocache)
+	CFI_STARTPROC
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+	ALIGN_DESTINATION
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+5:	movnti %r8,(%rdi)
+6:	movnti %r9,1*8(%rdi)
+7:	movnti %r10,2*8(%rdi)
+8:	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+13:	movnti %r8,4*8(%rdi)
+14:	movnti %r9,5*8(%rdi)
+15:	movnti %r10,6*8(%rdi)
+16:	movnti %r11,7*8(%rdi)
 	leaq 64(%rsi),%rsi
 	leaq 64(%rdi),%rdi
-
-	jns  .Lloop
-
-	.p2align 4
-.Lhandle_tail:
-	movl %ecx,%edx
-	andl $63,%ecx
-	shrl $3,%ecx
-	jz   .Lhandle_7
-	movl $8,%ebx
-	.p2align 4
-.Lloop_8:
-.Ls9:	movq (%rsi),%r8
-.Ld9:	movnti %r8,(%rdi)
 	decl %ecx
-	leaq 8(%rdi),%rdi
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+19:	movnti %r8,(%rdi)
 	leaq 8(%rsi),%rsi
-	jnz .Lloop_8
-
-.Lhandle_7:
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
 	movl %edx,%ecx
-	andl $7,%ecx
-	jz   .Lende
-	.p2align 4
-.Lloop_1:
-.Ls10:	movb (%rsi),%bl
-.Ld10:	movb %bl,(%rdi)
-	incq %rdi
+21:	movb (%rsi),%al
+22:	movb %al,(%rdi)
 	incq %rsi
+	incq %rdi
 	decl %ecx
-	jnz .Lloop_1
-
-	CFI_REMEMBER_STATE
-.Lende:
-	popq %rcx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE %rcx
-	popq %rbx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rbx
+	jnz 21b
+23:	xorl %eax,%eax
 	sfence
 	ret
-	CFI_RESTORE_STATE
 
-#ifdef FIX_ALIGNMENT
-	/* align destination */
-	.p2align 4
-.Lbad_alignment:
-	movl $8,%r9d
-	subl %ecx,%r9d
-	movl %r9d,%ecx
-	cmpq %r9,%rdx
-	jz   .Lhandle_7
-	js   .Lhandle_7
-.Lalign_1:
-.Ls11:	movb (%rsi),%bl
-.Ld11:	movb %bl,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz .Lalign_1
-	subq %r9,%rdx
-	jmp .Lafter_bad_alignment
-#endif
+	.section .fixup,"ax"
+30:	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 60f
+40:	leal (%edx,%ecx,8),%edx
+	jmp 60f
+50:	movl %ecx,%edx
+60:	sfence
+	movl %r8d,%ecx
+	jmp copy_user_handle_tail
+	.previous
 
-	/* table sorted by exception address */
 	.section __ex_table,"a"
-	.align 8
-	.quad .Ls1,.Ls1e	/* .Ls[1-4] - 0 bytes copied */
-	.quad .Ls2,.Ls1e
-	.quad .Ls3,.Ls1e
-	.quad .Ls4,.Ls1e
-	.quad .Ld1,.Ls1e	/* .Ld[1-4] - 0..24 bytes coped */
-	.quad .Ld2,.Ls2e
-	.quad .Ld3,.Ls3e
-	.quad .Ld4,.Ls4e
-	.quad .Ls5,.Ls5e	/* .Ls[5-8] - 32 bytes copied */
-	.quad .Ls6,.Ls5e
-	.quad .Ls7,.Ls5e
-	.quad .Ls8,.Ls5e
-	.quad .Ld5,.Ls5e	/* .Ld[5-8] - 32..56 bytes copied */
-	.quad .Ld6,.Ls6e
-	.quad .Ld7,.Ls7e
-	.quad .Ld8,.Ls8e
-	.quad .Ls9,.Le_quad
-	.quad .Ld9,.Le_quad
-	.quad .Ls10,.Le_byte
-	.quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT
-	.quad .Ls11,.Lzero_rest
-	.quad .Ld11,.Lzero_rest
-#endif
-	.quad .Le5,.Le_zero
+	.quad 1b,30b
+	.quad 2b,30b
+	.quad 3b,30b
+	.quad 4b,30b
+	.quad 5b,30b
+	.quad 6b,30b
+	.quad 7b,30b
+	.quad 8b,30b
+	.quad 9b,30b
+	.quad 10b,30b
+	.quad 11b,30b
+	.quad 12b,30b
+	.quad 13b,30b
+	.quad 14b,30b
+	.quad 15b,30b
+	.quad 16b,30b
+	.quad 18b,40b
+	.quad 19b,40b
+	.quad 21b,50b
+	.quad 22b,50b
 	.previous
-
-	/* eax: zero, ebx: 64 */
-.Ls1e: 	addl $8,%eax	/* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
-.Ls2e: 	addl $8,%eax
-.Ls3e: 	addl $8,%eax
-.Ls4e: 	addl $8,%eax
-.Ls5e: 	addl $8,%eax
-.Ls6e: 	addl $8,%eax
-.Ls7e: 	addl $8,%eax
-.Ls8e: 	addl $8,%eax
-	addq %rbx,%rdi	/* +64 */
-	subq %rax,%rdi  /* correct destination with computed offset */
-
-	shlq $6,%rdx	/* loop counter * 64 (stride length) */
-	addq %rax,%rdx	/* add offset to loopcnt */
-	andl $63,%ecx	/* remaining bytes */
-	addq %rcx,%rdx	/* add them */
-	jmp .Lzero_rest
-
-	/* exception on quad word loop in tail handling */
-	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
-	shll $3,%ecx
-	andl $7,%edx
-	addl %ecx,%edx
-	/* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
-	cmpl $0,(%rsp)	/* zero flag set? */
-	jz   .Le_zero
-	movq %rdx,%rcx
-.Le_byte:
-	xorl %eax,%eax
-.Le5:	rep
-	stosb
-	/* when there is another exception while zeroing the rest just return */
-.Le_zero:
-	movq %rdx,%rax
-	jmp .Lende
 	CFI_ENDPROC
 ENDPROC(__copy_user_nocache)
-
-

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>

-- 
wbr, Vitaly

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 13:48       ` [PATCH 1/2] Introduce copy_user_handle_tail routine Vitaly Mayatskikh
@ 2008-07-02 14:06         ` Andi Kleen
  2008-07-02 14:31           ` Vitaly Mayatskikh
  0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2008-07-02 14:06 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: linux-kernel, Linus Torvalds, Andrew Morton


> +unsigned long
> +copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
> +{
> +	char c;
> +	unsigned zero_len;
> +
> +	for (; len; --len) {
> +		if (__get_user_nocheck(c, from++, sizeof(char)))
> +			break;
> +		if (__put_user_nocheck(c, to++, sizeof(char)))

get/put user are macros and it's normally not a good idea to use ++ in macro
arguments because they might expand multiple times.

sizeof(char) is always 1

Also hopefully there's no sign extension anywhere with the signed char

Overall you could write it much simpler with a rep ; movs I think,
like traditional linux did.

> +			break;
> +	}
> +
> +	for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
> +		if (__put_user_nocheck(c, to++, sizeof(char)))
> +			break;

Similar problem with ++

If zerorest is ever 0 then retesting it on every iteration seems somewhat dumb.

I think a simple memset would be actually ok, i don't think we ever zero
anything that faults. That would be obviously racy anyways. If the zero
are supposed to override something  then a racing user thread could always
catch it.

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/2] Fix copy_user on x86
  2008-07-02 13:53       ` [PATCH 2/2] Fix copy_user on x86 Vitaly Mayatskikh
@ 2008-07-02 14:08         ` Andi Kleen
  2008-07-02 14:36           ` Vitaly Mayatskikh
  0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2008-07-02 14:08 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: Linus Torvalds, linux-kernel, Andrew Morton


Haven't read the whole change, sorry. It's fairly large (perhaps split it up?)

But generally signed-off-by should be at the end of the description, not the end
of the patch.

> 
> Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
> 


-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 14:06         ` Andi Kleen
@ 2008-07-02 14:31           ` Vitaly Mayatskikh
  2008-07-02 15:06             ` Andi Kleen
  2008-07-03  2:35             ` Linus Torvalds
  0 siblings, 2 replies; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-02 14:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Vitaly Mayatskikh, linux-kernel, Linus Torvalds, Andrew Morton

Andi Kleen <andi@firstfloor.org> writes:

> get/put user are macros and it's normally not a good idea to use ++ in macro
> arguments because they might expand multiple times.
>
> sizeof(char) is always 1
>
> Also hopefully there's no sign extension anywhere with the signed char

I have tested it a lot. I don't know of any fail scenario at the moment.

> Overall you could write it much simpler with a rep ; movs I think,
> like traditional linux did.

rep movs can fail.

> Similar problem with ++
>
> If zerorest is ever 0 then retesting it on every iteration seems
> somewhat dumb.

If zerorest is 0, this cycle will never be executed.

>
> I think a simple memset would be actually ok, i don't think we ever zero
> anything that faults. That would be obviously racy anyways. If the zero
> are supposed to override something  then a racing user thread could always
> catch it.

Linus wanted this routine to be extremely dumb. This is the reason why tail
handling was moved from assembly to C. Yeah, my original patches were in
assembly and on the top of your realization.

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/2] Fix copy_user on x86
  2008-07-02 14:08         ` Andi Kleen
@ 2008-07-02 14:36           ` Vitaly Mayatskikh
  0 siblings, 0 replies; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-02 14:36 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Vitaly Mayatskikh, Linus Torvalds, linux-kernel, Andrew Morton

Andi Kleen <andi@firstfloor.org> writes:

> Haven't read the whole change, sorry. It's fairly large (perhaps split it up?)

Hmm... It will be hard to read changes in copy...unrolled routines even
in "small" chunks.

> But generally signed-off-by should be at the end of the description, not the end
> of the patch.

Thanks for guiding, I'm new here :)

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 14:31           ` Vitaly Mayatskikh
@ 2008-07-02 15:06             ` Andi Kleen
  2008-07-02 15:32               ` Vitaly Mayatskikh
  2008-07-03  2:35             ` Linus Torvalds
  1 sibling, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2008-07-02 15:06 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: linux-kernel, Linus Torvalds, Andrew Morton


[again with correct ccs sorry]

Vitaly Mayatskikh wrote:
> > Andi Kleen <andi@firstfloor.org> writes:
> >
>> >> get/put user are macros and it's normally not a good idea to use ++ in macro
>> >> arguments because they might expand multiple times.
>> >>
>> >> sizeof(char) is always 1
>> >>
>> >> Also hopefully there's no sign extension anywhere with the signed char
> >
> > I have tested it a lot. I don't know of any fail scenario at the moment.
> >
>> >> Overall you could write it much simpler with a rep ; movs I think,
>> >> like traditional linux did.
> >
> > rep movs can fail.

How? (if it's a byte copy?)

The old 2.4 copy_*_user always used to that and it worked just fine AFAIK.


>> >> Similar problem with ++
>> >>
>> >> If zerorest is ever 0 then retesting it on every iteration seems
>> >> somewhat dumb.
> >
> > If zerorest is 0, this cycle will never be executed.

Ok but when it's not then it will be executed on each iteration.

>> >> I think a simple memset would be actually ok, i don't think we ever zero
>> >> anything that faults. That would be obviously racy anyways. If the zero
>> >> are supposed to override something  then a racing user thread could always
>> >> catch it.
> >
> > Linus wanted this routine to be extremely dumb. This is the reason why tail
> > handling was moved from assembly to C. Yeah, my original patches were in
> > assembly and on the top of your realization.

My point was that it could be simpler because zeroing should not ever fault
(copy_in_user is not supposed to zero)

-Andi




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 15:06             ` Andi Kleen
@ 2008-07-02 15:32               ` Vitaly Mayatskikh
  2008-07-02 15:40                 ` Andi Kleen
  0 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-02 15:32 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Vitaly Mayatskikh, linux-kernel, Linus Torvalds, Andrew Morton

Andi Kleen <andi@firstfloor.org> writes:

>>> >> Overall you could write it much simpler with a rep ; movs I think,
>>> >> like traditional linux did.
>> >
>> > rep movs can fail.
>
> How? (if it's a byte copy?)

Parameter len is a number of uncopied bytes, it doesn't count bytes
which were loaded into registers before GPF in unrolled
loop. copy_user_handle_tail tries to do a byte copy for, possibly,
remaining bytes, but it can fail at the first read/write, or at the
second, etc. It doesn't know where it will fail.

> The old 2.4 copy_*_user always used to that and it worked just fine AFAIK.

Again, Linus wanted it to be simple plain C routine. rep movs is not in
C language ;)

>>> >> If zerorest is ever 0 then retesting it on every iteration seems
>>> >> somewhat dumb.
>> >
>> > If zerorest is 0, this cycle will never be executed.
>
> Ok but when it's not then it will be executed on each iteration.

Ok, that's matter.

>>> >> I think a simple memset would be actually ok, i don't think we ever zero
>>> >> anything that faults. That would be obviously racy anyways. If the zero
>>> >> are supposed to override something  then a racing user thread could always
>>> >> catch it.
>> >
>> > Linus wanted this routine to be extremely dumb. This is the reason why tail
>> > handling was moved from assembly to C. Yeah, my original patches were in
>> > assembly and on the top of your realization.
>
> My point was that it could be simpler because zeroing should not ever fault
> (copy_in_user is not supposed to zero)

Why do you think that zeroing can never fail, even in userspace?
-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 15:32               ` Vitaly Mayatskikh
@ 2008-07-02 15:40                 ` Andi Kleen
  2008-07-02 15:58                   ` Vitaly Mayatskikh
  0 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2008-07-02 15:40 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: linux-kernel, Linus Torvalds, Andrew Morton

Vitaly Mayatskikh wrote:
> Andi Kleen <andi@firstfloor.org> writes:
> 
>>>>>> Overall you could write it much simpler with a rep ; movs I think,
>>>>>> like traditional linux did.
>>>> rep movs can fail.
>> How? (if it's a byte copy?)
> 
> Parameter len is a number of uncopied bytes,

But that is exactly what copy_*_user wants to return

 it doesn't count bytes
> which were loaded into registers before GPF in unrolled
> loop. copy_user_handle_tail tries to do a byte copy for, possibly,
> remaining bytes, but it can fail at the first read/write, or at the
> second, etc. It doesn't know where it will fail.

The original version I wrote returned "unfaulted bytes" which was wrong.
Correct is "uncopied" as fixed by Linus. rep ; movs returns uncopied.

> 
>>>>>> I think a simple memset would be actually ok, i don't think we ever zero
>>>>>> anything that faults. That would be obviously racy anyways. If the zero
>>>>>> are supposed to override something  then a racing user thread could always
>>>>>> catch it.
>>>> Linus wanted this routine to be extremely dumb. This is the reason why tail
>>>> handling was moved from assembly to C. Yeah, my original patches were in
>>>> assembly and on the top of your realization.
>> My point was that it could be simpler because zeroing should not ever fault
>> (copy_in_user is not supposed to zero)
> 
> Why do you think that zeroing can never fail, even in userspace?

There's no zeroing in user space, only in kernel space.

The only reason kernel does it is to avoid leaking uninitialized data,
but for user space it doesn't make sense (see above)

-Andi



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 15:40                 ` Andi Kleen
@ 2008-07-02 15:58                   ` Vitaly Mayatskikh
  2008-07-02 18:54                     ` Andi Kleen
  0 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-02 15:58 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Vitaly Mayatskikh, linux-kernel, Linus Torvalds, Andrew Morton

Andi Kleen <andi@firstfloor.org> writes:

>>>>> rep movs can fail.
>>> How? (if it's a byte copy?)
>> 
>> Parameter len is a number of uncopied bytes,
>
> But that is exactly what copy_*_user wants to return

Last experience showed, it doesn't.

Ok, when unrolled version fails on reading quad word at unaligned
address, it doesn't know where it was failed exactly. At this moment it
hasn't correct number of uncopied bytes, because some bytes can still
remain at the very end of the page. copy_user_handle_tail copies them
and return correct value on uncopied bytes. Complicated logic for
counting the number of these bytes is not necessary to optimize at
assembly level, because we already missed performance. It's hard to
complain against it.

> The original version I wrote returned "unfaulted bytes" which was wrong.
> Correct is "uncopied" as fixed by Linus. rep ; movs returns uncopied.

It's not in C. If you have the proposal why it should be written in
assembly, send it to Linus.

>> Why do you think that zeroing can never fail, even in userspace?
>
> There's no zeroing in user space, only in kernel space.

Agree.

> The only reason kernel does it is to avoid leaking uninitialized data,
> but for user space it doesn't make sense (see above)

Ok, copy_in_user can pass zerorest=0 to copy_user_handle_tail. Is it ok
for you?

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 15:58                   ` Vitaly Mayatskikh
@ 2008-07-02 18:54                     ` Andi Kleen
  0 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2008-07-02 18:54 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: linux-kernel, Linus Torvalds, Andrew Morton

Vitaly Mayatskikh wrote:
> Andi Kleen <andi@firstfloor.org> writes:
> 
>>>>>> rep movs can fail.
>>>> How? (if it's a byte copy?)
>>> Parameter len is a number of uncopied bytes,
>> But that is exactly what copy_*_user wants to return
> 
> Last experience showed, it doesn't.

?

> Ok, when unrolled version fails on reading quad word at unaligned
> address, it doesn't know where it was failed exactly. At this moment it
> hasn't correct number of uncopied bytes, because some bytes can still
> remain at the very end of the page. copy_user_handle_tail copies them
> and return correct value on uncopied bytes. Complicated logic for
> counting the number of these bytes is not necessary to optimize at
> assembly level, because we already missed performance. It's hard to
> complain against it.

Yes I'm talking about the "replay loop"

There's no complicated logic in a rep ; movs. And it's still
a byte copy. In fact it is far simpler than what you already have.

>> The original version I wrote returned "unfaulted bytes" which was wrong.
>> Correct is "uncopied" as fixed by Linus. rep ; movs returns uncopied.
> 
> It's not in C. If you have the proposal why it should be written in
> assembly, send it to Linus.

Well it would turn your 15+ lines C function in ~4 (well tested) lines or so.

> 
>>> Why do you think that zeroing can never fail, even in userspace?
>> There's no zeroing in user space, only in kernel space.
> 
> Agree.
> 
>> The only reason kernel does it is to avoid leaking uninitialized data,
>> but for user space it doesn't make sense (see above)
> 
> Ok, copy_in_user can pass zerorest=0 to copy_user_handle_tail. Is it ok
> for you?

My point was that for the zeroing you can just use memset(), there's
no need to support faulting there at all.

-Andi


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-02 14:31           ` Vitaly Mayatskikh
  2008-07-02 15:06             ` Andi Kleen
@ 2008-07-03  2:35             ` Linus Torvalds
  2008-07-07 12:09               ` Vitaly Mayatskikh
  1 sibling, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2008-07-03  2:35 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: Andi Kleen, linux-kernel, Andrew Morton



On Wed, 2 Jul 2008, Vitaly Mayatskikh wrote:
> 
> Linus wanted this routine to be extremely dumb.

Well, I wanted it simple and dumb, but not dumber than _necessary_.

I think

	if (cleartail)
		memset(dst,0,len);
	return len;

is basically what we should have at the end. Simple and sweet.

Now, the stuff that comes *before* that point is the "try to fix up one 
byte at a time" thing, which I'd like to be simple and dumb. At least to 
start with.

Of course, I also suspect that *eventually* we might want to make it 
smarter and more complex. For example, while performance isn't a primary 
issue, we might want to eventually avoid having to do _two_ faults (once 
in the fast unrolled or word-at-a-time loop, and once in the byte-for-byte 
one), by limiting the byte-for-byte one to be within a page, but that 
would be a "future enhancement" thing.

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-03  2:35             ` Linus Torvalds
@ 2008-07-07 12:09               ` Vitaly Mayatskikh
  2008-07-07 12:12                 ` Vitaly Mayatskikh
  2008-07-07 16:21                 ` Linus Torvalds
  0 siblings, 2 replies; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-07 12:09 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Vitaly Mayatskikh, Andi Kleen, linux-kernel, Andrew Morton

Linus Torvalds <torvalds@linux-foundation.org> writes:

> Now, the stuff that comes *before* that point is the "try to fix up one 
> byte at a time" thing, which I'd like to be simple and dumb. At least to 
> start with.

Just to be clear: do these patches are good enough now (to start with)?
Or, may be, it needs to be further improved?

> Of course, I also suspect that *eventually* we might want to make it 
> smarter and more complex. For example, while performance isn't a primary 
> issue, we might want to eventually avoid having to do _two_ faults (once 
> in the fast unrolled or word-at-a-time loop, and once in the byte-for-byte 
> one), by limiting the byte-for-byte one to be within a page, but that 
> would be a "future enhancement" thing.

Btw, how much does it cost to CPU to do a fault? Can it be compared with
average time of find_vma()?

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-07 12:09               ` Vitaly Mayatskikh
@ 2008-07-07 12:12                 ` Vitaly Mayatskikh
  2008-07-07 16:43                   ` Andi Kleen
  2008-07-07 16:21                 ` Linus Torvalds
  1 sibling, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-07 12:12 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: Linus Torvalds, Andi Kleen, linux-kernel, Andrew Morton

Vitaly Mayatskikh <v.mayatskih@gmail.com> writes:

>> one), by limiting the byte-for-byte one to be within a page, but that 
>> would be a "future enhancement" thing.
>
> Btw, how much does it cost to CPU to do a fault? Can it be compared with
> average time of find_vma()?

Remark: this is for preventing fault in memset, not byte-to-byte copy.

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-07 12:09               ` Vitaly Mayatskikh
  2008-07-07 12:12                 ` Vitaly Mayatskikh
@ 2008-07-07 16:21                 ` Linus Torvalds
  2008-07-07 17:05                   ` Vitaly Mayatskikh
  2008-07-09 13:03                   ` Ingo Molnar
  1 sibling, 2 replies; 26+ messages in thread
From: Linus Torvalds @ 2008-07-07 16:21 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: Andi Kleen, linux-kernel, Andrew Morton



On Mon, 7 Jul 2008, Vitaly Mayatskikh wrote:

> Linus Torvalds <torvalds@linux-foundation.org> writes:
> 
> > Now, the stuff that comes *before* that point is the "try to fix up one 
> > byte at a time" thing, which I'd like to be simple and dumb. At least to 
> > start with.
> 
> Just to be clear: do these patches are good enough now (to start with)?
> Or, may be, it needs to be further improved?

I think they are getting there. I'm obviously not merging them in 2.6.26, 
but I'd be happy to do so for .27. 

Obviously, I'd be even happier if it also went through the normal x86 
review cycles (ie Ingo &co), but the current series is largely ack'ed by 
me.

> Btw, how much does it cost to CPU to do a fault? Can it be compared with
> average time of find_vma()?

It's *much* higher than a find_vma(). It's on the order of several 
thousand cycles, easy (well, it depends on uarch - on a P4, iirc any 
exception is soemthing like 1500 cycles *minimum*, and that's just for the 
exception overhead, not the actual fault path).

But the thing is, it doesn't even need a find_vma(). We can avoid the 
extra trap 99.9% of the time by knowing that the trap happened at a page 
crosser (in *theory* a trap can happen in the middle of a page because 
another CPU did a munmap() in the middle, but that's not a case we need to 
even bother optimize for). In particular, we *know* we shouldn't even try 
to cross user pages. So the fixup routine can just do

	/* Think about it.. */
	#define BYTES_LEFT_IN_PAGE(ptr) \
		(unsigned int)((PAGE_SIZE-1) & -(long)(ptr))


	/* How much should we try to copy carefully byte-by-byte? */
	unsigned int max_copy = remaining;

	/* Don't even bother trying to cross a page in user space! */
	if (flags & DEST_IS_USERSPACE)
		max_copy = min(max_copy, BYTES_LEFT_IN_PAGE(dst));
	if (flags & SOURCE_IS_USERSPACE)
		max_copy = min(max_copy, BYTES_LEFT_IN_PAGE(src));

	/* Do the careful copy */
	while (max_copy--) {
		unsigned char c;
		if (__get_user(c,src))
			break;
		if (__put_user(c,dst))
			break;
		src++;
		dst++;
		remaining--;
	}

	if (flags & CLEAR_REMAINDER)
		memset(dst, 0, remaining);

	return remaining;

or similar. Note how this still uses the slow-and-careful byte-at-a-time 
approach to the final copy, but it avoids - on purpose - even trying to 
copy across page boundaries, and thus will never take a second trap in the 
common case.

See? We don't actually care about vma boundaries or anything like that. We 
just care about the only boundary that matters for faults: the page 
boundary.

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-07 12:12                 ` Vitaly Mayatskikh
@ 2008-07-07 16:43                   ` Andi Kleen
  0 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2008-07-07 16:43 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: Linus Torvalds, Andi Kleen, linux-kernel, Andrew Morton

On Mon, Jul 07, 2008 at 02:12:56PM +0200, Vitaly Mayatskikh wrote:
> Vitaly Mayatskikh <v.mayatskih@gmail.com> writes:
> 
> >> one), by limiting the byte-for-byte one to be within a page, but that 
> >> would be a "future enhancement" thing.
> >
> > Btw, how much does it cost to CPU to do a fault? Can it be compared with
> > average time of find_vma()?
> 
> Remark: this is for preventing fault in memset, not byte-to-byte copy.

memset never faults.

The current code doesn't handle it and it never happened.

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-07 16:21                 ` Linus Torvalds
@ 2008-07-07 17:05                   ` Vitaly Mayatskikh
  2008-07-09 13:03                   ` Ingo Molnar
  1 sibling, 0 replies; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-07 17:05 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Vitaly Mayatskikh, Andi Kleen, linux-kernel, Andrew Morton

Linus Torvalds <torvalds@linux-foundation.org> writes:

> See? We don't actually care about vma boundaries or anything like that. We 
> just care about the only boundary that matters for faults: the page 
> boundary.

I was thinking about find_vma() for memset, but, yes, it should never
fail in kernel space (unless bug) and find_vma() is unnecessary.
-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-07 16:21                 ` Linus Torvalds
  2008-07-07 17:05                   ` Vitaly Mayatskikh
@ 2008-07-09 13:03                   ` Ingo Molnar
  2008-07-09 13:16                     ` Vitaly Mayatskikh
  1 sibling, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2008-07-09 13:03 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Vitaly Mayatskikh, Andi Kleen, linux-kernel, Andrew Morton


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, 7 Jul 2008, Vitaly Mayatskikh wrote:
> 
> > Linus Torvalds <torvalds@linux-foundation.org> writes:
> > 
> > > Now, the stuff that comes *before* that point is the "try to fix up one 
> > > byte at a time" thing, which I'd like to be simple and dumb. At least to 
> > > start with.
> > 
> > Just to be clear: do these patches are good enough now (to start 
> > with)? Or, may be, it needs to be further improved?
> 
> I think they are getting there. I'm obviously not merging them in 
> 2.6.26, but I'd be happy to do so for .27.
> 
> Obviously, I'd be even happier if it also went through the normal x86 
> review cycles (ie Ingo &co), but the current series is largely ack'ed 
> by me.

applied them to tip/x86/core, for v2.6.27 merging, thanks everyone.

I've added Linus's Acked-by. Vitaly, could you please send your 
Signed-off-by line, it was missing from the patches. You can check the 
current form of the commits via:

 git-pull git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86/core

(the patches changed slightly, there was some merge fallout due to other 
changes in this area.)

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-09 13:03                   ` Ingo Molnar
@ 2008-07-09 13:16                     ` Vitaly Mayatskikh
  2008-07-09 13:52                       ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Vitaly Mayatskikh @ 2008-07-09 13:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Vitaly Mayatskikh, Andi Kleen, linux-kernel,
	Andrew Morton

Ingo Molnar <mingo@elte.hu> writes:

>> Obviously, I'd be even happier if it also went through the normal x86 
>> review cycles (ie Ingo &co), but the current series is largely ack'ed 
>> by me.
>
> applied them to tip/x86/core, for v2.6.27 merging, thanks everyone.
>
> I've added Linus's Acked-by. Vitaly, could you please send your 
> Signed-off-by line, it was missing from the patches. 

It was wrongly set at the very end of post.

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>

> You can check the 
> current form of the commits via:
>
>  git-pull git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git x86/core
>
> (the patches changed slightly, there was some merge fallout due to other 
> changes in this area.)

I'll run it through test case and will let you know if something goes
wrong. Thanks.

-- 
wbr, Vitaly

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] Introduce copy_user_handle_tail routine
  2008-07-09 13:16                     ` Vitaly Mayatskikh
@ 2008-07-09 13:52                       ` Ingo Molnar
  0 siblings, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2008-07-09 13:52 UTC (permalink / raw)
  To: Vitaly Mayatskikh; +Cc: Linus Torvalds, Andi Kleen, linux-kernel, Andrew Morton


* Vitaly Mayatskikh <v.mayatskih@gmail.com> wrote:

> Ingo Molnar <mingo@elte.hu> writes:
> 
> >> Obviously, I'd be even happier if it also went through the normal x86 
> >> review cycles (ie Ingo &co), but the current series is largely ack'ed 
> >> by me.
> >
> > applied them to tip/x86/core, for v2.6.27 merging, thanks everyone.
> >
> > I've added Linus's Acked-by. Vitaly, could you please send your 
> > Signed-off-by line, it was missing from the patches. 
> 
> It was wrongly set at the very end of post.
> 
> Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>

ok, pushed it out with your signoffs added.

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2008-07-09 13:52 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-06-27 21:52 [PATCH 3/3] Fix copy_user on x86_64 Vitaly Mayatskikh
2008-06-28 18:26 ` Linus Torvalds
2008-06-30 15:12   ` Vitaly Mayatskikh
2008-06-30 15:55     ` Linus Torvalds
2008-06-30 16:16       ` Andi Kleen
2008-06-30 18:22       ` Kari Hurtta
2008-07-02 13:48       ` [PATCH 1/2] Introduce copy_user_handle_tail routine Vitaly Mayatskikh
2008-07-02 14:06         ` Andi Kleen
2008-07-02 14:31           ` Vitaly Mayatskikh
2008-07-02 15:06             ` Andi Kleen
2008-07-02 15:32               ` Vitaly Mayatskikh
2008-07-02 15:40                 ` Andi Kleen
2008-07-02 15:58                   ` Vitaly Mayatskikh
2008-07-02 18:54                     ` Andi Kleen
2008-07-03  2:35             ` Linus Torvalds
2008-07-07 12:09               ` Vitaly Mayatskikh
2008-07-07 12:12                 ` Vitaly Mayatskikh
2008-07-07 16:43                   ` Andi Kleen
2008-07-07 16:21                 ` Linus Torvalds
2008-07-07 17:05                   ` Vitaly Mayatskikh
2008-07-09 13:03                   ` Ingo Molnar
2008-07-09 13:16                     ` Vitaly Mayatskikh
2008-07-09 13:52                       ` Ingo Molnar
2008-07-02 13:53       ` [PATCH 2/2] Fix copy_user on x86 Vitaly Mayatskikh
2008-07-02 14:08         ` Andi Kleen
2008-07-02 14:36           ` Vitaly Mayatskikh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox