All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH][v5] uprobes/x86: emulate push insns for uprobe on x86
@ 2017-12-01  0:12 Yonghong Song
  2017-12-06  0:33 ` Yonghong Song
  2017-12-12 10:57 ` [tip:perf/core] uprobes/x86: Emulate " tip-bot for Yonghong Song
  0 siblings, 2 replies; 3+ messages in thread
From: Yonghong Song @ 2017-12-01  0:12 UTC (permalink / raw)
  To: mingo, peterz, oleg, linux-kernel, x86; +Cc: kernel-team

Uprobe is a tracing mechanism for userspace programs.
Typical uprobe will incur overhead of two traps.
First trap is caused by replaced trap insn, and
the second trap is to execute the original displaced
insn in user space.

To reduce the overhead, kernel provides hooks
for architectures to emulate the original insn
and skip the second trap. In x86, emulation
is done for certain branch insns.

This patch extends the emulation to "push <reg>"
insns. These insns are typical in the beginning
of the function. For example, bcc
in https://github.com/iovisor/bcc repo provides
tools to measure funclantency, detect memleak, etc.
The tools will place uprobes in the beginning of
function and possibly uretprobes at the end of function.
This patch is able to reduce the trap overhead for
uprobe from 2 to 1.

Without this patch, uretprobe will typically incur
three traps. With this patch, if the function starts
with "push" insn, the number of traps can be
reduced from 3 to 2.

An experiment was conducted on two local VMs,
fedora 26 64-bit VM and 32-bit VM, both 4 processors
and 4GB memory, booted with latest tip repo (and this patch).
The host is MacBook with intel i7 processor.

The test program looks like
  #include <stdio.h>
  #include <stdlib.h>
  #include <time.h>
  #include <sys/time.h>

  static void test() __attribute__((noinline));
  void test() {}
  int main() {
    struct timeval start, end;

    gettimeofday(&start, NULL);
    for (int i = 0; i < 1000000; i++) {
      test();
    }
    gettimeofday(&end, NULL);

    printf("%ld\n", ((end.tv_sec * 1000000 + end.tv_usec)
                     - (start.tv_sec * 1000000 + start.tv_usec)));
    return 0;
  }

The program is compiled without optimization, and
the first insn for function "test" is "push %rbp".
The host is relatively idle.

Before the test run, the uprobe is inserted as below for uprobe:
  echo 'p <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
  echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
and for uretprobe:
  echo 'r <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
  echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable

Unit: microsecond(usec) per loop iteration

x86_64          W/ this patch   W/O this patch
uprobe          1.55            3.1
uretprobe       2.0             3.6

x86_32          W/ this patch   W/O this patch
uprobe          1.41            3.5
uretprobe       1.75            4.0

You can see that this patch significantly reduced the overhead,
50% for uprobe and 44% for uretprobe on x86_64, and even more
on x86_32.

Signed-off-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
---
 arch/x86/include/asm/uprobes.h |   4 ++
 arch/x86/kernel/uprobes.c      | 107 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 107 insertions(+), 4 deletions(-)

Changelogs:
v4 -> v5:
  . No code change from v4.
  . Rebased on top of tip and added Reviewed-by from Oleg.
v3 -> v4:
  . Revert most of v3 change as 32bit emulation is not really working
    on x86_64 platform as function emulate_push_stack() needs to account
    for 32bit app on 64bit platform. A separate effort is ongoing to
    address this issue.
v2 -> v3:
  . Do not emulate 32bit application on x86_64 platforms
v1 -> v2:
  . Address Oleg's comments

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 74f4c2f..d8bfa98 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -53,6 +53,10 @@ struct arch_uprobe {
 			u8	fixups;
 			u8	ilen;
 		} 			defparam;
+		struct {
+			u8	reg_offset;	/* to the start of pt_regs */
+			u8	ilen;
+		}			push;
 	};
 };
 
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a3755d2..85c7ef2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	return 0;
 }
 
-static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
 {
 	unsigned long new_sp = regs->sp - sizeof_long();
 
-	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+	if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
 		return -EFAULT;
 
 	regs->sp = new_sp;
@@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
 		regs->ip += correction;
 	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
 		regs->sp += sizeof_long(); /* Pop incorrect return address */
-		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+		if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
 			return -ERESTART;
 	}
 	/* popf; tell the caller to not touch TF */
@@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		 *
 		 * But there is corner case, see the comment in ->post_xol().
 		 */
-		if (push_ret_address(regs, new_ip))
+		if (emulate_push_stack(regs, new_ip))
 			return false;
 	} else if (!check_jmp_cond(auprobe, regs)) {
 		offs = 0;
@@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	return true;
 }
 
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+	if (emulate_push_stack(regs, *src_ptr))
+		return false;
+	regs->ip += auprobe->push.ilen;
+	return true;
+}
+
 static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	BUG_ON(!branch_is_call(auprobe));
@@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
 	.post_xol = branch_post_xol_op,
 };
 
+static const struct uprobe_xol_ops push_xol_ops = {
+	.emulate  = push_emulate_op,
+};
+
 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
 static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 {
@@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 	return 0;
 }
 
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+	u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+	if (opc1 < 0x50 || opc1 > 0x57)
+		return -ENOSYS;
+
+	if (insn->length > 2)
+		return -ENOSYS;
+	if (insn->length == 2) {
+		/* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+		if (insn->rex_prefix.nbytes != 1 ||
+		    insn->rex_prefix.bytes[0] != 0x41)
+			return -ENOSYS;
+
+		switch (opc1) {
+		case 0x50:
+			reg_offset = offsetof(struct pt_regs, r8);
+			break;
+		case 0x51:
+			reg_offset = offsetof(struct pt_regs, r9);
+			break;
+		case 0x52:
+			reg_offset = offsetof(struct pt_regs, r10);
+			break;
+		case 0x53:
+			reg_offset = offsetof(struct pt_regs, r11);
+			break;
+		case 0x54:
+			reg_offset = offsetof(struct pt_regs, r12);
+			break;
+		case 0x55:
+			reg_offset = offsetof(struct pt_regs, r13);
+			break;
+		case 0x56:
+			reg_offset = offsetof(struct pt_regs, r14);
+			break;
+		case 0x57:
+			reg_offset = offsetof(struct pt_regs, r15);
+			break;
+		}
+#else
+		return -ENOSYS;
+#endif
+	} else {
+		switch (opc1) {
+		case 0x50:
+			reg_offset = offsetof(struct pt_regs, ax);
+			break;
+		case 0x51:
+			reg_offset = offsetof(struct pt_regs, cx);
+			break;
+		case 0x52:
+			reg_offset = offsetof(struct pt_regs, dx);
+			break;
+		case 0x53:
+			reg_offset = offsetof(struct pt_regs, bx);
+			break;
+		case 0x54:
+			reg_offset = offsetof(struct pt_regs, sp);
+			break;
+		case 0x55:
+			reg_offset = offsetof(struct pt_regs, bp);
+			break;
+		case 0x56:
+			reg_offset = offsetof(struct pt_regs, si);
+			break;
+		case 0x57:
+			reg_offset = offsetof(struct pt_regs, di);
+			break;
+		}
+	}
+
+	auprobe->push.reg_offset = reg_offset;
+	auprobe->push.ilen = insn->length;
+	auprobe->ops = &push_xol_ops;
+	return 0;
+}
+
 /**
  * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
@@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret != -ENOSYS)
 		return ret;
 
+	ret = push_setup_xol_ops(auprobe, &insn);
+	if (ret != -ENOSYS)
+		return ret;
+
 	/*
 	 * Figure out which fixups default_post_xol_op() will need to perform,
 	 * and annotate defparam->fixups accordingly.
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH][v5] uprobes/x86: emulate push insns for uprobe on x86
  2017-12-01  0:12 [PATCH][v5] uprobes/x86: emulate push insns for uprobe on x86 Yonghong Song
@ 2017-12-06  0:33 ` Yonghong Song
  2017-12-12 10:57 ` [tip:perf/core] uprobes/x86: Emulate " tip-bot for Yonghong Song
  1 sibling, 0 replies; 3+ messages in thread
From: Yonghong Song @ 2017-12-06  0:33 UTC (permalink / raw)
  To: mingo, peterz, oleg, linux-kernel, x86; +Cc: kernel-team

Hi, Ingo and Peter,

Could you take a look at this patch and if no objection
merge it into tip? This patch has been reviewed by
Oleg Nesterov.

Thanks!

Yonghong

On 11/30/17 4:12 PM, Yonghong Song wrote:
> Uprobe is a tracing mechanism for userspace programs.
> Typical uprobe will incur overhead of two traps.
> First trap is caused by replaced trap insn, and
> the second trap is to execute the original displaced
> insn in user space.
> 
> To reduce the overhead, kernel provides hooks
> for architectures to emulate the original insn
> and skip the second trap. In x86, emulation
> is done for certain branch insns.
> 
> This patch extends the emulation to "push <reg>"
> insns. These insns are typical in the beginning
> of the function. For example, bcc
> in https://github.com/iovisor/bcc repo provides
> tools to measure funclantency, detect memleak, etc.
> The tools will place uprobes in the beginning of
> function and possibly uretprobes at the end of function.
> This patch is able to reduce the trap overhead for
> uprobe from 2 to 1.
> 
> Without this patch, uretprobe will typically incur
> three traps. With this patch, if the function starts
> with "push" insn, the number of traps can be
> reduced from 3 to 2.
> 
> An experiment was conducted on two local VMs,
> fedora 26 64-bit VM and 32-bit VM, both 4 processors
> and 4GB memory, booted with latest tip repo (and this patch).
> The host is MacBook with intel i7 processor.
> 
> The test program looks like
>    #include <stdio.h>
>    #include <stdlib.h>
>    #include <time.h>
>    #include <sys/time.h>
> 
>    static void test() __attribute__((noinline));
>    void test() {}
>    int main() {
>      struct timeval start, end;
> 
>      gettimeofday(&start, NULL);
>      for (int i = 0; i < 1000000; i++) {
>        test();
>      }
>      gettimeofday(&end, NULL);
> 
>      printf("%ld\n", ((end.tv_sec * 1000000 + end.tv_usec)
>                       - (start.tv_sec * 1000000 + start.tv_usec)));
>      return 0;
>    }
> 
> The program is compiled without optimization, and
> the first insn for function "test" is "push %rbp".
> The host is relatively idle.
> 
> Before the test run, the uprobe is inserted as below for uprobe:
>    echo 'p <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
>    echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
> and for uretprobe:
>    echo 'r <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
>    echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
> 
> Unit: microsecond(usec) per loop iteration
> 
> x86_64          W/ this patch   W/O this patch
> uprobe          1.55            3.1
> uretprobe       2.0             3.6
> 
> x86_32          W/ this patch   W/O this patch
> uprobe          1.41            3.5
> uretprobe       1.75            4.0
> 
> You can see that this patch significantly reduced the overhead,
> 50% for uprobe and 44% for uretprobe on x86_64, and even more
> on x86_32.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> Reviewed-by: Oleg Nesterov <oleg@redhat.com>
> ---
>   arch/x86/include/asm/uprobes.h |   4 ++
>   arch/x86/kernel/uprobes.c      | 107 +++++++++++++++++++++++++++++++++++++++--
>   2 files changed, 107 insertions(+), 4 deletions(-)
> 
> Changelogs:
> v4 -> v5:
>    . No code change from v4.
>    . Rebased on top of tip and added Reviewed-by from Oleg.
> v3 -> v4:
>    . Revert most of v3 change as 32bit emulation is not really working
>      on x86_64 platform as function emulate_push_stack() needs to account
>      for 32bit app on 64bit platform. A separate effort is ongoing to
>      address this issue.
> v2 -> v3:
>    . Do not emulate 32bit application on x86_64 platforms
> v1 -> v2:
>    . Address Oleg's comments
> 
> diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
> index 74f4c2f..d8bfa98 100644
> --- a/arch/x86/include/asm/uprobes.h
> +++ b/arch/x86/include/asm/uprobes.h
> @@ -53,6 +53,10 @@ struct arch_uprobe {
>   			u8	fixups;
>   			u8	ilen;
>   		} 			defparam;
> +		struct {
> +			u8	reg_offset;	/* to the start of pt_regs */
> +			u8	ilen;
> +		}			push;
>   	};
>   };
>   
> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
> index a3755d2..85c7ef2 100644
> --- a/arch/x86/kernel/uprobes.c
> +++ b/arch/x86/kernel/uprobes.c
> @@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
>   	return 0;
>   }
>   
> -static int push_ret_address(struct pt_regs *regs, unsigned long ip)
> +static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
>   {
>   	unsigned long new_sp = regs->sp - sizeof_long();
>   
> -	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
> +	if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
>   		return -EFAULT;
>   
>   	regs->sp = new_sp;
> @@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
>   		regs->ip += correction;
>   	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
>   		regs->sp += sizeof_long(); /* Pop incorrect return address */
> -		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
> +		if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
>   			return -ERESTART;
>   	}
>   	/* popf; tell the caller to not touch TF */
> @@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
>   		 *
>   		 * But there is corner case, see the comment in ->post_xol().
>   		 */
> -		if (push_ret_address(regs, new_ip))
> +		if (emulate_push_stack(regs, new_ip))
>   			return false;
>   	} else if (!check_jmp_cond(auprobe, regs)) {
>   		offs = 0;
> @@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
>   	return true;
>   }
>   
> +static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
> +{
> +	unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
> +
> +	if (emulate_push_stack(regs, *src_ptr))
> +		return false;
> +	regs->ip += auprobe->push.ilen;
> +	return true;
> +}
> +
>   static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
>   {
>   	BUG_ON(!branch_is_call(auprobe));
> @@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
>   	.post_xol = branch_post_xol_op,
>   };
>   
> +static const struct uprobe_xol_ops push_xol_ops = {
> +	.emulate  = push_emulate_op,
> +};
> +
>   /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
>   static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
>   {
> @@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
>   	return 0;
>   }
>   
> +/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
> +static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
> +{
> +	u8 opc1 = OPCODE1(insn), reg_offset = 0;
> +
> +	if (opc1 < 0x50 || opc1 > 0x57)
> +		return -ENOSYS;
> +
> +	if (insn->length > 2)
> +		return -ENOSYS;
> +	if (insn->length == 2) {
> +		/* only support rex_prefix 0x41 (x64 only) */
> +#ifdef CONFIG_X86_64
> +		if (insn->rex_prefix.nbytes != 1 ||
> +		    insn->rex_prefix.bytes[0] != 0x41)
> +			return -ENOSYS;
> +
> +		switch (opc1) {
> +		case 0x50:
> +			reg_offset = offsetof(struct pt_regs, r8);
> +			break;
> +		case 0x51:
> +			reg_offset = offsetof(struct pt_regs, r9);
> +			break;
> +		case 0x52:
> +			reg_offset = offsetof(struct pt_regs, r10);
> +			break;
> +		case 0x53:
> +			reg_offset = offsetof(struct pt_regs, r11);
> +			break;
> +		case 0x54:
> +			reg_offset = offsetof(struct pt_regs, r12);
> +			break;
> +		case 0x55:
> +			reg_offset = offsetof(struct pt_regs, r13);
> +			break;
> +		case 0x56:
> +			reg_offset = offsetof(struct pt_regs, r14);
> +			break;
> +		case 0x57:
> +			reg_offset = offsetof(struct pt_regs, r15);
> +			break;
> +		}
> +#else
> +		return -ENOSYS;
> +#endif
> +	} else {
> +		switch (opc1) {
> +		case 0x50:
> +			reg_offset = offsetof(struct pt_regs, ax);
> +			break;
> +		case 0x51:
> +			reg_offset = offsetof(struct pt_regs, cx);
> +			break;
> +		case 0x52:
> +			reg_offset = offsetof(struct pt_regs, dx);
> +			break;
> +		case 0x53:
> +			reg_offset = offsetof(struct pt_regs, bx);
> +			break;
> +		case 0x54:
> +			reg_offset = offsetof(struct pt_regs, sp);
> +			break;
> +		case 0x55:
> +			reg_offset = offsetof(struct pt_regs, bp);
> +			break;
> +		case 0x56:
> +			reg_offset = offsetof(struct pt_regs, si);
> +			break;
> +		case 0x57:
> +			reg_offset = offsetof(struct pt_regs, di);
> +			break;
> +		}
> +	}
> +
> +	auprobe->push.reg_offset = reg_offset;
> +	auprobe->push.ilen = insn->length;
> +	auprobe->ops = &push_xol_ops;
> +	return 0;
> +}
> +
>   /**
>    * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
>    * @mm: the probed address space.
> @@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
>   	if (ret != -ENOSYS)
>   		return ret;
>   
> +	ret = push_setup_xol_ops(auprobe, &insn);
> +	if (ret != -ENOSYS)
> +		return ret;
> +
>   	/*
>   	 * Figure out which fixups default_post_xol_op() will need to perform,
>   	 * and annotate defparam->fixups accordingly.
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [tip:perf/core] uprobes/x86: Emulate push insns for uprobe on x86
  2017-12-01  0:12 [PATCH][v5] uprobes/x86: emulate push insns for uprobe on x86 Yonghong Song
  2017-12-06  0:33 ` Yonghong Song
@ 2017-12-12 10:57 ` tip-bot for Yonghong Song
  1 sibling, 0 replies; 3+ messages in thread
From: tip-bot for Yonghong Song @ 2017-12-12 10:57 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: mingo, linux-kernel, tglx, hpa, torvalds, peterz, yhs, oleg

Commit-ID:  e7ed9d9bd0375c74fe6e27d8bc73d3c6f4c8c3bc
Gitweb:     https://git.kernel.org/tip/e7ed9d9bd0375c74fe6e27d8bc73d3c6f4c8c3bc
Author:     Yonghong Song <yhs@fb.com>
AuthorDate: Thu, 30 Nov 2017 16:12:02 -0800
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 11 Dec 2017 18:42:11 +0100

uprobes/x86: Emulate push insns for uprobe on x86

Uprobe is a tracing mechanism for userspace programs.
Typical uprobe will incur overhead of two traps.
First trap is caused by replaced trap insn, and
the second trap is to execute the original displaced
insn in user space.

To reduce the overhead, kernel provides hooks
for architectures to emulate the original insn
and skip the second trap. In x86, emulation
is done for certain branch insns.

This patch extends the emulation to "push <reg>"
insns. These insns are typical in the beginning
of the function. For example, bcc
in https://github.com/iovisor/bcc repo provides
tools to measure funclantency, detect memleak, etc.
The tools will place uprobes in the beginning of
function and possibly uretprobes at the end of function.
This patch is able to reduce the trap overhead for
uprobe from 2 to 1.

Without this patch, uretprobe will typically incur
three traps. With this patch, if the function starts
with "push" insn, the number of traps can be
reduced from 3 to 2.

An experiment was conducted on two local VMs,
fedora 26 64-bit VM and 32-bit VM, both 4 processors
and 4GB memory, booted with latest tip repo (and this patch).
The host is MacBook with intel i7 processor.

The test program looks like:

  #include <stdio.h>
  #include <stdlib.h>
  #include <time.h>
  #include <sys/time.h>

  static void test() __attribute__((noinline));
  void test() {}
  int main() {
    struct timeval start, end;

    gettimeofday(&start, NULL);
    for (int i = 0; i < 1000000; i++) {
      test();
    }
    gettimeofday(&end, NULL);

    printf("%ld\n", ((end.tv_sec * 1000000 + end.tv_usec)
                     - (start.tv_sec * 1000000 + start.tv_usec)));
    return 0;
  }

The program is compiled without optimization, and
the first insn for function "test" is "push %rbp".
The host is relatively idle.

Before the test run, the uprobe is inserted as below for uprobe:
  echo 'p <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
  echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable
and for uretprobe:
  echo 'r <binary>:<test_func_offset>' > /sys/kernel/debug/tracing/uprobe_events
  echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable

Unit: microsecond(usec) per loop iteration

x86_64          W/ this patch   W/O this patch
uprobe          1.55            3.1
uretprobe       2.0             3.6

x86_32          W/ this patch   W/O this patch
uprobe          1.41            3.5
uretprobe       1.75            4.0

You can see that this patch significantly reduced the overhead,
50% for uprobe and 44% for uretprobe on x86_64, and even more
on x86_32.

Signed-off-by: Yonghong Song <yhs@fb.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Link: http://lkml.kernel.org/r/20171201001202.3706564-1-yhs@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uprobes.h |   4 ++
 arch/x86/kernel/uprobes.c      | 107 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h
index 74f4c2f..d8bfa98 100644
--- a/arch/x86/include/asm/uprobes.h
+++ b/arch/x86/include/asm/uprobes.h
@@ -53,6 +53,10 @@ struct arch_uprobe {
 			u8	fixups;
 			u8	ilen;
 		} 			defparam;
+		struct {
+			u8	reg_offset;	/* to the start of pt_regs */
+			u8	ilen;
+		}			push;
 	};
 };
 
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a3755d2..85c7ef2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -528,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	return 0;
 }
 
-static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+static int emulate_push_stack(struct pt_regs *regs, unsigned long val)
 {
 	unsigned long new_sp = regs->sp - sizeof_long();
 
-	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+	if (copy_to_user((void __user *)new_sp, &val, sizeof_long()))
 		return -EFAULT;
 
 	regs->sp = new_sp;
@@ -566,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs
 		regs->ip += correction;
 	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
 		regs->sp += sizeof_long(); /* Pop incorrect return address */
-		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+		if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen))
 			return -ERESTART;
 	}
 	/* popf; tell the caller to not touch TF */
@@ -655,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		 *
 		 * But there is corner case, see the comment in ->post_xol().
 		 */
-		if (push_ret_address(regs, new_ip))
+		if (emulate_push_stack(regs, new_ip))
 			return false;
 	} else if (!check_jmp_cond(auprobe, regs)) {
 		offs = 0;
@@ -665,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	return true;
 }
 
+static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+	unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset;
+
+	if (emulate_push_stack(regs, *src_ptr))
+		return false;
+	regs->ip += auprobe->push.ilen;
+	return true;
+}
+
 static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
 	BUG_ON(!branch_is_call(auprobe));
@@ -703,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = {
 	.post_xol = branch_post_xol_op,
 };
 
+static const struct uprobe_xol_ops push_xol_ops = {
+	.emulate  = push_emulate_op,
+};
+
 /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
 static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 {
@@ -750,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
 	return 0;
 }
 
+/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */
+static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
+{
+	u8 opc1 = OPCODE1(insn), reg_offset = 0;
+
+	if (opc1 < 0x50 || opc1 > 0x57)
+		return -ENOSYS;
+
+	if (insn->length > 2)
+		return -ENOSYS;
+	if (insn->length == 2) {
+		/* only support rex_prefix 0x41 (x64 only) */
+#ifdef CONFIG_X86_64
+		if (insn->rex_prefix.nbytes != 1 ||
+		    insn->rex_prefix.bytes[0] != 0x41)
+			return -ENOSYS;
+
+		switch (opc1) {
+		case 0x50:
+			reg_offset = offsetof(struct pt_regs, r8);
+			break;
+		case 0x51:
+			reg_offset = offsetof(struct pt_regs, r9);
+			break;
+		case 0x52:
+			reg_offset = offsetof(struct pt_regs, r10);
+			break;
+		case 0x53:
+			reg_offset = offsetof(struct pt_regs, r11);
+			break;
+		case 0x54:
+			reg_offset = offsetof(struct pt_regs, r12);
+			break;
+		case 0x55:
+			reg_offset = offsetof(struct pt_regs, r13);
+			break;
+		case 0x56:
+			reg_offset = offsetof(struct pt_regs, r14);
+			break;
+		case 0x57:
+			reg_offset = offsetof(struct pt_regs, r15);
+			break;
+		}
+#else
+		return -ENOSYS;
+#endif
+	} else {
+		switch (opc1) {
+		case 0x50:
+			reg_offset = offsetof(struct pt_regs, ax);
+			break;
+		case 0x51:
+			reg_offset = offsetof(struct pt_regs, cx);
+			break;
+		case 0x52:
+			reg_offset = offsetof(struct pt_regs, dx);
+			break;
+		case 0x53:
+			reg_offset = offsetof(struct pt_regs, bx);
+			break;
+		case 0x54:
+			reg_offset = offsetof(struct pt_regs, sp);
+			break;
+		case 0x55:
+			reg_offset = offsetof(struct pt_regs, bp);
+			break;
+		case 0x56:
+			reg_offset = offsetof(struct pt_regs, si);
+			break;
+		case 0x57:
+			reg_offset = offsetof(struct pt_regs, di);
+			break;
+		}
+	}
+
+	auprobe->push.reg_offset = reg_offset;
+	auprobe->push.ilen = insn->length;
+	auprobe->ops = &push_xol_ops;
+	return 0;
+}
+
 /**
  * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
  * @mm: the probed address space.
@@ -771,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 	if (ret != -ENOSYS)
 		return ret;
 
+	ret = push_setup_xol_ops(auprobe, &insn);
+	if (ret != -ENOSYS)
+		return ret;
+
 	/*
 	 * Figure out which fixups default_post_xol_op() will need to perform,
 	 * and annotate defparam->fixups accordingly.

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-12-12 11:00 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-12-01  0:12 [PATCH][v5] uprobes/x86: emulate push insns for uprobe on x86 Yonghong Song
2017-12-06  0:33 ` Yonghong Song
2017-12-12 10:57 ` [tip:perf/core] uprobes/x86: Emulate " tip-bot for Yonghong Song

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.