Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH v10 tip 3/9] tracing: attach BPF programs to kprobes
From: Masami Hiramatsu @ 2015-03-23  2:14 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1427053150-32213-4-git-send-email-ast@plumgrid.com>

(2015/03/23 4:39), Alexei Starovoitov wrote:
> User interface:
> struct perf_event_attr attr = {.type = PERF_TYPE_TRACEPOINT, .config = event_id, ...};
> event_fd = perf_event_open(&attr,...);
> ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
> 
> prog_fd is a file descriptor associated with BPF program previously loaded.
> event_id is an ID of created kprobe
> 
> close(event_fd) - automatically detaches BPF program from it
> 
> BPF programs can call in-kernel helper functions to:
> - lookup/update/delete elements in maps
> - probe_read - wraper of probe_kernel_read() used to access any kernel
>   data structures
> 
> BPF programs receive 'struct pt_regs *' as an input
> ('struct pt_regs' is architecture dependent)
> and return 0 to ignore the event and 1 to store kprobe event into ring buffer.
> 
> Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
> kprobes must be recompiled for every kernel version and user must supply correct
> LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.
> 

This looks OK to me :)

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>

Thanks!

> Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
> Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
> ---
>  include/linux/ftrace_event.h    |   11 ++++
>  include/uapi/linux/bpf.h        |    3 +
>  include/uapi/linux/perf_event.h |    1 +
>  kernel/bpf/syscall.c            |    7 ++-
>  kernel/events/core.c            |   59 ++++++++++++++++++
>  kernel/trace/Makefile           |    1 +
>  kernel/trace/bpf_trace.c        |  130 +++++++++++++++++++++++++++++++++++++++
>  kernel/trace/trace_kprobe.c     |    8 +++
>  8 files changed, 219 insertions(+), 1 deletion(-)
>  create mode 100644 kernel/trace/bpf_trace.c
> 
> diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
> index 77325e1a1816..0aa535bc9f05 100644
> --- a/include/linux/ftrace_event.h
> +++ b/include/linux/ftrace_event.h
> @@ -13,6 +13,7 @@ struct trace_array;
>  struct trace_buffer;
>  struct tracer;
>  struct dentry;
> +struct bpf_prog;
>  
>  struct trace_print_flags {
>  	unsigned long		mask;
> @@ -306,6 +307,7 @@ struct ftrace_event_call {
>  #ifdef CONFIG_PERF_EVENTS
>  	int				perf_refcount;
>  	struct hlist_head __percpu	*perf_events;
> +	struct bpf_prog			*prog;
>  
>  	int	(*perf_perm)(struct ftrace_event_call *,
>  			     struct perf_event *);
> @@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
>  		event_triggers_post_call(file, tt);
>  }
>  
> +#ifdef CONFIG_BPF_SYSCALL
> +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
> +#else
> +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
> +{
> +	return 1;
> +}
> +#endif
> +
>  enum {
>  	FILTER_OTHER = 0,
>  	FILTER_STATIC_STRING,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 45da7ec7d274..b2948feeb70b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -118,6 +118,7 @@ enum bpf_map_type {
>  enum bpf_prog_type {
>  	BPF_PROG_TYPE_UNSPEC,
>  	BPF_PROG_TYPE_SOCKET_FILTER,
> +	BPF_PROG_TYPE_KPROBE,
>  };
>  
>  /* flags for BPF_MAP_UPDATE_ELEM command */
> @@ -151,6 +152,7 @@ union bpf_attr {
>  		__u32		log_level;	/* verbosity level of verifier */
>  		__u32		log_size;	/* size of user buffer */
>  		__aligned_u64	log_buf;	/* user supplied buffer */
> +		__u32		kern_version;	/* checked when prog_type=kprobe */
>  	};
>  } __attribute__((aligned(8)));
>  
> @@ -162,6 +164,7 @@ enum bpf_func_id {
>  	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
>  	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
>  	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
> +	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
>  	__BPF_FUNC_MAX_ID,
>  };
>  
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 3c8b45de57ec..ad4dade2a502 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -382,6 +382,7 @@ struct perf_event_attr {
>  #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
>  #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
>  #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
> +#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
>  
>  enum perf_event_ioc_flags {
>  	PERF_IOC_FLAG_GROUP		= 1U << 0,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 536edc2be307..504c10b990ef 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -16,6 +16,7 @@
>  #include <linux/file.h>
>  #include <linux/license.h>
>  #include <linux/filter.h>
> +#include <linux/version.h>
>  
>  static LIST_HEAD(bpf_map_types);
>  
> @@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
>  }
>  
>  /* last field in 'union bpf_attr' used by this command */
> -#define	BPF_PROG_LOAD_LAST_FIELD log_buf
> +#define	BPF_PROG_LOAD_LAST_FIELD kern_version
>  
>  static int bpf_prog_load(union bpf_attr *attr)
>  {
> @@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr)
>  	if (attr->insn_cnt >= BPF_MAXINSNS)
>  		return -EINVAL;
>  
> +	if (type == BPF_PROG_TYPE_KPROBE &&
> +	    attr->kern_version != LINUX_VERSION_CODE)
> +		return -EINVAL;
> +
>  	/* plain bpf_prog allocation */
>  	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
>  	if (!prog)
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 2709063eb26b..3a45e7f6b2df 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -42,6 +42,8 @@
>  #include <linux/module.h>
>  #include <linux/mman.h>
>  #include <linux/compat.h>
> +#include <linux/bpf.h>
> +#include <linux/filter.h>
>  
>  #include "internal.h"
>  
> @@ -3402,6 +3404,7 @@ errout:
>  }
>  
>  static void perf_event_free_filter(struct perf_event *event);
> +static void perf_event_free_bpf_prog(struct perf_event *event);
>  
>  static void free_event_rcu(struct rcu_head *head)
>  {
> @@ -3411,6 +3414,7 @@ static void free_event_rcu(struct rcu_head *head)
>  	if (event->ns)
>  		put_pid_ns(event->ns);
>  	perf_event_free_filter(event);
> +	perf_event_free_bpf_prog(event);
>  	kfree(event);
>  }
>  
> @@ -3923,6 +3927,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
>  static int perf_event_set_output(struct perf_event *event,
>  				 struct perf_event *output_event);
>  static int perf_event_set_filter(struct perf_event *event, void __user *arg);
> +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
>  
>  static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
>  {
> @@ -3976,6 +3981,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
>  	case PERF_EVENT_IOC_SET_FILTER:
>  		return perf_event_set_filter(event, (void __user *)arg);
>  
> +	case PERF_EVENT_IOC_SET_BPF:
> +		return perf_event_set_bpf_prog(event, arg);
> +
>  	default:
>  		return -ENOTTY;
>  	}
> @@ -6436,6 +6444,49 @@ static void perf_event_free_filter(struct perf_event *event)
>  	ftrace_profile_free_filter(event);
>  }
>  
> +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
> +{
> +	struct bpf_prog *prog;
> +
> +	if (event->attr.type != PERF_TYPE_TRACEPOINT)
> +		return -EINVAL;
> +
> +	if (event->tp_event->prog)
> +		return -EEXIST;
> +
> +	if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
> +		/* bpf programs can only be attached to kprobes */
> +		return -EINVAL;
> +
> +	prog = bpf_prog_get(prog_fd);
> +	if (IS_ERR(prog))
> +		return PTR_ERR(prog);
> +
> +	if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) {
> +		/* valid fd, but invalid bpf program type */
> +		bpf_prog_put(prog);
> +		return -EINVAL;
> +	}
> +
> +	event->tp_event->prog = prog;
> +
> +	return 0;
> +}
> +
> +static void perf_event_free_bpf_prog(struct perf_event *event)
> +{
> +	struct bpf_prog *prog;
> +
> +	if (!event->tp_event)
> +		return;
> +
> +	prog = event->tp_event->prog;
> +	if (prog) {
> +		event->tp_event->prog = NULL;
> +		bpf_prog_put(prog);
> +	}
> +}
> +
>  #else
>  
>  static inline void perf_tp_register(void)
> @@ -6451,6 +6502,14 @@ static void perf_event_free_filter(struct perf_event *event)
>  {
>  }
>  
> +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
> +{
> +	return -ENOENT;
> +}
> +
> +static void perf_event_free_bpf_prog(struct perf_event *event)
> +{
> +}
>  #endif /* CONFIG_EVENT_TRACING */
>  
>  #ifdef CONFIG_HAVE_HW_BREAKPOINT
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index 98f26588255e..c575a300103b 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
>  endif
>  obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
>  obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
> +obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o
>  obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
>  obj-$(CONFIG_TRACEPOINTS) += power-traces.o
>  ifeq ($(CONFIG_PM),y)
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> new file mode 100644
> index 000000000000..f1e87da91da3
> --- /dev/null
> +++ b/kernel/trace/bpf_trace.c
> @@ -0,0 +1,130 @@
> +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of version 2 of the GNU General Public
> + * License as published by the Free Software Foundation.
> + */
> +#include <linux/kernel.h>
> +#include <linux/types.h>
> +#include <linux/slab.h>
> +#include <linux/bpf.h>
> +#include <linux/filter.h>
> +#include <linux/uaccess.h>
> +#include "trace.h"
> +
> +static DEFINE_PER_CPU(int, bpf_prog_active);
> +
> +/**
> + * trace_call_bpf - invoke BPF program
> + * @prog: BPF program
> + * @ctx: opaque context pointer
> + *
> + * kprobe handlers execute BPF programs via this helper.
> + * Can be used from static tracepoints in the future.
> + *
> + * Return: BPF programs always return an integer which is interpreted by
> + * kprobe handler as:
> + * 0 - return from kprobe (event is filtered out)
> + * 1 - store kprobe event into ring buffer
> + * Other values are reserved and currently alias to 1
> + */
> +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
> +{
> +	unsigned int ret;
> +
> +	if (in_nmi()) /* not supported yet */
> +		return 1;
> +
> +	preempt_disable();
> +
> +	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> +		/*
> +		 * since some bpf program is already running on this cpu,
> +		 * don't call into another bpf program (same or different)
> +		 * and don't send kprobe event into ring-buffer,
> +		 * so return zero here
> +		 */
> +		ret = 0;
> +		goto out;
> +	}
> +
> +	rcu_read_lock();
> +	ret = BPF_PROG_RUN(prog, ctx);
> +	rcu_read_unlock();
> +
> + out:
> +	__this_cpu_dec(bpf_prog_active);
> +	preempt_enable();
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(trace_call_bpf);
> +
> +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> +	void *dst = (void *) (long) r1;
> +	int size = (int) r2;
> +	void *unsafe_ptr = (void *) (long) r3;
> +
> +	return probe_kernel_read(dst, unsafe_ptr, size);
> +}
> +
> +static const struct bpf_func_proto bpf_probe_read_proto = {
> +	.func		= bpf_probe_read,
> +	.gpl_only	= true,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_STACK,
> +	.arg2_type	= ARG_CONST_STACK_SIZE,
> +	.arg3_type	= ARG_ANYTHING,
> +};
> +
> +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
> +{
> +	switch (func_id) {
> +	case BPF_FUNC_map_lookup_elem:
> +		return &bpf_map_lookup_elem_proto;
> +	case BPF_FUNC_map_update_elem:
> +		return &bpf_map_update_elem_proto;
> +	case BPF_FUNC_map_delete_elem:
> +		return &bpf_map_delete_elem_proto;
> +	case BPF_FUNC_probe_read:
> +		return &bpf_probe_read_proto;
> +	default:
> +		return NULL;
> +	}
> +}
> +
> +/* bpf+kprobe programs can access fields of 'struct pt_regs' */
> +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
> +{
> +	/* check bounds */
> +	if (off < 0 || off >= sizeof(struct pt_regs))
> +		return false;
> +
> +	/* only read is allowed */
> +	if (type != BPF_READ)
> +		return false;
> +
> +	/* disallow misaligned access */
> +	if (off % size != 0)
> +		return false;
> +
> +	return true;
> +}
> +
> +static struct bpf_verifier_ops kprobe_prog_ops = {
> +	.get_func_proto  = kprobe_prog_func_proto,
> +	.is_valid_access = kprobe_prog_is_valid_access,
> +};
> +
> +static struct bpf_prog_type_list kprobe_tl = {
> +	.ops	= &kprobe_prog_ops,
> +	.type	= BPF_PROG_TYPE_KPROBE,
> +};
> +
> +static int __init register_kprobe_prog_ops(void)
> +{
> +	bpf_register_prog_type(&kprobe_tl);
> +	return 0;
> +}
> +late_initcall(register_kprobe_prog_ops);
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 8fa549f6f528..dc3462507d7c 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -1134,11 +1134,15 @@ static void
>  kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
>  {
>  	struct ftrace_event_call *call = &tk->tp.call;
> +	struct bpf_prog *prog = call->prog;
>  	struct kprobe_trace_entry_head *entry;
>  	struct hlist_head *head;
>  	int size, __size, dsize;
>  	int rctx;
>  
> +	if (prog && !trace_call_bpf(prog, regs))
> +		return;
> +
>  	head = this_cpu_ptr(call->perf_events);
>  	if (hlist_empty(head))
>  		return;
> @@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
>  		    struct pt_regs *regs)
>  {
>  	struct ftrace_event_call *call = &tk->tp.call;
> +	struct bpf_prog *prog = call->prog;
>  	struct kretprobe_trace_entry_head *entry;
>  	struct hlist_head *head;
>  	int size, __size, dsize;
>  	int rctx;
>  
> +	if (prog && !trace_call_bpf(prog, regs))
> +		return;
> +
>  	head = this_cpu_ptr(call->perf_events);
>  	if (hlist_empty(head))
>  		return;
> 


-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Research Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply

* Re: Re: [PATCH v9 tip 3/9] tracing: attach BPF programs to kprobes
From: Masami Hiramatsu @ 2015-03-23  2:17 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <550F0402.80900-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

(2015/03/23 3:03), Alexei Starovoitov wrote:
> On 3/22/15 3:06 AM, Masami Hiramatsu wrote:
>> (2015/03/22 1:02), Alexei Starovoitov wrote:
>>> On 3/21/15 5:14 AM, Masami Hiramatsu wrote:
>>>> (2015/03/21 8:30), Alexei Starovoitov wrote:
>>>>>
>>>>> Note, kprobes are _not_ a stable kernel ABI, so bpf programs attached to
>>>>> kprobes must be recompiled for every kernel version and user must supply correct
>>>>> LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call.
>>>>>
>>>>
>>>> Would you mean that the ABI of kprobe-based BPF programs? Kprobe API/ABIs
>>>> (register_kprobe() etc.) are stable, but the code who use kprobes certainly
>>>> depends the kernel binary by design. So, if you meant it, BPF programs must
>>>> be recompiled for every kernel binaries (including configuration changes,
>>>> not only its version).
>>>
>>> yes. I mainly meant that bpf+kprobe programs must be recompiled
>>> for every kernel binary.
>>
>> Hmm, if so, as we do in perf (and systemtap too), you'd better check
>> kernel's build-id instead of the kernel version when loading the
>> BPF program. It is safer than the KERNEL_VERSION_CODE.
> 
> It's not about safety. As I mentioned in cover letter:
> "version check is not used for safety, but for enforcing 'non-ABI-ness'"
> In other words it's like check-box next to 'terms and conditions'
> paragraph that the user has to click before he can continue.
> By providing 'kern_version' during loading the user accepts the fact
> that bpf+kprobe is not a stable ABI. Nothing more and nothing less.
> build-id cannot achieve that, because it cannot be checked from inside
> the kernel.

Ah, I see. Hmm, I think we'd better have such interface (kernel API).

> User space tools that will compile ktap/dtrace scripts into bpf might
> use build-id for their own purpose, but that's a different discussion.

Agreed.
I'd like to discuss it since kprobe event interface may also have same
issue.

Thank you,


-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Research Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ@public.gmane.org

^ permalink raw reply

* bpf+tracing next steps. Was: [PATCH v9 tip 3/9] tracing: attach BPF programs to kprobes
From: Alexei Starovoitov @ 2015-03-23  4:57 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <550F77B3.1020802@hitachi.com>

On 3/22/15 7:17 PM, Masami Hiramatsu wrote:
> (2015/03/23 3:03), Alexei Starovoitov wrote:
> 
>> User space tools that will compile ktap/dtrace scripts into bpf might
>> use build-id for their own purpose, but that's a different discussion.
> 
> Agreed.
> I'd like to discuss it since kprobe event interface may also have same
> issue.

I'm not sure what 'issue' you're seeing. My understanding is that
build-ids are used by perf to associate binaries with their debug info
and by systemtap to make sure that probes actually match the kernel
they were compiled for. In bpf case it probably will be perf way only.
Are you interested in doing something with bpf ? ;)
I know that Jovi is working on clang-based front-end, He Kuang is doing
something fancy and I'm going to focus on 'tcp instrumentation' once
bpf+kprobes is in. I think these efforts will help us make it
concrete and will establish a path towards bpf+tracepoints
(debug tracepoints or trace markers) and eventual integration with perf.
Here is the wish-list (for kernel and userspace) inspired by Brendan:
- access to pid, uid, tid, comm, etc
- access to kernel stack trace
- access to user-level stack trace
- kernel debuginfo for walking kernel structs, and accessing kprobe
entry args as variables
- tracing of uprobes
- tracing of user markers
- user debuginfo for user structs and args
- easy to use language
- library of scripting features
- nice one-liner syntax

I think there is a lot of interest in bpf+tracing and would be good to
align the efforts.

^ permalink raw reply

* Re: [PATCH] mremap: add MREMAP_NOHOLE flag --resend
From: Shaohua Li @ 2015-03-23  5:17 UTC (permalink / raw)
  To: Aliaksey Kandratsenka
  Cc: Daniel Micay, Andrew Morton, linux-mm, linux-api, Rik van Riel,
	Hugh Dickins, Mel Gorman, Johannes Weiner, Michal Hocko,
	Andy Lutomirski, google-perftools@googlegroups.com
In-Reply-To: <CADpJO7zBLhjecbiQeTubnTReiicVLr0-K43KbB4uCL5w_dyqJg@mail.gmail.com>

On Sat, Mar 21, 2015 at 11:06:14PM -0700, Aliaksey Kandratsenka wrote:
> On Wed, Mar 18, 2015 at 10:34 PM, Daniel Micay <danielmicay@gmail.com>
> wrote:
> >
> > On 18/03/15 06:31 PM, Andrew Morton wrote:
> > > On Tue, 17 Mar 2015 14:09:39 -0700 Shaohua Li <shli@fb.com> wrote:
> > >
> > >> There was a similar patch posted before, but it doesn't get merged.
> I'd like
> > >> to try again if there are more discussions.
> > >> http://marc.info/?l=linux-mm&m=141230769431688&w=2
> > >>
> > >> mremap can be used to accelerate realloc. The problem is mremap will
> > >> punch a hole in original VMA, which makes specific memory allocator
> > >> unable to utilize it. Jemalloc is an example. It manages memory in 4M
> > >> chunks. mremap a range of the chunk will punch a hole, which other
> > >> mmap() syscall can fill into. The 4M chunk is then fragmented, jemalloc
> > >> can't handle it.
> > >
> > > Daniel's changelog had additional details regarding the userspace
> > > allocators' behaviour.  It would be best to incorporate that into your
> > > changelog.
> > >
> > > Daniel also had microbenchmark testing results for glibc and jemalloc.
> > > Can you please do this?
> > >
> > > I'm not seeing any testing results for tcmalloc and I'm not seeing
> > > confirmation that this patch will be useful for tcmalloc.  Has anyone
> > > tried it, or sought input from tcmalloc developers?
> >
> > TCMalloc and jemalloc are currently equally slow in this benchmark, as
> > neither makes use of mremap. They're ~2-3x slower than glibc. I CC'ed
> > the currently most active TCMalloc developer so they can give input
> > into whether this patch would let them use it.
> 
> 
> Hi.
> 
> Thanks for looping us in for feedback (I'm CC-ing gperftools mailing list).
> 
> Yes, that might be useful feature. (Assuming I understood it correctly) I
> believe
> tcmalloc would likely use:
> 
> mremap(old_ptr, move_size, move_size,
>        MREMAP_MAYMOVE | MREMAP_FIXED | MREMAP_NOHOLE,
>        new_ptr);
> 
> as optimized equivalent of:
> 
> memcpy(new_ptr, old_ptr, move_size);
> madvise(old_ptr, move_size, MADV_DONTNEED);
> 
> And btw I find MREMAP_RETAIN name from original patch to be slightly more
> intuitive than MREMAP_NOHOLE. In my humble opinion the later name does not
> reflect semantic of this feature at all (assuming of course I correctly
> understood what the patch does).
> 
> I do have a couple of questions about this approach however. Please feel
> free to
> educate me on them.
> 
> a) what is the smallest size where mremap is going to be faster ?
> 
> My initial thinking was that we'd likely use mremap in all cases where we
> know
> that touching destination would cause minor page faults (i.e. when
> destination
> chunk was MADV_DONTNEED-ed or is brand new mapping). And then also always
> when
> size is large enough, i.e. because "teleporting" large count of pages is
> likely
> to be faster than copying them.
> 
> But now I realize that it is more interesting than that. I.e. because as
> Daniel
> pointed out, mremap holds mmap_sem exclusively, while page faults are
> holding it
> for read. That could be optimized of course. Either by separate "teleport
> ptes"
> syscall (again, as noted by Daniel), or by having mremap drop mmap_sem for
> write
> and retaking it for read for "moving pages" part of work. Being not really
> familiar with kernel code I have no idea if that's doable or not. But it
> looks
> like it might be quite important.

Does mmap_sem contend in your workload? Otherwise, there is no big
difference of read or write lock. memcpy to new allocation could trigger
page fault, new page allocation overhead and etc.
 
> Another aspect where I am similarly illiterate is performance effect of tlb
> flushes needed for such operation.

MADV_DONTNEED does tlb flush too.

> We can certainly experiment and find that limit. But if mremap threshold is
> going to be large, then perhaps this kernel feature is not as useful as we
> may
> hope.

There are a lot of factors here:
For mremap, the overhead:
-mmap sem write lock
-tlb flush

For memcpy + madvise, the overhead:
-memcpy
-new address triggers page fault (allocate new pages, handle page fault)
-is old address MADV_DONTNEED? (tlb flush)

I thought unless allocator only uses memcpy (without madvise, then
allocator will use more memory as necessary) for small size memory
(while memcpy for small size memory is faster than tlb flush), mremap
is a win. We probably can measure the size of memcpy which has smaller
overhead than tlb flush

> b) is that optimization worth having at all ?
> 
> After all, memcpy is actually known to be fast. I understand that copying
> memory
> in user space can be slowed down by minor page faults (results below seem to
> confirm that). But this is something where either allocator may retain
> populated
> pages a bit longer or where kernel could help. E.g. maybe by exposing
> something
> similar to MAP_POPULATE in madvise, or even doing some safe combination of
> madvise and MAP_UNINITIALIZED.

This option will make allocator use more memory than expected.
Eventually the memory must be reclaimed, which has big overhead too.
 
> I've played with Daniel's original benchmark (copied from
> http://marc.info/?l=linux-mm&m=141230769431688&w=2) with some tiny
> modifications:
> 
> #include <string.h>
> #include <stdlib.h>
> #include <stdio.h>
> #include <sys/mman.h>
> 
> int main(int argc, char **argv)
> {
>         if (argc > 1 && strcmp(argv[1], "--mlock") == 0) {
>                 int rv = mlockall(MCL_CURRENT | MCL_FUTURE);
>                 if (rv) {
>                         perror("mlockall");
>                         abort();
>                 }
>                 puts("mlocked!");
>         }
> 
>         for (size_t i = 0; i < 64; i++) {
>                 void *ptr = NULL;
>                 size_t old_size = 0;
>                 for (size_t size = 4; size < (1 << 30); size *= 2) {
>                         /*
>                          * void *hole = malloc(1 << 20);
>                          * if (!hole) {
>                          *      perror("malloc");
>                          *      abort();
>                          * }
>                          */
>                         ptr = realloc(ptr, size);
>                         if (!ptr) {
>                                 perror("realloc");
>                                 abort();
>                         }
>                         /* free(hole); */
>                         memset(ptr + old_size, 0xff, size - old_size);
>                         old_size = size;
>                 }
>                 free(ptr);
>         }
> }
> 
> I cannot say if this benchmark's vectors of up to 0.5 gigs are common in
> important applications or not. It can be argued that apps that care about
> such
> large vectors can do mremap themselves.
> 
> On the other hand, I believe that this micro benchmark could be plausibly
> changed to grow vector by smaller factor (i.e. see
> https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md#memory-handling).
> And
> with smaller growth factor, is seems reasonable to expect larger overhead
> from
> memcpy and smaller overhead from mremap. And thus favor mremap more.
> 
> And I confirm that with all default settings tcmalloc and jemalloc lose to
> glibc. Also, notably, recent dev build of jemalloc (what is going to be 4.0
> AFAIK) actually matches or exceeds glibc speed, despite still not doing
> mremap. Apparently it is smarter about avoiding moving allocation for those
> realloc-s. And it was even able to resist my attempt to force it to move
> allocation. I haven't investigated why. Note that I built it couple weeks
> or so
> ago from dev branch, so it might simply have bugs.
> 
> Results also vary greatly depending in transparent huge pages setting.
> Here's
> what I've got:
> 
> allocator |   mode    | time  | sys time | pgfaults |             extra
> ----------+-----------+-------+----------+----------+-------------------------------
> glibc     |           | 10.75 |     8.44 |  8388770 |
> glibc     |    thp    |  5.67 |     3.44 |   310882 |
> glibc     |   mlock   | 13.22 |     9.41 |  8388821 |
> glibc     | thp+mlock |  8.43 |     4.63 |   310933 |
> tcmalloc  |           | 11.46 |     2.00 |  2104826 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  |    thp    | 10.61 |     0.89 |   386206 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  |   mlock   | 10.11 |     0.27 |   264721 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  | thp+mlock | 10.28 |     0.17 |    46011 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=f
> tcmalloc  |           | 23.63 |    17.16 | 16770107 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc  |    thp    | 11.82 |     5.14 |   352477 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc  |   mlock   | 10.10 |     0.28 |   264724 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> tcmalloc  | thp+mlock | 10.30 |     0.17 |    49168 |
> TCMALLOC_AGGRESSIVE_DECOMMIT=t
> jemalloc1 |           | 23.71 |    17.33 | 16744572 |
> jemalloc1 |    thp    | 11.65 |     4.68 |    64988 |
> jemalloc1 |   mlock   | 10.13 |     0.29 |   263305 |
> jemalloc1 | thp+mlock | 10.05 |     0.17 |    50217 |
> jemalloc2 |           | 10.87 |     8.64 |  8521796 |
> jemalloc2 |    thp    |  4.64 |     2.32 |    56060 |
> jemalloc2 |   mlock   |  4.22 |     0.28 |   263181 |
> jemalloc2 | thp+mlock |  4.12 |     0.19 |    50411 |
> ----------+-----------+-------+----------+----------+-------------------------------
> 
> NOTE: usual disclaimer applies about possibility of screwing something up
> and
> getting invalid benchmark results without being able to see it. I apologize
> in
> advance.
> 
> NOTE: jemalloc1 is 3.6 as shipped by up-to-date Debian Sid. jemalloc2 is
> home-built snapshot of upcoming jemalloc 4.0.
> 
> NOTE: TCMALLOC_AGGRESSIVE_DECOMMIT=t (and default since 2.4) makes tcmalloc
> MADV_DONTNEED large free blocks immediately. As opposed to less rare with
> setting of "false". And it makes big difference on page faults counts and
> thus
> on runtime.
> 
> Another notable thing is how mlock effectively disables MADV_DONTNEED for
> jemalloc{1,2} and tcmalloc, lowers page faults count and thus improves
> runtime. It can be seen that tcmalloc+mlock on thp-less configuration is
> slightly better on runtime to glibc. The later spends a ton of time in
> kernel,
> probably handling minor page faults, and the former burns cpu in user space
> doing memcpy-s. So "tons of memcpys" seems to be competitive to what glibc
> is
> doing in this benchmark.

mlock disables MADV_DONTNEED, so this is an unfair comparsion. With it,
allocator will use more memory than expected.

I'm kind of confused why we talk about THP, mlock here. When application
uses allocator, it doesn't need to be forced to use THP or mlock. Can we
forcus on normal case?

Thanks,
Shaohua

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH V2 1/2] af_packet: make tpacket_rcv to not set status value before run_filter
From: Alexander Drozdov @ 2015-03-23  6:11 UTC (permalink / raw)
  To: David S. Miller, Jonathan Corbet
  Cc: Daniel Borkmann, Eric Dumazet, Al Viro, Willem de Bruijn,
	Michael S. Tsirkin, netdev, linux-kernel, Tobias Klauser,
	linux-doc, linux-api, Alexander Drozdov
In-Reply-To: <1426752102-12786-2-git-send-email-al.drozdov@gmail.com>

It is just an optimization. We don't need the value of status variable
if the packet is filtered.

Signed-off-by: Alexander Drozdov <al.drozdov@gmail.com>
---
 net/packet/af_packet.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f8db706..6ecf8dd 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1910,14 +1910,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL)
-		status |= TP_STATUS_CSUMNOTREADY;
-
 	snaplen = skb->len;
 
 	res = run_filter(skb, sk, snaplen);
 	if (!res)
 		goto drop_n_restore;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		status |= TP_STATUS_CSUMNOTREADY;
+
 	if (snaplen > res)
 		snaplen = res;
 
-- 
1.9.1


^ permalink raw reply related

* [PATCH V2 2/2] af_packet: pass checksum validation status to the user
From: Alexander Drozdov @ 2015-03-23  6:11 UTC (permalink / raw)
  To: David S. Miller, Jonathan Corbet
  Cc: Daniel Borkmann, Eric Dumazet, Al Viro, Willem de Bruijn,
	Michael S. Tsirkin, netdev, linux-kernel, Tobias Klauser,
	linux-doc, linux-api, Alexander Drozdov
In-Reply-To: <1427091073-8129-1-git-send-email-al.drozdov@gmail.com>

Introduce TP_STATUS_CSUM_VALID tp_status flag to tell the
af_packet user that at least the transport header checksum
has been already validated.

For now, the flag may be set for incoming packets only.

Signed-off-by: Alexander Drozdov <al.drozdov@gmail.com>
Cc: Willem de Bruijn <willemb@google.com>
---
 Documentation/networking/packet_mmap.txt | 13 ++++++++++---
 include/uapi/linux/if_packet.h           |  1 +
 net/packet/af_packet.c                   |  9 +++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index a6d7cb9..daa015a 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -440,9 +440,10 @@ and the following flags apply:
 +++ Capture process:
      from include/linux/if_packet.h
 
-     #define TP_STATUS_COPY          2 
-     #define TP_STATUS_LOSING        4 
-     #define TP_STATUS_CSUMNOTREADY  8 
+     #define TP_STATUS_COPY          (1 << 1)
+     #define TP_STATUS_LOSING        (1 << 2)
+     #define TP_STATUS_CSUMNOTREADY  (1 << 3)
+     #define TP_STATUS_CSUM_VALID    (1 << 7)
 
 TP_STATUS_COPY        : This flag indicates that the frame (and associated
                         meta information) has been truncated because it's 
@@ -466,6 +467,12 @@ TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets which
                         reading the packet we should not try to check the 
                         checksum. 
 
+TP_STATUS_CSUM_VALID  : This flag indicates that at least the transport
+                        header checksum of the packet has been already
+                        validated on the kernel side. If the flag is not set
+                        then we are free to check the checksum by ourselves
+                        provided that TP_STATUS_CSUMNOTREADY is also not set.
+
 for convenience there are also the following defines:
 
      #define TP_STATUS_KERNEL        0
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index da2d668..053bd10 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -99,6 +99,7 @@ struct tpacket_auxdata {
 #define TP_STATUS_VLAN_VALID		(1 << 4) /* auxdata has valid tp_vlan_tci */
 #define TP_STATUS_BLK_TMO		(1 << 5)
 #define TP_STATUS_VLAN_TPID_VALID	(1 << 6) /* auxdata has valid tp_vlan_tpid */
+#define TP_STATUS_CSUM_VALID		(1 << 7)
 
 /* Tx ring - header status */
 #define TP_STATUS_AVAILABLE	      0
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 6ecf8dd..3f09dda 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1918,6 +1918,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
 		status |= TP_STATUS_CSUMNOTREADY;
+	else if (skb->pkt_type != PACKET_OUTGOING &&
+		 (skb->ip_summed == CHECKSUM_COMPLETE ||
+		  skb_csum_unnecessary(skb)))
+		status |= TP_STATUS_CSUM_VALID;
 
 	if (snaplen > res)
 		snaplen = res;
@@ -3015,6 +3019,11 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_status = TP_STATUS_USER;
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			aux.tp_status |= TP_STATUS_CSUMNOTREADY;
+		else if (skb->pkt_type != PACKET_OUTGOING &&
+			 (skb->ip_summed == CHECKSUM_COMPLETE ||
+			  skb_csum_unnecessary(skb)))
+			aux.tp_status |= TP_STATUS_CSUM_VALID;
+
 		aux.tp_len = PACKET_SKB_CB(skb)->origlen;
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v9 tip 6/9] samples: bpf: simple non-portable kprobe filter example
From: Ingo Molnar @ 2015-03-23  7:29 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1426894210-27441-7-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>


* Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:

> tracex1_kern.c - C program compiled into BPF.
> It attaches to kprobe:netif_receive_skb
> When skb->dev->name == "lo", it prints sample debug message into trace_pipe
> via bpf_trace_printk() helper function.
> 
> tracex1_user.c - corresponding user space component that:
> - loads bpf program via bpf() syscall
> - opens kprobes:netif_receive_skb event via perf_event_open() syscall
> - attaches the program to event via ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
> - prints from trace_pipe
> 
> Note, this bpf program is completely non-portable. It must be recompiled
> with current kernel headers. kprobe is not a stable ABI and bpf+kprobe scripts
> may stop working any time.
> 
> bpf verifier will detect that it's using bpf_trace_printk() and kernel will
> print warning banner:
> ** trace_printk() being used. Allocating extra memory.  **

Printing this might be OK.

> **                                                      **
> ** This means that this is a DEBUG kernel and it is     **
> ** unsafe for production use.                           **

But I think printing that it's unsafe for production use is over the 
top: it's up to the admin whether it's safe or unsafe, just like 
inserting a kprobe can be safe or unsafe.

Informing that something happened is enough.

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH v9 tip 6/9] samples: bpf: simple non-portable kprobe filter example
From: Ingo Molnar @ 2015-03-23  7:35 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426894210-27441-7-git-send-email-ast@plumgrid.com>


* Alexei Starovoitov <ast@plumgrid.com> wrote:

> +void read_trace_pipe(void)
> +{
> +	int trace_fd;
> +
> +	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
> +	if (trace_fd < 0)
> +		return;
> +
> +	while (1) {
> +		static char buf[4096];
> +		ssize_t sz;
> +
> +		sz = read(trace_fd, buf, sizeof(buf));

read() will return -1 on failure ...

> +		if (sz) {

... this test passes ...

> +			buf[sz] = 0;

... and here we smash the stack?

> +			puts(buf);
> +		}
> +	}


Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH v9 tip 0/9] tracing: attach eBPF programs to kprobes
From: Ingo Molnar @ 2015-03-23  7:36 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Alexei Starovoitov, Namhyung Kim, Arnaldo Carvalho de Melo,
	Jiri Olsa, Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150321000812.606ef8cb-2kNGR76GQU9OHLTnHDQRgA@public.gmane.org>


* Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org> wrote:

> On Fri, 20 Mar 2015 16:30:01 -0700
> Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org> wrote:
> 
> > Hi Ingo,
> > 
> > I think it's good to go.
> > Patch 1 is already in net-next. Patch 3 depends on it.
> > I'm assuming it's not going to be a problem during merge window.
> > Patch 3 will have a minor conflict in uapi/linux/bpf.h in linux-next,
> > since net-next has added new lines to the bpf_prog_type and bpf_func_id enums.
> > I'm assuming it's not a problem either.
> > 
> > V8->V9:
> > - fixed comment style and allowed ispunct after %p
> > - added Steven's Reviewed-by. Thanks Steven!
> 
> Hi Ingo,
> 
> I'm fine with this series, so don't let me hold it up from going into
> your tree.

Ok, thanks!

Looks like a very powerful instrumentation interface to me.

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH v9 tip 8/9] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Ingo Molnar @ 2015-03-23  7:40 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api, netdev, linux-kernel
In-Reply-To: <1426894210-27441-9-git-send-email-ast@plumgrid.com>


* Alexei Starovoitov <ast@plumgrid.com> wrote:

> BPF C program attaches to blk_mq_start_request/blk_update_request kprobe events
> to calculate IO latency.

 ...

> +/* kprobe is NOT a stable ABI
> + * This bpf+kprobe example can stop working any time.
> + */
> +SEC("kprobe/blk_mq_start_request")
> +int bpf_prog1(struct pt_regs *ctx)
> +{
> +	long rq = ctx->di;
> +	u64 val = bpf_ktime_get_ns();
> +
> +	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
> +	return 0;
> +}

So just to make sure the original BPF instrumentation model is still 
upheld: no matter in what way the kernel changes, neither the kprobe, 
nor the BPF program can ever crash or corrupt the kernel, assuming the 
kprobes, perf and BPF subsystem has no bugs, correct?

So 'stops working' here means that the instrumentation data might not 
be reliable if kernel internal interfaces change - but it won't ever 
make the kernel unreliable in any fashion. Right?

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Gerd Hoffmann @ 2015-03-23  7:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: virtio-dev, open list, open list:ABI/API, virtualization
In-Reply-To: <20150321225356-mutt-send-email-mst@redhat.com>

  Hi,

> > > > +		if (cfg & (1 << (bit % 8)))
> > > > +			set_bit(bit, bits);
> > > 
> > > what if not set? does something clear the mask?
> > 
> > kzalloc?
> 
> So you are really just reading in array of bytes?
> All this set bit trickery is just to convert things from LE?

Trickery?  Just checking each bit from virtio config space, then set it
in the input layer bitmap.  It's a simple stupid loop.

Surely not the most efficient way, but hey, it's not in the hot path and
I'm sure I'm setting the bits correctly because this uses the standard
linux kernel bitops.

> At least, this needs a comment explaining what the function does,
> and maybe wrap it in a helper like virtio_input_bitmap_copy or
> virtio_bitmap_or.

Can do that, sure.

> > > > +	size = virtinput_cfg_select(vi, VIRTIO_INPUT_CFG_ABS_INFO, abs);
> > > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.min, &mi);
> > > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.max, &ma);
> > > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.fuzz, &fu);
> > > > +	virtio_cread(vi->vdev, struct virtio_input_config, u.abs.flat, &fl);
> > > 
> > > you read le field into u32 value.
> > > Please run sparse on this code. you will get a ton
> > > of warnings. Same error appears elsewhere.
> > 
> > Indeed.  IIRC that wasn't the case a while back.  Guess those bitwise
> > annotations have been added with the virtio 1.0 patches?
> > 
> > In any case I'll fix it up.
> 
> I see you still didn't in v2?

v2 builds fine without sparse warnings.  virtio_cread handles swapping
if needed and returns native endian, so I have to store this in normal
u32 variables and pass it on to the input layer as-is.

> You are doing leXXX everywhere, that's VERSION_1 dependency.
> virtio_cread will do byteswaps differently without VERSION_1.
> Just don't go there.

Changed that for v2, for the config space structs.  They have normal u32
in there now.  virtio_cread() wants it this way.

cheers,
  Gerd

^ permalink raw reply

* Re: bpf+tracing next steps. Was: [PATCH v9 tip 3/9] tracing: attach BPF programs to kprobes
From: Masami Hiramatsu @ 2015-03-23  9:27 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, David S. Miller,
	Daniel Borkmann, Peter Zijlstra, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <550F9D20.7070006-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

(2015/03/23 13:57), Alexei Starovoitov wrote:
> On 3/22/15 7:17 PM, Masami Hiramatsu wrote:
>> (2015/03/23 3:03), Alexei Starovoitov wrote:
>>
>>> User space tools that will compile ktap/dtrace scripts into bpf might
>>> use build-id for their own purpose, but that's a different discussion.
>>
>> Agreed.
>> I'd like to discuss it since kprobe event interface may also have same
>> issue.
> 
> I'm not sure what 'issue' you're seeing. My understanding is that
> build-ids are used by perf to associate binaries with their debug info
> and by systemtap to make sure that probes actually match the kernel
> they were compiled for. In bpf case it probably will be perf way only.

Ah, I see. So perftools can check the build-id if needed, right?

> Are you interested in doing something with bpf ? ;)

Of course :)

> I know that Jovi is working on clang-based front-end, He Kuang is doing
> something fancy and I'm going to focus on 'tcp instrumentation' once
> bpf+kprobes is in. I think these efforts will help us make it
> concrete and will establish a path towards bpf+tracepoints
> (debug tracepoints or trace markers) and eventual integration with perf.
> Here is the wish-list (for kernel and userspace) inspired by Brendan:
> - access to pid, uid, tid, comm, etc
> - access to kernel stack trace
> - access to user-level stack trace
> - kernel debuginfo for walking kernel structs, and accessing kprobe
> entry args as variables

perf probe can provide this to bpf.

> - tracing of uprobes
> - tracing of user markers

I'm working on the perf-cache which will also support SDT (based on Hemant Kumar's work).

> - user debuginfo for user structs and args

Ditto.

> - easy to use language
> - library of scripting features
> - nice one-liner syntax
> 
> I think there is a lot of interest in bpf+tracing and would be good to
> align the efforts.

Agreed :)

Thanks!

-- 
Masami HIRAMATSU
Software Platform Research Dept. Linux Technology Research Center
Hitachi, Ltd., Yokohama Research Laboratory
E-mail: masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ@public.gmane.org

^ permalink raw reply

* RE: [PATCH v10 tip 5/9] tracing: allow BPF programs to call bpf_trace_printk()
From: David Laight @ 2015-03-23 11:37 UTC (permalink / raw)
  To: 'Alexei Starovoitov', Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Peter Zijlstra, linux-api@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <1427053150-32213-6-git-send-email-ast@plumgrid.com>

From: Alexei Starovoitov
> Debugging of BPF programs needs some form of printk from the program,
> so let programs call limited trace_printk() with %d %u %x %p modifiers only.

Should anyone be allowed to use BPF programs to determine the kernel
addresses of any items?
Looks as though it is leaking kernel addresses to userspace.
Note that the problem is with the arguments, not the format string.

	David

^ permalink raw reply

* Re: [virtio-dev] Re: [PATCH 1/1] Add virtio-input driver.
From: Paolo Bonzini @ 2015-03-23 11:52 UTC (permalink / raw)
  To: Gerd Hoffmann, Michael S. Tsirkin
  Cc: virtio-dev, open list, ABI/API, virtualization
In-Reply-To: <1427097238.6365.27.camel@nilsson.home.kraxel.org>



On 23/03/2015 08:53, Gerd Hoffmann wrote:
>>>>> > > > > +		if (cfg & (1 << (bit % 8)))
>>>>> > > > > +			set_bit(bit, bits);
>>>> > > > 
>>>> > > > what if not set? does something clear the mask?
>>> > > 
>>> > > kzalloc?
>> > 
>> > So you are really just reading in array of bytes?
>> > All this set bit trickery is just to convert things from LE?
> Trickery?  Just checking each bit from virtio config space, then set it
> in the input layer bitmap.  It's a simple stupid loop.
> 
> Surely not the most efficient way, but hey, it's not in the hot path and
> I'm sure I'm setting the bits correctly because this uses the standard
> linux kernel bitops.

Use __set_bit though, because set_bit is an atomic operation.

Paolo

>> > At least, this needs a comment explaining what the function does,
>> > and maybe wrap it in a helper like virtio_input_bitmap_copy or
>> > virtio_bitmap_or.
> Can do that, sure.

^ permalink raw reply

* Re: [PATCH v10 tip 5/9] tracing: allow BPF programs to call bpf_trace_printk()
From: Ingo Molnar @ 2015-03-23 12:07 UTC (permalink / raw)
  To: David Laight
  Cc: 'Alexei Starovoitov', Steven Rostedt, Namhyung Kim,
	Arnaldo Carvalho de Melo, Jiri Olsa, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Peter Zijlstra,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6D1CB07731-VkEWCZq2GCInGFn1LkZF6NBPR1lH4CV8@public.gmane.org>


* David Laight <David.Laight-ZS65k/vG3HxXrIkS9f7CXA@public.gmane.org> wrote:

> From: Alexei Starovoitov
> > Debugging of BPF programs needs some form of printk from the program,
> > so let programs call limited trace_printk() with %d %u %x %p modifiers only.
> 
> Should anyone be allowed to use BPF programs to determine the kernel
> addresses of any items?
> Looks as though it is leaking kernel addresses to userspace.
> Note that the problem is with the arguments, not the format string.

All of these are privileged operations - inherent if you are trying to 
debug the kernel.

Thanks,

	Ingo

^ permalink raw reply

* Re: [Gta04-owner] [PATCH 08/14] twl4030_charger: allow max_current to be managed via sysfs.
From: jake42 @ 2015-03-23 12:14 UTC (permalink / raw)
  To: NeilBrown
  Cc: Sebastian Reichel, linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	inux-pm-u79uwXL29TY76Z2rM5mHXA, Pavel Machek,
	linux-omap-u79uwXL29TY76Z2rM5mHXA, Lee Jones,
	List for communicating with real GTA04 owners
In-Reply-To: <20150322232029.23789.13768.stgit-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>

Hello Neil,

some suggestions:

On 23.03.2015 00:20, NeilBrown wrote:
> From: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
> diff --git a/Documentation/ABI/testing/sysfs-class-power-twl4030
> b/Documentation/ABI/testing/sysfs-class-power-twl4030
> new file mode 100644
> index 000000000000..06092209d851
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-class-power-twl4030
> @@ -0,0 +1,15 @@
> +What: /sys/class/power_supply/twl4030_ac/max_current
> +      /sys/class/power_supply/twl4030_usb/max_current
> +Description:
> +	Read/Write limit on current which which may
one less which                        ^^
> +	be drawn from the ac (Accessory Charger) or
> +	USB port.
> +
> +	Value is in micro-Amps.
> +
> +	Value is set automatically to an appropriate
> +	value when a cable is plugged on unplugged.
s/on/or                               ^^
> +
> +	Value can the set by writing to the attribute.
                   ^^ be set?
> +	The change will only persist until the next
> +	plug event.  These event are reported via udev.

Regards
Jake

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Gerd Hoffmann @ 2015-03-23 13:44 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: virtio-dev, virtualization, Rusty Russell, open list,
	open list:ABI/API
In-Reply-To: <1427097238.6365.27.camel@nilsson.home.kraxel.org>

  Hi,

> > At least, this needs a comment explaining what the function does,
> > and maybe wrap it in a helper like virtio_input_bitmap_copy or
> > virtio_bitmap_or.
> 
> Can do that, sure.

Well, the function where this is in already cares about the bitmap copy
only.  Can add a comment though.

> > You are doing leXXX everywhere, that's VERSION_1 dependency.
> > virtio_cread will do byteswaps differently without VERSION_1.
> > Just don't go there.
> 
> Changed that for v2, for the config space structs.  They have normal u32
> in there now.  virtio_cread() wants it this way.

I liked the __le32 in the config space structs more though, so I've
waded through the virtio_config.h header file.

To me it looks like we need separate virtio_cread() versions for
non-transitional drivers, which do __le32 -> u32 translation instead of
__virtio32 -> u32 translation, so I can have __le32 types in the config
space structs.

Or I could use vdev->config->get() directly instead of virtio_cread, but
I'll loose sparse checking that way.

Hmm.  Recommendations?  Better ideas?

cheers,
  Gerd

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Michael S. Tsirkin @ 2015-03-23 13:51 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: virtio-dev, virtualization, Rusty Russell, open list,
	open list:ABI/API
In-Reply-To: <1427118292.27137.39.camel@nilsson.home.kraxel.org>

On Mon, Mar 23, 2015 at 02:44:52PM +0100, Gerd Hoffmann wrote:
>   Hi,
> 
> > > At least, this needs a comment explaining what the function does,
> > > and maybe wrap it in a helper like virtio_input_bitmap_copy or
> > > virtio_bitmap_or.
> > 
> > Can do that, sure.
> 
> Well, the function where this is in already cares about the bitmap copy
> only.  Can add a comment though.

OK, I think that will be enough for now.

> > > You are doing leXXX everywhere, that's VERSION_1 dependency.
> > > virtio_cread will do byteswaps differently without VERSION_1.
> > > Just don't go there.
> > 
> > Changed that for v2, for the config space structs.  They have normal u32
> > in there now.  virtio_cread() wants it this way.
> 
> I liked the __le32 in the config space structs more though, so I've
> waded through the virtio_config.h header file.
> 
> To me it looks like we need separate virtio_cread() versions for
> non-transitional drivers, which do __le32 -> u32 translation instead of
> __virtio32 -> u32 translation, so I can have __le32 types in the config
> space structs.
> 
> Or I could use vdev->config->get() directly instead of virtio_cread, but
> I'll loose sparse checking that way.
> 
> Hmm.  Recommendations?  Better ideas?
> 
> cheers,
>   Gerd

So to clarify, you dislike using __virtio32 in virtio input header?

-- 
MST

^ permalink raw reply

* Re: [PATCH v2 3/7] Introduce a new clone4 syscall with more flag bits and extensible arguments
From: David Drysdale @ 2015-03-23 14:11 UTC (permalink / raw)
  To: Josh Triplett
  Cc: Al Viro, Andrew Morton, Andy Lutomirski, Ingo Molnar, Kees Cook,
	Oleg Nesterov, Paul E. McKenney, H. Peter Anvin, Rik van Riel,
	Thomas Gleixner, Michael Kerrisk, Thiago Macieira,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Linux API,
	Linux FS Devel, X86 ML
In-Reply-To: <367b888ef58831b6812c3cf80ca973c65edc67f5.1426376419.git.josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org>

On Sun, Mar 15, 2015 at 7:59 AM, Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org> wrote:
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 0286735..ba28306 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -483,6 +483,7 @@ GLOBAL(\label)
>         PTREGSCALL stub32_execveat, compat_sys_execveat
>         PTREGSCALL stub32_fork, sys_fork
>         PTREGSCALL stub32_vfork, sys_vfork
> +       PTREGSCALL stub32_clone4, compat_sys_clone4
>
>         ALIGN
>  GLOBAL(stub32_clone)
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 1d74d16..ead143f 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -520,6 +520,7 @@ END(\label)
>         FORK_LIKE  clone
>         FORK_LIKE  fork
>         FORK_LIKE  vfork
> +       FORK_LIKE  clone4
>         FIXED_FRAME stub_iopl, sys_iopl
>
>  ENTRY(stub_execve)
> diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> index b3560ec..56fcc90 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -365,3 +365,4 @@
>  356    i386    memfd_create            sys_memfd_create
>  357    i386    bpf                     sys_bpf
>  358    i386    execveat                sys_execveat                    stub32_execveat
> +359    i386    clone4                  sys_clone4                      stub32_clone4
> diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> index 8d656fb..af15b0f 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -329,6 +329,7 @@
>  320    common  kexec_file_load         sys_kexec_file_load
>  321    common  bpf                     sys_bpf
>  322    64      execveat                stub_execveat
> +323    64      clone4                  stub_clone4
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> @@ -368,3 +369,4 @@
>  543    x32     io_setup                compat_sys_io_setup
>  544    x32     io_submit               compat_sys_io_submit
>  545    x32     execveat                stub_x32_execveat
> +546    x32     clone4                  stub32_clone4

Doesn't this need an x32 specific wrapper (to ensure the full
set of registers are saved)?

^ permalink raw reply

* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: David Drysdale @ 2015-03-23 14:12 UTC (permalink / raw)
  To: Josh Triplett
  Cc: Thiago Macieira, Kees Cook, Al Viro, Andrew Morton,
	Andy Lutomirski, Ingo Molnar, Oleg Nesterov, Paul E. McKenney,
	H. Peter Anvin, Rik van Riel, Thomas Gleixner, Michael Kerrisk,
	LKML, Linux API, linux-fsdevel@vger.kernel.org, x86@kernel.org
In-Reply-To: <20150316232949.GB31751@cloud>

On Mon, Mar 16, 2015 at 11:29 PM,  <josh@joshtriplett.org> wrote:
> On Mon, Mar 16, 2015 at 03:14:14PM -0700, Thiago Macieira wrote:
>> On Monday 16 March 2015 14:44:20 Kees Cook wrote:
>> > >               O_CLOEXEC
>> > >                      Set  the  close-on-exec  flag on the new file
>> > >descriptor. See the description of the O_CLOEXEC flag in open(2)  for
>> > >reasons why this may be useful.
>> >
>> > This begs the question: what happens when all CLONE_FD fds for a
>> > process are closed? Will the parent get SIGCHLD instead, will it
>> > auto-reap, or will it be un-wait-able (I assume not this...)
>>
>> Depends on CLONE_AUTOREAP. If it's on, then no one gets SIGCHLD, no one can
>> wait() on it and the process autoreaps itself.
>
> Minor nit: CLONE_AUTOREAP makes the process autoreap and nobody can wait
> on it, but if you pass SIGCHLD or some other exit signal to clone then
> you'll still get that signal.

Quick query: does CLONE_AUTOREAP also affect waiting for non-exit
events (i.e. WUNTRACED / WCONTINUED), by original parent and/or ptracer?

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Gerd Hoffmann @ 2015-03-23 14:27 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: virtio-dev, open list, open list:ABI/API, virtualization
In-Reply-To: <20150323144803-mutt-send-email-mst@redhat.com>

  Hi,

> > > > You are doing leXXX everywhere, that's VERSION_1 dependency.
> > > > virtio_cread will do byteswaps differently without VERSION_1.
> > > > Just don't go there.

> So to clarify, you dislike using __virtio32 in virtio input header?

Well, as I understand things __virtio32 implies byteorder depends on
whenever we are using VERSION_1 or not.  And non-transitional drivers
should not need it as everything is by definition little endian.

So, yes, your suggestion to just require VERSION_1 in the driver implies
in my eyes that there should be no reason to use __virtio32 instead of
__le32.

Or do I miss something here?

cheers,
  Gerd

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Michael S. Tsirkin @ 2015-03-23 14:54 UTC (permalink / raw)
  To: Gerd Hoffmann
  Cc: virtio-dev-sDuHXQ4OtrM4h7I2RyI4rWD2FQJk+8+b,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	Rusty Russell, open list, open list:ABI/API
In-Reply-To: <1427120855.27137.55.camel-3OfP5uLMi4C46o+2HkPkLj4oCIwMql/M@public.gmane.org>

On Mon, Mar 23, 2015 at 03:27:35PM +0100, Gerd Hoffmann wrote:
>   Hi,
> 
> > > > > You are doing leXXX everywhere, that's VERSION_1 dependency.
> > > > > virtio_cread will do byteswaps differently without VERSION_1.
> > > > > Just don't go there.
> 
> > So to clarify, you dislike using __virtio32 in virtio input header?
> 
> Well, as I understand things __virtio32 implies byteorder depends on
> whenever we are using VERSION_1 or not.  And non-transitional drivers
> should not need it as everything is by definition little endian.
> 
> So, yes, your suggestion to just require VERSION_1 in the driver implies
> in my eyes that there should be no reason to use __virtio32 instead of
> __le32.
> 
> Or do I miss something here?
> 
> cheers,
>   Gerd
> 

You are right but then if you do require VERSION_1 then
__virtio32 becomes identical to __le32.
There's some runtime overhead as we check on each access,
but it shouldn't matter here, right?
I guess we could add virtio_cread_le - is this what
you'd like?


-- 
MST

^ permalink raw reply

* Re: [PATCH v2 0/7] CLONE_FD: Task exit notification via file descriptor
From: josh @ 2015-03-23 15:03 UTC (permalink / raw)
  To: David Drysdale
  Cc: Thiago Macieira, Kees Cook, Al Viro, Andrew Morton,
	Andy Lutomirski, Ingo Molnar, Oleg Nesterov, Paul E. McKenney,
	H. Peter Anvin, Rik van Riel, Thomas Gleixner, Michael Kerrisk,
	LKML, Linux API, linux-fsdevel@vger.kernel.org, x86@kernel.org
In-Reply-To: <CAHse=S-e4Rorgzt=LohtNUeVAXTLbpr8ftPGnBZXgO5sQs28RA@mail.gmail.com>

On Mon, Mar 23, 2015 at 02:12:34PM +0000, David Drysdale wrote:
> On Mon, Mar 16, 2015 at 11:29 PM,  <josh@joshtriplett.org> wrote:
> > On Mon, Mar 16, 2015 at 03:14:14PM -0700, Thiago Macieira wrote:
> >> On Monday 16 March 2015 14:44:20 Kees Cook wrote:
> >> > >               O_CLOEXEC
> >> > >                      Set  the  close-on-exec  flag on the new file
> >> > >descriptor. See the description of the O_CLOEXEC flag in open(2)  for
> >> > >reasons why this may be useful.
> >> >
> >> > This begs the question: what happens when all CLONE_FD fds for a
> >> > process are closed? Will the parent get SIGCHLD instead, will it
> >> > auto-reap, or will it be un-wait-able (I assume not this...)
> >>
> >> Depends on CLONE_AUTOREAP. If it's on, then no one gets SIGCHLD, no one can
> >> wait() on it and the process autoreaps itself.
> >
> > Minor nit: CLONE_AUTOREAP makes the process autoreap and nobody can wait
> > on it, but if you pass SIGCHLD or some other exit signal to clone then
> > you'll still get that signal.
> 
> Quick query: does CLONE_AUTOREAP also affect waiting for non-exit
> events (i.e. WUNTRACED / WCONTINUED), by original parent and/or ptracer?

It shouldn't, no.  You can't wait on the process to exit (you'll get
-ECHLD after it wakes up), but you can wait on it to continue or
similar; none of the autoreap changes should affect that.

- Josh Triplett

^ permalink raw reply

* Re: [PATCH v2 3/7] Introduce a new clone4 syscall with more flag bits and extensible arguments
From: josh @ 2015-03-23 15:05 UTC (permalink / raw)
  To: David Drysdale
  Cc: Al Viro, Andrew Morton, Andy Lutomirski, Ingo Molnar, Kees Cook,
	Oleg Nesterov, Paul E. McKenney, H. Peter Anvin, Rik van Riel,
	Thomas Gleixner, Michael Kerrisk, Thiago Macieira,
	linux-kernel@vger.kernel.org, Linux API, Linux FS Devel, X86 ML
In-Reply-To: <CAHse=S_0G028D2f1vO=1EBO-TM1ZyJZT3-VE4O3a9dS-VcdKzg@mail.gmail.com>

On Mon, Mar 23, 2015 at 02:11:45PM +0000, David Drysdale wrote:
> On Sun, Mar 15, 2015 at 7:59 AM, Josh Triplett <josh@joshtriplett.org> wrote:
> > diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> > index 0286735..ba28306 100644
> > --- a/arch/x86/ia32/ia32entry.S
> > +++ b/arch/x86/ia32/ia32entry.S
> > @@ -483,6 +483,7 @@ GLOBAL(\label)
> >         PTREGSCALL stub32_execveat, compat_sys_execveat
> >         PTREGSCALL stub32_fork, sys_fork
> >         PTREGSCALL stub32_vfork, sys_vfork
> > +       PTREGSCALL stub32_clone4, compat_sys_clone4
> >
> >         ALIGN
> >  GLOBAL(stub32_clone)
> > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> > index 1d74d16..ead143f 100644
> > --- a/arch/x86/kernel/entry_64.S
> > +++ b/arch/x86/kernel/entry_64.S
> > @@ -520,6 +520,7 @@ END(\label)
> >         FORK_LIKE  clone
> >         FORK_LIKE  fork
> >         FORK_LIKE  vfork
> > +       FORK_LIKE  clone4
> >         FIXED_FRAME stub_iopl, sys_iopl
> >
> >  ENTRY(stub_execve)
> > diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
> > index b3560ec..56fcc90 100644
> > --- a/arch/x86/syscalls/syscall_32.tbl
> > +++ b/arch/x86/syscalls/syscall_32.tbl
> > @@ -365,3 +365,4 @@
> >  356    i386    memfd_create            sys_memfd_create
> >  357    i386    bpf                     sys_bpf
> >  358    i386    execveat                sys_execveat                    stub32_execveat
> > +359    i386    clone4                  sys_clone4                      stub32_clone4
> > diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
> > index 8d656fb..af15b0f 100644
> > --- a/arch/x86/syscalls/syscall_64.tbl
> > +++ b/arch/x86/syscalls/syscall_64.tbl
> > @@ -329,6 +329,7 @@
> >  320    common  kexec_file_load         sys_kexec_file_load
> >  321    common  bpf                     sys_bpf
> >  322    64      execveat                stub_execveat
> > +323    64      clone4                  stub_clone4
> >
> >  #
> >  # x32-specific system call numbers start at 512 to avoid cache impact
> > @@ -368,3 +369,4 @@
> >  543    x32     io_setup                compat_sys_io_setup
> >  544    x32     io_submit               compat_sys_io_submit
> >  545    x32     execveat                stub_x32_execveat
> > +546    x32     clone4                  stub32_clone4
> 
> Doesn't this need an x32 specific wrapper (to ensure the full
> set of registers are saved)?

I'm not an x32 expert; I don't know how x32 interacts with pt_regs and
compat syscalls.  Could an x32 expert weigh in, please?

- Josh Triplett

^ permalink raw reply

* Re: [PATCH 1/1] Add virtio-input driver.
From: Gerd Hoffmann @ 2015-03-23 15:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: virtio-dev, virtualization, Rusty Russell, open list,
	open list:ABI/API
In-Reply-To: <20150323155106-mutt-send-email-mst@redhat.com>

On Mo, 2015-03-23 at 15:54 +0100, Michael S. Tsirkin wrote:
> On Mon, Mar 23, 2015 at 03:27:35PM +0100, Gerd Hoffmann wrote:
> >   Hi,
> > 
> > > > > > You are doing leXXX everywhere, that's VERSION_1 dependency.
> > > > > > virtio_cread will do byteswaps differently without VERSION_1.
> > > > > > Just don't go there.
> > 
> > > So to clarify, you dislike using __virtio32 in virtio input header?
> > 
> > Well, as I understand things __virtio32 implies byteorder depends on
> > whenever we are using VERSION_1 or not.  And non-transitional drivers
> > should not need it as everything is by definition little endian.
> > 
> > So, yes, your suggestion to just require VERSION_1 in the driver implies
> > in my eyes that there should be no reason to use __virtio32 instead of
> > __le32.
> > 
> > Or do I miss something here?
> > 
> > cheers,
> >   Gerd
> > 
> 
> You are right but then if you do require VERSION_1 then
> __virtio32 becomes identical to __le32.

Except that sparse doesn't know that and throws errors when I mix the
two.

> There's some runtime overhead as we check on each access,
> but it shouldn't matter here, right?

Correct, config space is used at initialization time only.

> I guess we could add virtio_cread_le - is this what
> you'd like?

I just want something that makes both you and sparse happy.  I don't
care much whenever that is adding virtio_cread_le() or using __virtio32
even though it'll effectively is __le32 due to VERSION_1 being required.

cheers,
  Gerd

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox