Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH ipsec-next 7/7] xfrm: add espintcp (RFC 8229)
From: Steffen Klassert @ 2019-08-29  7:04 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: netdev, Herbert Xu
In-Reply-To: <029b59b8f74dbdbdf202fcf41a9a90b41b4821a2.1566395202.git.sd@queasysnail.net>

On Wed, Aug 21, 2019 at 11:46:25PM +0200, Sabrina Dubroca wrote:
> TCP encapsulation of IKE and IPsec messages (RFC 8229) is implemented
> as a TCP ULP, overriding in particular the sendmsg and recvmsg
> operations. A Stream Parser is used to extract messages out of the TCP
> stream using the first 2 bytes as length marker. Received IKE messages
> are put on "ike_queue", waiting to be dequeued by the custom recvmsg
> implementation. Received ESP messages are sent to XFRM, like with UDP
> encapsulation.
> 
> Some of this code is taken from the original submission by Herbert
> Xu. Currently, only IPv4 is supported, like for UDP encapsulation.
> 
> Co-developed-by: Herbert Xu <herbert@gondor.apana.org.au>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> ---
>  include/net/espintcp.h   |  38 +++
>  include/net/xfrm.h       |   1 +
>  include/uapi/linux/udp.h |   1 +
>  net/ipv4/esp4.c          | 189 ++++++++++++++-
>  net/xfrm/Kconfig         |   9 +
>  net/xfrm/Makefile        |   1 +
>  net/xfrm/espintcp.c      | 505 +++++++++++++++++++++++++++++++++++++++
>  net/xfrm/xfrm_policy.c   |   7 +
>  net/xfrm/xfrm_state.c    |   3 +
>  9 files changed, 751 insertions(+), 3 deletions(-)
>  create mode 100644 include/net/espintcp.h
>  create mode 100644 net/xfrm/espintcp.c
> 

...

> +static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
> +{
> +	struct xfrm_encap_tmpl *encap = x->encap;
> +	struct esp_tcp_sk *esk;
> +	__be16 sport, dport;
> +	struct sock *nsk;
> +	struct sock *sk;
> +
> +	sk = rcu_dereference(x->encap_sk);
> +	if (sk && sk->sk_state == TCP_ESTABLISHED)
> +		return sk;
> +
> +	spin_lock_bh(&x->lock);
> +	sport = encap->encap_sport;
> +	dport = encap->encap_dport;
> +	nsk = rcu_dereference_protected(x->encap_sk,
> +					lockdep_is_held(&x->lock));
> +	if (sk && sk == nsk) {
> +		esk = kmalloc(sizeof(*esk), GFP_ATOMIC);
> +		if (!esk) {
> +			spin_unlock_bh(&x->lock);
> +			return ERR_PTR(-ENOMEM);
> +		}
> +		RCU_INIT_POINTER(x->encap_sk, NULL);
> +		esk->sk = sk;
> +		call_rcu(&esk->rcu, esp_free_tcp_sk);

I don't understand this, can you please explain what you are doing here?


> +	}
> +	spin_unlock_bh(&x->lock);
> +
> +	sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4,
> +				     dport, x->props.saddr.a4, sport, 0);
> +	if (!sk)
> +		return ERR_PTR(-ENOENT);
> +
> +	if (!tcp_is_ulp_esp(sk)) {
> +		sock_put(sk);
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	spin_lock_bh(&x->lock);
> +	nsk = rcu_dereference_protected(x->encap_sk,
> +					lockdep_is_held(&x->lock));
> +	if (encap->encap_sport != sport ||
> +	    encap->encap_dport != dport) {
> +		sock_put(sk);
> +		sk = nsk ?: ERR_PTR(-EREMCHG);
> +	} else if (sk == nsk) {
> +		sock_put(sk);
> +	} else {
> +		rcu_assign_pointer(x->encap_sk, sk);
> +	}
> +	spin_unlock_bh(&x->lock);
> +
> +	return sk;
> +}
> +
> +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
> +{
> +	struct sock *sk;
> +	int err;
> +
> +	rcu_read_lock();
> +
> +	sk = esp_find_tcp_sk(x);
> +	err = PTR_ERR(sk);
> +	if (IS_ERR(sk))

Maybe better 'if (err)'?

> +		goto out;
> +
> +	bh_lock_sock(sk);
> +	if (sock_owned_by_user(sk)) {
> +		err = espintcp_queue_out(sk, skb);
> +		if (err < 0)
> +			goto unlock_sock;

This goto is not needed.

> +	} else {
> +		err = espintcp_push_skb(sk, skb);
> +	}
> +
> +unlock_sock:
> +	bh_unlock_sock(sk);
> +out:
> +	rcu_read_unlock();
> +	return err;
> +}
> +
> +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
> +				   struct sk_buff *skb)
> +{
> +	struct dst_entry *dst = skb_dst(skb);
> +	struct xfrm_state *x = dst->xfrm;
> +
> +	return esp_output_tcp_finish(x, skb);
> +}
> +
> +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
> +{
> +	int err;
> +
> +	local_bh_disable();
> +	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
> +	local_bh_enable();
> +
> +	/* EINPROGRESS just happens to do the right thing.  It
> +	 * actually means that the skb has been consumed and
> +	 * isn't coming back.
> +	 */
> +	return err ?: -EINPROGRESS;
> +}
> +#endif
> +
>  static void esp_output_done(struct crypto_async_request *base, int err)
>  {
>  	struct sk_buff *skb = base->data;
> @@ -147,7 +272,13 @@ static void esp_output_done(struct crypto_async_request *base, int err)
>  		secpath_reset(skb);
>  		xfrm_dev_resume(skb);
>  	} else {
> -		xfrm_output_resume(skb, err);
> +#ifdef CONFIG_XFRM_ESPINTCP

Do we really need all these ifdef? I guess most of them could
be avoided with some code refactorization.

> +		if (!err &&
> +		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
> +			esp_output_tail_tcp(x, skb);
> +		else
> +#endif
> +			xfrm_output_resume(skb, err);
>  	}
>  }

...

> @@ -296,7 +460,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
>  	struct sk_buff *trailer;
>  	int tailen = esp->tailen;
>  
> -	/* this is non-NULL only with UDP Encapsulation */
> +	/* this is non-NULL only with TCP/UDP Encapsulation */
>  	if (x->encap) {
>  		int err = esp_output_encap(x, skb, esp);
>  
> @@ -491,6 +655,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
>  	if (sg != dsg)
>  		esp_ssg_unref(x, tmp);
>  
> +#ifdef CONFIG_XFRM_ESPINTCP
> +	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
> +		err = esp_output_tail_tcp(x, skb);
> +#endif
> +
>  error_free:
>  	kfree(tmp);
>  error:
> @@ -617,10 +786,16 @@ int esp_input_done2(struct sk_buff *skb, int err)
>  
>  	if (x->encap) {
>  		struct xfrm_encap_tmpl *encap = x->encap;
> +		struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);

This gives a unused variable warning if CONFIG_XFRM_ESPINTCP
is not set.

>  		struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
>  		__be16 source;
>  
>  		switch (x->encap->encap_type) {
> +#ifdef CONFIG_XFRM_ESPINTCP
> +		case TCP_ENCAP_ESPINTCP:
> +			source = th->source;
> +			break;
> +#endif
>  		case UDP_ENCAP_ESPINUDP:
>  		case UDP_ENCAP_ESPINUDP_NON_IKE:
>  			source = uh->source;
> @@ -1017,6 +1192,14 @@ static int esp_init_state(struct xfrm_state *x)
>  		case UDP_ENCAP_ESPINUDP_NON_IKE:
>  			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
>  			break;
> +#ifdef CONFIG_XFRM_ESPINTCP
> +		case TCP_ENCAP_ESPINTCP:
> +			/* only the length field, TCP encap is done by
> +			 * the socket
> +			 */
> +			x->props.header_len += 2;
> +			break;
> +#endif
>  		}
>  	}
>  
> diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
> index c967fc3c38c8..ccc012b3ea10 100644
> --- a/net/xfrm/Kconfig
> +++ b/net/xfrm/Kconfig
> @@ -71,6 +71,15 @@ config XFRM_IPCOMP
>  	select CRYPTO
>  	select CRYPTO_DEFLATE
>  
> +config XFRM_ESPINTCP
> +	bool "ESP in TCP encapsulation (RFC 8229)"
> +	depends on XFRM && INET_ESP
> +	select STREAM_PARSER

I'm getting these compile errors:

espintcp.o: In function `espintcp_close':
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:469: undefined reference to `sk_msg_free'
net/xfrm/espintcp.o: In function `espintcp_sendmsg':
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:302: undefined reference to `sk_msg_alloc'
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:316: undefined reference to `sk_msg_memcopy_from_iter'
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:341: undefined reference to `sk_msg_free'
/home/klassert/git/linux-stk/net/xfrm/espintcp.c:321: undefined reference to `sk_msg_memcopy_from_iter'
/home/klassert/git/linux-stk/Makefile:1067: recipe for target 'vmlinux' failed
make[1]: *** [vmlinux] Error 1

I guess you need to select NET_SOCK_MSG.

Btw. I've updated the ipsec-next tree, so patch 1 is not needed anymore.

Everything else looks good!


^ permalink raw reply

* Re: [PATCH net-next v4 3/3] dt-bindings: net: ethernet: Update mt7622 docs and dts to reflect the new phylink API
From: Matthias Brugger @ 2019-08-29  7:03 UTC (permalink / raw)
  To: David Miller
  Cc: opensource, john, sean.wang, nelson.chang, netdev,
	linux-arm-kernel, linux-mediatek, linux-mips, linux, frank-w, sr
In-Reply-To: <20190828.125658.1743313522645522716.davem@davemloft.net>



On 28/08/2019 21:56, David Miller wrote:
> From: Matthias Brugger <matthias.bgg@gmail.com>
> Date: Wed, 28 Aug 2019 11:29:45 +0200
> 
>> Thanks for taking this patch. For the next time, please make sure that dts[i]
>> patches are independent from the binding description, as dts[i] should go
>> through my tree. No problem for this round, just saying for the future.
> 
> That's not always possible nor reasonable, to be quite honest.
> 

Right now no case comes to my mind. What would be a case where this is not
reasonable or possible?

Regards,
Matthias

^ permalink raw reply

* Re: [PATCH v2 3/3] arm: Add support for function error injection
From: Russell King - ARM Linux admin @ 2019-08-29  6:57 UTC (permalink / raw)
  To: Leo Yan
  Cc: Oleg Nesterov, Catalin Marinas, Will Deacon,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, H. Peter Anvin,
	x86, Arnd Bergmann, Alexei Starovoitov, Daniel Borkmann,
	Martin KaFai Lau, Song Liu, Yonghong Song, Naveen N. Rao,
	linux-arm-kernel, linux-kernel, linuxppc-dev, linux-arch, netdev,
	bpf, clang-built-linux, Masami Hiramatsu
In-Reply-To: <20190819091808.GB5599@leoy-ThinkPad-X240s>

I'm sorry, I can't apply this, it produces loads of:

include/linux/error-injection.h:7:10: fatal error: asm/error-injection.h: No such file or directory

Since your patch 1 has been merged by the ARM64 people, I can't take
it until next cycle.

On Mon, Aug 19, 2019 at 05:18:08PM +0800, Leo Yan wrote:
> Hi Russell,
> 
> On Tue, Aug 06, 2019 at 06:00:15PM +0800, Leo Yan wrote:
> > This patch implements arm specific functions regs_set_return_value() and
> > override_function_with_return() to support function error injection.
> > 
> > In the exception flow, it updates pt_regs::ARM_pc with pt_regs::ARM_lr
> > so can override the probed function return.
> 
> Gentle ping ...  Could you review this patch?
> 
> Thanks,
> Leo.
> 
> > Signed-off-by: Leo Yan <leo.yan@linaro.org>
> > ---
> >  arch/arm/Kconfig              |  1 +
> >  arch/arm/include/asm/ptrace.h |  5 +++++
> >  arch/arm/lib/Makefile         |  2 ++
> >  arch/arm/lib/error-inject.c   | 19 +++++++++++++++++++
> >  4 files changed, 27 insertions(+)
> >  create mode 100644 arch/arm/lib/error-inject.c
> > 
> > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> > index 33b00579beff..2d3d44a037f6 100644
> > --- a/arch/arm/Kconfig
> > +++ b/arch/arm/Kconfig
> > @@ -77,6 +77,7 @@ config ARM
> >  	select HAVE_EXIT_THREAD
> >  	select HAVE_FAST_GUP if ARM_LPAE
> >  	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
> > +	select HAVE_FUNCTION_ERROR_INJECTION if !THUMB2_KERNEL
> >  	select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG
> >  	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
> >  	select HAVE_GCC_PLUGINS
> > diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h
> > index 91d6b7856be4..3b41f37b361a 100644
> > --- a/arch/arm/include/asm/ptrace.h
> > +++ b/arch/arm/include/asm/ptrace.h
> > @@ -89,6 +89,11 @@ static inline long regs_return_value(struct pt_regs *regs)
> >  	return regs->ARM_r0;
> >  }
> >  
> > +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
> > +{
> > +	regs->ARM_r0 = rc;
> > +}
> > +
> >  #define instruction_pointer(regs)	(regs)->ARM_pc
> >  
> >  #ifdef CONFIG_THUMB2_KERNEL
> > diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> > index b25c54585048..8f56484a7156 100644
> > --- a/arch/arm/lib/Makefile
> > +++ b/arch/arm/lib/Makefile
> > @@ -42,3 +42,5 @@ ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
> >    CFLAGS_xor-neon.o		+= $(NEON_FLAGS)
> >    obj-$(CONFIG_XOR_BLOCKS)	+= xor-neon.o
> >  endif
> > +
> > +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> > diff --git a/arch/arm/lib/error-inject.c b/arch/arm/lib/error-inject.c
> > new file mode 100644
> > index 000000000000..2d696dc94893
> > --- /dev/null
> > +++ b/arch/arm/lib/error-inject.c
> > @@ -0,0 +1,19 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/error-injection.h>
> > +#include <linux/kprobes.h>
> > +
> > +void override_function_with_return(struct pt_regs *regs)
> > +{
> > +	/*
> > +	 * 'regs' represents the state on entry of a predefined function in
> > +	 * the kernel/module and which is captured on a kprobe.
> > +	 *
> > +	 * 'regs->ARM_lr' contains the the link register for the probed
> > +	 * function, when kprobe returns back from exception it will override
> > +	 * the end of probed function and directly return to the predefined
> > +	 * function's caller.
> > +	 */
> > +	instruction_pointer_set(regs, regs->ARM_lr);
> > +}
> > +NOKPROBE_SYMBOL(override_function_with_return);
> > -- 
> > 2.17.1
> > 
> 

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up

^ permalink raw reply

* Re: auto-split of commit. Was: [PATCH bpf-next 04/10] tools/bpf: add libbpf_prog_type_(from|to)_str helpers
From: Greg Kroah-Hartman @ 2019-08-29  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jakub Kicinski, Julia Kartseva, ast, Thomas Gleixner, rdna, bpf,
	daniel, netdev, kernel-team
In-Reply-To: <20190828234626.ltfy3qr2nne4uumy@ast-mbp.dhcp.thefacebook.com>

On Wed, Aug 28, 2019 at 04:46:28PM -0700, Alexei Starovoitov wrote:
> On Wed, Aug 28, 2019 at 04:34:22PM -0700, Jakub Kicinski wrote:
> > 
> > Greg, Thomas, libbpf is extracted from the kernel sources and
> > maintained in a clone repo on GitHub for ease of packaging.
> > 
> > IIUC Alexei's concern is that since we are moving the commits from
> > the kernel repo to the GitHub one we have to preserve the commits
> > exactly as they are, otherwise SOB lines lose their power.
> > 
> > Can you provide some guidance on whether that's a valid concern, 
> > or whether it's perfectly fine to apply a partial patch?
> 
> Right. That's exactly the concern.
> 
> Greg, Thomas,
> could you please put your legal hat on and clarify the following.
> Say some developer does a patch that modifies
> include/uapi/linux/bpf.h
> ..some other kernel code...and
> tools/include/uapi/linux/bpf.h
> 
> That tools/include/uapi/linux/bpf.h is used by perf and by libbpf.
> We have automatic mirror of tools/libbpf into github/libbpf/
> so that external projects and can do git submodule of it,
> can build packages out of it, etc.
> 
> The question is whether it's ok to split tools/* part out of
> original commit, keep Author and SOB, create new commit out of it,
> and automatically push that auto-generated commit into github mirror.

Note, I am not a laywer, and am not _your_ lawyer either, only _your_
lawyer can answer questions as to what is best for you.

That being said, from a "are you keeping the correct authorship info",
yes, it sounds like you are doing the correct thing here.

Look at what I do for stable kernels, I take the original commit and add
it to "another tree" keeping the original author and s-o-b chain intact,
and adding a "this is the original git commit id" type message to the
changelog text so that people can link it back to the original.

If you are doing that here as well, I don't see how anyone can object.

thanks,

greg k-h

^ permalink raw reply

* [PATCH bpf-next 00/13] bpf: adding map batch processing support
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song

Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
    
Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. Four bpf syscall subcommands are introduced:
    BPF_MAP_LOOKUP_BATCH
    BPF_MAP_LOOKUP_AND_DELETE_BATCH
    BPF_MAP_UPDATE_BATCH
    BPF_MAP_DELETE_BATCH

In userspace, application can iterate through the whole map one batch
as a time, e.g., bpf_map_lookup_batch() in the below:
    p_key = NULL;
    p_next_key = &key;
    while (true) {
       err = bpf_map_lookup_batch(fd, p_key, &p_next_key, keys, values,
                                  &batch_size, elem_flags, flags);
       if (err) ...
       if (p_next_key) break; // done
       if (!p_key) p_key = p_next_key;
    }
Please look at individual patches for details of new syscall subcommands
and examples of user codes.

The testing is also done in a qemu VM environment:
      measure_lookup: max_entries 1000000, batch 10, time 342ms
      measure_lookup: max_entries 1000000, batch 1000, time 295ms
      measure_lookup: max_entries 1000000, batch 1000000, time 270ms
      measure_lookup: max_entries 1000000, no batching, time 1346ms
      measure_lookup_delete: max_entries 1000000, batch 10, time 433ms
      measure_lookup_delete: max_entries 1000000, batch 1000, time 363ms
      measure_lookup_delete: max_entries 1000000, batch 1000000, time 357ms
      measure_lookup_delete: max_entries 1000000, not batch, time 1894ms
      measure_delete: max_entries 1000000, batch, time 220ms
      measure_delete: max_entries 1000000, not batch, time 1289ms
For a 1M entry hash table, batch size of 10 can reduce cpu time
by 70%. Please see patch "tools/bpf: measure map batching perf"
for details of test codes.

Brian Vazquez (1):
  bpf: add bpf_map_value_size and bp_map_copy_value helper functions

Yonghong Song (12):
  bpf: refactor map_update_elem()
  bpf: refactor map_delete_elem()
  bpf: refactor map_get_next_key()
  bpf: adding map batch processing support
  tools/bpf: sync uapi header bpf.h
  tools/bpf: implement libbpf API functions for map batch operations
  tools/bpf: add test for bpf_map_update_batch()
  tools/bpf: add test for bpf_map_lookup_batch()
  tools/bpf: add test for bpf_map_lookup_and_delete_batch()
  tools/bpf: add test for bpf_map_delete_batch()
  tools/bpf: add a multithreaded test for map batch operations
  tools/bpf: measure map batching perf

 include/uapi/linux/bpf.h                      |  27 +
 kernel/bpf/syscall.c                          | 752 ++++++++++++++----
 tools/include/uapi/linux/bpf.h                |  27 +
 tools/lib/bpf/bpf.c                           |  67 ++
 tools/lib/bpf/bpf.h                           |  17 +
 tools/lib/bpf/libbpf.map                      |   4 +
 .../selftests/bpf/map_tests/map_batch_mt.c    | 126 +++
 .../selftests/bpf/map_tests/map_batch_perf.c  | 242 ++++++
 .../bpf/map_tests/map_delete_batch.c          | 139 ++++
 .../map_tests/map_lookup_and_delete_batch.c   | 164 ++++
 .../bpf/map_tests/map_lookup_batch.c          | 166 ++++
 .../bpf/map_tests/map_update_batch.c          | 115 +++
 12 files changed, 1707 insertions(+), 139 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_mt.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_perf.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_delete_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

-- 
2.17.1


^ permalink raw reply

* [PATCH bpf-next 06/13] tools/bpf: sync uapi header bpf.h
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

sync uapi header include/uapi/linux/bpf.h to
tools/include/uapi/linux/bpf.h.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5d2fb183ee2d..576688f13e8c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
 	BPF_BTF_GET_NEXT_ID,
+	BPF_MAP_LOOKUP_BATCH,
+	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+	BPF_MAP_UPDATE_BATCH,
+	BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -347,6 +351,9 @@ enum bpf_attach_type {
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* flags for BPF_MAP_*_BATCH */
+#define BPF_F_ENFORCE_ENOENT	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -396,6 +403,26 @@ union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_*_BATCH commands */
+		__aligned_u64	start_key;	/* input: storing start key,
+						 * if NULL, starting from the beginning.
+						 */
+		__aligned_u64	next_start_key;	/* output: storing next batch start_key,
+						 * if NULL, no next key.
+						 */
+		__aligned_u64	keys;		/* input/output: key buffer */
+		__aligned_u64	values;		/* input/output: value buffer */
+		__u32		count;		/* input: # of keys/values in
+						 *   or fits in keys[]/values[].
+						 * output: how many successful
+						 *   lookup/lookup_and_delete
+						 *   update/delete operations.
+						 */
+		__u32		map_fd;
+		__u64		elem_flags;	/* BPF_MAP_*_ELEM flags */
+		__u64		flags;		/* flags for batch operation */
+	} batch;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 08/13] tools/bpf: add test for bpf_map_update_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Add tests for bpf_map_update_batch().
  $ ./test_maps
  ...
  test_map_update_batch:PASS
  ...
---
 .../bpf/map_tests/map_update_batch.c          | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_update_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_update_batch.c b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
new file mode 100644
index 000000000000..67c1e11fc911
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_update_batch.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+void test_map_update_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, key, value;
+	const int max_entries = 10;
+	__u32 count, max_count;
+	int err, i;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	/* do not fill in the whole hash table, so we could test
+	 * update with new elements.
+	 */
+	max_count = max_entries - 2;
+
+	for (i = 0; i < max_count; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	/* test 1: count == 0, expect success. */
+	count = 0;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "count = 0", "error:%s\n", strerror(errno));
+
+	/* test 2: update initial map with BPF_NOEXIST, expect success. */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(err, "elem_flags = BPF_NOEXIST",
+	      "error:%s\n", strerror(errno));
+
+	/* use bpf_map_get_next_key to ensure all keys/values are indeed
+	 * covered.
+	 */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, &key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+	CHECK(key + 1 != value, "key/value checking",
+	      "error: key %d value %d\n", key, value);
+	i = 1;
+	while (!bpf_map_get_next_key(map_fd, &key, &key)) {
+		err = bpf_map_lookup_elem(map_fd, &key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		CHECK(key + 1 != value,
+		      "key/value checking", "error: key %d value %d\n",
+		      key, value);
+		i++;
+	}
+	CHECK(i != max_count, "checking number of entries",
+	      "err: i %u max_count %u\n", i, max_count);
+
+	/* test 3: elem_flags = BPF_NOEXIST, already exists, expect failure */
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	/* failure to due to flag BPF_NOEXIST, count is set to 0 */
+	CHECK(!err || count, "elem_flags = BPF_NOEXIST again",
+	      "unexpected success\n");
+
+	/* test 4: elem_flags = 0, expect success */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "elem_flags = 0", "error %s\n", strerror(errno));
+
+	/* test 5: keys = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, NULL, values,
+				   &count, 0, 0);
+	CHECK(!err, "keys = NULL", "unexpected success\n");
+
+	/* test 6: values = NULL, expect failure */
+	count = max_count;
+	err = bpf_map_update_batch(map_fd, keys, NULL, &count, 0, 0);
+	CHECK(!err, "values = NULL", "unexpected success\n");
+
+	/* test 7: modify the first key to be max_count + 10,
+	 * elem_flags = BPF_NOEXIST,
+	 * expect failure, the return count = 1.
+	 */
+	count = max_count;
+	keys[0] = max_count + 10;
+	err = bpf_map_update_batch(map_fd, keys, values,
+				   &count, BPF_NOEXIST, 0);
+	CHECK(!err, "keys[0] = max_count + 10", "unexpected success\n");
+	CHECK(count != 1, "keys[0] = max_count + 10",
+	      "error: %s, incorrect count %u\n", strerror(errno), count);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 03/13] bpf: refactor map_delete_elem()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Refactor function map_delete_elem() with a new helper
bpf_map_delete_elem(), which will be used later
for batched lookup_and_delete and delete operations.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3caa0ab3d30d..b8bba499df11 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -992,6 +992,25 @@ static int map_update_elem(union bpf_attr *attr)
 	return err;
 }
 
+static int bpf_map_delete_elem(struct bpf_map *map, void *key)
+{
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return bpf_map_offload_delete_elem(map, key);
+
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	rcu_read_lock();
+	err = map->ops->map_delete_elem(map, key);
+	rcu_read_unlock();
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+	maybe_wait_bpf_programs(map);
+
+	return err;
+}
+
 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
 
 static int map_delete_elem(union bpf_attr *attr)
@@ -1021,20 +1040,8 @@ static int map_delete_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_delete_elem(map, key);
-		goto out;
-	}
+	err = bpf_map_delete_elem(map, key);
 
-	preempt_disable();
-	__this_cpu_inc(bpf_prog_active);
-	rcu_read_lock();
-	err = map->ops->map_delete_elem(map, key);
-	rcu_read_unlock();
-	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-	maybe_wait_bpf_programs(map);
-out:
 	kfree(key);
 err_put:
 	fdput(f);
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 10/13] tools/bpf: add test for bpf_map_lookup_and_delete_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Test bpf_map_lookup_and_delete_batch() functionality.
  $ ./test_maps
  ...
  test_map_lookup_and_delete_batch:PASS
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../map_tests/map_lookup_and_delete_batch.c   | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
new file mode 100644
index 000000000000..b151513a0ab2
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     int *keys, int *values)
+{
+	int i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		CHECK(keys[i] + 1 != values[i], "key/value checking",
+		      "error: i %d key %d value %d\n", i, keys[i], values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_map_lookup_and_delete_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: lookup/delete an empty hash table, success */
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_next_skey, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "empty map", "error: %s\n", strerror(errno));
+	CHECK(p_next_skey || count, "empty map",
+	      "p_next_skey = %p, count = %u\n", p_next_skey, count);
+
+	/* populate elements to the map */
+	map_batch_update(map_fd, max_entries, keys, values);
+
+	/* test 2: lookup/delete with count = 0, success */
+	count = 0;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+	/* test 3: lookup/delete with count = max_entries, skey && !nskey,
+	 * failure.
+	 */
+	count = max_entries;
+	key = 1;
+	err = bpf_map_lookup_and_delete_batch(map_fd, &key, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(!err, "skey && !nskey", "unexpected success\n");
+
+	/* test 4: lookup/delete with count = max_entries, success */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_next_skey, keys,
+					      values, &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+	map_batch_verify(visited, max_entries, keys, values);
+
+	/* bpf_map_get_next_key() should return -ENOENT for an empty map. */
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	/* test 5: lookup/delete in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively lookup/delete elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_and_delete_batch(map_fd, p_skey,
+							      &p_next_skey,
+							      keys + i * step,
+							      values + i * step,
+							      &count, 0, 0);
+			CHECK(err, "lookup/delete with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "lookup/delete with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "lookup/delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	}
+
+	/* test 6: lookup/delete with keys in keys[]. */
+	map_batch_update(map_fd, max_entries, keys, values);
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_lookup_and_delete_batch(map_fd, NULL, NULL, keys, values,
+					      &count, 0, 0);
+	CHECK(err, "lookup/delete key in keys[]", "error: %s\n", strerror(errno));
+	map_batch_verify(visited, max_entries, keys, values);
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 12/13] tools/bpf: add a multithreaded test for map batch operations
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

A multithreaded test is added. Three threads repeatedly did:
  - batch update
  - batch lookup_and_delete
  - batch delete

It is totally possible each batch element operation in kernel
may find that the key, retrieved from bpf_map_get_next_key(),
may fail lookup and/or delete as some other threads in parallel
operates on the same map.

The default mode for new batch APIs is to ignore -ENOENT errors
in case of lookup and delete and move to the next element.
The test would otherwise fail if the kernel reacts as -ENOENT
as a real error and propogates it back to user space.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/map_tests/map_batch_mt.c    | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_mt.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_batch_mt.c b/tools/testing/selftests/bpf/map_tests/map_batch_mt.c
new file mode 100644
index 000000000000..a0e2591d0079
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_batch_mt.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+/* Create three threads. Each thread will iteratively do:
+ *   . update constantly
+ *   . lookup and delete constantly
+ *   . delete constantly
+ * So this will make lookup and delete operations
+ * may fail as the elements may be deleted by another
+ * thread.
+ *
+ * By default, we should not see a problem as
+ * -ENOENT for bpf_map_delete_elem() and bpf_map_lookup_elem()
+ * will be ignored. But with flag, BPF_F_ENFORCE_ENOENT
+ * we may see errors.
+ */
+
+static int map_fd;
+static const __u32 max_entries = 10;
+static volatile bool stop = false;
+
+static void do_batch_update()
+{
+	int i, err, keys[max_entries], values[max_entries];
+	__u32 count;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	while (!stop) {
+		count = max_entries;
+		err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+		CHECK(err, "bpf_map_update_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void do_batch_delete()
+{
+	__u32 count;
+	int err;
+
+	while (!stop) {
+		count = 0;
+		err = bpf_map_delete_batch(map_fd, NULL, NULL, NULL, &count,
+					   0, 0);
+		CHECK(err, "bpf_map_delete_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void do_batch_lookup_and_delete()
+{
+	int err, key, keys[max_entries], values[max_entries];
+	__u32 count;
+	void *p_key;
+
+	while (!stop) {
+		p_key = &key;
+		count = max_entries;
+		err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &p_key,
+						      keys, values, &count,
+						      0, 0);
+		CHECK(err, "bpf_map_lookup_and_delete_batch()", "error:%s\n",
+		      strerror(errno));
+	}
+}
+
+static void *do_work(void *arg)
+{
+	int work_index = (int)(long)arg;
+
+	if (work_index == 0)
+		do_batch_update();
+	else if (work_index == 1)
+		do_batch_delete();
+	else
+		do_batch_lookup_and_delete();
+
+	return NULL;
+}
+
+void test_map_batch_mt(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	const int nr_threads = 3;
+	pthread_t threads[nr_threads];
+	int i, err;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	for (i = 0; i < nr_threads; i++) {
+		err = pthread_create(&threads[i], NULL, do_work,
+				     (void *)(long)i);
+		CHECK(err, "pthread_create", "error: %s\n", strerror(errno));
+	}
+
+	sleep(1);
+	stop = true;
+
+	for (i = 0; i < nr_threads; i++)
+		pthread_join(threads[i], NULL);
+
+	close(map_fd);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 13/13] tools/bpf: measure map batching perf
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

The test program run result:
  $ ./test_maps
  ...
  measure_lookup: max_entries 1000000, batch 10, time 342
  measure_lookup: max_entries 1000000, batch 1000, time 295
  measure_lookup: max_entries 1000000, batch 1000000, time 270
  measure_lookup: max_entries 1000000, no batching, time 1346
  measure_lookup_delete: max_entries 1000000, batch 10, time 433
  measure_lookup_delete: max_entries 1000000, batch 1000, time 363
  measure_lookup_delete: max_entries 1000000, batch 1000000, time 357
  measure_lookup_delete: max_entries 1000000, not batch, time 1894
  measure_delete: max_entries 1000000, batch, time 220
  measure_delete: max_entries 1000000, not batch, time 1289
  test_map_batch_perf:PASS
  ...

  The test is running on a qemu VM environment. The time
  unit is millisecond. A simple hash table with 1M elements
  is created.

  For lookup and lookup_and_deletion, since buffer allocation
  is needed to hold the lookup results, three different
  batch sizes (10, 1000, and 1M) are tried. The performance
  without batching is also measured. A batch size of 10
  can already improves performance dramatically, more than 70%,
  and increasing batch size may continue to improve performance,
  but with diminishing returns.

  For delete, the batch API provides a mechanism to delete all elements
  in the map, which is used here. The deletion of the whole map
  improves performance by 80% compared to non-batch mechanism.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/map_tests/map_batch_perf.c  | 242 ++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_batch_perf.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_batch_perf.c b/tools/testing/selftests/bpf/map_tests/map_batch_perf.c
new file mode 100644
index 000000000000..42d95651e1ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_batch_perf.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+/* Test map batch performance.
+ * Test three common scenarios:
+ *    - batched lookup
+ *    - batched lookup and delete
+ *    - batched deletion
+ */
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static unsigned long util_gettime(void)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void measure_lookup(int map_fd, __u32 max_entries, int *keys,
+			   int *values)
+{
+	__u32 batches[3] = {10, 1000};
+	int err, key, value, option;
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	__u32 count, total;
+
+	map_batch_update(map_fd, max_entries, keys, values);
+
+	/* batched */
+	batches[2] = max_entries;
+	for (option = 0; option < 3; option++) {
+		p_key = NULL;
+		p_next_key = &key;
+		count = batches[option];
+		start = util_gettime();
+		total = 0;
+
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd, p_key, &p_next_key,
+						   keys, values, &count, 0, 0);
+			CHECK(err, "bpf_map_lookup_batch()", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_key)
+				break;
+
+			if (!p_key)
+				p_key = p_next_key;
+		}
+
+		end = util_gettime();
+		CHECK(total != max_entries,
+		      "checking total", "total %u, max_entries %u\n",
+		      total, max_entries);
+		printf("%s: max_entries %u, batch %u, time %ld\n", __func__,
+		       max_entries, batches[option], end - start);
+	}
+
+	/* not batched */
+	start = util_gettime();
+	p_key = NULL;
+	p_next_key = &key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+		p_key = p_next_key;
+	}
+	end = util_gettime();
+	printf("%s: max_entries %u, no batching, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+static void measure_lookup_delete(int map_fd, __u32 max_entries, int *keys,
+				  int *values)
+{
+	int err, key, next_key, value, option;
+	__u32 batches[3] = {10, 1000};
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	__u32 count, total;
+
+	/* batched */
+	batches[2] = max_entries;
+	for (option = 0; option < 3; option++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		p_key = NULL;
+		p_next_key = &key;
+		count = batches[option];
+		start = util_gettime();
+		total = 0;
+
+		while (true) {
+			err = bpf_map_lookup_and_delete_batch(map_fd, p_key,
+				&p_next_key, keys, values, &count, 0, 0);
+			CHECK(err, "bpf_map_lookup_and_delete_batch()",
+			      "error: %s\n", strerror(errno));
+
+			total += count;
+			if (!p_next_key)
+				break;
+
+			if (!p_key)
+				p_key = p_next_key;
+		}
+
+		end = util_gettime();
+		CHECK(total != max_entries,
+		      "checking total", "total %u, max_entries %u\n",
+		      total, max_entries);
+		printf("%s: max_entries %u, batch %u, time %ld\n", __func__,
+		       max_entries, batches[option], end - start);
+	}
+
+	/* not batched */
+	map_batch_update(map_fd, max_entries, keys, values);
+	start = util_gettime();
+	p_key = NULL;
+	p_next_key = &key;
+	err = bpf_map_get_next_key(map_fd, p_key, p_next_key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+	err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+	CHECK(err, "bpf_map_lookup_elem()", "error: %s\n", strerror(errno));
+
+	p_key = p_next_key;
+	p_next_key = &next_key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_delete_elem(map_fd, p_key);
+		CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+		      strerror(errno));
+
+		err = bpf_map_lookup_elem(map_fd, p_next_key, &value);
+		CHECK(err, "bpf_map_lookup_elem()", "error: %s\n",
+		      strerror(errno));
+
+		p_key = p_next_key;
+		p_next_key = (p_next_key == &key) ? &next_key : &key;
+	}
+	err = bpf_map_delete_elem(map_fd, p_key);
+	CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+	      strerror(errno));
+	end = util_gettime();
+	printf("%s: max_entries %u, not batch, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+static void measure_delete(int map_fd, __u32 max_entries, int *keys,
+			   int *values)
+{
+	unsigned long start, end;
+	void *p_key, *p_next_key;
+	int err, key, next_key;
+	__u32 count;
+
+	/* batched */
+	map_batch_update(map_fd, max_entries, keys, values);
+	count = 0;
+	p_next_key = &key;
+	start = util_gettime();
+	err = bpf_map_delete_batch(map_fd, NULL, NULL, NULL, &count, 0, 0);
+	end = util_gettime();
+	CHECK(err, "bpf_map_delete_batch()", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries, "bpf_map_delete_batch()",
+	      "count = %u, max_entries = %u\n", count, max_entries);
+
+	printf("%s: max_entries %u, batch, time %ld\n", __func__,
+	       max_entries, end - start);
+
+	map_batch_update(map_fd, max_entries, keys, values);
+	p_key = NULL;
+	p_next_key = &key;
+	start = util_gettime();
+	err = bpf_map_get_next_key(map_fd, p_key, p_next_key);
+	CHECK(err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	p_key = p_next_key;
+	p_next_key = &next_key;
+	while (!bpf_map_get_next_key(map_fd, p_key, p_next_key)) {
+		err = bpf_map_delete_elem(map_fd, p_key);
+		CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+		      strerror(errno));
+		p_key = p_next_key;
+		p_next_key = (p_next_key == &key) ? &next_key : &key;
+	}
+	err = bpf_map_delete_elem(map_fd, p_key);
+	CHECK(err, "bpf_map_delete_elem()", "error: %s\n",
+	      strerror(errno));
+	end = util_gettime();
+	printf("%s: max_entries %u, not batch, time %ld\n", __func__,
+	       max_entries, end - start);
+}
+
+void test_map_batch_perf(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	const __u32 max_entries = 1000000;  // 1M entries for the hash table
+	int map_fd, *keys, *values;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values, "malloc()", "error:%s\n", strerror(errno));
+
+	measure_lookup(map_fd, max_entries, keys, values);
+	measure_lookup_delete(map_fd, max_entries, keys, values);
+	measure_delete(map_fd, max_entries, keys, values);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 11/13] tools/bpf: add test for bpf_map_delete_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Test bpf_map_delete_batch() functionality.
  $ ./test_maps
  ...
  test_map_delete_batch:PASS
  ...
---
 .../bpf/map_tests/map_delete_batch.c          | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_delete_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_delete_batch.c
new file mode 100644
index 000000000000..459495a6d9fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_delete_batch.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+			     int *values)
+{
+	int i, err;
+
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+void test_map_delete_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: delete an empty hash table, success */
+	for (i = 0; i < 2; i++) {
+		if (i == 0) {
+			count = 0;
+			p_next_skey = NULL;
+		} else {
+			count = max_entries;
+			p_next_skey = &key;
+		}
+		err = bpf_map_delete_batch(map_fd, NULL, &p_next_skey, keys,
+					   &count, 0, 0);
+		CHECK(err, "empty map", "error: %s\n", strerror(errno));
+		CHECK(p_next_skey || count, "empty map",
+		      "i = %d, p_next_skey = %p, count = %u\n", i, p_next_skey,
+		      count);
+	}
+
+	/* test 2: delete with count = 0, success */
+	for (i = 0; i < 2; i++) {
+		/* populate elements to the map */
+		map_batch_update(map_fd, max_entries, keys, values);
+
+		count = 0;
+		if (i == 0) {
+			/* all elements in the map */
+			p_skey = NULL;
+		} else {
+			/* all elements starting from p_skey */
+			p_skey = &key;
+			err = bpf_map_get_next_key(map_fd, NULL, p_skey);
+			CHECK(err, "bpf_map_get_next_key()", "error: %s\n",
+			      strerror(errno));
+		}
+		err = bpf_map_delete_batch(map_fd, p_skey, NULL, NULL,
+				    &count, 0, 0);
+		CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+		/* all elements should be gone. */
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "unexpected success\n");
+	}
+
+	/* test 3: delete in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		map_batch_update(map_fd, max_entries, keys, values);
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively delete elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_delete_batch(map_fd, p_skey, &p_next_skey,
+					      keys + i * step, &count, 0, 0);
+			CHECK(err, "delete with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "delete with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "delete with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		err = bpf_map_get_next_key(map_fd, NULL, &key);
+		CHECK(!err, "bpf_map_get_next_key()", "error: %s\n",
+		      strerror(errno));
+	}
+
+	/* test 4: delete with keys in keys[]. */
+	map_batch_update(map_fd, max_entries, keys, values);
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_delete_batch(map_fd, NULL, NULL, keys, &count, 0, 0);
+	CHECK(err, "delete key in keys[]", "error: %s\n", strerror(errno));
+	err = bpf_map_get_next_key(map_fd, NULL, &key);
+	CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 04/13] bpf: refactor map_get_next_key()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Refactor function map_get_next_key() with a new helper
bpf_map_get_next_key(), which will be used later
for batched map lookup/lookup_and_delete/delete operations.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b8bba499df11..06308f0206e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1048,6 +1048,20 @@ static int map_delete_elem(union bpf_attr *attr)
 	return err;
 }
 
+static int bpf_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return bpf_map_offload_get_next_key(map, key, next_key);
+
+	rcu_read_lock();
+	err = map->ops->map_get_next_key(map, key, next_key);
+	rcu_read_unlock();
+
+	return err;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
 
@@ -1088,15 +1102,7 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (!next_key)
 		goto free_key;
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_get_next_key(map, key, next_key);
-		goto out;
-	}
-
-	rcu_read_lock();
-	err = map->ops->map_get_next_key(map, key, next_key);
-	rcu_read_unlock();
-out:
+	err = bpf_map_get_next_key(map, key, next_key);
 	if (err)
 		goto free_next_key;
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 09/13] tools/bpf: add test for bpf_map_lookup_batch()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Test bpf_map_lookup_batch() functionality.
  $ ./test_maps
  ...
  test_map_lookup_batch:PASS
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../bpf/map_tests/map_lookup_batch.c          | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_batch.c

diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
new file mode 100644
index 000000000000..9c88e8adc556
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_batch.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook  */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+			     int *keys, int *values)
+{
+	int i;
+
+	memset(visited, 0, max_entries * sizeof(*visited));
+	for (i = 0; i < max_entries; i++) {
+		CHECK(keys[i] + 1 != values[i], "key/value checking",
+		      "error: i %d key %d value %d\n", i, keys[i], values[i]);
+		visited[i] = 1;
+	}
+	for (i = 0; i < max_entries; i++) {
+		CHECK(visited[i] != 1, "visited checking",
+		      "error: keys array at index %d missing\n", i);
+	}
+}
+
+void test_map_lookup_batch(void)
+{
+	struct bpf_create_map_attr xattr = {
+		.name = "hash_map",
+		.map_type = BPF_MAP_TYPE_HASH,
+		.key_size = sizeof(int),
+		.value_size = sizeof(int),
+	};
+	int map_fd, *keys, *values, *visited, key;
+	const __u32 max_entries = 10;
+	void *p_skey, *p_next_skey;
+	__u32 count, total;
+	int err, i, step;
+
+	xattr.max_entries = max_entries;
+	map_fd = bpf_create_map_xattr(&xattr);
+	CHECK(map_fd == -1,
+	      "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+	keys = malloc(max_entries * sizeof(int));
+	values = malloc(max_entries * sizeof(int));
+	visited = malloc(max_entries * sizeof(int));
+	CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+	/* test 1: lookup an empty hash table, success */
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "empty map", "error: %s\n", strerror(errno));
+	CHECK(p_next_skey || count, "empty map",
+	      "p_next_skey = %p, count = %u\n", p_next_skey, count);
+
+	/* populate elements to the map */
+	for (i = 0; i < max_entries; i++) {
+		keys[i] = i + 1;
+		values[i] = i + 2;
+	}
+
+	count = max_entries;
+	err = bpf_map_update_batch(map_fd, keys, values, &count, 0, 0);
+	CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+
+	/* test 2: lookup with count = 0, success */
+	count = 0;
+	err = bpf_map_lookup_batch(map_fd, NULL, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+	/* test 3: lookup with count = max_entries, skey && !nskey, failure */
+	count = max_entries;
+	key = 1;
+	err = bpf_map_lookup_batch(map_fd, &key, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(!err, "skey && !nskey", "unexpected success\n");
+
+	/* test 4: lookup with count = max_entries, success */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+
+
+	/* test 5: lookup with count = max_entries, it should return
+	 * success with count = max_entries.
+	 */
+	memset(keys, 0, max_entries * sizeof(*keys));
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, NULL, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+	CHECK(count != max_entries || p_next_skey != NULL, "count = max_entries",
+	      "count = %u, max_entries = %u, p_next_skey = %p\n",
+	      count, max_entries, p_next_skey);
+
+	/* test 6: lookup with an invalid start key, failure */
+	count = max_entries;
+	key = max_entries + 10;
+	p_next_skey = &key;
+	err = bpf_map_lookup_batch(map_fd, &key, &p_next_skey, keys, values,
+				   &count, 0, 0);
+	CHECK(!err, "invalid start_key", "unexpected success\n");
+
+	/* test 7: lookup in a loop with various steps. */
+	for (step = 1; step < max_entries; step++) {
+		memset(keys, 0, max_entries * sizeof(*keys));
+		memset(values, 0, max_entries * sizeof(*values));
+		p_skey = NULL;
+		p_next_skey = &key;
+		total = 0;
+		i = 0;
+		/* iteratively lookup elements with 'step' elements each */
+		count = step;
+		while (true) {
+			err = bpf_map_lookup_batch(map_fd, p_skey, &p_next_skey,
+						   keys + i * step,
+						   values + i * step,
+						   &count, 0, 0);
+			CHECK(err, "lookup with steps", "error: %s\n",
+			      strerror(errno));
+
+			total += count;
+			if (!p_next_skey)
+				break;
+
+			CHECK(count != step, "lookup with steps",
+			      "i = %d, count = %u, step = %d\n",
+			      i, count, step);
+
+			if (!p_skey)
+				p_skey = p_next_skey;
+			i++;
+		}
+
+		CHECK(total != max_entries, "lookup with steps",
+		      "total = %u, max_entries = %u\n", total, max_entries);
+
+		map_batch_verify(visited, max_entries, keys, values);
+	}
+
+	/* test 8: lookup with keys in keys[]. */
+	memset(values, 0, max_entries * sizeof(*values));
+	count = max_entries;
+	err = bpf_map_lookup_batch(map_fd, NULL, NULL, keys, values,
+				   &count, 0, 0);
+	CHECK(err, "lookup key in keys[]", "error: %s\n", strerror(errno));
+	map_batch_verify(visited, max_entries, keys, values);
+
+	printf("%s:PASS\n", __func__);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 07/13] tools/bpf: implement libbpf API functions for map batch operations
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Added four libbpf API functions to support map batch operations:
  . int bpf_map_delete_batch( ... )
  . int bpf_map_lookup_batch( ... )
  . int bpf_map_lookup_and_delete_batch( ... )
  . int bpf_map_update_batch( ... )

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/lib/bpf/bpf.c      | 67 ++++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/bpf.h      | 17 ++++++++++
 tools/lib/bpf/libbpf.map |  4 +++
 3 files changed, 88 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index cbb933532981..bdbd660aa2b8 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -438,6 +438,73 @@ int bpf_map_freeze(int fd)
 	return sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr));
 }
 
+static int bpf_map_batch_common(int cmd, int fd, const void *start_key,
+				void **next_start_key,
+				void *keys, void *values,
+				__u32 *count, __u64 elem_flags,
+				__u64 flags)
+{
+	union bpf_attr attr = {};
+	int ret;
+
+	attr.batch.map_fd = fd;
+	attr.batch.start_key = ptr_to_u64(start_key);
+	if (next_start_key)
+		attr.batch.next_start_key = ptr_to_u64(*next_start_key);
+	attr.batch.keys = ptr_to_u64(keys);
+	attr.batch.values = ptr_to_u64(values);
+	if (count)
+		attr.batch.count = *count;
+	attr.batch.elem_flags = elem_flags;
+	attr.batch.flags = flags;
+
+	ret = sys_bpf(cmd, &attr, sizeof(attr));
+	if (next_start_key)
+		*next_start_key = (void *)attr.batch.next_start_key;
+	if (count)
+		*count = attr.batch.count;
+
+	return ret;
+}
+
+int bpf_map_delete_batch(int fd, const void *start_key, void **next_start_key,
+			 void *keys, __u32 *count, __u64 elem_flags,
+			 __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, start_key,
+				    next_start_key, keys, (void *)NULL,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_lookup_batch(int fd, const void *start_key, void **next_start_key,
+			 void *keys, void *values, __u32 *count,
+			 __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, start_key,
+				    next_start_key, keys, values,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_lookup_and_delete_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    void *values, __u32 *count,
+				    __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+				    fd, start_key,
+				    next_start_key, keys, values,
+				    count, elem_flags, flags);
+}
+
+int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
+			 __u64 elem_flags, __u64 flags)
+{
+	return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH,
+				    fd, (void *)NULL, (void *)NULL,
+				    keys, values,
+				    count, elem_flags, flags);
+}
+
 int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 0db01334740f..a36bfcbfe04f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -120,6 +120,23 @@ LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
 LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
 LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
 LIBBPF_API int bpf_map_freeze(int fd);
+LIBBPF_API int bpf_map_delete_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    __u32 *count, __u64 elem_flags,
+				    __u64 flags);
+LIBBPF_API int bpf_map_lookup_batch(int fd, const void *start_key,
+				    void **next_start_key, void *keys,
+				    void *values, __u32 *count,
+				    __u64 elem_flags, __u64 flags);
+LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, const void *start_key,
+					       void **next_start_key,
+					       void *keys, void *values,
+					       __u32 *count, __u64 elem_flags,
+					       __u64 flags);
+LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values,
+				    __u32 *count, __u64 elem_flags,
+				    __u64 flags);
+
 LIBBPF_API int bpf_obj_pin(int fd, const char *pathname);
 LIBBPF_API int bpf_obj_get(const char *pathname);
 LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 664ce8e7a60e..7a00630f0ff1 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -188,4 +188,8 @@ LIBBPF_0.0.4 {
 LIBBPF_0.0.5 {
 	global:
 		bpf_btf_get_next_id;
+		bpf_map_delete_batch;
+		bpf_map_lookup_and_delete_batch;
+		bpf_map_lookup_batch;
+		bpf_map_update_batch;
 } LIBBPF_0.0.4;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 05/13] bpf: adding map batch processing support
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
  https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t

During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
  https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
  https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138

Also, in bcc, we have API to delete all entries in a map.
  https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264

For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
  https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550

This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. Four bpf syscall subcommands are introduced:
	BPF_MAP_LOOKUP_BATCH
	BPF_MAP_LOOKUP_AND_DELETE_BATCH
	BPF_MAP_UPDATE_BATCH
	BPF_MAP_DELETE_BATCH

The UAPI attribute structure looks like:
	struct { /* struct used by BPF_MAP_*_BATCH commands */
		__aligned_u64   start_key;      /* input: storing start key,
						 * if NULL, starting from the beginning.
						 */
		__aligned_u64   next_start_key; /* output: storing next batch start_key,
						 * if NULL, no next key.
						 */
		__aligned_u64	keys;		/* input/output: key buffer */
		__aligned_u64	values;		/* input/output: value buffer */
		__u32		count;		/* input: # of keys/values in
						 *   or fits in keys[]/values[].
						 * output: how many successful
						 *   lookup/lookup_and_delete
						 *   /delete/update operations.
						 */
		__u32		map_fd;
		__u64		elem_flags;	/* BPF_MAP_{UPDATE,LOOKUP}_ELEM flags */
		__u64		flags;		/* flags for batch operation */
	} batch;

The 'start_key' and 'next_start_key' are used to BPF_MAP_LOOKUP_BATCH,
BPF_MAP_LOOKUP_AND_DELETE_BATCH and BPF_MAP_DELETE_BATCH
to start the operation on 'start_key' and also set the
next batch start key in 'next_start_key'.

If 'count' is greater than 0 and the return code is 0,
  . the 'count' may be updated to be smaller if there is less
    elements than 'count' for the operation. In such cases,
    'next_start_key' will be set to NULL.
  . the 'count' remains the same. 'next_start_key' could be NULL
    or could point to next start_key for batch processing.

If 'count' is greater than 0 and the return code is an error
other than -EFAULT, the kernel will try to overwrite 'count'
to contain the number of successful element-level (lookup,
lookup_and_delete, update and delete) operations. The following
attributes can be checked:
  . if 'count' value is modified, the new value will be
    the number of successful element-level operations.
  . if 'count' value is modified, the keys[]/values[] will
    contain correct results for new 'count' number of
    operations for lookup[_and_delete] and update.

The implementation in this patch mimics what user space
did, e.g., for lookup_and_delete,
    while(bpf_get_next_key(map, keyp, next_keyp) == 0) {
       bpf_map_delete_elem(map, keyp);
       bpf_map_lookup_elem(map, next_keyp, &value, flags);
       keyp, next_keyp = next_keyp, keyp;
    }
The similar loop is implemented in the kernel, and
each operation, bpf_get_next_key(), bpf_map_delete_elem()
and bpf_map_lookup_elem(), uses existing kernel functions
each of which has its own rcu_read_lock region, bpf_prog_active
protection, etc.
Therefore, it is totally possible that after bpf_get_next_key(),
the bpf_map_delete_elem() or bpf_map_lookup_elem() may fail
as the key may be deleted concurrently by kernel or
other user space processes/threads.
By default, the current implementation permits the loop
to continue, just like typical user space handling. But
a flag, BPF_F_ENFORCE_ENOENT, can be specified, so kernel
will return an error if bpf_map_delete_elem() or
bpf_map_lookup_elem() failed.

The high-level algorithm for BPF_MAP_LOOKUP_BATCH and
BPF_MAP_LOOKUP_AND_DELETE_BATCH:
	if (start_key == NULL and next_start_key == NULL) {
		lookup up to 'count' keys in keys[] and fill
		corresponding values[], and delete those
		keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
	} else if (start_key == NULL && next_start_key != NULL) {
		lookup up to 'count' keys from the beginning
		of the map and fill keys[]/values[], delete
		those keys if BPF_MAP_LOOKUP_AND_DELETE_BATCH.
		Set 'next_start_key' for next batch operation.
	} else if (start_key != NULL && next_start_key != NULL) {
		lookup to 'count' keys from 'start_key', inclusive,
		and fill keys[]/values[], delete those keys if
		BPF_MAP_LOOKUP_AND_DELETE_BATCH.
		Set 'next_start_key' for next batch operation.
	}

The high-level algorithm for BPF_MAP_UPDATE_BATCH:
	if (count != 0) {
		do 'count' number of updates on keys[]/values[].
	}

The high-level algorithm for BPF_MAP_DELETE_BATCH:
	if (count == 0) {
		if (start_key == NULL) {
			delete all elements from map.
		} else {
			delete all elements from start_key to the end of map.
		}
	} else {
		if (start_key == NULL and next_start_key == NULL) {
			delete 'count' number of keys in keys[].
		} else if (start_key == NULL and next_start_key != NULL) {
			delete 'count' number of keys from the
			beginning of the map and set 'next_start_key'
			properly.
		} else if (start_key != NULL and next_start_keeey != NULL) {
			delete 'count' number of keys from 'start_key',
			and set 'next_start_key' properly.
		}
	}

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h |  27 +++
 kernel/bpf/syscall.c     | 448 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 475 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d2fb183ee2d..576688f13e8c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
 	BPF_MAP_FREEZE,
 	BPF_BTF_GET_NEXT_ID,
+	BPF_MAP_LOOKUP_BATCH,
+	BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+	BPF_MAP_UPDATE_BATCH,
+	BPF_MAP_DELETE_BATCH,
 };
 
 enum bpf_map_type {
@@ -347,6 +351,9 @@ enum bpf_attach_type {
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* flags for BPF_MAP_*_BATCH */
+#define BPF_F_ENFORCE_ENOENT	(1U << 0)
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -396,6 +403,26 @@ union bpf_attr {
 		__u64		flags;
 	};
 
+	struct { /* struct used by BPF_MAP_*_BATCH commands */
+		__aligned_u64	start_key;	/* input: storing start key,
+						 * if NULL, starting from the beginning.
+						 */
+		__aligned_u64	next_start_key;	/* output: storing next batch start_key,
+						 * if NULL, no next key.
+						 */
+		__aligned_u64	keys;		/* input/output: key buffer */
+		__aligned_u64	values;		/* input/output: value buffer */
+		__u32		count;		/* input: # of keys/values in
+						 *   or fits in keys[]/values[].
+						 * output: how many successful
+						 *   lookup/lookup_and_delete
+						 *   update/delete operations.
+						 */
+		__u32		map_fd;
+		__u64		elem_flags;	/* BPF_MAP_*_ELEM flags */
+		__u64		flags;		/* flags for batch operation */
+	} batch;
+
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
 		__u32		prog_type;	/* one of enum bpf_prog_type */
 		__u32		insn_cnt;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 06308f0206e5..8746b55405f9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -33,6 +33,17 @@
 
 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
 
+#define BPF_MAP_BATCH_SWAP_KEYS(key1, key2, buf1, buf2)	\
+	do {						\
+		if (key1 == (buf1)) {			\
+			key1 = buf2;			\
+			key2 = buf1;			\
+		} else {				\
+			key1 = buf1;			\
+			key2 = buf2;			\
+		}					\
+	} while (0)					\
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -1183,6 +1194,431 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
 	return err;
 }
 
+static void __map_batch_get_attrs(const union bpf_attr *attr,
+				  void __user **skey, void __user **nskey,
+				  void __user **keys, void __user **values,
+				  u32 *max_count, u64 *elem_flags, u64 *flags)
+{
+	*skey = u64_to_user_ptr(attr->batch.start_key);
+	*nskey = u64_to_user_ptr(attr->batch.next_start_key);
+	*keys = u64_to_user_ptr(attr->batch.keys);
+	*values = u64_to_user_ptr(attr->batch.values);
+	*max_count = attr->batch.count;
+	*elem_flags = attr->batch.elem_flags;
+	*flags = attr->batch.flags;
+}
+
+static int
+__map_lookup_delete_batch_key_in_keys(struct bpf_map *map, void *key, void *value,
+				      u32 max_count, u32 key_size, u32 value_size,
+				      u64 elem_flags, void __user *keys,
+				      void __user *values,
+				      union bpf_attr __user *uattr,
+				      bool ignore_enoent)
+{
+	u32 count, missed = 0;
+	int ret = 0, err;
+
+	for (count = 0; count < max_count; count++) {
+		if (copy_from_user(key, keys + count * key_size, key_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_copy_value(map, key, value, elem_flags);
+		if (ret) {
+			if (ret != -ENOENT || !ignore_enoent)
+				break;
+
+			missed++;
+			continue;
+		}
+
+
+		if (copy_to_user(values + count * value_size, value, value_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_delete_elem(map, key);
+		if (ret) {
+			if (ret != -ENOENT || !ignore_enoent)
+				break;
+
+			missed++;
+		}
+	}
+
+	count -= missed;
+	if ((!ret && missed) || (ret && ret != -EFAULT)) {
+		err = put_user(count, &uattr->batch.count);
+		ret = err ? : ret;
+	}
+
+	return ret;
+}
+
+static int map_lookup_and_delete_batch(struct bpf_map *map,
+				       const union bpf_attr *attr,
+				       union bpf_attr __user *uattr,
+				       bool do_delete)
+{
+	u32 max_count, count = 0, key_size, roundup_key_size, value_size;
+	bool ignore_enoent, nextkey_is_null, copied;
+	void *buf = NULL, *key, *value, *next_key;
+	void __user *skey, *nskey, *keys, *values;
+	u64 elem_flags, flags, zero = 0;
+	int err, ret = 0;
+
+	if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+	    map->map_type == BPF_MAP_TYPE_STACK)
+		return -ENOTSUPP;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (elem_flags & ~BPF_F_LOCK || flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	if (!max_count)
+		return 0;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count != 0 && !skey && !nskey: loop/delete max_count elements in keys[]/values[].
+	 * max_count != 0 && !skey && nskey: loop/delete max_count elements starting from map start.
+	 * max_count != 0 && skey && nskey: loop/delete max_count elements starting from skey.
+	 */
+	if (skey && !nskey)
+		return -EINVAL;
+
+	/* allocate space for two keys and one value. */
+	key_size = map->key_size;
+	roundup_key_size = round_up(map->key_size, 8);
+	value_size = bpf_map_value_size(map);
+	buf = kmalloc(roundup_key_size * 2 + value_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	next_key = buf + roundup_key_size;
+	value = buf + roundup_key_size * 2;
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+
+	if (!skey && !nskey) {
+		/* handle cases where keys in keys[] */
+		ret = __map_lookup_delete_batch_key_in_keys(map, key, value, max_count,
+							    key_size, value_size,
+							    elem_flags, keys, values,
+							    uattr, ignore_enoent);
+		goto out;
+	}
+
+	/* Get the first key. */
+	if (!skey) {
+		ret = bpf_map_get_next_key(map, NULL, key);
+		if (ret) {
+			nextkey_is_null = true;
+			goto after_loop;
+		}
+	} else if (copy_from_user(key, skey, key_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Copy the first key/value pair */
+	ret = bpf_map_copy_value(map, key, value, elem_flags);
+	if (ret) {
+		if (skey)
+			goto out;
+
+		nextkey_is_null = true;
+		goto after_loop;
+	}
+
+	if (copy_to_user(keys, key, key_size) ||
+	    copy_to_user(values, value, value_size)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* We will always try to get next_key first
+	 * and then delete the current key.
+	 */
+	ret = bpf_map_get_next_key(map, key, next_key);
+	count = 0;
+	nextkey_is_null = false;
+	copied = true;
+
+again:
+	/* delete key */
+	if (do_delete) {
+		err = bpf_map_delete_elem(map, key);
+		if (err && (err != -ENOENT || !ignore_enoent)) {
+			/* failed to delete "key".
+			 * lookup_delete will considered failed.
+			 */
+			ret = err;
+			goto after_loop;
+		}
+	}
+
+	/* deletion in lookup_and_delete is successful. */
+	/* check bpf_map_get_next_key() result. */
+	count += !!copied;
+	nextkey_is_null = ret && ret == -ENOENT;
+	if (ret || count >= max_count)
+		goto after_loop;
+
+	/* copy value corresponding to next_key */
+	ret = bpf_map_copy_value(map, next_key, value, elem_flags);
+	copied = true;
+	if (ret) {
+		copied = false;
+		if (ret != -ENOENT || !ignore_enoent)
+			goto after_loop;
+	} else {
+		if (copy_to_user(keys + count * key_size, next_key, key_size) ||
+		    copy_to_user(values + count * value_size, value, value_size)) {
+			ret = -EFAULT;
+			goto after_loop;
+		}
+	}
+
+	BPF_MAP_BATCH_SWAP_KEYS(key, next_key, buf, buf + roundup_key_size);
+	ret = bpf_map_get_next_key(map, key, next_key);
+	goto again;
+
+after_loop:
+	if (!ret) {
+		if (put_user(count, &uattr->batch.count) ||
+		    copy_to_user(nskey, next_key, key_size))
+			ret = -EFAULT;
+	} else if (nextkey_is_null) {
+		ret = 0;
+		if (put_user(count, &uattr->batch.count) ||
+                    copy_to_user(&uattr->batch.next_start_key, &zero, sizeof(u64)))
+			ret = -EFAULT;
+	} else if (ret != -EFAULT) {
+		if (put_user(count, &uattr->batch.count))
+			ret = -EFAULT;
+	}
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+static int map_update_batch(struct bpf_map *map,
+			    const union bpf_attr *attr,
+			    union bpf_attr __user *uattr,
+			    struct fd *f)
+{
+	u32 max_count, count, missed, key_size, roundup_key_size, value_size;
+	void __user *skey, *nskey, *keys, *values;
+	void *buf = NULL, *key, *value;
+	u64 elem_flags, flags;
+	bool ignore_enoent;
+	int ret, err;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	if (!max_count)
+		return 0;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count != 0 && !skey && !nskey: update max_count elements in keys[]/values[].
+	 */
+	if (nskey || skey)
+		return -EINVAL;
+
+	if ((elem_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))
+		return -EINVAL;
+
+	key_size = map->key_size;
+	roundup_key_size = round_up(key_size, 8);
+	value_size = bpf_map_value_size(map);
+	buf = kmalloc(roundup_key_size + value_size, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	value = buf + roundup_key_size;
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+
+	missed = 0;
+	for (count = 0; count < max_count; count++) {
+		if (copy_from_user(key, keys + count * key_size, key_size) ||
+		    copy_from_user(value, values + count * value_size, value_size)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = bpf_map_update_elem(map, key, value, f, elem_flags);
+		if (ret && (ret != -ENOENT || !ignore_enoent))
+			break;
+
+		missed += !!ret;
+		ret = 0;
+	}
+	count -= missed;
+	if ((!ret && missed) || (ret && ret != -EFAULT)) {
+		err = put_user(count, &uattr->batch.count);
+		// Reset the error code if the above put_user failed.
+		ret = err ? : ret;
+	}
+
+	kfree(buf);
+	return ret;
+}
+
+static int map_delete_batch(struct bpf_map *map,
+			    const union bpf_attr *attr,
+			    union bpf_attr __user *uattr)
+{
+	u32 max_count, count, key_size, roundup_key_size;
+	void __user *skey, *nskey, *keys, *values;
+	bool ignore_enoent, nextkey_is_null;
+	void *buf = NULL, *key, *next_key;
+	u64 elem_flags, flags, zero = 0;
+	int ret, err;
+
+	__map_batch_get_attrs(attr, &skey, &nskey, &keys, &values, &max_count,
+			      &elem_flags, &flags);
+
+	if (elem_flags || flags & ~BPF_F_ENFORCE_ENOENT)
+		return -EINVAL;
+
+	/* The following max_count/skey/nskey combinations are supported:
+	 * max_count == 0 && !skey && !nskey: delete all elements in the map
+	 * max_count == 0 && skey && !nskey: delete all elements starting from skey.
+	 * max_count != 0 && !skey && nskey: delete max_count elements starting from map start.
+	 * max_count != 0 && skey && nskey: delete max_count elements starting from skey.
+	 * max_count != 0 && !skey && !nskey: delete max_count elements in keys[].
+	 */
+	if ((max_count == 0 && nskey) || (max_count != 0 && skey && !nskey))
+		return -EINVAL;
+
+	ignore_enoent = !(flags & BPF_F_ENFORCE_ENOENT);
+	key_size = map->key_size;
+	roundup_key_size = round_up(map->key_size, 8);
+	buf = kmalloc(roundup_key_size * 2, GFP_USER | __GFP_NOWARN);
+	if (!buf)
+		return -ENOMEM;
+
+	key = buf;
+	next_key = buf + roundup_key_size;
+
+	nextkey_is_null = false;
+
+	if (max_count == 0 || nskey) {
+		count = 0;
+		if (skey) {
+			if (copy_from_user(key, skey, key_size)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		} else {
+			ret = bpf_map_get_next_key(map, NULL, key);
+			if (ret) {
+				nextkey_is_null = true;
+				goto after_loop;
+			}
+		}
+
+		while (max_count == 0 || count < max_count) {
+			ret = bpf_map_get_next_key(map, key, next_key);
+
+			err = bpf_map_delete_elem(map, key);
+			if (err && (err != -ENOENT || !ignore_enoent)) {
+				ret = err;
+				break;
+			}
+
+			count += !err;
+
+			/* check bpf_map_get_next_key() result */
+			if (ret) {
+				nextkey_is_null = ret == -ENOENT;
+				break;
+			}
+
+			BPF_MAP_BATCH_SWAP_KEYS(key, next_key, buf, buf + roundup_key_size);
+		}
+
+after_loop:
+		if (!ret) {
+			if (copy_to_user(nskey, key, key_size))
+				ret = -EFAULT;
+		} else if (nextkey_is_null) {
+			ret = 0;
+			if (copy_to_user(&uattr->batch.next_start_key, &zero, sizeof(u64)))
+				ret = -EFAULT;
+		}
+	} else {
+		for (count = 0; count < max_count;) {
+			if (copy_from_user(key, keys + count * key_size, key_size)) {
+				ret = -EFAULT;
+				break;
+			}
+
+			ret = bpf_map_delete_elem(map, key);
+			if (ret && (ret != -ENOENT || !ignore_enoent))
+				break;
+
+			count += !ret;
+		}
+	}
+
+	if (ret != -EFAULT && put_user(count, &uattr->batch.count))
+		ret = -EFAULT;
+
+out:
+	kfree(buf);
+	return ret;
+}
+
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+			    union bpf_attr __user *uattr,
+			    int cmd)
+{
+	struct bpf_map *map;
+	int err, ufd;
+	struct fd f;
+
+	if (CHECK_ATTR(BPF_MAP_BATCH))
+		return -EINVAL;
+
+	ufd = attr->batch.map_fd;
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
+	if (cmd == BPF_MAP_LOOKUP_BATCH)
+		err = map_lookup_and_delete_batch(map, attr, uattr, false);
+	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
+		err = map_lookup_and_delete_batch(map, attr, uattr, true);
+	else if (cmd == BPF_MAP_UPDATE_BATCH)
+		err = map_update_batch(map, attr, uattr, &f);
+	else /* BPF_MAP_DELETE_BATCH */
+		err = map_delete_batch(map, attr, uattr);
+
+err_put:
+	fdput(f);
+	return err;
+
+}
+
 #define BPF_MAP_FREEZE_LAST_FIELD map_fd
 
 static int map_freeze(const union bpf_attr *attr)
@@ -2939,6 +3375,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
 		err = map_lookup_and_delete_elem(&attr);
 		break;
+	case BPF_MAP_LOOKUP_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+		break;
+	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+		break;
+	case BPF_MAP_UPDATE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+		break;
+	case BPF_MAP_DELETE_BATCH:
+		err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 02/13] bpf: refactor map_update_elem()
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

Refactor function map_update_elem() by creating a
helper function bpf_map_update_elem() which will be
used later by batched map update operation.

Also reuse function bpf_map_value_size()
in map_update_elem().

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/syscall.c | 113 ++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 211e0bc667bd..3caa0ab3d30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -878,6 +878,61 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
 		synchronize_rcu();
 }
 
+static int bpf_map_update_elem(struct bpf_map *map, void *key, void *value,
+			       struct fd *f, __u64 flags) {
+	int err;
+
+	/* Need to create a kthread, thus must support schedule */
+	if (bpf_map_is_dev_bound(map)) {
+		return bpf_map_offload_update_elem(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
+		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
+		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+		return map->ops->map_update_elem(map, key, value, flags);
+	}
+
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+	 * inside bpf map update or delete otherwise deadlocks are possible
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		err = bpf_percpu_hash_update(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_update(map, key, value, flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_update(map, key, value,
+						       flags);
+	} else if (IS_FD_ARRAY(map)) {
+		rcu_read_lock();
+		err = bpf_fd_array_map_update_elem(map, f->file, key, value,
+						   flags);
+		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+		rcu_read_lock();
+		err = bpf_fd_htab_map_update_elem(map, f->file, key, value,
+						  flags);
+		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		/* rcu_read_lock() is not needed */
+		err = bpf_fd_reuseport_array_update_elem(map, key, value,
+							 flags);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_push_elem(map, value, flags);
+	} else {
+		rcu_read_lock();
+		err = map->ops->map_update_elem(map, key, value, flags);
+		rcu_read_unlock();
+	}
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+	maybe_wait_bpf_programs(map);
+
+	return err;
+}
+
 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
@@ -915,13 +970,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-		value_size = round_up(map->value_size, 8) * num_possible_cpus();
-	else
-		value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -932,56 +981,8 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
-	/* Need to create a kthread, thus must support schedule */
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_update_elem(map, key, value, attr->flags);
-		goto out;
-	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
-		   map->map_type == BPF_MAP_TYPE_SOCKHASH ||
-		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		err = map->ops->map_update_elem(map, key, value, attr->flags);
-		goto out;
-	}
+	err = bpf_map_update_elem(map, key, value, &f, attr->flags);
 
-	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
-	 * inside bpf map update or delete otherwise deadlocks are possible
-	 */
-	preempt_disable();
-	__this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
-		err = bpf_percpu_hash_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		err = bpf_percpu_array_update(map, key, value, attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
-		err = bpf_percpu_cgroup_storage_update(map, key, value,
-						       attr->flags);
-	} else if (IS_FD_ARRAY(map)) {
-		rcu_read_lock();
-		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
-						   attr->flags);
-		rcu_read_unlock();
-	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		rcu_read_lock();
-		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
-						  attr->flags);
-		rcu_read_unlock();
-	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
-		/* rcu_read_lock() is not needed */
-		err = bpf_fd_reuseport_array_update_elem(map, key, value,
-							 attr->flags);
-	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-		   map->map_type == BPF_MAP_TYPE_STACK) {
-		err = map->ops->map_push_elem(map, value, attr->flags);
-	} else {
-		rcu_read_lock();
-		err = map->ops->map_update_elem(map, key, value, attr->flags);
-		rcu_read_unlock();
-	}
-	__this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-	maybe_wait_bpf_programs(map);
-out:
 free_value:
 	kfree(value);
 free_key:
-- 
2.17.1


^ permalink raw reply related

* [PATCH bpf-next 01/13] bpf: add bpf_map_value_size and bp_map_copy_value helper functions
From: Yonghong Song @ 2019-08-29  6:45 UTC (permalink / raw)
  To: bpf, netdev
  Cc: Alexei Starovoitov, Brian Vazquez, Daniel Borkmann, kernel-team,
	Yonghong Song
In-Reply-To: <20190829064502.2750303-1-yhs@fb.com>

From: Brian Vazquez <brianvv@google.com>

Move reusable code from map_lookup_elem to helper functions to avoid code
duplication in kernel/bpf/syscall.c

Suggested-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Brian Vazquez <brianvv@google.com>
---
 kernel/bpf/syscall.c | 134 +++++++++++++++++++++++--------------------
 1 file changed, 73 insertions(+), 61 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca60eafa6922..211e0bc667bd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -126,6 +126,76 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 	return map;
 }
 
+static u32 bpf_map_value_size(struct bpf_map *map)
+{
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+		return round_up(map->value_size, 8) * num_possible_cpus();
+	else if (IS_FD_MAP(map))
+		return sizeof(u32);
+	else
+		return  map->value_size;
+}
+
+static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
+			      __u64 flags)
+{
+	void *ptr;
+	int err;
+
+	if (bpf_map_is_dev_bound(map))
+		return  bpf_map_offload_lookup_elem(map, key, value);
+
+	preempt_disable();
+	this_cpu_inc(bpf_prog_active);
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		err = bpf_percpu_hash_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		err = bpf_percpu_array_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+		err = bpf_percpu_cgroup_storage_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+		err = bpf_stackmap_copy(map, key, value);
+	} else if (IS_FD_ARRAY(map)) {
+		err = bpf_fd_array_map_lookup_elem(map, key, value);
+	} else if (IS_FD_HASH(map)) {
+		err = bpf_fd_htab_map_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
+		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+		   map->map_type == BPF_MAP_TYPE_STACK) {
+		err = map->ops->map_peek_elem(map, value);
+	} else {
+		rcu_read_lock();
+		if (map->ops->map_lookup_elem_sys_only)
+			ptr = map->ops->map_lookup_elem_sys_only(map, key);
+		else
+			ptr = map->ops->map_lookup_elem(map, key);
+		if (IS_ERR(ptr)) {
+			err = PTR_ERR(ptr);
+		} else if (!ptr) {
+			err = -ENOENT;
+		} else {
+			err = 0;
+			if (flags & BPF_F_LOCK)
+				/* lock 'ptr' and copy everything but lock */
+				copy_map_value_locked(map, value, ptr, true);
+			else
+				copy_map_value(map, value, ptr);
+			/* mask lock, since value wasn't zero inited */
+			check_and_init_map_lock(map, value);
+		}
+		rcu_read_unlock();
+	}
+	this_cpu_dec(bpf_prog_active);
+	preempt_enable();
+
+	return err;
+}
+
 void *bpf_map_area_alloc(size_t size, int numa_node)
 {
 	/* We really just want to fail instead of triggering OOM killer
@@ -739,7 +809,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	void __user *uvalue = u64_to_user_ptr(attr->value);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
-	void *key, *value, *ptr;
+	void *key, *value;
 	u32 value_size;
 	struct fd f;
 	int err;
@@ -771,72 +841,14 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	    map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
-		value_size = round_up(map->value_size, 8) * num_possible_cpus();
-	else if (IS_FD_MAP(map))
-		value_size = sizeof(u32);
-	else
-		value_size = map->value_size;
+	value_size = bpf_map_value_size(map);
 
 	err = -ENOMEM;
 	value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
 	if (!value)
 		goto free_key;
 
-	if (bpf_map_is_dev_bound(map)) {
-		err = bpf_map_offload_lookup_elem(map, key, value);
-		goto done;
-	}
-
-	preempt_disable();
-	this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
-		err = bpf_percpu_hash_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		err = bpf_percpu_array_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
-		err = bpf_percpu_cgroup_storage_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
-		err = bpf_stackmap_copy(map, key, value);
-	} else if (IS_FD_ARRAY(map)) {
-		err = bpf_fd_array_map_lookup_elem(map, key, value);
-	} else if (IS_FD_HASH(map)) {
-		err = bpf_fd_htab_map_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
-		err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
-		   map->map_type == BPF_MAP_TYPE_STACK) {
-		err = map->ops->map_peek_elem(map, value);
-	} else {
-		rcu_read_lock();
-		if (map->ops->map_lookup_elem_sys_only)
-			ptr = map->ops->map_lookup_elem_sys_only(map, key);
-		else
-			ptr = map->ops->map_lookup_elem(map, key);
-		if (IS_ERR(ptr)) {
-			err = PTR_ERR(ptr);
-		} else if (!ptr) {
-			err = -ENOENT;
-		} else {
-			err = 0;
-			if (attr->flags & BPF_F_LOCK)
-				/* lock 'ptr' and copy everything but lock */
-				copy_map_value_locked(map, value, ptr, true);
-			else
-				copy_map_value(map, value, ptr);
-			/* mask lock, since value wasn't zero inited */
-			check_and_init_map_lock(map, value);
-		}
-		rcu_read_unlock();
-	}
-	this_cpu_dec(bpf_prog_active);
-	preempt_enable();
-
-done:
+	err = bpf_map_copy_value(map, key, value, attr->flags);
 	if (err)
 		goto free_value;
 
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 3/3] Enable ptp_kvm for arm64
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu
In-Reply-To: <20190829063952.18470-1-jianyong.wu@arm.com>

Currently in arm64 virtualization environment, there is no mechanism to
keep time sync between guest and host. Time in guest will drift compared
with host after boot up as they may both use third party time sources
to correct their time respectively. The time deviation will be in order
of milliseconds but some scenarios ask for higher time precision, like
in cloud envirenment, we want all the VMs running in the host aquire the
same level accuracy from host clock.

Use of kvm ptp clock, which choose the host clock source clock as a
reference clock to sync time clock between guest and host has been adopted
by x86 which makes the time sync order from milliseconds to nanoseconds.

This patch enable kvm ptp on arm64 and we get the similar clock drift as
found with x86 with kvm ptp.

Test result comparison between with kvm ptp and without it in arm64 are
as follows. This test derived from the result of command 'chronyc
sources'. we should take more cure of the last sample column which shows
the offset between the local clock and the source at the last measurement.

no kvm ptp in guest:
MS Name/IP address   Stratum Poll Reach LastRx Last sample
========================================================================
^* dns1.synet.edu.cn      2   6   377    13  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    21  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    29  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    37  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    45  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    53  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    61  +1040us[+1581us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377     4   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    12   -130us[ +796us] +/-   21ms
^* dns1.synet.edu.cn      2   6   377    20   -130us[ +796us] +/-   21ms

in host:
MS Name/IP address   Stratum Poll Reach LastRx Last sample
========================================================================
^* 120.25.115.20          2   7   377    72   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377    92   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377   112   -470us[ -603us] +/-   18ms
^* 120.25.115.20          2   7   377     2   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    22   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    43   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    63   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377    83   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377   103   +872ns[-6808ns] +/-   17ms
^* 120.25.115.20          2   7   377   123   +872ns[-6808ns] +/-   17ms

The dns1.synet.edu.cn is the network reference clock for guest and
120.25.115.20 is the network reference clock for host. we can't get the
clock error between guest and host directly, but a roughly estimated value
will be in order of hundreds of us to ms.

with kvm ptp in guest:
chrony has been disabled in host to remove the disturb by network clock.

MS Name/IP address         Stratum Poll Reach LastRx Last sample
========================================================================
* PHC0                    0   3   377     8     -7ns[   +1ns] +/-    3ns
* PHC0                    0   3   377     8     +1ns[  +16ns] +/-    3ns
* PHC0                    0   3   377     6     -4ns[   -0ns] +/-    6ns
* PHC0                    0   3   377     6     -8ns[  -12ns] +/-    5ns
* PHC0                    0   3   377     5     +2ns[   +4ns] +/-    4ns
* PHC0                    0   3   377    13     +2ns[   +4ns] +/-    4ns
* PHC0                    0   3   377    12     -4ns[   -6ns] +/-    4ns
* PHC0                    0   3   377    11     -8ns[  -11ns] +/-    6ns
* PHC0                    0   3   377    10    -14ns[  -20ns] +/-    4ns
* PHC0                    0   3   377     8     +4ns[   +5ns] +/-    4ns

The PHC0 is the ptp clock which choose the host clock as its source
clock. So we can be sure to say that the clock error between host and guest
is in order of ns.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---
 arch/arm64/include/asm/arch_timer.h  |  3 ++
 arch/arm64/kvm/arch_ptp_kvm.c        | 76 ++++++++++++++++++++++++++++
 drivers/clocksource/arm_arch_timer.c |  6 ++-
 drivers/ptp/Kconfig                  |  2 +-
 include/linux/arm-smccc.h            | 14 +++++
 virt/kvm/arm/psci.c                  | 17 +++++++
 6 files changed, 115 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm64/kvm/arch_ptp_kvm.c

diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h
index 6756178c27db..880576a814b6 100644
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -229,4 +229,7 @@ static inline int arch_timer_arch_init(void)
 	return 0;
 }
 
+extern struct clocksource clocksource_counter;
+extern u64 arch_counter_read(struct clocksource *cs);
+
 #endif
diff --git a/arch/arm64/kvm/arch_ptp_kvm.c b/arch/arm64/kvm/arch_ptp_kvm.c
new file mode 100644
index 000000000000..6b2165ebce62
--- /dev/null
+++ b/arch/arm64/kvm/arch_ptp_kvm.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <asm/hypervisor.h>
+#include <linux/module.h>
+#include <linux/psci.h>
+#include <linux/arm-smccc.h>
+#include <linux/timecounter.h>
+#include <linux/sched/clock.h>
+#include <asm/arch_timer.h>
+
+/*
+ * as trap call cause delay, this function will return the delay in nanosecond
+ */
+static u64 arm_smccc_1_1_invoke_delay(u32 id, struct arm_smccc_res *res)
+{
+	u64 ns, t1, t2;
+
+	t1 = sched_clock();
+	arm_smccc_1_1_invoke(id, res);
+	t2 = sched_clock();
+	t2 -= t1;
+	ns = t2;
+	return ns;
+}
+
+int kvm_arch_ptp_init(void)
+{
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	u64 ns;
+	struct arm_smccc_res hvc_res;
+
+	if (!kvm_arm_hyp_service_available(
+			ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID)) {
+		return -EOPNOTSUPP;
+	}
+	ns = arm_smccc_1_1_invoke_delay(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+					&hvc_res);
+	ts->tv_sec = hvc_res.a0;
+	ts->tv_nsec = hvc_res.a1;
+	timespec64_add_ns(ts, ns);
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock_fn(long *cycle, struct timespec64 *ts,
+			      struct clocksource **cs)
+{
+	u64 ns;
+	struct arm_smccc_res hvc_res;
+
+	if (!kvm_arm_hyp_service_available(
+			ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID)) {
+		return -EOPNOTSUPP;
+	}
+	ns = arm_smccc_1_1_invoke_delay(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
+					&hvc_res);
+	ts->tv_sec = hvc_res.a0;
+	ts->tv_nsec = hvc_res.a1;
+	timespec64_add_ns(ts, ns);
+	*cycle = hvc_res.a2;
+	*cs = &clocksource_counter;
+
+	return 0;
+}
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 07e57a49d1e8..021e3f69364c 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -175,23 +175,25 @@ static notrace u64 arch_counter_get_cntvct(void)
 u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
 EXPORT_SYMBOL_GPL(arch_timer_read_counter);
 
-static u64 arch_counter_read(struct clocksource *cs)
+u64 arch_counter_read(struct clocksource *cs)
 {
 	return arch_timer_read_counter();
 }
+EXPORT_SYMBOL(arch_counter_read);
 
 static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 {
 	return arch_timer_read_counter();
 }
 
-static struct clocksource clocksource_counter = {
+struct clocksource clocksource_counter = {
 	.name	= "arch_sys_counter",
 	.rating	= 400,
 	.read	= arch_counter_read,
 	.mask	= CLOCKSOURCE_MASK(56),
 	.flags	= CLOCK_SOURCE_IS_CONTINUOUS,
 };
+EXPORT_SYMBOL(clocksource_counter);
 
 static struct cyclecounter cyclecounter __ro_after_init = {
 	.read	= arch_counter_read_cc,
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 9b8fee5178e8..e032fafdafa7 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -110,7 +110,7 @@ config PTP_1588_CLOCK_PCH
 config PTP_1588_CLOCK_KVM
 	tristate "KVM virtual PTP clock"
 	depends on PTP_1588_CLOCK
-	depends on KVM_GUEST && X86
+	depends on KVM_GUEST && X86 || ARM64
 	default y
 	help
 	  This driver adds support for using kvm infrastructure as a PTP
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index a6e4d3e3d10a..2a222a1a8594 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -94,6 +94,7 @@
 
 /* KVM "vendor specific" services */
 #define ARM_SMCCC_KVM_FUNC_FEATURES		0
+#define ARM_SMCCC_KVM_PTP			1
 #define ARM_SMCCC_KVM_FUNC_FEATURES_2		127
 #define ARM_SMCCC_KVM_NUM_FUNCS			128
 
@@ -102,6 +103,16 @@
 			   ARM_SMCCC_SMC_32,				\
 			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
 			   ARM_SMCCC_KVM_FUNC_FEATURES)
+/*
+ * This ID used for virtual ptp kvm clock and it will pass second value
+ * and nanosecond value of host real time and system counter by vcpu
+ * register to guest.
+ */
+#define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_32,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   ARM_SMCCC_KVM_PTP)
 
 #ifndef __ASSEMBLY__
 
@@ -373,5 +384,8 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		method;							\
 	})
 
+#include <linux/psci.h>
+#include <linux/clocksource.h>
+
 #endif /*__ASSEMBLY__*/
 #endif /*__LINUX_ARM_SMCCC_H*/
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index 0debf49bf259..7fffdb25d32c 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -392,6 +392,8 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	u32 func_id = smccc_get_function(vcpu);
 	u32 val[4] = {};
 	u32 option;
+	struct timespec *ts;
+	u64 cnt;
 
 	val[0] = SMCCC_RET_NOT_SUPPORTED;
 
@@ -431,6 +433,21 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 	case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
 		val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES);
 		break;
+	/*
+	 * This will used for virtual ptp kvm clock. three
+	 * values will be passed back.
+	 * reg0 stores seconds of host real time;
+	 * reg1 stores nanoseconds of host real time;
+	 * reg2 stotes system counter cycle value.
+	 */
+	case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
+		getnstimeofday(ts);
+		cnt = arch_timer_read_counter();
+		val[0] = ts->tv_sec;
+		val[1] = ts->tv_nsec;
+		val[2] = cnt;
+		val[3] = 0;
+		break;
 	default:
 		return kvm_psci_call(vcpu);
 	}
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 2/3] reorganize ptp_kvm modules to make it arch-independent.
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu
In-Reply-To: <20190829063952.18470-1-jianyong.wu@arm.com>

Currently, ptp_kvm modules implementation is only for x86 which includs
large part of arch-specific code.  This patch move all of those code
into related arch directory.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---
 arch/x86/kvm/arch_ptp_kvm.c          | 92 ++++++++++++++++++++++++++++
 drivers/ptp/Makefile                 |  1 +
 drivers/ptp/{ptp_kvm.c => kvm_ptp.c} | 77 ++++++-----------------
 include/asm-generic/ptp_kvm.h        | 12 ++++
 4 files changed, 123 insertions(+), 59 deletions(-)
 create mode 100644 arch/x86/kvm/arch_ptp_kvm.c
 rename drivers/ptp/{ptp_kvm.c => kvm_ptp.c} (63%)
 create mode 100644 include/asm-generic/ptp_kvm.h

diff --git a/arch/x86/kvm/arch_ptp_kvm.c b/arch/x86/kvm/arch_ptp_kvm.c
new file mode 100644
index 000000000000..56ea84a86da2
--- /dev/null
+++ b/arch/x86/kvm/arch_ptp_kvm.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+#include <asm/pvclock.h>
+#include <asm/kvmclock.h>
+#include <linux/module.h>
+#include <uapi/asm/kvm_para.h>
+#include <uapi/linux/kvm_para.h>
+#include <linux/ptp_clock_kernel.h>
+
+phys_addr_t clock_pair_gpa;
+struct kvm_clock_pairing clock_pair;
+struct pvclock_vsyscall_time_info *hv_clock;
+
+int kvm_arch_ptp_init(void)
+{
+	int ret;
+
+	if (!kvm_para_available())
+		return -ENODEV;
+
+	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
+	hv_clock = pvclock_get_pvti_cpu0_va();
+	if (!hv_clock)
+		return -ENODEV;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+		return -ENODEV;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock(struct timespec64 *ts)
+{
+	long ret;
+
+	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+			     clock_pair_gpa,
+			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	if (ret != 0)
+		return -EOPNOTSUPP;
+
+	ts->tv_sec = clock_pair.sec;
+	ts->tv_nsec = clock_pair.nsec;
+
+	return 0;
+}
+
+int kvm_arch_ptp_get_clock_fn(long *cycle, struct timespec64 *tspec,
+			      struct clocksource **cs)
+{
+	unsigned long ret;
+	unsigned int version;
+	int cpu;
+	struct pvclock_vcpu_time_info *src;
+
+	cpu = smp_processor_id();
+	src = &hv_clock[cpu].pvti;
+
+	do {
+		/*
+		 * We are using a TSC value read in the hosts
+		 * kvm_hc_clock_pairing handling.
+		 * So any changes to tsc_to_system_mul
+		 * and tsc_shift or any other pvclock
+		 * data invalidate that measurement.
+		 */
+		version = pvclock_read_begin(src);
+
+		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
+				     clock_pair_gpa,
+				     KVM_CLOCK_PAIRING_WALLCLOCK);
+		tspec->tv_sec = clock_pair.sec;
+		tspec->tv_nsec = clock_pair.nsec;
+		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+	} while (pvclock_read_retry(src, version));
+
+	*cs = &kvm_clock;
+
+	return 0;
+}
+
+MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>");
+MODULE_DESCRIPTION("PTP clock using KVMCLOCK");
+MODULE_LICENSE("GPL");
diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 677d1d178a3e..5a8c6462fc0f 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -4,6 +4,7 @@
 #
 
 ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp_kvm-y				:= ../../arch/$(ARCH)/kvm/arch_ptp_kvm.o kvm_ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
 obj-$(CONFIG_PTP_1588_CLOCK_DTE)	+= ptp_dte.o
 obj-$(CONFIG_PTP_1588_CLOCK_IXP46X)	+= ptp_ixp46x.o
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/kvm_ptp.c
similarity index 63%
rename from drivers/ptp/ptp_kvm.c
rename to drivers/ptp/kvm_ptp.c
index fc7d0b77e118..9d07cf872be7 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/kvm_ptp.c
@@ -8,12 +8,12 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <uapi/linux/kvm_para.h>
 #include <asm/kvm_para.h>
-#include <asm/pvclock.h>
-#include <asm/kvmclock.h>
 #include <uapi/asm/kvm_para.h>
+#include <asm-generic/ptp_kvm.h>
 
 #include <linux/ptp_clock_kernel.h>
 
@@ -24,56 +24,29 @@ struct kvm_ptp_clock {
 
 DEFINE_SPINLOCK(kvm_ptp_lock);
 
-static struct pvclock_vsyscall_time_info *hv_clock;
-
-static struct kvm_clock_pairing clock_pair;
-static phys_addr_t clock_pair_gpa;
-
 static int ptp_kvm_get_time_fn(ktime_t *device_time,
 			       struct system_counterval_t *system_counter,
 			       void *ctx)
 {
-	unsigned long ret;
+	unsigned long ret, cycle;
 	struct timespec64 tspec;
-	unsigned version;
-	int cpu;
-	struct pvclock_vcpu_time_info *src;
+	struct clocksource *cs;
 
 	spin_lock(&kvm_ptp_lock);
 
 	preempt_disable_notrace();
-	cpu = smp_processor_id();
-	src = &hv_clock[cpu].pvti;
-
-	do {
-		/*
-		 * We are using a TSC value read in the hosts
-		 * kvm_hc_clock_pairing handling.
-		 * So any changes to tsc_to_system_mul
-		 * and tsc_shift or any other pvclock
-		 * data invalidate that measurement.
-		 */
-		version = pvclock_read_begin(src);
-
-		ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-				     clock_pair_gpa,
-				     KVM_CLOCK_PAIRING_WALLCLOCK);
-		if (ret != 0) {
-			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
-			spin_unlock(&kvm_ptp_lock);
-			preempt_enable_notrace();
-			return -EOPNOTSUPP;
-		}
-
-		tspec.tv_sec = clock_pair.sec;
-		tspec.tv_nsec = clock_pair.nsec;
-		ret = __pvclock_read_cycles(src, clock_pair.tsc);
-	} while (pvclock_read_retry(src, version));
+	ret = kvm_arch_ptp_get_clock_fn(&cycle, &tspec, &cs);
+	if (ret != 0) {
+		pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
+		spin_unlock(&kvm_ptp_lock);
+		preempt_enable_notrace();
+		return -EOPNOTSUPP;
+	}
 
 	preempt_enable_notrace();
 
-	system_counter->cycles = ret;
-	system_counter->cs = &kvm_clock;
+	system_counter->cycles = cycle;
+	system_counter->cs = cs;
 
 	*device_time = timespec64_to_ktime(tspec);
 
@@ -116,17 +89,13 @@ static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts)
 
 	spin_lock(&kvm_ptp_lock);
 
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING,
-			     clock_pair_gpa,
-			     KVM_CLOCK_PAIRING_WALLCLOCK);
+	ret = kvm_arch_ptp_get_clock(&tspec);
 	if (ret != 0) {
 		pr_err_ratelimited("clock offset hypercall ret %lu\n", ret);
 		spin_unlock(&kvm_ptp_lock);
 		return -EOPNOTSUPP;
 	}
 
-	tspec.tv_sec = clock_pair.sec;
-	tspec.tv_nsec = clock_pair.nsec;
 	spin_unlock(&kvm_ptp_lock);
 
 	memcpy(ts, &tspec, sizeof(struct timespec64));
@@ -166,21 +135,11 @@ static void __exit ptp_kvm_exit(void)
 
 static int __init ptp_kvm_init(void)
 {
-	long ret;
-
-	if (!kvm_para_available())
-		return -ENODEV;
-
-	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	hv_clock = pvclock_get_pvti_cpu0_va();
-
-	if (!hv_clock)
-		return -ENODEV;
+	int ret;
 
-	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
-			KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
-		return -ENODEV;
+	ret = kvm_arch_ptp_init();
+	if (IS_ERR(ret))
+		return ret;
 
 	kvm_ptp_clock.caps = ptp_kvm_caps;
 
diff --git a/include/asm-generic/ptp_kvm.h b/include/asm-generic/ptp_kvm.h
new file mode 100644
index 000000000000..128a9d7af161
--- /dev/null
+++ b/include/asm-generic/ptp_kvm.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ *  Virtual PTP 1588 clock for use with KVM guests
+ *
+ *  Copyright (C) 2019 ARM Ltd.
+ *  All Rights Reserved
+ */
+
+static int kvm_arch_ptp_init(void);
+static int kvm_arch_ptp_get_clock(struct timespec64 *ts);
+static int kvm_arch_ptp_get_clock_fn(long *cycle,
+		struct timespec64 *tspec, void *cs);
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 1/3] Export psci_ops.conduit symbol as modules will use it.
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu
In-Reply-To: <20190829063952.18470-1-jianyong.wu@arm.com>

If arm_smccc_1_1_invoke used in modules, psci_ops.conduit should
be export.

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
---
 drivers/firmware/psci/psci.c | 6 ++++++
 include/linux/arm-smccc.h    | 2 +-
 include/linux/psci.h         | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index f82ccd39a913..35c4eaab1451 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -212,6 +212,12 @@ static unsigned long psci_migrate_info_up_cpu(void)
 			      0, 0, 0);
 }
 
+enum psci_conduit psci_get_conduit(void)
+{
+	return psci_ops.conduit;
+}
+EXPORT_SYMBOL(psci_get_conduit);
+
 static void set_conduit(enum psci_conduit conduit)
 {
 	switch (conduit) {
diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 552cbd49abe8..a6e4d3e3d10a 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -357,7 +357,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
  * The return value also provides the conduit that was used.
  */
 #define arm_smccc_1_1_invoke(...) ({					\
-		int method = psci_ops.conduit;				\
+		int method = psci_get_conduit();			\
 		switch (method) {					\
 		case PSCI_CONDUIT_HVC:					\
 			arm_smccc_1_1_hvc(__VA_ARGS__);			\
diff --git a/include/linux/psci.h b/include/linux/psci.h
index a8a15613c157..e5cedc986049 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -42,6 +42,7 @@ struct psci_operations {
 	enum smccc_version smccc_version;
 };
 
+extern enum psci_conduit psci_get_conduit(void);
 extern struct psci_operations psci_ops;
 
 #if defined(CONFIG_ARM_PSCI_FW)
-- 
2.17.1


^ permalink raw reply related

* [RFC PATCH 0/3] arm64: enable virtual kvm ptp for arm64
From: Jianyong Wu @ 2019-08-29  6:39 UTC (permalink / raw)
  To: netdev, pbonzini, sean.j.christopherson, maz, richardcochran,
	Mark.Rutland, Will.Deacon, suzuki.poulose
  Cc: linux-kernel, Steve.Capper, Kaly.Xin, justin.he, jianyong.wu

kvm ptp targets to provide high precision time sync between guest
and host in virtualization environment. This patch enable kvm ptp
for arm64.

This patch set base on [1][2][3]

[1]https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/
commit/?h=kvm/hvc&id=125ea89e4a21e2fc5235410f966a996a1a7148bf
[2]https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/
commit/?h=kvm/hvc&id=464f5a1741e5959c3e4d2be1966ae0093b4dce06
[3]https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/
commit/?h=kvm/hvc&id=6597490e005d0eeca8ed8c1c1d7b4318ee014681

Jianyong Wu (3):
  Export psci_ops.conduit symbol as modules will use it.
  reorganize ptp_kvm modules to make it arch-independent.
  Enable ptp_kvm for arm64

 arch/arm64/include/asm/arch_timer.h  |  3 +
 arch/arm64/kvm/arch_ptp_kvm.c        | 76 +++++++++++++++++++++++
 arch/x86/kvm/arch_ptp_kvm.c          | 92 ++++++++++++++++++++++++++++
 drivers/clocksource/arm_arch_timer.c |  6 +-
 drivers/firmware/psci/psci.c         |  6 ++
 drivers/ptp/Kconfig                  |  2 +-
 drivers/ptp/Makefile                 |  1 +
 drivers/ptp/{ptp_kvm.c => kvm_ptp.c} | 77 ++++++-----------------
 include/asm-generic/ptp_kvm.h        | 12 ++++
 include/linux/arm-smccc.h            | 16 ++++-
 include/linux/psci.h                 |  1 +
 virt/kvm/arm/psci.c                  | 17 +++++
 12 files changed, 246 insertions(+), 63 deletions(-)
 create mode 100644 arch/arm64/kvm/arch_ptp_kvm.c
 create mode 100644 arch/x86/kvm/arch_ptp_kvm.c
 rename drivers/ptp/{ptp_kvm.c => kvm_ptp.c} (63%)
 create mode 100644 include/asm-generic/ptp_kvm.h

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH net] netdevsim: Restore per-network namespace accounting for fib entries
From: Jiri Pirko @ 2019-08-29  6:28 UTC (permalink / raw)
  To: David Ahern; +Cc: David Ahern, davem, netdev
In-Reply-To: <2c561928-1052-4c33-848d-ed7b81e920cf@gmail.com>

Wed, Aug 28, 2019 at 11:26:03PM CEST, dsahern@gmail.com wrote:
>On 8/28/19 4:37 AM, Jiri Pirko wrote:
>> Tue, Aug 06, 2019 at 09:15:17PM CEST, dsahern@kernel.org wrote:
>>> From: David Ahern <dsahern@gmail.com>
>>>
>>> Prior to the commit in the fixes tag, the resource controller in netdevsim
>>> tracked fib entries and rules per network namespace. Restore that behavior.
>> 
>> David, please help me understand. If the counters are per-device, not
>> per-netns, they are both the same. If we have device (devlink instance)
>> is in a netns and take only things happening in this netns into account,
>> it should count exactly the same amount of fib entries, doesn't it?
>
>if you are only changing where the counters are stored - net_generic vs
>devlink private - then yes, they should be equivalent.

Okay.

>
>> 
>> I re-thinked the devlink netns patchset and currently I'm going in
>> slightly different direction. I'm having netns as an attribute of
>> devlink reload. So all the port netdevices and everything gets
>> re-instantiated into new netns. Works fine with mlxsw. There we also
>> re-register the fib notifier.
>> 
>> I think that this can work for your usecase in netdevsim too:
>> 1) devlink instance is registering a fib notifier to track all fib
>>    entries in a namespace it belongs to. The counters are per-device -
>>    counting fib entries in a namespace the device is in.
>> 2) another devlink instance can do the same tracking in the same
>>    namespace. No problem, it's a separate counter, but the numbers are
>>    the same. One can set different limits to different devlink
>>    instances, but you can have only one. That is the bahaviour you have
>>    now.
>> 3) on devlink reload, netdevsim re-instantiates ports and re-registers
>>    fib notifier
>> 4) on devlink reload with netns change, all should be fine as the
>>    re-registered fib nofitier replays the entries. The ports are
>>    re-instatiated in new netns.
>> 
>> This way, we would get consistent behaviour between netdevsim and real
>> devices (mlxsw), correct devlink-netns implementation (you also
>> suggested to move ports to the namespace). Everyone should be happy.
>> 
>> What do you think?
>> 
>
>Right now, registering the fib notifier walks all namespaces. That is
>not a scalable solution. Are you changing that to replay only a given
>netns? Are you changing the notifiers to be per-namespace?

Eventually, that seems like good idea. Currently I want to do
if (net==nsim_dev->mynet)
	done
check at the beginning of the notifier.


>
>Also, you are still allowing devlink instances to be created within a
>namespace?

Yes, netdevsim is planned to be created directly in namespace.

^ permalink raw reply

* Re: [PATCH v2 bpf-next 3/3] perf: implement CAP_TRACING
From: Song Liu @ 2019-08-29  6:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, David S . Miller, peterz@infradead.org,
	rostedt@goodmis.org, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <20190829051253.1927291-3-ast@kernel.org>



> On Aug 28, 2019, at 10:12 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 
> Implement permissions as stated in uapi/linux/capability.h
> 
> Similar to CAP_BPF it's highly unlikely that s/CAP_SYS_ADMIN/CAP_TRACING/
> replacement will cause user breakage.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Song Liu <songliubraving@fb.com>

^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/3] bpf: implement CAP_BPF
From: Song Liu @ 2019-08-29  6:04 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, davem@davemloft.net, peterz@infradead.org,
	rostedt@goodmis.org, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <20190829051253.1927291-2-ast@kernel.org>



> On Aug 28, 2019, at 10:12 PM, Alexei Starovoitov <ast@kernel.org> wrote:
> 

[...]

> diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
> index 44e2d640b088..91a7f25512ca 100644
> --- a/tools/testing/selftests/bpf/test_verifier.c
> +++ b/tools/testing/selftests/bpf/test_verifier.c
> @@ -805,10 +805,20 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
> 	}
> }
> 
> +struct libcap {
> +	struct __user_cap_header_struct hdr;
> +	struct __user_cap_data_struct data[2];
> +};
> +

I am confused by struct libcap. Why do we need it? 

> static int set_admin(bool admin)
> {
> 	cap_t caps;
> -	const cap_value_t cap_val = CAP_SYS_ADMIN;
> +	/* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs,
> +	 * and CAP_TRACING to create stackmap
> +	 */
> +	const cap_value_t cap_net_admin = CAP_NET_ADMIN;
> +	const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
> +	struct libcap *cap;
> 	int ret = -1;
> 
> 	caps = cap_get_proc();
> @@ -816,11 +826,26 @@ static int set_admin(bool admin)
> 		perror("cap_get_proc");
> 		return -1;
> 	}
> -	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
> +	cap = (struct libcap *)caps;
> +	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
> +		perror("cap_set_flag clear admin");
> +		goto out;
> +	}
> +	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
> 				admin ? CAP_SET : CAP_CLEAR)) {
> -		perror("cap_set_flag");
> +		perror("cap_set_flag set_or_clear net");
> 		goto out;
> 	}
> +	/* libcap is likely old and simply ignores CAP_BPF and CAP_TRACING,
> +	 * so update effective bits manually
> +	 */
> +	if (admin) {
> +		cap->data[1].effective |= 1 << (38 /* CAP_BPF */ - 32);
> +		cap->data[1].effective |= 1 << (39 /* CAP_TRACING */ - 32);
> +	} else {
> +		cap->data[1].effective &= ~(1 << (38 - 32));
> +		cap->data[1].effective &= ~(1 << (39 - 32));
> +	}

And why we do not need cap->data[0]?

Thanks,
Song


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox