Netdev List
 help / color / mirror / Atom feed
* linux-next: build failure after merge of the net-next tree
From: Stephen Rothwell @ 2017-12-22  0:45 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Ido Schimmel,
	Eric Dumazet

Hi all,

After merging the net-next tree, today's linux-next build (arm
multi_v7_defconfig) failed like this:

net/ipv6/route.c: In function 'inet6_rtm_getroute':
net/ipv6/route.c:4324:25: error: 'struct dst_entry' has no member named 'from'
  if (fibmatch && rt->dst.from) {
                         ^
In file included from include/linux/uio.h:12:0,
                 from include/linux/socket.h:8,
                 from net/ipv6/route.c:34:
net/ipv6/route.c:4325:46: error: 'struct dst_entry' has no member named 'from'
   struct rt6_info *ort = container_of(rt->dst.from,
                                              ^
include/linux/kernel.h:929:26: note: in definition of macro 'container_of'
  void *__mptr = (void *)(ptr);     \
                          ^
In file included from include/linux/kernel.h:10:0,
                 from include/linux/uio.h:12,
                 from include/linux/socket.h:8,
                 from net/ipv6/route.c:34:
net/ipv6/route.c:4325:46: error: 'struct dst_entry' has no member named 'from'
   struct rt6_info *ort = container_of(rt->dst.from,
                                              ^
include/linux/compiler.h:301:19: note: in definition of macro '__compiletime_assert'
   bool __cond = !(condition);    \
                   ^
include/linux/compiler.h:324:2: note: in expansion of macro '_compiletime_assert'
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^
include/linux/build_bug.h:47:37: note: in expansion of macro 'compiletime_assert'
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^
include/linux/kernel.h:930:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
  ^
include/linux/kernel.h:930:20: note: in expansion of macro '__same_type'
  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
                    ^
net/ipv6/route.c:4325:26: note: in expansion of macro 'container_of'
   struct rt6_info *ort = container_of(rt->dst.from,
                          ^
net/ipv6/route.c:4325:46: error: 'struct dst_entry' has no member named 'from'
   struct rt6_info *ort = container_of(rt->dst.from,
                                              ^
include/linux/compiler.h:301:19: note: in definition of macro '__compiletime_assert'
   bool __cond = !(condition);    \
                   ^
include/linux/compiler.h:324:2: note: in expansion of macro '_compiletime_assert'
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^
include/linux/build_bug.h:47:37: note: in expansion of macro 'compiletime_assert'
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
                                     ^
include/linux/kernel.h:930:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
  BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
  ^
include/linux/kernel.h:931:6: note: in expansion of macro '__same_type'
     !__same_type(*(ptr), void),   \
      ^
net/ipv6/route.c:4325:26: note: in expansion of macro 'container_of'
   struct rt6_info *ort = container_of(rt->dst.from,
                          ^

Caused by commit

  3a2232e92e87 ("ipv6: Move dst->from into struct rt6_info")

interacting with commit

  58acfd714e6b ("ipv6: Honor specified parameters in fibmatch lookup"

from the net tree.

I have added the following merge fix patch for today (I am guessing
a bit here):

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 22 Dec 2017 11:25:13 +1100
Subject: [PATCH] ipv6: fix up for "ipv6: Move dst->from into struct rt6_info"

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 net/ipv6/route.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4efaac956f0c..2490280b3394 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4321,9 +4321,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		goto errout;
 	}
 
-	if (fibmatch && rt->dst.from) {
-		struct rt6_info *ort = container_of(rt->dst.from,
-						    struct rt6_info, dst);
+	if (fibmatch && rt->from) {
+		struct rt6_info *ort = rt->from;
 
 		dst_hold(&ort->dst);
 		ip6_rt_put(rt);
-- 
2.15.0

-- 
Cheers,
Stephen Rothwell

^ permalink raw reply related

* Re: [PATCH bpf-next] samples/bpf: adjust rlimit RLIMIT_MEMLOCK for sampleip
From: Daniel Borkmann @ 2017-12-22  0:24 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov; +Cc: netdev, bgregg
In-Reply-To: <20171221094905.4872-1-bhole_prashant_q7@lab.ntt.co.jp>

On 12/21/2017 10:49 AM, Prashant Bhole wrote:
> The default memlock rlimit is 64KB, which causes failure in
> creating a map
> 
> For example:
> test@test# ./sampleip
> failed to create a map: 1 Operation not permitted
> ERROR: loading BPF program (errno 1):
> Try: ulimit -l unlimited
> 
> Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> ---
>  samples/bpf/sampleip_user.c | 17 ++++++++++-------
>  1 file changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
> index 4ed690b907ff..f240a7db7c0a 100644
> --- a/samples/bpf/sampleip_user.c
> +++ b/samples/bpf/sampleip_user.c
> @@ -19,6 +19,7 @@
>  #include <linux/ptrace.h>
>  #include <linux/bpf.h>
>  #include <sys/ioctl.h>
> +#include <sys/resource.h>
>  #include "libbpf.h"
>  #include "bpf_load.h"
>  #include "perf-sys.h"
> @@ -132,8 +133,9 @@ static void int_exit(int sig)
>  
>  int main(int argc, char **argv)
>  {
> -	char filename[256];
>  	int *pmu_fd, opt, freq = DEFAULT_FREQ, secs = DEFAULT_SECS;
> +	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
> +	char filename[256];
>  
>  	/* process arguments */
>  	while ((opt = getopt(argc, argv, "F:h")) != -1) {
> @@ -154,6 +156,11 @@ int main(int argc, char **argv)
>  		return 1;
>  	}
>  
> +	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
> +		perror("Failed to set memlock rlimit");
> +		return 1;
> +	}
> +
>  	/* initialize kernel symbol translation */
>  	if (load_kallsyms()) {
>  		fprintf(stderr, "ERROR: loading /proc/kallsyms\n");
> @@ -171,12 +178,8 @@ int main(int argc, char **argv)
>  	/* load BPF program */
>  	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
>  	if (load_bpf_file(filename)) {
> -		fprintf(stderr, "ERROR: loading BPF program (errno %d):\n",
> -			errno);
> -		if (strcmp(bpf_log_buf, "") == 0)
> -			fprintf(stderr, "Try: ulimit -l unlimited\n");

Given the author of that sample code clearly gave this as a hint to make
the decision up to the user to tweak ulimit, I don't think we should
then do it unconditionally in the sample program here. Therefore, I'm
not taking this, sorry.

> -		else
> -			fprintf(stderr, "%s", bpf_log_buf);
> +		fprintf(stderr, "ERROR: loading BPF program (errno %d): %s\n",
> +			errno, bpf_log_buf);
>  		return 1;
>  	}
>  	signal(SIGINT, int_exit);
> 

^ permalink raw reply

* Re: [PATCH bpf-next 0/11] bpf: more sock_ops callbacks
From: Lawrence Brakmo @ 2017-12-22  0:23 UTC (permalink / raw)
  To: Daniel Borkmann, netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Eric Dumazet,
	Neal Cardwell, Yuchung Cheng
In-Reply-To: <c77a7fb5-5bd2-7b2b-8fca-b2e6f9502499@iogearbox.net>

Daniel,

Dam, by mistake I copied the “consists of the following pachtes” from the previous bpf branch commit. I will send a corrected patch set in a few minutes.

Thanks,

- Lawrence

On 12/21/17, 4:03 PM, "Daniel Borkmann" <daniel@iogearbox.net> wrote:

    On 12/20/2017 10:16 PM, Lawrence Brakmo wrote:
    > This patchset adds support for:
    > 
    > - direct R or R/W access to many tcp_sock fields
    > - passing up to 4 arguments to sock_ops BPF functions
    > - tcp_sock field bpf_sock_ops_flags for controlling callbacks
    > - optionally calling sock_ops BPF program when RTO fires
    > - optionally calling sock_ops BPF program when packet is retransmitted
    > - optionally calling sock_ops BPF program when TCP state changes
    > - access to tclass and sk_txhash
    > - new selftest
    > 
    > Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
    > 
    > Consists of the following patches:
    > [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
    > [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
    > [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
    > [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
    > [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
    > [PATCH bpf 06/11] bpf: Add sock_ops RTO callback
    > [PATCH bpf 07/11] bpf: Add support for reading sk_state and more
    > [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
    > [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
    > [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
    > [PATCH bpf 11/11] bpf: add selftest for tcpbpf
    
    Hmm, looks like only ever [1] and [2] made it into patchwork for some
    reason and both under a different series. Something wrong with mailer
    config?
    
    Cheers,
    Daniel
    
      [1] https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_851690_&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=pq_Mqvzfy-C8ltkgyx1u_g&m=Kg_lciwL9AOJWdB5GjpWeRoRn3Vx0n3O4ttPPITzmf0&s=bl0Hj1SWmDCUF9_ZkT6QI-kbMiTyUOh0xhoy0FIsS9A&e=
      [2] https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_851689_&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=pq_Mqvzfy-C8ltkgyx1u_g&m=Kg_lciwL9AOJWdB5GjpWeRoRn3Vx0n3O4ttPPITzmf0&s=BitYJKyncTLIJ35HMAPqjXpU5gm4B4B5tDgk1KOLU6o&e=
    
      (First two in: https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_project_netdev_list_-3Fsubmitter-3D66772-26state-3D-2A&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=pq_Mqvzfy-C8ltkgyx1u_g&m=Kg_lciwL9AOJWdB5GjpWeRoRn3Vx0n3O4ttPPITzmf0&s=0BGtuzYNs3pIzBEnWZUpCVEyT0DcZqccyQwAk5H1SH8&e=)
    


^ permalink raw reply

* linux-next: manual merge of the net-next tree with the net tree
From: Stephen Rothwell @ 2017-12-22  0:11 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Jann Horn,
	Daniel Borkmann, Alexei Starovoitov

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  kernel/bpf/verifier.c

between commit:

  0c17d1d2c619 ("bpf: fix incorrect tracking of register size truncation")

from the net tree and commits:

  f4d7e40a5b71 ("bpf: introduce function calls (verification)")
  1ea47e01ad6e ("bpf: add support for bpf_call to interpreter")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc kernel/bpf/verifier.c
index 04b24876cd23,48b2901cf483..000000000000
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@@ -1072,29 -1425,54 +1430,77 @@@ static int check_ptr_alignment(struct b
  					   strict);
  }
  
 +/* truncate register to smaller size (in bytes)
 + * must be called with size < BPF_REG_SIZE
 + */
 +static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
 +{
 +	u64 mask;
 +
 +	/* clear high bits in bit representation */
 +	reg->var_off = tnum_cast(reg->var_off, size);
 +
 +	/* fix arithmetic bounds */
 +	mask = ((u64)1 << (size * 8)) - 1;
 +	if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
 +		reg->umin_value &= mask;
 +		reg->umax_value &= mask;
 +	} else {
 +		reg->umin_value = 0;
 +		reg->umax_value = mask;
 +	}
 +	reg->smin_value = reg->umin_value;
 +	reg->smax_value = reg->umax_value;
 +}
 +
+ static int update_stack_depth(struct bpf_verifier_env *env,
+ 			      const struct bpf_func_state *func,
+ 			      int off)
+ {
+ 	u16 stack = env->subprog_stack_depth[func->subprogno], total = 0;
+ 	struct bpf_verifier_state *cur = env->cur_state;
+ 	int i;
+ 
+ 	if (stack >= -off)
+ 		return 0;
+ 
+ 	/* update known max for given subprogram */
+ 	env->subprog_stack_depth[func->subprogno] = -off;
+ 
+ 	/* compute the total for current call chain */
+ 	for (i = 0; i <= cur->curframe; i++) {
+ 		u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno];
+ 
+ 		/* round up to 32-bytes, since this is granularity
+ 		 * of interpreter stack sizes
+ 		 */
+ 		depth = round_up(depth, 32);
+ 		total += depth;
+ 	}
+ 
+ 	if (total > MAX_BPF_STACK) {
+ 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
+ 			cur->curframe, total);
+ 		return -EACCES;
+ 	}
+ 	return 0;
+ }
+ 
+ static int get_callee_stack_depth(struct bpf_verifier_env *env,
+ 				  const struct bpf_insn *insn, int idx)
+ {
+ 	int start = idx + insn->imm + 1, subprog;
+ 
+ 	subprog = find_subprog(env, start);
+ 	if (subprog < 0) {
+ 		WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
+ 			  start);
+ 		return -EFAULT;
+ 	}
+ 	subprog++;
+ 	return env->subprog_stack_depth[subprog];
+ }
+ 
  /* check whether memory at (regno + off) is accessible for t = (read | write)
   * if t==write, value_regno is a register which value is stored into memory
   * if t==read, value_regno is a register which will receive the value from memory
@@@ -1302,15 -1678,14 +1704,15 @@@ static int check_stack_boundary(struct 
  	}
  
  	/* Only allow fixed-offset stack reads */
- 	if (!tnum_is_const(regs[regno].var_off)) {
+ 	if (!tnum_is_const(reg->var_off)) {
  		char tn_buf[48];
  
- 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
+ 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
  		verbose(env, "invalid variable stack read R%d var_off=%s\n",
  			regno, tn_buf);
 +		return -EACCES;
  	}
- 	off = regs[regno].off + regs[regno].var_off.value;
+ 	off = reg->off + reg->var_off.value;
  	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
  	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
  		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
@@@ -2294,9 -2758,12 +2828,11 @@@ static int adjust_scalar_min_max_vals(s
  static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
  				   struct bpf_insn *insn)
  {
- 	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
+ 	struct bpf_verifier_state *vstate = env->cur_state;
+ 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
+ 	struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
  	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
  	u8 opcode = BPF_OP(insn->code);
 -	int rc;
  
  	dst_reg = &regs[insn->dst_reg];
  	src_reg = NULL;

^ permalink raw reply

* [Patch net-next] net_sched: remove the unsafe __skb_array_empty()
From: Cong Wang @ 2017-12-22  0:03 UTC (permalink / raw)
  To: netdev; +Cc: jakub.kicinski, Cong Wang, John Fastabend
In-Reply-To: <20171222000330.29009-1-xiyou.wangcong@gmail.com>

__skb_array_empty() is only safe if array is never resized.
pfifo_fast_dequeue() is called in TX BH context and without
qdisc lock, so even after we disable BH on ->reset() path
we can still race with other CPU's.

Fixes: c5ad119fb6c0 ("net: sched: pfifo_fast use skb_array")
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/sch_generic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 00ddb5f8f430..9279258ce060 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -622,9 +622,6 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
 		struct skb_array *q = band2list(priv, band);
 
-		if (__skb_array_empty(q))
-			continue;
-
 		skb = skb_array_consume_bh(q);
 	}
 	if (likely(skb)) {
-- 
2.13.0

^ permalink raw reply related

* [Patch net-next] net_sched: call qdisc_reset() with qdisc lock
From: Cong Wang @ 2017-12-22  0:03 UTC (permalink / raw)
  To: netdev; +Cc: jakub.kicinski, Cong Wang, John Fastabend

qdisc_reset() should always be called with qdisc spinlock
and with BH disabled, otherwise qdisc ->reset() could race
with TX BH.

Fixes: 7bbde83b1860 ("net: sched: drop qdisc_reset from dev_graft_qdisc")
Reported-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/sched/sch_generic.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 10aaa3b615ce..00ddb5f8f430 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1097,8 +1097,11 @@ static void dev_qdisc_reset(struct net_device *dev,
 {
 	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
 
-	if (qdisc)
+	if (qdisc) {
+		spin_lock_bh(qdisc_lock(qdisc));
 		qdisc_reset(qdisc);
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
 }
 
 /**
-- 
2.13.0

^ permalink raw reply related

* Re: [PATCH bpf-next 0/11] bpf: more sock_ops callbacks
From: Daniel Borkmann @ 2017-12-22  0:03 UTC (permalink / raw)
  To: Lawrence Brakmo, netdev
  Cc: Kernel Team, Blake Matheny, Alexei Starovoitov, Eric Dumazet,
	Neal Cardwell, Yuchung Cheng
In-Reply-To: <20171220211644.3993770-1-brakmo@fb.com>

On 12/20/2017 10:16 PM, Lawrence Brakmo wrote:
> This patchset adds support for:
> 
> - direct R or R/W access to many tcp_sock fields
> - passing up to 4 arguments to sock_ops BPF functions
> - tcp_sock field bpf_sock_ops_flags for controlling callbacks
> - optionally calling sock_ops BPF program when RTO fires
> - optionally calling sock_ops BPF program when packet is retransmitted
> - optionally calling sock_ops BPF program when TCP state changes
> - access to tclass and sk_txhash
> - new selftest
> 
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> 
> Consists of the following patches:
> [PATCH bpf 01/11] bpf: Make SOCK_OPS_GET_TCP size independent
> [PATCH bpf 02/11] bpf: Make SOCK_OPS_GET_TCP struct independent
> [PATCH bpf 03/11] bpf: Add write access to tcp_sock and sock fields
> [PATCH bpf 04/11] bpf: Support passing args to sock_ops bpf function
> [PATCH bpf 05/11] bpf: Adds field bpf_sock_ops_flags to tcp_sock
> [PATCH bpf 06/11] bpf: Add sock_ops RTO callback
> [PATCH bpf 07/11] bpf: Add support for reading sk_state and more
> [PATCH bpf 08/11] bpf: Add sock_ops R/W access to tclass & sk_txhash
> [PATCH bpf 09/11] bpf: Add BPF_SOCK_OPS_RETRANS_CB
> [PATCH bpf 10/11] bpf: Add BPF_SOCK_OPS_STATE_CB
> [PATCH bpf 11/11] bpf: add selftest for tcpbpf

Hmm, looks like only ever [1] and [2] made it into patchwork for some
reason and both under a different series. Something wrong with mailer
config?

Cheers,
Daniel

  [1] https://patchwork.ozlabs.org/patch/851690/
  [2] https://patchwork.ozlabs.org/patch/851689/

  (First two in: https://patchwork.ozlabs.org/project/netdev/list/?submitter=66772&state=*)

^ permalink raw reply

* Re: [PATCH bpf] selftests/bpf: fix Makefile for passing LLC to the command line
From: Daniel Borkmann @ 2017-12-21 23:55 UTC (permalink / raw)
  To: Jakub Kicinski, netdev, alexei.starovoitov; +Cc: oss-drivers, Quentin Monnet
In-Reply-To: <20171221165250.21138-1-jakub.kicinski@netronome.com>

On 12/21/2017 05:52 PM, Jakub Kicinski wrote:
> From: Quentin Monnet <quentin.monnet@netronome.com>
> 
> Makefile has a LLC variable that is initialised to "llc", but can
> theoretically be overridden from the command line ("make LLC=llc-6.0").
> However, this fails because for LLVM probe check, "llc" is called
> directly. Use the $(LLC) variable instead to fix this.
> 
> Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
> Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Applied to bpf tree, thanks Jakub!

^ permalink raw reply

* [PATCH net-next] net: erspan: remove md NULL check
From: William Tu @ 2017-12-21 23:51 UTC (permalink / raw)
  To: netdev; +Cc: Haishuang Yan

The 'md' is allocated from 'tun_dst = ip_tun_rx_dst' and
since we've checked 'tun_dst', 'md' will never be NULL.
The patch removes it at both ipv4 and ipv6 erspan.

Fixes: afb4c97d90e6 ("ip6_gre: fix potential memory leak in ip6erspan_rcv")
Fixes: 50670b6ee9bc ("ip_gre: fix potential memory leak in erspan_rcv")
Cc: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: William Tu <u9012063@gmail.com>
---
 net/ipv4/ip_gre.c  | 5 -----
 net/ipv6/ip6_gre.c | 4 ----
 2 files changed, 9 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 90c912307814..47c7de3ca458 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -313,11 +313,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 				return PACKET_REJECT;
 
 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
-			if (!md) {
-				dst_release((struct dst_entry *)tun_dst);
-				return PACKET_REJECT;
-			}
-
 			memcpy(md, pkt_md, sizeof(*md));
 			md->version = ver;
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 8451d00b210b..1aabc8df7cb7 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -550,10 +550,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
 
 			info = &tun_dst->u.tun_info;
 			md = ip_tunnel_info_opts(info);
-			if (!md) {
-				dst_release((struct dst_entry *)tun_dst);
-				return PACKET_REJECT;
-			}
 
 			memcpy(md, pkt_md, sizeof(*md));
 			md->version = ver;
-- 
2.7.4

^ permalink raw reply related

* Re: pull-request: bpf-next 2017-12-18
From: Daniel Borkmann @ 2017-12-21 23:48 UTC (permalink / raw)
  To: David Miller, alexei.starovoitov; +Cc: ast, netdev
In-Reply-To: <20171221.112819.2190571738067203400.davem@davemloft.net>

On 12/21/2017 05:28 PM, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Wed, 20 Dec 2017 16:16:44 -0500 (EST)
> 
>> I think I understand how this new stuff works, I'll take a stab at
>> doing the sparc64 JIT bits.
> 
> This patch should do it, please queue up for bpf-next.
> 
> But this is really overkill on sparc64.
> 
> No matter where you relocate the call destination to, the size of the
> program and the code output will be identical except for the call
> instruction PC relative offset field.
> 
> So at some point as a follow-up I should change this code to simply
> scan the insns for the function calls and fixup the offsets, rather
> than do a full set of code generation passes.
> 
> Thanks.
> 
> ====================
> bpf: sparc64: Add JIT support for multi-function programs.
> 
> Modelled strongly upon the arm64 implementation.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
> index a2f1b5e..4ee417f 100644
> --- a/arch/sparc/net/bpf_jit_comp_64.c
> +++ b/arch/sparc/net/bpf_jit_comp_64.c
> @@ -1507,11 +1507,19 @@ static void jit_fill_hole(void *area, unsigned int size)
>  		*ptr++ = 0x91d02005; /* ta 5 */
>  }
>  
> +struct sparc64_jit_data {
> +	struct bpf_binary_header *header;
> +	u8 *image;
> +	struct jit_ctx ctx;
> +};
> +
>  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>  {
>  	struct bpf_prog *tmp, *orig_prog = prog;
> +	struct sparc64_jit_data *jit_data;
>  	struct bpf_binary_header *header;
>  	bool tmp_blinded = false;
> +	bool extra_pass = false;
>  	struct jit_ctx ctx;
>  	u32 image_size;
>  	u8 *image_ptr;
> @@ -1531,13 +1539,30 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>  		prog = tmp;
>  	}
>  
> +	jit_data = prog->aux->jit_data;
> +	if (!jit_data) {
> +		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
> +		if (!jit_data) {
> +			prog = orig_prog;
> +			goto out;
> +		}

Looks good, one thing: If I spot this correctly, isn't here a ...

		prog->aux->jit_data = jit_data;

... missing? Otherwise the context from the initial pass is neither
saved for the extra pass nor freed.

> +	}
> +	if (jit_data->ctx.offset) {
> +		ctx = jit_data->ctx;
> +		image_ptr = jit_data->image;
> +		header = jit_data->header;
> +		extra_pass = true;
> +		image_size = sizeof(u32) * ctx.idx;
> +		goto skip_init_ctx;
> +	}
> +
>  	memset(&ctx, 0, sizeof(ctx));
>  	ctx.prog = prog;
>  
>  	ctx.offset = kcalloc(prog->len, sizeof(unsigned int), GFP_KERNEL);
>  	if (ctx.offset == NULL) {
>  		prog = orig_prog;
> -		goto out;
> +		goto out_off;
>  	}
>  
>  	/* Fake pass to detect features used, and get an accurate assessment
> @@ -1560,7 +1585,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>  	}
>  
>  	ctx.image = (u32 *)image_ptr;
> -
> +skip_init_ctx:
>  	for (pass = 1; pass < 3; pass++) {
>  		ctx.idx = 0;
>  
> @@ -1591,14 +1616,24 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>  
>  	bpf_flush_icache(header, (u8 *)header + (header->pages * PAGE_SIZE));
>  
> -	bpf_jit_binary_lock_ro(header);
> +	if (!prog->is_func || extra_pass) {
> +		bpf_jit_binary_lock_ro(header);
> +	} else {
> +		jit_data->ctx = ctx;
> +		jit_data->image = image_ptr;
> +		jit_data->header = header;
> +	}
>  
>  	prog->bpf_func = (void *)ctx.image;
>  	prog->jited = 1;
>  	prog->jited_len = image_size;
>  
> +	if (!prog->is_func || extra_pass) {
>  out_off:
> -	kfree(ctx.offset);
> +		kfree(ctx.offset);
> +		kfree(jit_data);
> +		prog->aux->jit_data = NULL;
> +	}
>  out:
>  	if (tmp_blinded)
>  		bpf_jit_prog_release_other(prog, prog == orig_prog ?
> 

^ permalink raw reply

* [PATCH net-next v2] enic: add wq clean up budget
From: Govindarajulu Varadarajan @ 2017-12-21 16:12 UTC (permalink / raw)
  To: davem, netdev; +Cc: govindarajulu90, benve, Govindarajulu Varadarajan

In case of tx clean up, we set '-1' as budget. This means clean up until
wq is empty or till (1 << 32) pkts are cleaned. Under heavy load this
will run for long time and cause
"watchdog: BUG: soft lockup - CPU#25 stuck for 21s!" warning.

This patch sets wq clean up budget to 256.

Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
---
v2: resubmit: previous discussion: https://patchwork.ozlabs.org/patch/845011/

 drivers/net/ethernet/cisco/enic/enic.h      | 2 ++
 drivers/net/ethernet/cisco/enic/enic_main.c | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 6a9527004cb1..9b218f0e5a4c 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -43,6 +43,8 @@
 #define ENIC_CQ_MAX		(ENIC_WQ_MAX + ENIC_RQ_MAX)
 #define ENIC_INTR_MAX		(ENIC_CQ_MAX + 2)
 
+#define ENIC_WQ_NAPI_BUDGET	256
+
 #define ENIC_AIC_LARGE_PKT_DIFF	3
 
 struct enic_msix_entry {
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index d98676e43e03..f202ba72a811 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1500,7 +1500,7 @@ static int enic_poll(struct napi_struct *napi, int budget)
 	unsigned int cq_wq = enic_cq_wq(enic, 0);
 	unsigned int intr = enic_legacy_io_intr();
 	unsigned int rq_work_to_do = budget;
-	unsigned int wq_work_to_do = -1; /* no limit */
+	unsigned int wq_work_to_do = ENIC_WQ_NAPI_BUDGET;
 	unsigned int  work_done, rq_work_done = 0, wq_work_done;
 	int err;
 
@@ -1598,7 +1598,7 @@ static int enic_poll_msix_wq(struct napi_struct *napi, int budget)
 	struct vnic_wq *wq = &enic->wq[wq_index];
 	unsigned int cq;
 	unsigned int intr;
-	unsigned int wq_work_to_do = -1; /* clean all desc possible */
+	unsigned int wq_work_to_do = ENIC_WQ_NAPI_BUDGET;
 	unsigned int wq_work_done;
 	unsigned int wq_irq;
 
-- 
2.15.1

^ permalink raw reply related

* Re: [PATCH net 1/2] dt-bindings: net: mediatek: add condition to property mediatek,pctl
From: Rob Herring @ 2017-12-21 22:55 UTC (permalink / raw)
  To: sean.wang-NuS5LvNUpcJWk0Htik3J/w
  Cc: davem-fT/PcQaiUtIeIZ0/mPfg9Q, mark.rutland-5wv7dgnIgG8,
	matthias.bgg-Re5JQEeQqe8AvxtiuMwx3w, john-Pj+rj9U5foFAfugRpC6u6w,
	nbd-p3rKhJxN3npAfugRpC6u6w, nelson.chang-NuS5LvNUpcJWk0Htik3J/w,
	devicetree-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mediatek-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <e366efc29985d3292c8a1afb1389b5eac57c9037.1513762066.git.sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>

On Wed, Dec 20, 2017 at 05:47:05PM +0800, sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org wrote:
> From: Sean Wang <sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>
> 
> The property "mediatek,pctl" is only required for SoCs such as MT2701 and
> MT7623, so adding a few words for stating the condition.
> 
> Signed-off-by: Sean Wang <sean.wang-NuS5LvNUpcJWk0Htik3J/w@public.gmane.org>
> ---
>  Documentation/devicetree/bindings/net/mediatek-net.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Rob Herring <robh-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH ipsec-next] xfrm: update the stats documentation
From: Shannon Nelson @ 2017-12-21 22:26 UTC (permalink / raw)
  To: steffen.klassert; +Cc: netdev

Add a couple of stats that aren't in the documentation file
and rework the top description to be a little more readable.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 Documentation/networking/xfrm_proc.txt | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/xfrm_proc.txt b/Documentation/networking/xfrm_proc.txt
index d0d8baf..2eae619 100644
--- a/Documentation/networking/xfrm_proc.txt
+++ b/Documentation/networking/xfrm_proc.txt
@@ -5,13 +5,15 @@ Masahide NAKAMURA <nakam@linux-ipv6.org>
 
 Transformation Statistics
 -------------------------
-xfrm_proc is a statistics shown factor dropped by transformation
-for developer.
-It is a counter designed from current transformation source code
-and defined like linux private MIB.
 
-Inbound statistics
-~~~~~~~~~~~~~~~~~~
+The xfrm_proc code is a set of statistics showing numbers of packets
+dropped by the transformation code and why.  These counters are defined
+as part of the linux private MIB.  These counters can be viewed in
+/proc/net/xfrm_stat.
+
+
+Inbound errors
+~~~~~~~~~~~~~~
 XfrmInError:
 	All errors which is not matched others
 XfrmInBufferError:
@@ -46,6 +48,10 @@ XfrmInPolBlock:
 	Policy discards
 XfrmInPolError:
 	Policy error
+XfrmAcquireError:
+	State hasn't been fully acquired before use
+XfrmFwdHdrError:
+	Forward routing of a packet is not allowed
 
 Outbound errors
 ~~~~~~~~~~~~~~~
@@ -72,3 +78,5 @@ XfrmOutPolDead:
 	Policy is dead
 XfrmOutPolError:
 	Policy error
+XfrmOutStateInvalid:
+	State is invalid, perhaps expired
-- 
2.7.4

^ permalink raw reply related

* [PATCH net] rtnetlink: fix struct net reference leak
From: Craig Gallek @ 2017-12-21 22:18 UTC (permalink / raw)
  To: David Miller, Jiri Benc; +Cc: netdev, Nicolas Dichtel, Jason A . Donenfeld

From: Craig Gallek <kraig@google.com>

The below referenced commit extended the RTM_GETLINK interface to
allow querying by netns id.  The netnsid property was previously
defined as a signed integer, but this patch assumes that the user
always passes a positive integer.  syzkaller discovered this problem
by setting a negative netnsid and then calling the get-link path
in a tight loop.  This surprisingly quickly overflows the reference
count on the associated struct net, potentially destroying it.  When the
default namespace is used, the machine crashes in strange and interesting
ways.

Unfortunately, this is not easy to reproduce with just the ip tool
as it enforces unsigned integer parsing despite the interface interpeting
the NETNSID attribute as signed.

I'm not sure why this attribute is signed in the first place, but
the first commit that introduced it (6621dd29eb9b) is in v4.15-rc4,
so I assume it's too late to change.

This patch removes the positive netns id assumption, but adds another
assumption that the netns id 0 is always the 'self' identifying id (for
which an additional struct net reference is not necessary).

Fixes: 79e1ad148c84 ("rtnetlink: use netnsid to query interface")
CC: Jiri Benc <jbenc@redhat.com>
CC: Nicolas Dichtel <nicolas.dichtel@6wind.com>
CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Craig Gallek <kraig@google.com>
---
 net/core/rtnetlink.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dabba2a91fc8..3de033b7e4b9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1451,7 +1451,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	ifm->ifi_flags = dev_get_flags(dev);
 	ifm->ifi_change = change;
 
-	if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+	if (tgt_netnsid && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
 		goto nla_put_failure;
 
 	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
@@ -1712,7 +1712,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct rtnl_link_ops *kind_ops = NULL;
 	unsigned int flags = NLM_F_MULTI;
 	int master_idx = 0;
-	int netnsid = -1;
+	int netnsid = 0;
 	int err;
 	int hdrlen;
 
@@ -1733,10 +1733,12 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 			ifla_policy, NULL) >= 0) {
 		if (tb[IFLA_IF_NETNSID]) {
 			netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-			tgt_net = get_target_net(skb, netnsid);
-			if (IS_ERR(tgt_net)) {
-				tgt_net = net;
-				netnsid = -1;
+			if (netnsid) {
+				tgt_net = get_target_net(skb, netnsid);
+				if (IS_ERR(tgt_net)) {
+					tgt_net = net;
+					netnsid = 0;
+				}
 			}
 		}
 
@@ -1786,7 +1788,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	cb->args[0] = h;
 	cb->seq = net->dev_base_seq;
 	nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-	if (netnsid >= 0)
+	if (netnsid)
 		put_net(tgt_net);
 
 	return err;
@@ -2873,7 +2875,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct nlattr *tb[IFLA_MAX+1];
 	struct net_device *dev = NULL;
 	struct sk_buff *nskb;
-	int netnsid = -1;
+	int netnsid = 0;
 	int err;
 	u32 ext_filter_mask = 0;
 
@@ -2883,9 +2885,11 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
-		tgt_net = get_target_net(skb, netnsid);
-		if (IS_ERR(tgt_net))
-			return PTR_ERR(tgt_net);
+		if (netnsid) {
+			tgt_net = get_target_net(skb, netnsid);
+			if (IS_ERR(tgt_net))
+				return PTR_ERR(tgt_net);
+		}
 	}
 
 	if (tb[IFLA_IFNAME])
@@ -2923,7 +2927,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	} else
 		err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
 out:
-	if (netnsid >= 0)
+	if (netnsid)
 		put_net(tgt_net);
 
 	return err;
-- 
2.15.1.620.gb9897f4670-goog

^ permalink raw reply related

* Re: [PATCH net-next] tcp: md5: Handle RCU dereference of md5sig_info
From: Christoph Paasch @ 2017-12-21 22:13 UTC (permalink / raw)
  To: Mat Martineau; +Cc: netdev
In-Reply-To: <20171221182910.4785-2-mathew.j.martineau@linux.intel.com>

On Thu, Dec 21, 2017 at 10:29 AM, Mat Martineau
<mathew.j.martineau@linux.intel.com> wrote:
> Dereference tp->md5sig_info in tcp_v4_destroy_sock() the same way it is
> done in the adjacent call to tcp_clear_md5_list().
>
> Resolves this sparse warning:
>
> net/ipv4/tcp_ipv4.c:1914:17: warning: incorrect type in argument 1 (different address spaces)
> net/ipv4/tcp_ipv4.c:1914:17:    expected struct callback_head *head
> net/ipv4/tcp_ipv4.c:1914:17:    got struct callback_head [noderef] <asn:4>*<noident>
>
> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
> ---
>  net/ipv4/tcp_ipv4.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index dd945b114215..5d203248123e 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
>         /* Clean up the MD5 key list, if any */
>         if (tp->md5sig_info) {
>                 tcp_clear_md5_list(sk);
> -               kfree_rcu(tp->md5sig_info, rcu);
> +               kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);

Acked-by: Christoph Paasch <cpaasch@apple.com>

^ permalink raw reply

* Re: [patch iproute2 v2] tc: add -bs option for batch mode
From: David Ahern @ 2017-12-21 22:04 UTC (permalink / raw)
  To: Chris Mi, netdev; +Cc: gerlitz.or
In-Reply-To: <20171220092628.6673-1-chrism@mellanox.com>

On 12/20/17 2:26 AM, Chris Mi wrote:
> Currently in tc batch mode, only one command is read from the batch
> file and sent to kernel to process. With this patch, we can accumulate
> several commands before sending to kernel. The batch size is specified
> using option -bs or -batchsize.
> 
> To accumulate the commands in tc, we allocate an array of struct iovec.
> If batchsize is bigger than 1 and we haven't accumulated enough
> commands, rtnl_talk() will return without actually sending the message.
> One exception is that there is no more command in the batch file.
> 
> But please note that kernel still processes the requests one by one.
> To process the requests in parallel in kernel is another effort.
> The time we're saving in this patch is the user mode and kernel mode
> context switch. So this patch works on top of the current kernel.
> 
> Using the following script in kernel, we can generate 1,000,000 rules.
>         tools/testing/selftests/tc-testing/tdc_batch.py
> 
> Without this patch, 'tc -b $file' exection time is:
> 
> real    0m14.916s
> user    0m6.808s
> sys     0m8.046s
> 
> With this patch, 'tc -b $file -bs 10' exection time is:
> 
> real    0m12.286s
> user    0m5.903s
> sys     0m6.312s
> 
> The insertion rate is improved more than 10%.
> 
> Signed-off-by: Chris Mi <chrism@mellanox.com>
> ---
>  include/libnetlink.h |  6 ++++
>  include/utils.h      |  4 +++
>  lib/libnetlink.c     | 25 ++++++++++-----
>  lib/utils.c          | 20 ++++++++++++
>  man/man8/tc.8        |  5 +++
>  tc/m_action.c        | 62 +++++++++++++++++++++++++++---------
>  tc/tc.c              | 24 ++++++++++++--
>  tc/tc_common.h       |  3 ++
>  tc/tc_filter.c       | 88 ++++++++++++++++++++++++++++++++++++----------------
>  9 files changed, 186 insertions(+), 51 deletions(-)
> 
> diff --git a/include/libnetlink.h b/include/libnetlink.h
> index a4d83b9e..775f830b 100644
> --- a/include/libnetlink.h
> +++ b/include/libnetlink.h
> @@ -13,6 +13,8 @@
>  #include <linux/netconf.h>
>  #include <arpa/inet.h>
>  
> +#define MSG_IOV_MAX 256
> +
>  struct rtnl_handle {
>  	int			fd;
>  	struct sockaddr_nl	local;
> @@ -93,6 +95,10 @@ int rtnl_dump_filter_nc(struct rtnl_handle *rth,
>  			void *arg, __u16 nc_flags);
>  #define rtnl_dump_filter(rth, filter, arg) \
>  	rtnl_dump_filter_nc(rth, filter, arg, 0)
> +
> +extern int msg_iov_index;
> +extern int batch_size;
> +
>  int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
>  	      struct nlmsghdr **answer)
>  	__attribute__((warn_unused_result));
> diff --git a/include/utils.h b/include/utils.h
> index d3895d56..113a8c31 100644
> --- a/include/utils.h
> +++ b/include/utils.h
> @@ -235,6 +235,10 @@ void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n);
>  
>  extern int cmdlineno;
>  ssize_t getcmdline(char **line, size_t *len, FILE *in);
> +
> +extern int cmdlinetotal;
> +void setcmdlinetotal(const char *name);
> +
>  int makeargs(char *line, char *argv[], int maxargs);
>  int inet_get_addr(const char *src, __u32 *dst, struct in6_addr *dst6);
>  
> diff --git a/lib/libnetlink.c b/lib/libnetlink.c
> index 00e6ce0c..7ff32d64 100644
> --- a/lib/libnetlink.c
> +++ b/lib/libnetlink.c
> @@ -24,6 +24,7 @@
>  #include <sys/uio.h>
>  
>  #include "libnetlink.h"
> +#include "utils.h"
>  
>  #ifndef SOL_NETLINK
>  #define SOL_NETLINK 270
> @@ -581,6 +582,10 @@ static void rtnl_talk_error(struct nlmsghdr *h, struct nlmsgerr *err,
>  		strerror(-err->error));
>  }
>  
> +static struct iovec msg_iov[MSG_IOV_MAX];
> +int msg_iov_index;
> +int batch_size = 1;
> +
>  static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
>  		       struct nlmsghdr **answer,
>  		       bool show_rtnl_err, nl_ext_ack_fn_t errfn)
> @@ -589,23 +594,29 @@ static int __rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n,
>  	unsigned int seq;
>  	struct nlmsghdr *h;
>  	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
> -	struct iovec iov = {
> -		.iov_base = n,
> -		.iov_len = n->nlmsg_len
> -	};
> +	struct iovec *iov = &msg_iov[msg_iov_index];
> +	char *buf;
> +
> +	iov->iov_base = n;
> +	iov->iov_len = n->nlmsg_len;
> +
>  	struct msghdr msg = {
>  		.msg_name = &nladdr,
>  		.msg_namelen = sizeof(nladdr),
> -		.msg_iov = &iov,
> -		.msg_iovlen = 1,
> +		.msg_iov = msg_iov,
> +		.msg_iovlen = ++msg_iov_index,
>  	};
> -	char *buf;
>  
>  	n->nlmsg_seq = seq = ++rtnl->seq;
>  
>  	if (answer == NULL)
>  		n->nlmsg_flags |= NLM_F_ACK;
>  
> +	msg_iov_index %= batch_size;
> +	if (msg_iov_index != 0 && batch_size > 1 && cmdlineno != cmdlinetotal &&
> +	    (n->nlmsg_type == RTM_NEWTFILTER || n->nlmsg_type == RTM_NEWACTION))
> +		return 3;
> +
>  	status = sendmsg(rtnl->fd, &msg, 0);
>  	if (status < 0) {
>  		perror("Cannot talk to rtnetlink");

I have a fair idea of why you did it this way, but relying on global
variables and magic return codes is not a great solution.

Why not refactor rtnl_talk -- move the sendmsg piece to a helper that
takes an iov or a msg as input arg. Then have the tc code piece together
the iov and call rtnl_talk. If batch_size == 1 it calls rtnl_talk; > 1
it puts together the iov and hands it off.

^ permalink raw reply

* Re: RCU callback crashes
From: Jakub Kicinski @ 2017-12-21 21:45 UTC (permalink / raw)
  To: Cong Wang; +Cc: John Fastabend, Jiri Pirko, netdev@vger.kernel.org
In-Reply-To: <CAM_iQpV8+NknPgGbNDzF+=S8Px4rxO2=PMwV5BLDQJhX5CGmDQ@mail.gmail.com>

On Thu, 21 Dec 2017 13:31:01 -0800, Cong Wang wrote:
> >    629          if (likely(skb)) {
> >    630                  qdisc_qstats_cpu_backlog_dec(qdisc, skb);
> >    631                  qdisc_bstats_cpu_update(qdisc, skb);
> >    632                  qdisc_qstats_cpu_qlen_dec(qdisc);
> >    633          }
> >    634
> >    635          return skb;
> >    636  }  
> 
> Hi, Jakub
> 
> Could you test the attached patch? It looks like the __skb_array_empty()
> use is unsafe.

I don't have a reproducer, unfortunately, I haven't seen the splat
since :(  FWIW the kernel config was with all debug/checks disabled,
only KASAN on.

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2017-12-21 21:32 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


What's a holiday weekend without some networking bug fixes?

1) Fix some eBPF JIT bugs wrt. SKB pointers across helper function
   calls, from Daniel Borkmann.

2) Fix regression from errata limiting change to marvell PHY driver,
   from Zhao Qiang.

3) Fix u16 overflow in SCTP, from Xin Long.

4) Fix potential memory leak during bridge newlink, from Nikolay
   Aleksandrov.

5) Fix BPF selftest build on s390, from Hendrik Brueckner.

6) Don't append to cfg80211 automatically generated certs file,
   always write new ones from scratch.  From Thierry Reding.

7) Fix sleep in atomic in mac80211 hwsim, from Jia-Ju Bai.

8) Fix hang on tg3 MTU change with certain chips, from Brian King.

9) Add stall detection to arc emac driver and reset chip when this
   happens, from Alexander Kochetkov.

10) Fix MTU limitng in GRE tunnel drivers, from Xin Long.

11) Fix stmmac timestamping bug due to mis-shifting of field.
    From Fredrik Hallenberg.

12) Fix metrics match when deleting an ipv4 route.  The kernel sets
    some internal metrics bits which the user isn't going to set
    when it makes the delete request.  From Phil Sutter.

13) mvneta driver loop over RX queues limits on "txq_number" :-)
    Fix from Yelena Krivosheev.

14) Fix double free and memory corruption in get_net_ns_by_id, from
    Eric W. Biederman.

15) Flush ipv4 FIB tables in the reverse order.  Some tables can
    share their actual backing data, in particular this happens
    for the MAIN and LOCAL tables.  We have to kill the LOCAL
    table first, because it uses MAIN's backing memory.  Fix from
    Ido Schimmel.

16) Several eBPF verifier value tracking fixes, from Edward Cree,
    Jann Horn, and Alexei Starovoitov.

17) Make changes to ipv6 autoflowlabel sysctl really propagate to
    sockets, unless the socket has set the per-socket value
    explicitly.  From Shaohua Li.

18) Fix leaks and double callback invocations of zerocopy SKBs,
    from Willem de Bruijn.

Please pull, thanks a lot!

The following changes since commit f3b5ad89de16f5d42e8ad36fbdf85f705c1ae051:

  Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma (2017-12-16 13:43:08 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 

for you to fetch changes up to c50b7c473f609189da3bccd28ee5dcf3b55109cd:

  Merge branch 'net-zerocopy-fixes' (2017-12-21 15:00:59 -0500)

----------------------------------------------------------------
Adiel Aloni (1):
      mac80211_hwsim: enable TODS BIT in null data frame

Alexander Kochetkov (2):
      net: arc_emac: fix arc_emac_rx() error paths
      net: arc_emac: restart stalled EMAC

Alexei Starovoitov (3):
      Merge branch 'bpf-jit-fixes'
      bpf: fix integer overflows
      bpf: do not allow root to mangle valid pointers

Alexey Khoroshilov (1):
      net: phy: xgene: disable clk on error paths

Alexey Kodanev (1):
      vxlan: restore dev->mtu setting based on lower device

Brendan McGrath (1):
      ipv6: icmp6: Allow icmp messages to be looped back

Brian King (1):
      tg3: Fix rx hang on MTU change with 5717/5719

Daniel Borkmann (6):
      bpf, s390x: do not reload skb pointers in non-skb context
      bpf, ppc64: do not reload skb pointers in non-skb context
      bpf: guarantee r1 to be ctx in case of bpf_helper_changes_pkt_data
      bpf, sparc: fix usage of wrong reg for load_skb_regs after call
      bpf: add test case for ld_abs and helper changing pkt data
      Merge branch 'bpf-verifier-sec-fixes'

David Miller (1):
      bpf: Fix tools and testing build.

David S. Miller (7):
      Merge git://git.kernel.org/.../bpf/bpf
      Merge tag 'mac80211-for-davem-2017-12-19' of git://git.kernel.org/.../jberg/mac80211
      Merge branch 'mvneta-fixes'
      Merge branch 'cls_bpf-fix-offload-state-tracking-with-block-callbacks'
      Merge tag 'mlx5-fixes-2017-12-19' of git://git.kernel.org/.../saeed/linux
      Merge git://git.kernel.org/.../bpf/bpf
      Merge branch 'net-zerocopy-fixes'

Edward Cree (1):
      bpf/verifier: fix bounds calculation on BPF_RSH

Eran Ben Elisha (1):
      net/mlx5: Fix rate limit packet pacing naming and struct

Eric Garver (1):
      openvswitch: Fix pop_vlan action for double tagged frames

Eric W. Biederman (1):
      net: Fix double free and memory corruption in get_net_ns_by_id()

Eugenia Emantayev (2):
      net/mlx5e: Fix defaulting RX ring size when not needed
      net/mlx5: Fix misspelling in the error message and comment

Fredrik Hallenberg (2):
      net: stmmac: Fix TX timestamp calculation
      net: stmmac: Fix bad RX timestamp extraction

Gal Pressman (4):
      net/mlx5e: Fix features check of IPv6 traffic
      net/mlx5e: Fix possible deadlock of VXLAN lock
      net/mlx5e: Add refcount to VXLAN structure
      net/mlx5e: Prevent possible races in VXLAN control flow

Hemanth Puranik (1):
      net: qcom/emac: Change the order of mac up and sgmii open

Hendrik Brueckner (1):
      bpf: fix broken BPF selftest build on s390

Huy Nguyen (1):
      net/mlx5e: Fix ETS BW check

Ido Schimmel (2):
      ipv4: Fix use-after-free when flushing FIB tables
      ipv6: Honor specified parameters in fibmatch lookup

Jakub Kicinski (2):
      cls_bpf: fix offload assumptions after callback conversion
      nfp: bpf: keep track of the offloaded program

Jann Horn (7):
      bpf: fix incorrect sign extension in check_alu_op()
      bpf: fix incorrect tracking of register size truncation
      bpf: fix 32-bit ALU op verification
      bpf: fix missing error return in check_stack_boundary()
      bpf: force strict alignment checks for stack pointers
      bpf: don't prune branches when a scalar is replaced with a pointer
      selftests/bpf: add tests for recent bugfixes

Jia-Ju Bai (1):
      mac80211_hwsim: Fix a possible sleep-in-atomic bug in hwsim_get_radio_nl

Johannes Berg (2):
      nl80211: fix nl80211_send_iface() error paths
      cfg80211: ship certificates as hex files

Jon Maloy (4):
      tipc: fix lost member events bug
      tipc: remove leaving group member from all lists
      tipc: fix list sorting bug in function tipc_group_update_member()
      tipc: remove joining group member from congested list

Jonathan Corbet (1):
      nl80211: Remove obsolete kerneldoc line

Julian Wiedmann (1):
      s390/qeth: fix error handling in checksum cmd callback

Kamal Heib (1):
      net/mlx5: FPGA, return -EINVAL if size is zero

Maor Gottlieb (1):
      net/mlx5: Fix steering memory leak

Moni Shoua (1):
      net/mlx5: Fix error flow in CREATE_QP command

Moshe Shemesh (2):
      net/mlx5: Cleanup IRQs in case of unload failure
      net/mlx5: Stay in polling mode when command EQ destroy fails

Naresh Kamboju (1):
      selftests: net: Adding config fragment CONFIG_NUMA=y

Nikolay Aleksandrov (1):
      net: bridge: fix early call to br_stp_change_bridge_id and plug newlink leaks

Petr Machata (1):
      mlxsw: spectrum_router: Remove batch neighbour deletion causing FW bug

Phil Sutter (1):
      ipv4: fib: Fix metrics match when deleting a route

Russell King (1):
      net: phy: marvell: avoid pause mode on SGMII-to-Copper for 88e151x

Saeed Mahameed (1):
      Revert "mlx5: move affinity hints assignments to generic code"

Sean Wang (1):
      net: mediatek: setup proper state for disabled GMAC on the default

Shaohua Li (1):
      net: reevalulate autoflowlabel setting after sysctl setting

Song Liu (1):
      xdp: linearize skb in netif_receive_generic_xdp()

Thierry Reding (1):
      cfg80211: always rewrite generated files from scratch

Willem de Bruijn (2):
      skbuff: orphan frags before zerocopy clone
      skbuff: skb_copy_ubufs must release uarg even without user frags

Xin Long (6):
      sctp: fix the issue that a __u16 variable may overflow in sctp_ulpq_renege
      sctp: add SCTP_CID_RECONF conversion in sctp_cname
      vxlan: update skb dst pmtu on tx path
      ip_gre: remove the incorrect mtu limit for ipgre tap
      ip6_gre: remove the incorrect mtu limit for ipgre tap
      ip6_tunnel: get the min mtu properly in ip6_tnl_xmit

Yelena Krivosheev (3):
      net: mvneta: clear interface link status on port disable
      net: mvneta: use proper rxq_number in loop on rx queues
      net: mvneta: eliminate wrong call to handle rx descriptor error

Zhao Qiang (1):
      net: phy: marvell: Limit 88m1101 autoneg errata to 88E1145 as well.

 arch/powerpc/net/bpf_jit_comp64.c                     |   6 +-
 arch/s390/net/bpf_jit_comp.c                          |  11 +-
 arch/sparc/net/bpf_jit_comp_64.c                      |   6 +-
 drivers/net/ethernet/arc/emac.h                       |   2 +
 drivers/net/ethernet/arc/emac_main.c                  | 164 ++++++++++++++++++++++++----
 drivers/net/ethernet/broadcom/tg3.c                   |   4 +-
 drivers/net/ethernet/marvell/mvneta.c                 |   8 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c           |  11 +-
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h          |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c    |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c  |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c     |  63 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/eq.c          |  20 ++--
 drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c    |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c     |  16 ++-
 drivers/net/ethernet/mellanox/mlx5/core/health.c      |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c        |  75 ++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/qp.c          |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/rl.c          |  22 ++--
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.c       |  64 ++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/vxlan.h       |   1 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c |  15 +--
 drivers/net/ethernet/netronome/nfp/bpf/main.c         |  55 ++++++++--
 drivers/net/ethernet/netronome/nfp/bpf/main.h         |   8 ++
 drivers/net/ethernet/qualcomm/emac/emac.c             |   6 +-
 drivers/net/ethernet/stmicro/stmmac/common.h          |   2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c    |   5 +-
 drivers/net/ethernet/stmicro/stmmac/enh_desc.c        |   3 +-
 drivers/net/ethernet/stmicro/stmmac/norm_desc.c       |   2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c |   6 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c     |   2 +-
 drivers/net/phy/marvell.c                             |  14 ++-
 drivers/net/phy/mdio-xgene.c                          |  21 +++-
 drivers/net/vxlan.c                                   |  19 ++++
 drivers/net/wireless/mac80211_hwsim.c                 |   3 +-
 drivers/s390/net/qeth_core_main.c                     |   9 +-
 include/linux/bpf_verifier.h                          |   4 +-
 include/linux/ipv6.h                                  |   3 +-
 include/linux/mlx5/driver.h                           |   3 +-
 include/linux/mlx5/mlx5_ifc.h                         |   8 +-
 include/net/cfg80211.h                                |   1 -
 include/net/pkt_cls.h                                 |   5 +-
 kernel/bpf/verifier.c                                 | 283 +++++++++++++++++++++++++++--------------------
 lib/test_bpf.c                                        |  43 ++++++++
 net/bridge/br_netlink.c                               |  11 +-
 net/core/dev.c                                        |   2 +-
 net/core/net_namespace.c                              |   2 +-
 net/core/skbuff.c                                     |   7 +-
 net/ipv4/fib_frontend.c                               |   9 +-
 net/ipv4/fib_semantics.c                              |   8 +-
 net/ipv4/ip_gre.c                                     |   1 +
 net/ipv6/af_inet6.c                                   |   1 -
 net/ipv6/ip6_gre.c                                    |   1 +
 net/ipv6/ip6_output.c                                 |  12 +-
 net/ipv6/ip6_tunnel.c                                 |   9 +-
 net/ipv6/ipv6_sockglue.c                              |   1 +
 net/ipv6/route.c                                      |  20 ++--
 net/openvswitch/flow.c                                |  15 ++-
 net/sched/cls_bpf.c                                   |  93 +++++++---------
 net/sctp/debug.c                                      |   3 +
 net/sctp/ulpqueue.c                                   |  24 ++--
 net/tipc/group.c                                      |  16 +--
 net/wireless/Makefile                                 |  31 ++----
 net/wireless/certs/sforshee.hex                       |  86 +++++++++++++++
 net/wireless/certs/sforshee.x509                      | Bin 680 -> 0 bytes
 net/wireless/nl80211.c                                |   6 +-
 tools/arch/s390/include/uapi/asm/bpf_perf_event.h     |   2 +-
 tools/testing/selftests/bpf/Makefile                  |   2 +-
 tools/testing/selftests/bpf/test_progs.c              |   8 +-
 tools/testing/selftests/bpf/test_verifier.c           | 629 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 tools/testing/selftests/net/config                    |   1 +
 73 files changed, 1548 insertions(+), 492 deletions(-)
 create mode 100644 net/wireless/certs/sforshee.hex
 delete mode 100644 net/wireless/certs/sforshee.x509

^ permalink raw reply

* Re: RCU callback crashes
From: Cong Wang @ 2017-12-21 21:31 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: John Fastabend, Jiri Pirko, netdev@vger.kernel.org
In-Reply-To: <20171220164419.42c63ebf@cakuba.netronome.com>

[-- Attachment #1: Type: text/plain, Size: 2276 bytes --]

On Wed, Dec 20, 2017 at 4:50 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Wed, 20 Dec 2017 16:41:14 -0800, Jakub Kicinski wrote:
>> Just as I hit send... :)  but this looks unrelated, "Comm: sshd" -
>> so probably from the management interface.
>>
>> [  154.604041] ==================================================================
>> [  154.612245] BUG: KASAN: slab-out-of-bounds in pfifo_fast_dequeue+0x140/0x2d0
>> [  154.620219] Read of size 8 at addr ffff88086bb64040 by task sshd/983
>> [  154.627403]
>> [  154.629161] CPU: 10 PID: 983 Comm: sshd Not tainted 4.15.0-rc3-perf-00984-g82d3fc87a4aa-dirty #13
>> [  154.639190] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.3.4 11/08/2016
>> [  154.647665] Call Trace:
>> [  154.650494]  dump_stack+0xa6/0x118
>> [  154.654387]  ? _atomic_dec_and_lock+0xe8/0xe8
>> [  154.659355]  ? trace_event_raw_event_rcu_torture_read+0x190/0x190
>> [  154.666263]  ? rcu_segcblist_enqueue+0xe9/0x120
>> [  154.671422]  ? _raw_spin_unlock_bh+0x91/0xc0
>> [  154.676286]  ? pfifo_fast_dequeue+0x140/0x2d0
>> [  154.681251]  print_address_description+0x6a/0x270
>> [  154.686601]  ? pfifo_fast_dequeue+0x140/0x2d0
>> [  154.691565]  kasan_report+0x23f/0x350
>> [  154.695752]  pfifo_fast_dequeue+0x140/0x2d0
>
> If we trust stack decode it's:
>
>    615  static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
>    616  {
>    617          struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
>    618          struct sk_buff *skb = NULL;
>    619          int band;
>    620
>    621          for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
>    622                  struct skb_array *q = band2list(priv, band);
>    623
>>> 624                  if (__skb_array_empty(q))
>    625                          continue;
>    626
>    627                  skb = skb_array_consume_bh(q);
>    628          }
>    629          if (likely(skb)) {
>    630                  qdisc_qstats_cpu_backlog_dec(qdisc, skb);
>    631                  qdisc_bstats_cpu_update(qdisc, skb);
>    632                  qdisc_qstats_cpu_qlen_dec(qdisc);
>    633          }
>    634
>    635          return skb;
>    636  }

Hi, Jakub

Could you test the attached patch? It looks like the __skb_array_empty()
use is unsafe.

Thanks!

[-- Attachment #2: pfifo_fast_dequeue.diff --]
[-- Type: text/plain, Size: 463 bytes --]

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 10aaa3b615ce..8d47fb4aadb4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -621,10 +621,6 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 
 	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
 		struct skb_array *q = band2list(priv, band);
-
-		if (__skb_array_empty(q))
-			continue;
-
 		skb = skb_array_consume_bh(q);
 	}
 	if (likely(skb)) {

^ permalink raw reply related

* Re: [PATCH RFC 00/18] r8169: separate r8168 driver and add experimental phylib support
From: Heiner Kallweit @ 2017-12-21 21:27 UTC (permalink / raw)
  To: David Miller; +Cc: andrew, nic_swsd, hau, netdev
In-Reply-To: <20171221.161603.944790580293624594.davem@davemloft.net>

Am 21.12.2017 um 22:16 schrieb David Miller:
> From: Heiner Kallweit <hkallweit1@gmail.com>
> Date: Thu, 21 Dec 2017 21:38:11 +0100
> 
>> This experimental series separates drivers for PCI / PCIE NIC's and
>> adds initial phylib support to the separated r8168 driver.
> 
> Thanks for working on this.
> 
> The RX and TX ring handling is basically going to be identical
> for the two chips, so it's very undesirable to duplicate that
> code in the two drivers.
> 
Agree .. My approach would be:
- remove everything that's not needed from both drivers
- see what's still identical and factor it out into lib(s)

Currently the driver is one source code file with 8.700 LoC.
That's way too big anyway IMO and should be splitted.

> Getting good test coverage is going to be extremely challenging
> for this, so the more code you share between the two drivers
> rather than duplicate the better.
> 
I'm aware of this (seeing that basically every chip needs certain
quirks) and don't expect the patch set to be mainline-ready
very soon. I have access to one supported chip only, so I hope
others give it a try too.

^ permalink raw reply

* Re: [PATCH RFC 00/18] r8169: separate r8168 driver and add experimental phylib support
From: David Miller @ 2017-12-21 21:16 UTC (permalink / raw)
  To: hkallweit1; +Cc: andrew, nic_swsd, hau, netdev
In-Reply-To: <83321b2e-8402-26c5-9703-3fe795cc893d@gmail.com>

From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 21 Dec 2017 21:38:11 +0100

> This experimental series separates drivers for PCI / PCIE NIC's and
> adds initial phylib support to the separated r8168 driver.

Thanks for working on this.

The RX and TX ring handling is basically going to be identical
for the two chips, so it's very undesirable to duplicate that
code in the two drivers.

Getting good test coverage is going to be extremely challenging
for this, so the more code you share between the two drivers
rather than duplicate the better.

^ permalink raw reply

* [PATCH bpf-next v2 8/8] selftests/bpf: test device info reporting for bound progs
From: Jakub Kicinski @ 2017-12-21 21:01 UTC (permalink / raw)
  To: netdev, alexei.starovoitov, daniel; +Cc: ktkhai, oss-drivers, Jakub Kicinski
In-Reply-To: <20171221210120.30166-1-jakub.kicinski@netronome.com>

Check if bound programs report correct device info.  Test
in local namespace, in remote one, back to the local ns,
remove the device and check that information is cleared.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
--
v2:
 - check the error code from "prog show pin XX" with device
   removed is -ENODEV.
---
 tools/testing/selftests/bpf/test_offload.py | 112 +++++++++++++++++++++++++---
 1 file changed, 101 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index c940505c2978..e3c750f17cb8 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -18,6 +18,8 @@ import argparse
 import json
 import os
 import pprint
+import random
+import string
 import subprocess
 import time
 
@@ -27,6 +29,7 @@ bpf_test_dir = os.path.dirname(os.path.realpath(__file__))
 pp = pprint.PrettyPrinter()
 devs = [] # devices we created for clean up
 files = [] # files to be removed
+netns = [] # net namespaces to be removed
 
 def log_get_sec(level=0):
     return "*" * (log_level + level)
@@ -128,22 +131,25 @@ files = [] # files to be removed
     if f in files:
         files.remove(f)
 
-def tool(name, args, flags, JSON=True, fail=True):
+def tool(name, args, flags, JSON=True, ns="", fail=True):
     params = ""
     if JSON:
         params += "%s " % (flags["json"])
 
-    ret, out = cmd(name + " " + params + args, fail=fail)
+    if ns != "":
+        ns = "ip netns exec %s " % (ns)
+
+    ret, out = cmd(ns + name + " " + params + args, fail=fail)
     if JSON and len(out.strip()) != 0:
         return ret, json.loads(out)
     else:
         return ret, out
 
-def bpftool(args, JSON=True, fail=True):
-    return tool("bpftool", args, {"json":"-p"}, JSON=JSON, fail=fail)
+def bpftool(args, JSON=True, ns="", fail=True):
+    return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns, fail=fail)
 
-def bpftool_prog_list(expected=None):
-    _, progs = bpftool("prog show", JSON=True, fail=True)
+def bpftool_prog_list(expected=None, ns=""):
+    _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True)
     if expected is not None:
         if len(progs) != expected:
             fail(True, "%d BPF programs loaded, expected %d" %
@@ -158,13 +164,13 @@ files = [] # files to be removed
         time.sleep(0.05)
     raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs))
 
-def ip(args, force=False, JSON=True, fail=True):
+def ip(args, force=False, JSON=True, ns="", fail=True):
     if force:
         args = "-force " + args
-    return tool("ip", args, {"json":"-j"}, JSON=JSON, fail=fail)
+    return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns, fail=fail)
 
-def tc(args, JSON=True, fail=True):
-    return tool("tc", args, {"json":"-p"}, JSON=JSON, fail=fail)
+def tc(args, JSON=True, ns="", fail=True):
+    return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns, fail=fail)
 
 def ethtool(dev, opt, args, fail=True):
     return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail)
@@ -178,6 +184,15 @@ files = [] # files to be removed
 def bpf_bytecode(bytecode):
     return "bytecode \"%s\"" % (bytecode)
 
+def mknetns(n_retry=10):
+    for i in range(n_retry):
+        name = ''.join([random.choice(string.ascii_letters) for i in range(8)])
+        ret, _ = ip("netns add %s" % (name), fail=False)
+        if ret == 0:
+            netns.append(name)
+            return name
+    return None
+
 class DebugfsDir:
     """
     Class for accessing DebugFS directories as a dictionary.
@@ -237,6 +252,8 @@ files = [] # files to be removed
         self.dev = self._netdevsim_create()
         devs.append(self)
 
+        self.ns = ""
+
         self.dfs_dir = '/sys/kernel/debug/netdevsim/%s' % (self.dev['ifname'])
         self.dfs_refresh()
 
@@ -257,7 +274,7 @@ files = [] # files to be removed
 
     def remove(self):
         devs.remove(self)
-        ip("link del dev %s" % (self.dev["ifname"]))
+        ip("link del dev %s" % (self.dev["ifname"]), ns=self.ns)
 
     def dfs_refresh(self):
         self.dfs = DebugfsDir(self.dfs_dir)
@@ -285,6 +302,11 @@ files = [] # files to be removed
             time.sleep(0.05)
         raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs))
 
+    def set_ns(self, ns):
+        name = "1" if ns == "" else ns
+        ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns)
+        self.ns = ns
+
     def set_mtu(self, mtu, fail=True):
         return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu),
                   fail=fail)
@@ -372,6 +394,8 @@ files = [] # files to be removed
         dev.remove()
     for f in files:
         cmd("rm -f %s" % (f))
+    for ns in netns:
+        cmd("ip netns delete %s" % (ns))
 
 def pin_prog(file_name, idx=0):
     progs = bpftool_prog_list(expected=(idx + 1))
@@ -381,6 +405,35 @@ files = [] # files to be removed
 
     return file_name, bpf_pinned(file_name)
 
+def check_dev_info(other_ns, ns, pin_file=None, removed=False):
+    if removed:
+        bpftool_prog_list(expected=0)
+        ret, err = bpftool("prog show pin %s" % (pin_file), fail=False)
+        fail(ret == 0, "Showing prog with removed device did not fail")
+        fail(err["error"].find("No such device") == -1,
+             "Showing prog with removed device expected ENODEV, error is %s" %
+             (err["error"]))
+        return
+    progs = bpftool_prog_list(expected=int(not removed), ns=ns)
+    prog = progs[0]
+
+    fail("dev" not in prog.keys(), "Device parameters not reported")
+    dev = prog["dev"]
+    fail("ifindex" not in dev.keys(), "Device parameters not reported")
+    fail("ns_dev" not in dev.keys(), "Device parameters not reported")
+    fail("ns_inode" not in dev.keys(), "Device parameters not reported")
+
+    if not removed and not other_ns:
+        fail("ifname" not in dev.keys(), "Ifname not reported")
+        fail(dev["ifname"] != sim["ifname"],
+             "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"]))
+    else:
+        fail("ifname" in dev.keys(), "Ifname is reported for other ns")
+        if removed:
+            fail(dev["ifindex"] != 0, "Device perameters not zero on removed")
+            fail(dev["ns_dev"] != 0, "Device perameters not zero on removed")
+            fail(dev["ns_inode"] != 0, "Device perameters not zero on removed")
+
 # Parse command line
 parser = argparse.ArgumentParser()
 parser.add_argument("--log", help="output verbose log to given file")
@@ -417,6 +470,12 @@ samples = ["sample_ret0.o"]
     skip(ret != 0, "sample %s/%s not found, please compile it" %
          (bpf_test_dir, s))
 
+# Check if net namespaces seem to work
+ns = mknetns()
+skip(ns is None, "Could not create a net namespace")
+cmd("ip netns delete %s" % (ns))
+netns = []
+
 try:
     obj = bpf_obj("sample_ret0.o")
     bytecode = bpf_bytecode("1,6 0 0 4294967295,")
@@ -549,6 +608,8 @@ samples = ["sample_ret0.o"]
     progs = bpftool_prog_list(expected=1)
     fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"],
          "Loaded program has wrong ID")
+    fail("dev" in progs[0].keys(),
+         "Device parameters reported for non-offloaded program")
 
     start_test("Test XDP prog replace with bad flags...")
     ret, _ = sim.set_xdp(obj, "offload", force=True, fail=False)
@@ -673,6 +734,35 @@ samples = ["sample_ret0.o"]
     fail(time_diff < delay_sec, "Removal process took %s, expected %s" %
          (time_diff, delay_sec))
 
+    # Remove all pinned files and reinstantiate the netdev
+    clean_up()
+    bpftool_prog_list_wait(expected=0)
+
+    sim = NetdevSim()
+    sim.set_ethtool_tc_offloads(True)
+    sim.set_xdp(obj, "offload")
+
+    start_test("Test bpftool bound info reporting (own ns)...")
+    check_dev_info(False, "")
+
+    start_test("Test bpftool bound info reporting (other ns)...")
+    ns = mknetns()
+    sim.set_ns(ns)
+    check_dev_info(True, "")
+
+    start_test("Test bpftool bound info reporting (remote ns)...")
+    check_dev_info(False, ns)
+
+    start_test("Test bpftool bound info reporting (back to own ns)...")
+    sim.set_ns("")
+    check_dev_info(False, "")
+
+    pin_file, _ = pin_prog("/sys/fs/bpf/tmp")
+    sim.remove()
+
+    start_test("Test bpftool bound info reporting (removed dev)...")
+    check_dev_info(True, "", pin_file=pin_file, removed=True)
+
     print("%s: OK" % (os.path.basename(__file__)))
 
 finally:
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next v2 7/8] tools: bpftool: report device information for offloaded programs
From: Jakub Kicinski @ 2017-12-21 21:01 UTC (permalink / raw)
  To: netdev, alexei.starovoitov, daniel; +Cc: ktkhai, oss-drivers, Jakub Kicinski
In-Reply-To: <20171221210120.30166-1-jakub.kicinski@netronome.com>

Print the just-exposed device information about device to which
program is bound.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 tools/bpf/bpftool/common.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++
 tools/bpf/bpftool/main.h   |  2 ++
 tools/bpf/bpftool/prog.c   |  3 +++
 3 files changed, 57 insertions(+)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index b62c94e3997a..6601c95a9258 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -44,7 +44,9 @@
 #include <unistd.h>
 #include <linux/limits.h>
 #include <linux/magic.h>
+#include <net/if.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
 
@@ -412,3 +414,53 @@ void delete_pinned_obj_table(struct pinned_obj_table *tab)
 		free(obj);
 	}
 }
+
+static char *
+ifindex_to_name_ns(__u32 ifindex, __u32 ns_dev, __u32 ns_ino, char *buf)
+{
+	struct stat st;
+	int err;
+
+	err = stat("/proc/self/ns/net", &st);
+	if (err) {
+		p_err("Can't stat /proc/self: %s", strerror(errno));
+		return NULL;
+	}
+
+	if (st.st_dev != ns_dev || st.st_ino != ns_ino)
+		return NULL;
+
+	return if_indextoname(ifindex, buf);
+}
+
+void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
+{
+	char name[IF_NAMESIZE];
+
+	if (!ifindex)
+		return;
+
+	printf(" dev ");
+	if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name))
+		printf("%s", name);
+	else
+		printf("ifindex %u ns_dev %llu ns_ino %llu",
+		       ifindex, ns_dev, ns_inode);
+}
+
+void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
+{
+	char name[IF_NAMESIZE];
+
+	if (!ifindex)
+		return;
+
+	jsonw_name(json_wtr, "dev");
+	jsonw_start_object(json_wtr);
+	jsonw_uint_field(json_wtr, "ifindex", ifindex);
+	jsonw_uint_field(json_wtr, "ns_dev", ns_dev);
+	jsonw_uint_field(json_wtr, "ns_inode", ns_inode);
+	if (ifindex_to_name_ns(ifindex, ns_dev, ns_inode, name))
+		jsonw_string_field(json_wtr, "ifname", name);
+	jsonw_end_object(json_wtr);
+}
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 8f6d3cac0347..65b526fe6e7e 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -96,6 +96,8 @@ struct pinned_obj {
 int build_pinned_obj_table(struct pinned_obj_table *table,
 			   enum bpf_obj_type type);
 void delete_pinned_obj_table(struct pinned_obj_table *tab);
+void print_dev_plain(__u32 ifindex, __u64 ns_dev, __u64 ns_inode);
+void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode);
 
 struct cmd {
 	const char *cmd;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 037484ceaeaf..4ccf6301f0fe 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -230,6 +230,8 @@ static void print_prog_json(struct bpf_prog_info *info, int fd)
 		     info->tag[0], info->tag[1], info->tag[2], info->tag[3],
 		     info->tag[4], info->tag[5], info->tag[6], info->tag[7]);
 
+	print_dev_json(info->ifindex, info->netns_dev, info->netns_ino);
+
 	if (info->load_time) {
 		char buf[32];
 
@@ -287,6 +289,7 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd)
 
 	printf("tag ");
 	fprint_hex(stdout, info->tag, BPF_TAG_SIZE, "");
+	print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino);
 	printf("\n");
 
 	if (info->load_time) {
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next v2 5/8] bpf: offload: free program id when device disappears
From: Jakub Kicinski @ 2017-12-21 21:01 UTC (permalink / raw)
  To: netdev, alexei.starovoitov, daniel; +Cc: ktkhai, oss-drivers, Jakub Kicinski
In-Reply-To: <20171221210120.30166-1-jakub.kicinski@netronome.com>

Bound programs are quite useless after their device disappears.
They are simply waiting for reference count to go to zero,
don't list them in BPF_PROG_GET_NEXT_ID by freeing their ID
early.

Note that orphaned offload programs will return -ENODEV on
BPF_OBJ_GET_INFO_BY_FD so user will never see ID 0.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 2 ++
 kernel/bpf/offload.c | 3 +++
 kernel/bpf/syscall.c | 9 +++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 669549f7e3e8..9a916ab34299 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -357,6 +357,8 @@ void bpf_prog_put(struct bpf_prog *prog);
 int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
 
+void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
+
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 60be15b9d8f1..1e6064ea3609 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -135,6 +135,9 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 	if (offload->dev_state)
 		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
 
+	/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+	bpf_prog_free_id(prog, true);
+
 	list_del_init(&offload->offloads);
 	kfree(offload);
 	prog->aux->offload = NULL;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1143db61584c..7d9f5b0f0e49 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -905,9 +905,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 {
-	/* cBPF to eBPF migrations are currently not in the idr store. */
+	/* cBPF to eBPF migrations are currently not in the idr store.
+	 * Offloaded programs are removed from the store when their device
+	 * disappears - even if someone grabs an fd to them they are unusable,
+	 * simply waiting for refcnt to drop to be freed.
+	 */
 	if (!prog->aux->id)
 		return;
 
@@ -917,6 +921,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 		__acquire(&prog_idr_lock);
 
 	idr_remove(&prog_idr, prog->aux->id);
+	prog->aux->id = 0;
 
 	if (do_idr_lock)
 		spin_unlock_bh(&prog_idr_lock);
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next v2 6/8] bpf: offload: report device information for offloaded programs
From: Jakub Kicinski @ 2017-12-21 21:01 UTC (permalink / raw)
  To: netdev, alexei.starovoitov, daniel
  Cc: ktkhai, oss-drivers, Jakub Kicinski, Eric W . Biederman
In-Reply-To: <20171221210120.30166-1-jakub.kicinski@netronome.com>

Report to the user ifindex and namespace information of offloaded
programs.  If device has disappeared return -ENODEV.  Specify the
namespace using dev/inode combination.

CC: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
v2:
 - take RTNL lock to grab a coherent snapshot of device state
   (ifindex vs name space) and avoid races with name space
   moves (based on Eric's comment on Kirill's patch to
   peernet2id_alloc()).
---
 fs/nsfs.c                      |  2 +-
 include/linux/bpf.h            |  2 ++
 include/linux/proc_ns.h        |  1 +
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/offload.c           | 44 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  6 ++++++
 tools/include/uapi/linux/bpf.h |  3 +++
 7 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 7c6f76d29f56..e50628675935 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -51,7 +51,7 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-static void *__ns_get_path(struct path *path, struct ns_common *ns)
+void *__ns_get_path(struct path *path, struct ns_common *ns)
 {
 	struct vfsmount *mnt = nsfs_mnt;
 	struct dentry *dentry;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9a916ab34299..7810ae57b357 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 2ff18c9840a7..1733359cf713 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -76,6 +76,7 @@ static inline int ns_alloc_inum(struct ns_common *ns)
 
 extern struct file *proc_ns_fget(int fd);
 #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private)
+extern void *__ns_get_path(struct path *path, struct ns_common *ns);
 extern void *ns_get_path(struct path *path, struct task_struct *task,
 			const struct proc_ns_operations *ns_ops);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d01f1cb3cfc0..72b37fc3bc0c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 1e6064ea3609..4d50000bd1e3 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -16,9 +16,11 @@
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bug.h>
+#include <linux/kdev_t.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
 #include <linux/printk.h>
+#include <linux/proc_ns.h>
 #include <linux/rtnetlink.h>
 #include <linux/rwsem.h>
 
@@ -181,6 +183,48 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
+			       struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload;
+	struct inode *ns_inode;
+	struct path ns_path;
+	int ifindex, err;
+	struct net *net;
+
+again:
+	rtnl_lock();
+	down_read(&bpf_devs_lock);
+
+	offload = prog->aux->offload;
+	if (!offload) {
+		up_read(&bpf_devs_lock);
+		rtnl_unlock();
+		return -ENODEV;
+	}
+
+	ifindex = offload->netdev->ifindex;
+	net = dev_net(offload->netdev);
+	get_net(net); /* __ns_get_path() drops the reference */
+
+	up_read(&bpf_devs_lock);
+	rtnl_unlock();
+
+	err = PTR_ERR_OR_ZERO(__ns_get_path(&ns_path, &net->ns));
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		return err;
+	}
+	ns_inode = ns_path.dentry->d_inode;
+
+	info->ifindex = ifindex;
+	info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+	info->netns_ino = ns_inode->i_ino;
+
+	return 0;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7d9f5b0f0e49..20444fd678d0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1624,6 +1624,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		err = bpf_prog_offload_info_fill(&info, prog);
+		if (err)
+			return err;
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index db1b0923a308..4e8c60acfa32 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -921,6 +921,9 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u64 netns_dev;
+	__u64 netns_ino;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
-- 
2.15.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox