Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH bpf-next 05/13] bpf: get better bpf_prog ksyms based on btf func type_id
From: Martin Lau @ 2018-10-15 23:12 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Alexei Starovoitov, daniel@iogearbox.net, netdev@vger.kernel.org,
	Kernel Team
In-Reply-To: <20181012185446.2379289-1-yhs@fb.com>

On Fri, Oct 12, 2018 at 11:54:42AM -0700, Yonghong Song wrote:
> This patch added interface to load a program with the following
> additional information:
>    . prog_btf_fd
>    . func_info and func_info_len
> where func_info will provides function range and type_id
> corresponding to each function.
> 
> If verifier agrees with function range provided by the user,
> the bpf_prog ksym for each function will use the func name
> provided in the type_id, which is supposed to provide better
> encoding as it is not limited by 16 bytes program name
> limitation and this is better for bpf program which contains
> multiple subprograms.
> 
> The bpf_prog_info interface is also extended to
> return btf_id and jited_func_types, so user spaces can
> print out the function prototype for each jited function.
Some nits.

> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/linux/bpf.h          |  2 +
>  include/linux/bpf_verifier.h |  1 +
>  include/linux/btf.h          |  2 +
>  include/uapi/linux/bpf.h     | 11 +++++
>  kernel/bpf/btf.c             | 16 +++++++
>  kernel/bpf/core.c            |  9 ++++
>  kernel/bpf/syscall.c         | 86 +++++++++++++++++++++++++++++++++++-
>  kernel/bpf/verifier.c        | 50 +++++++++++++++++++++
>  8 files changed, 176 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 9b558713447f..e9c63ffa01af 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -308,6 +308,8 @@ struct bpf_prog_aux {
>  	void *security;
>  #endif
>  	struct bpf_prog_offload *offload;
> +	struct btf *btf;
> +	u32 type_id; /* type id for this prog/func */
>  	union {
>  		struct work_struct work;
>  		struct rcu_head	rcu;
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 9e8056ec20fa..e84782ec50ac 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -201,6 +201,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
>  struct bpf_subprog_info {
>  	u32 start; /* insn idx of function entry point */
>  	u16 stack_depth; /* max. stack depth used by this function */
> +	u32 type_id; /* btf type_id for this subprog */
>  };
>  
>  /* single container for all structs
> diff --git a/include/linux/btf.h b/include/linux/btf.h
> index e076c4697049..90e91b52aa90 100644
> --- a/include/linux/btf.h
> +++ b/include/linux/btf.h
> @@ -46,5 +46,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
>  		       struct seq_file *m);
>  int btf_get_fd_by_id(u32 id);
>  u32 btf_id(const struct btf *btf);
> +bool is_btf_func_type(const struct btf *btf, u32 type_id);
> +const char *btf_get_name_by_id(const struct btf *btf, u32 type_id);
>  
>  #endif
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f9187b41dff6..7ebbf4f06a65 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -332,6 +332,9 @@ union bpf_attr {
>  		 * (context accesses, allowed helpers, etc).
>  		 */
>  		__u32		expected_attach_type;
> +		__u32		prog_btf_fd;	/* fd pointing to BTF type data */
> +		__u32		func_info_len;	/* func_info length */
> +		__aligned_u64	func_info;	/* func type info */
>  	};
>  
>  	struct { /* anonymous struct used by BPF_OBJ_* commands */
> @@ -2585,6 +2588,9 @@ struct bpf_prog_info {
>  	__u32 nr_jited_func_lens;
>  	__aligned_u64 jited_ksyms;
>  	__aligned_u64 jited_func_lens;
> +	__u32 btf_id;
> +	__u32 nr_jited_func_types;
> +	__aligned_u64 jited_func_types;
>  } __attribute__((aligned(8)));
>  
>  struct bpf_map_info {
> @@ -2896,4 +2902,9 @@ struct bpf_flow_keys {
>  	};
>  };
>  
> +struct bpf_func_info {
> +	__u32	insn_offset;
> +	__u32	type_id;
> +};
> +
>  #endif /* _UAPI__LINUX_BPF_H__ */
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 794a185f11bf..85b8eeccddbd 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -486,6 +486,15 @@ static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
>  	return btf->types[type_id];
>  }
>  
> +bool is_btf_func_type(const struct btf *btf, u32 type_id)
> +{
> +	const struct btf_type *type = btf_type_by_id(btf, type_id);
> +
> +	if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC)
> +		return false;
> +	return true;
> +}
Can btf_type_is_func() (from patch 2) be reused?
The btf_type_by_id() can be done by the caller.
I don't think it worths to add a similar helper
for just one user for now.

The !type check can be added to btf_type_is_func() if
it is needed.

> +
>  /*
>   * Regular int is not a bit field and it must be either
>   * u8/u16/u32/u64.
> @@ -2579,3 +2588,10 @@ u32 btf_id(const struct btf *btf)
>  {
>  	return btf->id;
>  }
> +
> +const char *btf_get_name_by_id(const struct btf *btf, u32 type_id)
> +{
> +	const struct btf_type *t = btf_type_by_id(btf, type_id);
> +
> +	return btf_name_by_offset(btf, t->name_off);
> +}
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 3f5bf1af0826..add3866a120e 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -27,6 +27,7 @@
>  #include <linux/random.h>
>  #include <linux/moduleloader.h>
>  #include <linux/bpf.h>
> +#include <linux/btf.h>
>  #include <linux/frame.h>
>  #include <linux/rbtree_latch.h>
>  #include <linux/kallsyms.h>
> @@ -387,6 +388,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
>  static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
>  {
>  	const char *end = sym + KSYM_NAME_LEN;
> +	const char *func_name;
>  
>  	BUILD_BUG_ON(sizeof("bpf_prog_") +
>  		     sizeof(prog->tag) * 2 +
> @@ -401,6 +403,13 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
>  
>  	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
>  	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
> +
> +	if (prog->aux->btf) {
> +		func_name = btf_get_name_by_id(prog->aux->btf, prog->aux->type_id);
> +		snprintf(sym, (size_t)(end - sym), "_%s", func_name);
> +		return;
> +	}
> +
>  	if (prog->aux->name[0])
>  		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
>  	else
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 4f416234251f..aa4688a1a137 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1120,6 +1120,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
>  		/* bpf_prog_free_id() must be called first */
>  		bpf_prog_free_id(prog, do_idr_lock);
>  		bpf_prog_kallsyms_del_all(prog);
> +		btf_put(prog->aux->btf);
>  
>  		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
>  	}
> @@ -1343,8 +1344,45 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
>  	}
>  }
>  
> +static int prog_check_btf(const struct bpf_prog *prog, const struct btf *btf,
> +			  union bpf_attr *attr)
> +{
> +	struct bpf_func_info __user *uinfo, info;
> +	int i, nfuncs, usize;
> +	u32 prev_offset;
> +
> +	usize = sizeof(struct bpf_func_info);
> +	if (attr->func_info_len % usize)
> +		return -EINVAL;
> +
> +	/* func_info section should have increasing and valid insn_offset
> +	 * and type should be BTF_KIND_FUNC.
> +	 */
> +	nfuncs = attr->func_info_len / usize;
> +	uinfo = u64_to_user_ptr(attr->func_info);
> +	for (i = 0; i < nfuncs; i++) {
> +		if (copy_from_user(&info, uinfo, usize))
> +			return -EFAULT;
> +
> +		if (!is_btf_func_type(btf, info.type_id))
> +			return -EINVAL;
> +
> +		if (i == 0) {
> +			if (info.insn_offset)
> +				return -EINVAL;
> +			prev_offset = 0;
> +		} else if (info.insn_offset < prev_offset) {
> +			return -EINVAL;
> +		}
> +
> +		prev_offset = info.insn_offset;
> +	}
> +
> +	return 0;
> +}
> +
>  /* last field in 'union bpf_attr' used by this command */
> -#define	BPF_PROG_LOAD_LAST_FIELD expected_attach_type
> +#define	BPF_PROG_LOAD_LAST_FIELD func_info
>  
>  static int bpf_prog_load(union bpf_attr *attr)
>  {
> @@ -1431,6 +1469,27 @@ static int bpf_prog_load(union bpf_attr *attr)
>  	if (err)
>  		goto free_prog;
>  
> +	/* copy func_info before verifier which may make
> +	 * some adjustment.
> +	 */
Is it a left over comment?  I don't see the intention of
copying func_info to avoid verifier modification from below.
I could be missing something.

or should the comments be moved to the new "check_btf_func()" below?

> +	if (attr->func_info_len) {
> +		struct btf *btf;
> +
> +		btf = btf_get_by_fd(attr->prog_btf_fd);
> +		if (IS_ERR(btf)) {
> +			err = PTR_ERR(btf);
> +			goto free_prog;
> +		}
> +
> +		err = prog_check_btf(prog, btf, attr);
> +		if (err) {
> +			btf_put(btf);
> +			goto free_prog;
> +		}
> +
> +		prog->aux->btf = btf;
> +	}
> +
>  	/* run eBPF verifier */
>  	err = bpf_check(&prog, attr);
>  	if (err < 0)
> @@ -1463,6 +1522,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>  	bpf_prog_kallsyms_del_subprogs(prog);
>  	free_used_maps(prog->aux);
>  free_prog:
> +	btf_put(prog->aux->btf);
>  	bpf_prog_uncharge_memlock(prog);
>  free_prog_sec:
>  	security_bpf_prog_free(prog->aux);
> @@ -2108,6 +2168,30 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
>  		}
>  	}
>  
> +	if (prog->aux->btf) {
> +		info.btf_id = btf_id(prog->aux->btf);
> +
> +		ulen = info.nr_jited_func_types;
> +		info.nr_jited_func_types = prog->aux->func_cnt;
> +		if (info.nr_jited_func_types && ulen) {
> +			if (bpf_dump_raw_ok()) {
> +				u32 __user *user_types;
> +				u32 func_type, i;
> +
> +				ulen = min_t(u32, info.nr_jited_func_types,
> +					     ulen);
> +				user_types = u64_to_user_ptr(info.jited_func_types);
> +				for (i = 0; i < ulen; i++) {
> +					func_type = prog->aux->func[i]->aux->type_id;
> +					if (put_user(func_type, &user_types[i]))
> +						return -EFAULT;
> +				}
> +			} else {
> +				info.jited_func_types = 0;
> +			}
> +		}
> +	}
> +
>  done:
>  	if (copy_to_user(uinfo, &info, info_len) ||
>  	    put_user(info_len, &uattr->info.info_len))
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 3f93a548a642..97c408e84322 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -4589,6 +4589,50 @@ static int check_cfg(struct bpf_verifier_env *env)
>  	return ret;
>  }
>  
> +static int check_btf_func(struct bpf_prog *prog, struct bpf_verifier_env *env,
> +			  union bpf_attr *attr)
> +{
> +	struct bpf_func_info *data;
> +	int i, nfuncs, ret = 0;
> +
> +	if (!attr->func_info_len)
> +		return 0;
> +
> +	nfuncs = attr->func_info_len / sizeof(struct bpf_func_info);
> +	if (env->subprog_cnt != nfuncs) {
> +		verbose(env, "number of funcs in func_info does not match verifier\n");
> +		return -EINVAL;
> +	}
> +
> +	data = kvmalloc(attr->func_info_len, GFP_KERNEL | __GFP_NOWARN);
> +	if (!data) {
> +		verbose(env, "no memory to allocate attr func_info\n");
> +		return -ENOMEM;
> +	}
> +
> +	if (copy_from_user(data, u64_to_user_ptr(attr->func_info),
> +			   attr->func_info_len)) {
> +		verbose(env, "memory copy error for attr func_info\n");
> +		ret = -EFAULT;
> +		goto cleanup;
> +		}
Extra indentation.

> +
> +	for (i = 0; i < nfuncs; i++) {
> +		if (env->subprog_info[i].start != data[i].insn_offset) {
> +			verbose(env, "func_info subprog start (%d) does not match verifier (%d)\n",
> +				env->subprog_info[i].start, data[i].insn_offset);
The printing args are swapped? env->subprog_info[i].start should
go to the "verifier (%d)"?

and s/%d/%u/

> +			ret = -EINVAL;
> +			goto cleanup;
> +		}
> +		env->subprog_info[i].type_id = data[i].type_id;
> +	}
> +
> +	prog->aux->type_id = data[0].type_id;
> +cleanup:
> +	kvfree(data);
> +	return ret;
> +}
> +
>  /* check %cur's range satisfies %old's */
>  static bool range_within(struct bpf_reg_state *old,
>  			 struct bpf_reg_state *cur)
> @@ -5873,6 +5917,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>  		func[i]->aux->name[0] = 'F';
>  		func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
>  		func[i]->jit_requested = 1;
> +		func[i]->aux->btf = prog->aux->btf;
> +		func[i]->aux->type_id = env->subprog_info[i].type_id;
>  		func[i] = bpf_int_jit_compile(func[i]);
>  		if (!func[i]->jited) {
>  			err = -ENOTSUPP;
> @@ -6307,6 +6353,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
>  	if (ret < 0)
>  		goto skip_full_check;
>  
> +	ret = check_btf_func(env->prog, env, attr);
> +	if (ret < 0)
> +		goto skip_full_check;
> +
>  	ret = do_check(env);
>  	if (env->cur_state) {
>  		free_verifier_state(env->cur_state, true);
> -- 
> 2.17.1
> 

^ permalink raw reply

* Re: [bpf-next PATCH v3 0/2] bpftool support for sockmap use cases
From: Alexei Starovoitov @ 2018-10-15 23:17 UTC (permalink / raw)
  To: John Fastabend; +Cc: jakub.kicinski, ast, daniel, netdev
In-Reply-To: <20181015181857.8673.46183.stgit@john-Precision-Tower-5810>

On Mon, Oct 15, 2018 at 11:19:44AM -0700, John Fastabend wrote:
> The first patch adds support for attaching programs to maps. This is
> needed to support sock{map|hash} use from bpftool. Currently, I carry
> around custom code to do this so doing it using standard bpftool will
> be great.
> 
> The second patch adds a compat mode to ignore non-zero entries in
> the map def. This allows using bpftool with maps that have a extra
> fields that the user knows can be ignored. This is needed to work
> correctly with maps being loaded by other tools or directly via
> syscalls.
> 
> v3: add bash completion and doc updates for --mapcompat

Applied, Thanks

^ permalink raw reply

* [PATCH bpf-next v2] tools: bpftool: add map create command
From: Jakub Kicinski @ 2018-10-15 23:30 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski

Add a way of creating maps from user space.  The command takes
as parameters most of the attributes of the map creation system
call command.  After map is created its pinned to bpffs.  This makes
it possible to easily and dynamically (without rebuilding programs)
test various corner cases related to map creation.

Map type names are taken from bpftool's array used for printing.
In general these days we try to make use of libbpf type names, but
there are no map type names in libbpf as of today.

As with most features I add the motivation is testing (offloads) :)

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 .../bpf/bpftool/Documentation/bpftool-map.rst |  15 ++-
 tools/bpf/bpftool/Documentation/bpftool.rst   |   4 +-
 tools/bpf/bpftool/bash-completion/bpftool     |  38 +++++-
 tools/bpf/bpftool/common.c                    |  21 ++++
 tools/bpf/bpftool/main.h                      |   1 +
 tools/bpf/bpftool/map.c                       | 110 +++++++++++++++++-
 6 files changed, 183 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index a6258bc8ec4f..3497f2d80328 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -15,13 +15,15 @@ SYNOPSIS
 	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
 
 	*COMMANDS* :=
-	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
-	| **pin** | **help** }
+	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
+	| **delete** | **pin** | **help** }
 
 MAP COMMANDS
 =============
 
 |	**bpftool** **map { show | list }**   [*MAP*]
+|	**bpftool** **map create**     *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \
+|		**entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*]
 |	**bpftool** **map dump**       *MAP*
 |	**bpftool** **map update**     *MAP*  **key** *DATA*   **value** *VALUE* [*UPDATE_FLAGS*]
 |	**bpftool** **map lookup**     *MAP*  **key** *DATA*
@@ -36,6 +38,11 @@ MAP COMMANDS
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |	*VALUE* := { *DATA* | *MAP* | *PROG* }
 |	*UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
+|	*TYPE* := { **hash** | **array** | **prog_array** | **perf_event_array** | **percpu_hash**
+|		| **percpu_array** | **stack_trace** | **cgroup_array** | **lru_hash**
+|		| **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps**
+|		| **devmap** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
+|		| **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** }
 
 DESCRIPTION
 ===========
@@ -47,6 +54,10 @@ DESCRIPTION
 		  Output will start with map ID followed by map type and
 		  zero or more named attributes (depending on kernel version).
 
+	**bpftool map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE*  **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*]
+		  Create a new map with given parameters and pin it to *bpffs*
+		  as *FILE*.
+
 	**bpftool map dump**    *MAP*
 		  Dump all entries in a given *MAP*.
 
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 65488317fefa..04cd4f92ab89 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -22,8 +22,8 @@ SYNOPSIS
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
 
 	*MAP-COMMANDS* :=
-	{ **show** | **list** | **dump** | **update** | **lookup** | **getnext** | **delete**
-	| **pin** | **event_pipe** | **help** }
+	{ **show** | **list** | **create** | **dump** | **update** | **lookup** | **getnext**
+	| **delete** | **pin** | **event_pipe** | **help** }
 
 	*PROG-COMMANDS* := { **show** | **list** | **dump jited** | **dump xlated** | **pin**
 	| **load** | **attach** | **detach** | **help** }
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index ac85207cba8d..c56545e87b0d 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -387,6 +387,42 @@ _bpftool()
                             ;;
                     esac
                     ;;
+                create)
+                    case $prev in
+                        $command)
+                            _filedir
+                            return 0
+                            ;;
+                        type)
+                            COMPREPLY=( $( compgen -W 'hash array prog_array \
+                                perf_event_array percpu_hash percpu_array \
+                                stack_trace cgroup_array lru_hash \
+                                lru_percpu_hash lpm_trie array_of_maps \
+                                hash_of_maps devmap sockmap cpumap xskmap \
+                                sockhash cgroup_storage reuseport_sockarray \
+                                percpu_cgroup_storage' -- \
+                                                   "$cur" ) )
+                            return 0
+                            ;;
+                        key|value|flags|name|entries)
+                            return 0
+                            ;;
+                        dev)
+                            _sysfs_get_netdevs
+                            return 0
+                            ;;
+                        *)
+                            _bpftool_once_attr 'type'
+                            _bpftool_once_attr 'key'
+                            _bpftool_once_attr 'value'
+                            _bpftool_once_attr 'entries'
+                            _bpftool_once_attr 'name'
+                            _bpftool_once_attr 'flags'
+                            _bpftool_once_attr 'dev'
+                            return 0
+                            ;;
+                    esac
+                    ;;
                 lookup|getnext|delete)
                     case $prev in
                         $command)
@@ -500,7 +536,7 @@ _bpftool()
                 *)
                     [[ $prev == $object ]] && \
                         COMPREPLY=( $( compgen -W 'delete dump getnext help \
-                            lookup pin event_pipe show list update' -- \
+                            lookup pin event_pipe show list update create' -- \
                             "$cur" ) )
                     ;;
             esac
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index b3a0709ea7ed..3318da8060bd 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -618,3 +618,24 @@ void print_dev_json(__u32 ifindex, __u64 ns_dev, __u64 ns_inode)
 		jsonw_string_field(json_wtr, "ifname", name);
 	jsonw_end_object(json_wtr);
 }
+
+int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what)
+{
+	char *endptr;
+
+	NEXT_ARGP();
+
+	if (*val) {
+		p_err("%s already specified", what);
+		return -1;
+	}
+
+	*val = strtoul(**argv, &endptr, 0);
+	if (*endptr) {
+		p_err("can't parse %s as %s", **argv, what);
+		return -1;
+	}
+	NEXT_ARGP();
+
+	return 0;
+}
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 91fd697303cb..28ee769bd11b 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -139,6 +139,7 @@ int do_cgroup(int argc, char **arg);
 int do_perf(int argc, char **arg);
 int do_net(int argc, char **arg);
 
+int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 9f5de48f8a99..7bf38f0e152e 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -36,6 +36,7 @@
 #include <fcntl.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
+#include <net/if.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -94,6 +95,17 @@ static bool map_is_map_of_progs(__u32 type)
 	return type == BPF_MAP_TYPE_PROG_ARRAY;
 }
 
+static int map_type_from_str(const char *type)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(map_type_name); i++)
+		/* Don't allow prefixing in case of possible future shadowing */
+		if (map_type_name[i] && !strcmp(map_type_name[i], type))
+			return i;
+	return -1;
+}
+
 static void *alloc_value(struct bpf_map_info *info)
 {
 	if (map_is_per_cpu(info->type))
@@ -1058,6 +1070,92 @@ static int do_pin(int argc, char **argv)
 	return err;
 }
 
+static int do_create(int argc, char **argv)
+{
+	struct bpf_create_map_attr attr = { NULL, };
+	const char *pinfile;
+	int err, fd;
+
+	if (!REQ_ARGS(7))
+		return -1;
+	pinfile = GET_ARG();
+
+	while (argc) {
+		if (!REQ_ARGS(2))
+			return -1;
+
+		if (is_prefix(*argv, "type")) {
+			NEXT_ARG();
+
+			if (attr.map_type) {
+				p_err("map type already specified");
+				return -1;
+			}
+
+			attr.map_type = map_type_from_str(*argv);
+			if ((int)attr.map_type < 0) {
+				p_err("unrecognized map type: %s", *argv);
+				return -1;
+			}
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "name")) {
+			NEXT_ARG();
+			attr.name = GET_ARG();
+		} else if (is_prefix(*argv, "key")) {
+			if (parse_u32_arg(&argc, &argv, &attr.key_size,
+					  "key size"))
+				return -1;
+		} else if (is_prefix(*argv, "value")) {
+			if (parse_u32_arg(&argc, &argv, &attr.value_size,
+					  "value size"))
+				return -1;
+		} else if (is_prefix(*argv, "entries")) {
+			if (parse_u32_arg(&argc, &argv, &attr.max_entries,
+					  "max entries"))
+				return -1;
+		} else if (is_prefix(*argv, "flags")) {
+			if (parse_u32_arg(&argc, &argv, &attr.map_flags,
+					  "flags"))
+				return -1;
+		} else if (is_prefix(*argv, "dev")) {
+			NEXT_ARG();
+
+			if (attr.map_ifindex) {
+				p_err("offload device already specified");
+				return -1;
+			}
+
+			attr.map_ifindex = if_nametoindex(*argv);
+			if (!attr.map_ifindex) {
+				p_err("unrecognized netdevice '%s': %s",
+				      *argv, strerror(errno));
+				return -1;
+			}
+			NEXT_ARG();
+		}
+	}
+
+	if (!attr.name) {
+		p_err("map name not specified");
+		return -1;
+	}
+
+	fd = bpf_create_map_xattr(&attr);
+	if (fd < 0) {
+		p_err("map create failed: %s", strerror(errno));
+		return -1;
+	}
+
+	err = do_pin_fd(fd, pinfile);
+	close(fd);
+	if (err)
+		return err;
+
+	if (json_output)
+		jsonw_null(json_wtr);
+	return 0;
+}
+
 static int do_help(int argc, char **argv)
 {
 	if (json_output) {
@@ -1067,6 +1165,9 @@ static int do_help(int argc, char **argv)
 
 	fprintf(stderr,
 		"Usage: %s %s { show | list }   [MAP]\n"
+		"       %s %s create     FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n"
+		"                              entries MAX_ENTRIES name NAME [flags FLAGS] \\\n"
+		"                              [dev NAME]\n"
 		"       %s %s dump       MAP\n"
 		"       %s %s update     MAP  key DATA value VALUE [UPDATE_FLAGS]\n"
 		"       %s %s lookup     MAP  key DATA\n"
@@ -1081,11 +1182,17 @@ static int do_help(int argc, char **argv)
 		"       " HELP_SPEC_PROGRAM "\n"
 		"       VALUE := { DATA | MAP | PROG }\n"
 		"       UPDATE_FLAGS := { any | exist | noexist }\n"
+		"       TYPE := { hash | array | prog_array | perf_event_array | percpu_hash |\n"
+		"                 percpu_array | stack_trace | cgroup_array | lru_hash |\n"
+		"                 lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n"
+		"                 devmap | sockmap | cpumap | xskmap | sockhash |\n"
+		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
-		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
+		bin_name, argv[-2]);
 
 	return 0;
 }
@@ -1101,6 +1208,7 @@ static const struct cmd cmds[] = {
 	{ "delete",	do_delete },
 	{ "pin",	do_pin },
 	{ "event_pipe",	do_event_pipe },
+	{ "create",	do_create },
 	{ 0 }
 };
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH bpf-next 2/3] bpf: emit RECORD_MMAP events for bpf prog load/unload
From: Song Liu @ 2018-10-15 23:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: acme, Peter Zijlstra, Alexei Starovoitov, David S . Miller,
	Daniel Borkmann, Networking, kernel-team
In-Reply-To: <20180921221343.g52n7c4edisvice3@ast-mbp>

On Fri, Sep 21, 2018 at 3:15 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Sep 21, 2018 at 09:25:00AM -0300, Arnaldo Carvalho de Melo wrote:
> >
> > > I have considered adding MUNMAP to match existing MMAP, but went
> > > without it because I didn't want to introduce new bit in perf_event_attr
> > > and emit these new events in a misbalanced conditional way for prog load/unload.
> > > Like old perf is asking kernel for mmap events via mmap bit, so prog load events
> >
> > By prog load events you mean that old perf, having perf_event_attr.mmap = 1 ||
> > perf_event_attr.mmap2 = 1 will cause the new kernel to emit
> > PERF_RECORD_MMAP records for the range of addresses that a BPF program
> > is being loaded on, right?
>
> right. it would be weird when prog load events are there, but not unload.
>
> > > will be in perf.data, but old perf report won't recognize them anyway.
> >
> > Why not? It should lookup the symbol and find it in the rb_tree of maps,
> > with a DSO name equal to what was in the PERF_RECORD_MMAP emitted by the
> > BPF core, no? It'll be an unresolved symbol, but a resolved map.
> >
> > > Whereas new perf would certainly want to catch bpf events and will set
> > > both mmap and mumap bits.
> >
> > new perf with your code will find a symbol, not a map, because your code
> > catches a special case PERF_RECORD_MMAP and instead of creating a
> > 'struct map' will create a 'struct symbol' and insert it in the kallsyms
> > 'struct map', right?
>
> right.
> bpf progs are more similar to kernel functions than to modules.
> For modules it makes sense to create a new map and insert symbols into it.
> For bpf JITed images there is no DSO to parse.
> Single bpf elf file may contain multiple bpf progs and each prog may contain
> multiple bpf functions. They will be loaded at different time and
> will have different life time.
>
> > In theory the old perf should catch the PERF_RECORD_MMAP with a string
> > in the filename part and insert a new map into the kernel mmap rb_tree,
> > and then samples would be resolved to this map, but since there is no
> > backing DSO with a symtab, it would stop at that, just stating that the
> > map is called NAME-OF-BPF-PROGRAM. This is all from memory, possibly
> > there is something in there that makes it ignore this PERF_RECORD_MMAP
> > emitted by the BPF kernel code when loading a new program.
>
> In /proc/kcore there is already a section for module range.
> Hence when perf processes bpf load/unload events the map is already created.
> Therefore the patch 3 only searches for it and inserts new symbol into it.
>
> In that sense the reuse of RECORD_MMAP event for bpf progs is indeed
> not exactly clean, since no new map is created.
> It's probably better to introduce PERF_RECORD_[INSERT|ERASE]_KSYM events ?
>
> Such event potentially can be used for offline ksym resolution.
> perf could process /proc/kallsyms during perf record and emit all of them
> as synthetic PERF_RECORD_INSERT_KSYM into perf.data, so perf report can run
> on a different server and still find the right symbols.
>
> I guess, we can do bpf specific events too and keep RECORD_MMAP as-is.
> How about single PERF_RECORD_BPF event with internal flag for load/unload ?
>
> > Right, that is another unfortunate state of affairs, kernel module
> > load/unload should already be supported, reported by the kernel via a
> > proper PERF_RECORD_MODULE_LOAD/UNLOAD
>
> I agree with Peter here. It would nice, but low priority.
> modules are mostly static. Loaded once and stay there.
>
> > There is another longstanding TODO list entry: PERF_RECORD_MMAP records
> > should include a build-id, to avoid either userspace getting confused
> > when there is an update of some mmap DSO, for long running sessions, for
> > instance, or to have to scan the just recorded perf.data file for DSOs
> > with samples to then read it from the file system (more races).
> >
> > Have you ever considered having a build-id for bpf objects that could be
> > used here?
>
> build-id concept is not applicable to bpf.
> bpf elf files on the disc don't have good correlation with what is
> running in the kernel. bpf bytestream is converted and optimized
> by the verifier. Then JITed.
> So debug info left in .o file and original bpf bytestream in .o are
> mostly useless.
> For bpf programs we have 'program tag'. It is computed over original
> bpf bytestream, so both kernel and user space can compute it.
> In libbcc we use /var/tmp/bcc/bpf_prog_TAG/ directory to store original
> source code of the program, so users looking at kernel stack traces
> with bpf_prog_TAG can find the source.
> It's similar to build-id, but not going to help perf to annotate
> actual x86 instructions inside JITed image and show src code.
> Since JIT runs in the kernel this problem cannot be solved by user space only.
> It's a difficult problem and we have a plan to tackle that,
> but it's step 2. A bunch of infra is needed on bpf side to
> preserve the association during src_in_C -> original bpf insns ->
> translated bpf insns -> JITed asm.
> Then report it back to user space and then teach perf to properly annotate progs.
>
> > Will the JITed code from some BPF bytecode be different if the same
> > bytecode is JITed in several machines, all having the exact same
> > hardware?
>
> Yes. JITed code will be different depending on sysctl knobs (like const blinding)
> So the same original bpf byte stream loaded at different times
> may have different JITed image.
>
> Even without security features like blinding the JIT can be different.
> the bpf maps are separate from bpf progs. The bpf map is created first.
> Then the same bpf instruction stream (depending on map type that it uses)
> may be optimized by the verifier differently causing difference in JIT.
>
> > > (instead of passing kallsyms's bpf prog name in event->mmap.filename)
> > > but bpf functions don't have their own prog_id. Multiple bpf funcs
> > > with different JITed blobs are considered to be a part of single prog_id.
> > > So as a step one I'm only extending RECORD_MMAP with addr and kallsym
> > > name of bpf function/prog.
> > > As a step two the plan is to add notification mechanism for prog load/unload
> > > that will include prog_id and design new synthetic RECORD_* events in
> > > perf user space that will contain JITed code, line info, BTF, etc.
> >
> > So, will the kernel JIT a bytecode, load it somewhere and run it, then,
> > when unloading it, keep it somewhere (a filesystem with some limits,
> > etc) so that at some later time (with some timeouts) tooling can, using
> > its id/buildid cookie get the contents and symbol table to have a better
> > annotation experience?
>
> yes. The plan is to let perf fetch JITed image via BPF_OBJ_GET_INFO_BY_FD cmd
> during perf record run and store it inside perf.data in synthetic records.
> Then perf report can annotate it later.

Hi Peter and Arnaldo,

I am working with Alexei on the idea of fetching BPF program information via
BPF_OBJ_GET_INFO_BY_FD cmd. I added PERF_RECORD_BPF_EVENT
to perf_event_type, and dumped these events to perf event ring buffer.

I found that perf will not process event until the end of perf-record:

root@virt-test:~# ~/perf record -ag -- sleep 10
...... 10 seconds later
[ perf record: Woken up 34 times to write data ]
machine__process_bpf_event: prog_id 6 loaded
machine__process_bpf_event: prog_id 6 unloaded
[ perf record: Captured and wrote 9.337 MB perf.data (93178 samples) ]

In this example, the bpf program was loaded and then unloaded in
another terminal. When machine__process_bpf_event() processes
the load event, the bpf program is already unloaded. Therefore,
machine__process_bpf_event() will not be able to get information
about the program via BPF_OBJ_GET_INFO_BY_FD cmd.

To solve this problem, we will need to run BPF_OBJ_GET_INFO_BY_FD
as soon as perf get the event from kernel. I looked around the perf
code for a while. But I haven't found a good example where some
events are processed before the end of perf-record. Could you
please help me with this?

Thanks,
Song

^ permalink raw reply

* Re: [PATCH bpf-next v2] tools: bpftool: add map create command
From: Alexei Starovoitov @ 2018-10-15 23:41 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: daniel, oss-drivers, netdev
In-Reply-To: <20181015233036.2822-1-jakub.kicinski@netronome.com>

On Mon, Oct 15, 2018 at 04:30:36PM -0700, Jakub Kicinski wrote:
> Add a way of creating maps from user space.  The command takes
> as parameters most of the attributes of the map creation system
> call command.  After map is created its pinned to bpffs.  This makes
> it possible to easily and dynamically (without rebuilding programs)
> test various corner cases related to map creation.
> 
> Map type names are taken from bpftool's array used for printing.
> In general these days we try to make use of libbpf type names, but
> there are no map type names in libbpf as of today.
> 
> As with most features I add the motivation is testing (offloads) :)
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>

Applied, Thanks

^ permalink raw reply

* [PATCH] wlcore: Fix the return value in case of error in 'wlcore_vendor_cmd_smart_config_start()'
From: Christophe JAILLET @ 2018-10-16  7:39 UTC (permalink / raw)
  To: kvalo, davem, tony
  Cc: linux-wireless, netdev, linux-kernel, kernel-janitors,
	Christophe JAILLET

We return 0 unconditionally at the end of
'wlcore_vendor_cmd_smart_config_start()'.
However, 'ret' is set to some error codes in several error handling paths
and we already return some error codes at the beginning of the function.

Return 'ret' instead to propagate the error code.

Fixes: 80ff8063e87c ("wlcore: handle smart config vendor commands")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
 drivers/net/wireless/ti/wlcore/vendor_cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ti/wlcore/vendor_cmd.c b/drivers/net/wireless/ti/wlcore/vendor_cmd.c
index dbe78d8491ef..7f34ec077ee5 100644
--- a/drivers/net/wireless/ti/wlcore/vendor_cmd.c
+++ b/drivers/net/wireless/ti/wlcore/vendor_cmd.c
@@ -70,7 +70,7 @@ wlcore_vendor_cmd_smart_config_start(struct wiphy *wiphy,
 out:
 	mutex_unlock(&wl->mutex);
 
-	return 0;
+	return ret;
 }
 
 static int
-- 
2.17.1

^ permalink raw reply related

* [PATCH net 0/3] nfp: fix pedit set action offloads
From: Jakub Kicinski @ 2018-10-15 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Jakub Kicinski

Hi,

Pieter says:

This set fixes set actions when using multiple pedit actions with
partial masks and with multiple keys per pedit action. Additionally
it fixes set ipv6 pedit action offloads when using it in combination
with other header keys.

The problem would only trigger if one combines multiple pedit actions
of the same type with partial masks, e.g.:

$ tc filter add dev netdev protocol ip parent ffff: \
    flower indev netdev \
    ip_proto tcp \
    action pedit ex munge \ 
    ip src set 11.11.11.11 retain 65535 munge \
    ip src set 22.22.22.22 retain 4294901760 pipe \
    csum ip and tcp pipe \
    mirred egress redirect dev netdev

Pieter Jansen van Vuuren (3):
  nfp: flower: fix pedit set actions for multiple partial masks
  nfp: flower: fix multiple keys per pedit action
  nfp: flower: use offsets provided by pedit instead of index for ipv6

 .../ethernet/netronome/nfp/flower/action.c    | 51 ++++++++++++-------
 1 file changed, 33 insertions(+), 18 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH net 1/3] nfp: flower: fix pedit set actions for multiple partial masks
From: Jakub Kicinski @ 2018-10-15 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Pieter Jansen van Vuuren
In-Reply-To: <20181015235225.17574-1-jakub.kicinski@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Previously we did not correctly change headers when using multiple
pedit actions with partial masks. We now take this into account and
no longer just commit the last pedit action.

Fixes: c0b1bd9a8b8a ("nfp: add set ipv4 header action flower offload")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/flower/action.c    | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 46ba0cf257c6..91de7a9b0190 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -429,12 +429,14 @@ nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off,
 
 	switch (off) {
 	case offsetof(struct iphdr, daddr):
-		set_ip_addr->ipv4_dst_mask = mask;
-		set_ip_addr->ipv4_dst = exact;
+		set_ip_addr->ipv4_dst_mask |= mask;
+		set_ip_addr->ipv4_dst &= ~mask;
+		set_ip_addr->ipv4_dst |= exact & mask;
 		break;
 	case offsetof(struct iphdr, saddr):
-		set_ip_addr->ipv4_src_mask = mask;
-		set_ip_addr->ipv4_src = exact;
+		set_ip_addr->ipv4_src_mask |= mask;
+		set_ip_addr->ipv4_src &= ~mask;
+		set_ip_addr->ipv4_src |= exact & mask;
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -451,8 +453,9 @@ static void
 nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask,
 		      struct nfp_fl_set_ipv6_addr *ip6)
 {
-	ip6->ipv6[idx % 4].mask = mask;
-	ip6->ipv6[idx % 4].exact = exact;
+	ip6->ipv6[idx % 4].mask |= mask;
+	ip6->ipv6[idx % 4].exact &= ~mask;
+	ip6->ipv6[idx % 4].exact |= exact & mask;
 
 	ip6->reserved = cpu_to_be16(0);
 	ip6->head.jump_id = opcode_tag;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net 2/3] nfp: flower: fix multiple keys per pedit action
From: Jakub Kicinski @ 2018-10-15 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Pieter Jansen van Vuuren
In-Reply-To: <20181015235225.17574-1-jakub.kicinski@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Previously we only allowed a single header key per pedit action to
change the header. This used to result in the last header key in the
pedit action to overwrite previous headers. We now keep track of them
and allow multiple header keys per pedit action.

Fixes: c0b1bd9a8b8a ("nfp: add set ipv4 header action flower offload")
Fixes: 354b82bb320e ("nfp: add set ipv6 source and destination address")
Fixes: f8b7b0a6b113 ("nfp: add set tcp and udp header action flower offload")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/flower/action.c   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 91de7a9b0190..c39d7fdf73e6 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -544,7 +544,7 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow,
 	struct nfp_fl_set_eth set_eth;
 	enum pedit_header_type htype;
 	int idx, nkeys, err;
-	size_t act_size;
+	size_t act_size = 0;
 	u32 offset, cmd;
 	u8 ip_proto = 0;
 
@@ -602,7 +602,9 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow,
 		act_size = sizeof(set_eth);
 		memcpy(nfp_action, &set_eth, act_size);
 		*a_len += act_size;
-	} else if (set_ip_addr.head.len_lw) {
+	}
+	if (set_ip_addr.head.len_lw) {
+		nfp_action += act_size;
 		act_size = sizeof(set_ip_addr);
 		memcpy(nfp_action, &set_ip_addr, act_size);
 		*a_len += act_size;
@@ -610,10 +612,12 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow,
 		/* Hardware will automatically fix IPv4 and TCP/UDP checksum. */
 		*csum_updated |= TCA_CSUM_UPDATE_FLAG_IPV4HDR |
 				nfp_fl_csum_l4_to_flag(ip_proto);
-	} else if (set_ip6_dst.head.len_lw && set_ip6_src.head.len_lw) {
+	}
+	if (set_ip6_dst.head.len_lw && set_ip6_src.head.len_lw) {
 		/* TC compiles set src and dst IPv6 address as a single action,
 		 * the hardware requires this to be 2 separate actions.
 		 */
+		nfp_action += act_size;
 		act_size = sizeof(set_ip6_src);
 		memcpy(nfp_action, &set_ip6_src, act_size);
 		*a_len += act_size;
@@ -626,6 +630,7 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow,
 		/* Hardware will automatically fix TCP/UDP checksum. */
 		*csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
 	} else if (set_ip6_dst.head.len_lw) {
+		nfp_action += act_size;
 		act_size = sizeof(set_ip6_dst);
 		memcpy(nfp_action, &set_ip6_dst, act_size);
 		*a_len += act_size;
@@ -633,13 +638,16 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow,
 		/* Hardware will automatically fix TCP/UDP checksum. */
 		*csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
 	} else if (set_ip6_src.head.len_lw) {
+		nfp_action += act_size;
 		act_size = sizeof(set_ip6_src);
 		memcpy(nfp_action, &set_ip6_src, act_size);
 		*a_len += act_size;
 
 		/* Hardware will automatically fix TCP/UDP checksum. */
 		*csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
-	} else if (set_tport.head.len_lw) {
+	}
+	if (set_tport.head.len_lw) {
+		nfp_action += act_size;
 		act_size = sizeof(set_tport);
 		memcpy(nfp_action, &set_tport, act_size);
 		*a_len += act_size;
-- 
2.17.1

^ permalink raw reply related

* [PATCH net 3/3] nfp: flower: use offsets provided by pedit instead of index for ipv6
From: Jakub Kicinski @ 2018-10-15 23:52 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, Pieter Jansen van Vuuren
In-Reply-To: <20181015235225.17574-1-jakub.kicinski@netronome.com>

From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>

Previously when populating the set ipv6 address action, we incorrectly
made use of pedit's key index to determine which 32bit word should be
set. We now calculate which word has been selected based on the offset
provided by the pedit action.

Fixes: 354b82bb320e ("nfp: add set ipv6 source and destination address")
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../ethernet/netronome/nfp/flower/action.c    | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index c39d7fdf73e6..7a1e9cd9cc62 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -450,12 +450,12 @@ nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off,
 }
 
 static void
-nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask,
+nfp_fl_set_ip6_helper(int opcode_tag, u8 word, __be32 exact, __be32 mask,
 		      struct nfp_fl_set_ipv6_addr *ip6)
 {
-	ip6->ipv6[idx % 4].mask |= mask;
-	ip6->ipv6[idx % 4].exact &= ~mask;
-	ip6->ipv6[idx % 4].exact |= exact & mask;
+	ip6->ipv6[word].mask |= mask;
+	ip6->ipv6[word].exact &= ~mask;
+	ip6->ipv6[word].exact |= exact & mask;
 
 	ip6->reserved = cpu_to_be16(0);
 	ip6->head.jump_id = opcode_tag;
@@ -468,6 +468,7 @@ nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off,
 	       struct nfp_fl_set_ipv6_addr *ip_src)
 {
 	__be32 exact, mask;
+	u8 word;
 
 	/* We are expecting tcf_pedit to return a big endian value */
 	mask = (__force __be32)~tcf_pedit_mask(action, idx);
@@ -476,17 +477,20 @@ nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off,
 	if (exact & ~mask)
 		return -EOPNOTSUPP;
 
-	if (off < offsetof(struct ipv6hdr, saddr))
+	if (off < offsetof(struct ipv6hdr, saddr)) {
 		return -EOPNOTSUPP;
-	else if (off < offsetof(struct ipv6hdr, daddr))
-		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, idx,
+	} else if (off < offsetof(struct ipv6hdr, daddr)) {
+		word = (off - offsetof(struct ipv6hdr, saddr)) / sizeof(exact);
+		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, word,
 				      exact, mask, ip_src);
-	else if (off < offsetof(struct ipv6hdr, daddr) +
-		       sizeof(struct in6_addr))
-		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, idx,
+	} else if (off < offsetof(struct ipv6hdr, daddr) +
+		       sizeof(struct in6_addr)) {
+		word = (off - offsetof(struct ipv6hdr, daddr)) / sizeof(exact);
+		nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, word,
 				      exact, mask, ip_dst);
-	else
+	} else {
 		return -EOPNOTSUPP;
+	}
 
 	return 0;
 }
-- 
2.17.1

^ permalink raw reply related

* pull-request: bpf-next 2018-10-16
From: Daniel Borkmann @ 2018-10-16  0:33 UTC (permalink / raw)
  To: davem; +Cc: daniel, ast, netdev

Hi David,

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Convert BPF sockmap and kTLS to both use a new sk_msg API and enable
   sk_msg BPF integration for the latter, from Daniel and John.

2) Enable BPF syscall side to indicate for maps that they do not support
   a map lookup operation as opposed to just missing key, from Prashant.

3) Add bpftool map create command which after map creation pins the
   map into bpf fs for further processing, from Jakub.

4) Add bpftool support for attaching programs to maps allowing sock_map
   and sock_hash to be used from bpftool, from John.

5) Improve syscall BPF map update/delete path for map-in-map types to
   wait a RCU grace period for pending references to complete, from Daniel.

6) Couple of follow-up fixes for the BPF socket lookup to get it
   enabled also when IPv6 is compiled as a module, from Joe.

7) Fix a generic-XDP bug to handle the case when the Ethernet header
   was mangled and thus update skb's protocol and data, from Jesper.

8) Add a missing BTF header length check between header copies from
   user space, from Wenwen.

9) Minor fixups in libbpf to use __u32 instead u32 types and include
   proper perf_event.h uapi header instead of perf internal one, from Yonghong.

10) Allow to pass user-defined flags through EXTRA_CFLAGS and EXTRA_LDFLAGS
    to bpftool's build, from Jiri.

11) BPF kselftest tweaks to add LWTUNNEL to config fragment and to install
    with_addr.sh script from flow dissector selftest, from Anders.

Please consider pulling these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git

Thanks a lot!

----------------------------------------------------------------

The following changes since commit 071a234ad744ab9a1e9c948874d5f646a2964734:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next (2018-10-08 23:42:44 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git 

for you to fetch changes up to 0b592b5a01bef5416472ec610d3191e019c144a5:

  tools: bpftool: add map create command (2018-10-15 16:39:21 -0700)

----------------------------------------------------------------
Alexei Starovoitov (5):
      Merge branch 'unsupported-map-lookup'
      Merge branch 'xdp-vlan'
      Merge branch 'sockmap_and_ktls'
      Merge branch 'ipv6_sk_lookup_fixes'
      Merge branch 'bpftool_sockmap'

Anders Roxell (2):
      selftests: bpf: add config fragment LWTUNNEL
      selftests: bpf: install script with_addr.sh

Daniel Borkmann (5):
      tcp, ulp: enforce sock_owned_by_me upon ulp init and cleanup
      tcp, ulp: remove ulp bits from sockmap
      bpf, sockmap: convert to generic sk_msg interface
      tls: convert to generic sk_msg interface
      bpf, doc: add maintainers entry to related files

Daniel Colascione (1):
      bpf: wait for running BPF programs when updating map-in-map

Jakub Kicinski (1):
      tools: bpftool: add map create command

Jesper Dangaard Brouer (3):
      net: fix generic XDP to handle if eth header was mangled
      bpf: make TC vlan bpf_helpers avail to selftests
      selftests/bpf: add XDP selftests for modifying and popping VLAN headers

Jiri Olsa (2):
      bpftool: Allow to add compiler flags via EXTRA_CFLAGS variable
      bpftool: Allow add linker flags via EXTRA_LDFLAGS variable

Joe Stringer (3):
      bpf: Fix dev pointer dereference from sk_skb
      bpf: Allow sk_lookup with IPv6 module
      bpf: Fix IPv6 dport byte-order in bpf_sk_lookup

John Fastabend (5):
      tls: replace poll implementation with read hook
      tls: add bpf support to sk_msg handling
      bpf: add tls support for testing in test_sockmap
      bpf: bpftool, add support for attaching programs to maps
      bpf: bpftool, add flag to allow non-compat map definitions

Prashant Bhole (6):
      bpf: error handling when map_lookup_elem isn't supported
      bpf: return EOPNOTSUPP when map lookup isn't supported
      tools/bpf: bpftool, split the function do_dump()
      tools/bpf: bpftool, print strerror when map lookup error occurs
      selftests/bpf: test_verifier, change names of fixup maps
      selftests/bpf: test_verifier, check bpf_map_lookup_elem access in bpf prog

Wenwen Wang (1):
      bpf: btf: Fix a missing check bug

Yonghong Song (1):
      tools/bpf: use proper type and uapi perf_event.h header for libbpf

 MAINTAINERS                                      |   10 +
 include/linux/bpf.h                              |   33 +-
 include/linux/bpf_types.h                        |    2 +-
 include/linux/filter.h                           |   21 -
 include/linux/skmsg.h                            |  410 ++++
 include/net/addrconf.h                           |    5 +
 include/net/sock.h                               |    4 -
 include/net/tcp.h                                |   28 +-
 include/net/tls.h                                |   24 +-
 kernel/bpf/Makefile                              |    5 -
 kernel/bpf/arraymap.c                            |    2 +-
 kernel/bpf/btf.c                                 |    3 +
 kernel/bpf/core.c                                |    2 -
 kernel/bpf/sockmap.c                             | 2629 ----------------------
 kernel/bpf/stackmap.c                            |    2 +-
 kernel/bpf/syscall.c                             |   28 +-
 kernel/bpf/xskmap.c                              |    2 +-
 net/Kconfig                                      |   11 +
 net/core/Makefile                                |    2 +
 net/core/dev.c                                   |   14 +
 net/core/filter.c                                |  290 +--
 net/core/skmsg.c                                 |  802 +++++++
 net/core/sock.c                                  |   61 -
 net/core/sock_map.c                              | 1002 +++++++++
 net/ipv4/Makefile                                |    1 +
 net/ipv4/tcp_bpf.c                               |  655 ++++++
 net/ipv4/tcp_ulp.c                               |   73 +-
 net/ipv6/af_inet6.c                              |    1 +
 net/strparser/Kconfig                            |    4 +-
 net/tls/Kconfig                                  |    1 +
 net/tls/tls_device.c                             |    2 +-
 net/tls/tls_main.c                               |   11 +-
 net/tls/tls_sw.c                                 |  900 +++++---
 tools/bpf/bpftool/Documentation/bpftool-map.rst  |   15 +-
 tools/bpf/bpftool/Documentation/bpftool-prog.rst |   11 +
 tools/bpf/bpftool/Documentation/bpftool.rst      |   10 +-
 tools/bpf/bpftool/Makefile                       |    9 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   59 +-
 tools/bpf/bpftool/common.c                       |   21 +
 tools/bpf/bpftool/main.c                         |    7 +-
 tools/bpf/bpftool/main.h                         |    4 +-
 tools/bpf/bpftool/map.c                          |  212 +-
 tools/bpf/bpftool/prog.c                         |  101 +-
 tools/lib/bpf/Makefile                           |    2 +-
 tools/lib/bpf/bpf.h                              |    3 +
 tools/lib/bpf/libbpf.c                           |   35 +-
 tools/lib/bpf/libbpf.h                           |    2 +
 tools/testing/selftests/bpf/Makefile             |    8 +-
 tools/testing/selftests/bpf/bpf_helpers.h        |    4 +
 tools/testing/selftests/bpf/config               |    1 +
 tools/testing/selftests/bpf/test_sockmap.c       |   89 +
 tools/testing/selftests/bpf/test_verifier.c      |  501 +++--
 tools/testing/selftests/bpf/test_xdp_vlan.c      |  292 +++
 tools/testing/selftests/bpf/test_xdp_vlan.sh     |  195 ++
 54 files changed, 4962 insertions(+), 3659 deletions(-)
 create mode 100644 include/linux/skmsg.h
 delete mode 100644 kernel/bpf/sockmap.c
 create mode 100644 net/core/skmsg.c
 create mode 100644 net/core/sock_map.c
 create mode 100644 net/ipv4/tcp_bpf.c
 create mode 100644 tools/testing/selftests/bpf/test_xdp_vlan.c
 create mode 100755 tools/testing/selftests/bpf/test_xdp_vlan.sh

^ permalink raw reply

* Re: [PATCH ipsec-next] xfrm: use complete IPv6 addresses for hash
From: Steffen Klassert @ 2018-10-16  8:26 UTC (permalink / raw)
  To: Michal Kubecek; +Cc: Herbert Xu, David S. Miller, netdev, linux-kernel
In-Reply-To: <20181012122444.0448FA0ED9@unicorn.suse.cz>

On Fri, Oct 12, 2018 at 02:24:44PM +0200, Michal Kubecek wrote:
> In some environments it is common that many hosts share the same lower half
> of their IPv6 addresses (in particular ::1). As __xfrm6_addr_hash() and
> __xfrm6_daddr_saddr_hash() calculate the hash only from the lower halves,
> as much as 1/3 of the hosts ends up in one hashtable chain which harms the
> performance.
> 
> Use complete IPv6 addresses when calculating the hashes. Rather than just
> adding two more words to the xor, use jhash2() for consistency with
> __xfrm6_pref_hash() and __xfrm6_dpref_spref_hash().
> 
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>

Applied to ipsec-next, thanks a lot!

^ permalink raw reply

* Re: [PATCH bpf-next v2 7/8] bpf: add tls support for testing in test_sockmap
From: Andrey Ignatov @ 2018-10-16  0:42 UTC (permalink / raw)
  To: Daniel Borkmann, john.fastabend@gmail.com
  Cc: alexei.starovoitov@gmail.com, Dave Watson, netdev@vger.kernel.org
In-Reply-To: <20181013004603.3747-8-daniel@iogearbox.net>

Hi Daniel and John!

Daniel Borkmann <daniel@iogearbox.net> [Fri, 2018-10-12 17:46 -0700]:
> From: John Fastabend <john.fastabend@gmail.com>
> 
> This adds a --ktls option to test_sockmap in order to enable the
> combination of ktls and sockmap to run, which makes for another
> batch of 648 test cases for both in combination.
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> ---
>  tools/testing/selftests/bpf/test_sockmap.c | 89 ++++++++++++++++++++++++++++++
>  1 file changed, 89 insertions(+)
> 
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
> index ac7de38..10a5fa8 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -71,6 +71,7 @@ int txmsg_start;
>  int txmsg_end;
>  int txmsg_ingress;
>  int txmsg_skb;
> +int ktls;
>  
>  static const struct option long_options[] = {
>  	{"help",	no_argument,		NULL, 'h' },
> @@ -92,6 +93,7 @@ static const struct option long_options[] = {
>  	{"txmsg_end",	required_argument,	NULL, 'e'},
>  	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
>  	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
> +	{"ktls", no_argument,			&ktls, 1 },
>  	{0, 0, NULL, 0 }
>  };
>  
> @@ -112,6 +114,76 @@ static void usage(char *argv[])
>  	printf("\n");
>  }
>  
> +#define TCP_ULP 31
> +#define TLS_TX 1
> +#define TLS_RX 2
> +#include <linux/tls.h>

This breaks selftest build for me:
  test_sockmap.c:120:23: fatal error: linux/tls.h: No such file or directory
   #include <linux/tls.h>
                         ^
  compilation terminated.

Should include/uapi/linux/tls.h be copied to tools/ not to depend on
host headers?

> +
> +char *sock_to_string(int s)
> +{
> +	if (s == c1)
> +		return "client1";
> +	else if (s == c2)
> +		return "client2";
> +	else if (s == s1)
> +		return "server1";
> +	else if (s == s2)
> +		return "server2";
> +	else if (s == p1)
> +		return "peer1";
> +	else if (s == p2)
> +		return "peer2";
> +	else
> +		return "unknown";
> +}
> +
> +static int sockmap_init_ktls(int verbose, int s)
> +{
> +	struct tls12_crypto_info_aes_gcm_128 tls_tx = {
> +		.info = {
> +			.version     = TLS_1_2_VERSION,
> +			.cipher_type = TLS_CIPHER_AES_GCM_128,
> +		},
> +	};
> +	struct tls12_crypto_info_aes_gcm_128 tls_rx = {
> +		.info = {
> +			.version     = TLS_1_2_VERSION,
> +			.cipher_type = TLS_CIPHER_AES_GCM_128,
> +		},
> +	};
> +	int so_buf = 6553500;
> +	int err;
> +
> +	err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls"));
> +	if (err) {
> +		fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err);
> +		return -EINVAL;
> +	}
> +	err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx));
> +	if (err) {
> +		fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err);
> +		return -EINVAL;
> +	}
> +	err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx));
> +	if (err) {
> +		fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err);
> +		return -EINVAL;
> +	}
> +	err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf));
> +	if (err) {
> +		fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err);
> +		return -EINVAL;
> +	}
> +	err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf));
> +	if (err) {
> +		fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err);
> +		return -EINVAL;
> +	}
> +
> +	if (verbose)
> +		fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s));
> +	return 0;
> +}
>  static int sockmap_init_sockets(int verbose)
>  {
>  	int i, err, one = 1;
> @@ -456,6 +528,21 @@ static int sendmsg_test(struct sockmap_options *opt)
>  	else
>  		rx_fd = p2;
>  
> +	if (ktls) {
> +		/* Redirecting into non-TLS socket which sends into a TLS
> +		 * socket is not a valid test. So in this case lets not
> +		 * enable kTLS but still run the test.
> +		 */
> +		if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) {
> +			err = sockmap_init_ktls(opt->verbose, rx_fd);
> +			if (err)
> +				return err;
> +		}
> +		err = sockmap_init_ktls(opt->verbose, c1);
> +		if (err)
> +			return err;
> +	}
> +
>  	rxpid = fork();
>  	if (rxpid == 0) {
>  		if (opt->drop_expected)
> @@ -907,6 +994,8 @@ static void test_options(char *options)
>  		strncat(options, "ingress,", OPTSTRING);
>  	if (txmsg_skb)
>  		strncat(options, "skb,", OPTSTRING);
> +	if (ktls)
> +		strncat(options, "ktls,", OPTSTRING);
>  }
>  
>  static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
> -- 
> 2.9.5
> 

-- 
Andrey Ignatov

^ permalink raw reply

* Re: [PATCH bpf-next v2 7/8] bpf: add tls support for testing in test_sockmap
From: Daniel Borkmann @ 2018-10-16  0:48 UTC (permalink / raw)
  To: Andrey Ignatov, john.fastabend@gmail.com
  Cc: alexei.starovoitov@gmail.com, Dave Watson, netdev@vger.kernel.org
In-Reply-To: <20181016004243.GA95609@rdna-mbp.dhcp.thefacebook.com>

On 10/16/2018 02:42 AM, Andrey Ignatov wrote:
> Hi Daniel and John!
> 
> Daniel Borkmann <daniel@iogearbox.net> [Fri, 2018-10-12 17:46 -0700]:
>> From: John Fastabend <john.fastabend@gmail.com>
>>
>> This adds a --ktls option to test_sockmap in order to enable the
>> combination of ktls and sockmap to run, which makes for another
>> batch of 648 test cases for both in combination.
>>
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
>> ---
>>  tools/testing/selftests/bpf/test_sockmap.c | 89 ++++++++++++++++++++++++++++++
>>  1 file changed, 89 insertions(+)
>>
>> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
>> index ac7de38..10a5fa8 100644
>> --- a/tools/testing/selftests/bpf/test_sockmap.c
>> +++ b/tools/testing/selftests/bpf/test_sockmap.c
>> @@ -71,6 +71,7 @@ int txmsg_start;
>>  int txmsg_end;
>>  int txmsg_ingress;
>>  int txmsg_skb;
>> +int ktls;
>>  
>>  static const struct option long_options[] = {
>>  	{"help",	no_argument,		NULL, 'h' },
>> @@ -92,6 +93,7 @@ static const struct option long_options[] = {
>>  	{"txmsg_end",	required_argument,	NULL, 'e'},
>>  	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
>>  	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
>> +	{"ktls", no_argument,			&ktls, 1 },
>>  	{0, 0, NULL, 0 }
>>  };
>>  
>> @@ -112,6 +114,76 @@ static void usage(char *argv[])
>>  	printf("\n");
>>  }
>>  
>> +#define TCP_ULP 31
>> +#define TLS_TX 1
>> +#define TLS_RX 2
>> +#include <linux/tls.h>
> 
> This breaks selftest build for me:
>   test_sockmap.c:120:23: fatal error: linux/tls.h: No such file or directory
>    #include <linux/tls.h>
>                          ^
>   compilation terminated.
> 
> Should include/uapi/linux/tls.h be copied to tools/ not to depend on
> host headers?

Good point, yes, that should happen; will send a fix tomorrow morning.

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH] virtio_net: enable tx after resuming from suspend
From: Jason Wang @ 2018-10-16  8:53 UTC (permalink / raw)
  To: ake
  Cc: Michael S. Tsirkin, David S. Miller, virtualization, netdev,
	linux-kernel
In-Reply-To: <e2baaccc-4ead-e61d-fc1e-d79435012e1c@igel.co.jp>


On 2018/10/15 下午6:08, ake wrote:
>
> On 2018年10月12日 18:18, ake wrote:
>>
>> On 2018年10月12日 17:23, Jason Wang wrote:
>>>
>>> On 2018年10月12日 12:30, ake wrote:
>>>> On 2018年10月11日 22:06, Jason Wang wrote:
>>>>> On 2018年10月11日 18:22, ake wrote:
>>>>>> On 2018年10月11日 18:44, Jason Wang wrote:
>>>>>>> On 2018年10月11日 15:51, Ake Koomsin wrote:
>>>>>>>> commit 713a98d90c5e ("virtio-net: serialize tx routine during reset")
>>>>>>>> disabled the virtio tx before going to suspend to avoid a use after
>>>>>>>> free.
>>>>>>>> However, after resuming, it causes the virtio_net device to lose its
>>>>>>>> network connectivity.
>>>>>>>>
>>>>>>>> To solve the issue, we need to enable tx after resuming.
>>>>>>>>
>>>>>>>> Fixes commit 713a98d90c5e ("virtio-net: serialize tx routine during
>>>>>>>> reset")
>>>>>>>> Signed-off-by: Ake Koomsin <ake@igel.co.jp>
>>>>>>>> ---
>>>>>>>>      drivers/net/virtio_net.c | 1 +
>>>>>>>>      1 file changed, 1 insertion(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>>>>>> index dab504ec5e50..3453d80f5f81 100644
>>>>>>>> --- a/drivers/net/virtio_net.c
>>>>>>>> +++ b/drivers/net/virtio_net.c
>>>>>>>> @@ -2256,6 +2256,7 @@ static int virtnet_restore_up(struct
>>>>>>>> virtio_device *vdev)
>>>>>>>>          }
>>>>>>>>            netif_device_attach(vi->dev);
>>>>>>>> +    netif_start_queue(vi->dev);
>>>>>>> I believe this is duplicated with netif_tx_wake_all_queues() in
>>>>>>> netif_device_attach() above?
>>>>>> Thank you for your review.
>>>>>>
>>>>>> If both netif_tx_wake_all_queues() and netif_start_queue() result in
>>>>>> clearing __QUEUE_STATE_DRV_XOFF, then is it possible that some
>>>>>> conditions in netif_device_attach() is not satisfied?
>>>>> Yes, maybe. One case I can see now is when the device is down, in this
>>>>> case netif_device_attach() won't try to wakeup the queue.
>>>>>
>>>>>>     Without
>>>>>> netif_start_queue(), the virtio_net device does not resume properly
>>>>>> after waking up.
>>>>> How do you trigger the issue? Just do suspend/resume?
>>>> Yes, simply suspend and resume.
>>>>
>>>> Here is how I trigger the issue:
>>>>
>>>> 1) Start the Virtual Machine Manager GUI program.
>>>> 2) Create a guest Linux OS. Make sure that the guest OS kernel is
>>>>      >= 4.12. Make sure that it uses virtio_net as its network device.
>>>>      In addition, make sure that the video adapter is VGA. Otherwise,
>>>>      waking up with the virtual power button does not work.
>>>> 3) After installing the guest OS, log in, and test the network
>>>>      connectivity by ping the host machine.
>>>> 4) Suspend. After this, the screen is blank.
>>>> 5) Resume by hitting the virtual power button. The login screen
>>>>      appears again.
>>>> 6) Log in again. The guest loses its network connection.
>>>>
>>>> In my test:
>>>> Guest: Ubuntu 16.04/18.04 with kernel 4.15.0-36-generic
>>>> Host: Ubuntu 16.04 with kernel 4.15.0-36-generic/4.4.0-137-generic
>>> I can not reproduce this issue if virtio-net interface is up in guest
>>> before the suspend. I'm using net-next.git and qemu master. But I do
>>> reproduce when virtio-net interface is down in guest before suspend,
>>> after resume, even if I make it up, the network is still lost.
>>>
>>> I think the interface is up in your case, but please confirm this.
>> If you mean the interface state before I hit the suspend button,
>> the answer is yes. The interface is up before I suspend the guest
>> machine.
>>
>> Note that my current QEMU version is QEMU emulator version 2.5.0
>> (Debian 1:2.5+dfsg-5ubuntu10.32).
>>
>> I will try with net-next.git and qemu master later and see if I can
>> reproduce the issue.
> Update. I tried with net-next and qemu master. Interestingly, the result
> is different from yours. The network is lost even if the virtio_net
> interface is up before suspending.
>
> Host: Ubuntu 16.04 with net-next kernel (default configuration)
> Guest: Ubuntu 18.04 with net-next kernel (default configuration)
> Qemu: master
> Qemu command:
> qemu-system-x86_64 -cpu host -m 2048 -enable-kvm \
> -bios /usr/share/OVMF/OVMF_CODE.fd \
> -drive file=/var/lib/libvirt/images/virtio_test.qcow2,if=virtio \
> -netdev user,id=hostnet0 \
> -device virtio-net-pci,netdev=hostnet0 \
> -device VGA,id=video0,vgamem_mb=16 \
> -global PIIX4_PM.disable_s3=1 \
> -global PIIX4_PM.disable_s4=1 -monitor stdio


Interesting, just notice you're using userspace network. To isolate the 
issue, can you retry with e.g tap or e1000 to make sure it's not a fault 
of slirp or virito-net?

Thanks

^ permalink raw reply

* [PATCH v2 net-next 01/11] netlink: Add answer_flags to netlink_callback
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

With dump filtering we need a way to ensure the NLM_F_DUMP_FILTERED
flag is set on a message back to the user if the data returned is
influenced by some input attributes. Normally this can be done as
messages are added to the skb, but if the filter results in no data
being returned, the user could be confused as to why.

This patch adds answer_flags to the netlink_callback allowing dump
handlers to set the NLM_F_DUMP_FILTERED at a minimum in the
NLMSG_DONE message ensuring the flag gets back to the user.

The netlink_callback space is initialized to 0 via a memset in
__netlink_dump_start, so init of the new answer_flags is covered.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/netlink.h  | 1 +
 net/netlink/af_netlink.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 72580f1a72a2..4da90a6ab536 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -180,6 +180,7 @@ struct netlink_callback {
 	u16			family;
 	u16			min_dump_alloc;
 	bool			strict_check;
+	u16			answer_flags;
 	unsigned int		prev_seq, seq;
 	long			args[6];
 };
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e613a9f89600..6bb9f3cde0b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2257,7 +2257,8 @@ static int netlink_dump(struct sock *sk)
 	}

 	nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
-			       sizeof(nlk->dump_done_errno), NLM_F_MULTI);
+			       sizeof(nlk->dump_done_errno),
+			       NLM_F_MULTI | cb->answer_flags);
 	if (WARN_ON(!nlh))
 		goto errout_skb;

-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 05/11] net/mpls: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
protocol. MPLS uses only a single table and route type.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index bfcb4759c9ee..48f4cbd9fb38 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2067,12 +2067,35 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 }
 #endif
 
+static bool mpls_rt_uses_dev(struct mpls_route *rt,
+			     const struct net_device *dev)
+{
+	struct net_device *nh_dev;
+
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = rt->rt_nh;
+
+		nh_dev = rtnl_dereference(nh->nh_dev);
+		if (dev == nh_dev)
+			return true;
+	} else {
+		for_nexthops(rt) {
+			nh_dev = rtnl_dereference(nh->nh_dev);
+			if (nh_dev == dev)
+				return true;
+		} endfor_nexthops(rt);
+	}
+
+	return false;
+}
+
 static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
 	struct fib_dump_filter filter = {};
+	unsigned int flags = NLM_F_MULTI;
 	size_t platform_labels;
 	unsigned int index;
 
@@ -2084,6 +2107,14 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
+
+		/* for MPLS, there is only 1 table with fixed type and flags.
+		 * If either are set in the filter then return nothing.
+		 */
+		if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) ||
+		    (filter.rt_type && filter.rt_type != RTN_UNICAST) ||
+		     filter.flags)
+			return skb->len;
 	}
 
 	index = cb->args[0];
@@ -2092,15 +2123,24 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	platform_labels = net->mpls.platform_labels;
+
+	if (filter.filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	for (; index < platform_labels; index++) {
 		struct mpls_route *rt;
+
 		rt = rtnl_dereference(platform_label[index]);
 		if (!rt)
 			continue;
 
+		if ((filter.dev && !mpls_rt_uses_dev(rt, filter.dev)) ||
+		    (filter.protocol && rt->rt_protocol != filter.protocol))
+			continue;
+
 		if (mpls_dump_route(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-				    index, rt, NLM_F_MULTI) < 0)
+				    index, rt, flags) < 0)
 			break;
 	}
 	cb->args[0] = index;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 00/11] net: Kernel side filtering for route dumps
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of route dumps by protocol (e.g., which
routing daemon installed the route), route type (e.g., unicast), table
id and nexthop device.

iproute2 has been doing this filtering in userspace for years; pushing
the filters to the kernel side reduces the amount of data the kernel
sends and reduces wasted cycles on both sides processing unwanted data.
These initial options provide a huge improvement for efficiently
examining routes on large scale systems.

v2
- better handling of requests for a specific table. Rather than walking
  the hash of all tables, lookup the specific table and dump it
- refactor mr_rtm_dumproute moving the loop over the table into a
  helper that can be invoked directly
- add hook to return NLM_F_DUMP_FILTERED in DONE message to ensure
  it is returned even when the dump returns nothing

David Ahern (11):
  netlink: Add answer_flags to netlink_callback
  net: Add struct for fib dump filter
  net/ipv4: Plumb support for filtering route dumps
  net/ipv6: Plumb support for filtering route dumps
  net/mpls: Plumb support for filtering route dumps
  ipmr: Refactor mr_rtm_dumproute
  net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
  net: Enable kernel side filtering of route dumps
  net/mpls: Handle kernel side filtering of route dumps
  net/ipv6: Bail early if user only wants cloned entries
  net/ipv4: Bail early if user only wants prefix entries

 include/linux/mroute_base.h |  11 +++-
 include/linux/netlink.h     |   1 +
 include/net/ip6_route.h     |   1 +
 include/net/ip_fib.h        |  17 ++++--
 net/ipv4/fib_frontend.c     |  76 ++++++++++++++++++++++----
 net/ipv4/fib_trie.c         |  37 +++++++++----
 net/ipv4/ipmr.c             |  22 ++++++--
 net/ipv4/ipmr_base.c        | 126 ++++++++++++++++++++++++++++++++------------
 net/ipv6/ip6_fib.c          |  34 +++++++++---
 net/ipv6/ip6mr.c            |  21 ++++++--
 net/ipv6/route.c            |  40 +++++++++++---
 net/mpls/af_mpls.c          |  92 +++++++++++++++++++++++++++-----
 net/netlink/af_netlink.c    |   3 +-
 13 files changed, 386 insertions(+), 95 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH v2 net-next 03/11] net/ipv4: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device index,
protocol and route type. If the table id is given in the filter, lookup the
table and call fib_table_dump directly for it.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip_fib.h    |  2 +-
 net/ipv4/fib_frontend.c | 13 ++++++++++++-
 net/ipv4/fib_trie.c     | 37 ++++++++++++++++++++++++++-----------
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 667013bf4266..1eabc9edd2b9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -239,7 +239,7 @@ int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
 int fib_table_delete(struct net *, struct fib_table *, struct fib_config *,
 		     struct netlink_ext_ack *extack);
 int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
-		   struct netlink_callback *cb);
+		   struct netlink_callback *cb, struct fib_dump_filter *filter);
 int fib_table_flush(struct net *net, struct fib_table *table);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
 void fib_table_flush_external(struct fib_table *table);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 850850dd80e1..37dc8ac366fd 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -855,6 +855,17 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	    ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED)
 		return skb->len;
 
+	if (filter.table_id) {
+		tb = fib_get_table(net, filter.table_id);
+		if (!tb) {
+			NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+			return -ENOENT;
+		}
+
+		err = fib_table_dump(tb, skb, cb, &filter);
+		return skb->len ? : err;
+	}
+
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
@@ -869,7 +880,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 			if (dumped)
 				memset(&cb->args[2], 0, sizeof(cb->args) -
 						 2 * sizeof(cb->args[0]));
-			err = fib_table_dump(tb, skb, cb);
+			err = fib_table_dump(tb, skb, cb, &filter);
 			if (err < 0) {
 				if (likely(skb->len))
 					goto out;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..237c9f72b265 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2003,12 +2003,17 @@ void fib_free_table(struct fib_table *tb)
 }
 
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
-			     struct sk_buff *skb, struct netlink_callback *cb)
+			     struct sk_buff *skb, struct netlink_callback *cb,
+			     struct fib_dump_filter *filter)
 {
+	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
 	struct fib_alias *fa;
 	int i, s_i;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	s_i = cb->args[4];
 	i = 0;
 
@@ -2016,25 +2021,35 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
 		int err;
 
-		if (i < s_i) {
-			i++;
-			continue;
-		}
+		if (i < s_i)
+			goto next;
 
-		if (tb->tb_id != fa->tb_id) {
-			i++;
-			continue;
+		if (tb->tb_id != fa->tb_id)
+			goto next;
+
+		if (filter->filter_set) {
+			if (filter->rt_type && fa->fa_type != filter->rt_type)
+				goto next;
+
+			if ((filter->protocol &&
+			     fa->fa_info->fib_protocol != filter->protocol))
+				goto next;
+
+			if (filter->dev &&
+			    !fib_info_nh_uses_dev(fa->fa_info, filter->dev))
+				goto next;
 		}
 
 		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
 				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 				    tb->tb_id, fa->fa_type,
 				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+				    fa->fa_tos, fa->fa_info, flags);
 		if (err < 0) {
 			cb->args[4] = i;
 			return err;
 		}
+next:
 		i++;
 	}
 
@@ -2044,7 +2059,7 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 
 /* rcu_read_lock needs to be hold by caller from readside */
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
-		   struct netlink_callback *cb)
+		   struct netlink_callback *cb, struct fib_dump_filter *filter)
 {
 	struct trie *t = (struct trie *)tb->tb_data;
 	struct key_vector *l, *tp = t->kv;
@@ -2057,7 +2072,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
 		int err;
 
-		err = fn_trie_dump_leaf(l, tb, skb, cb);
+		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
 		if (err < 0) {
 			cb->args[3] = key;
 			cb->args[2] = count;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 04/11] net/ipv6: Plumb support for filtering route dumps
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by table id, egress device
index, protocol, and route type. If the table id is given in the filter,
lookup the table and call fib6_dump_table directly for it.

Move the existing route flags check for prefix only routes to the new
filter.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c | 28 ++++++++++++++++++++++------
 net/ipv6/route.c   | 40 ++++++++++++++++++++++++++++++++--------
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 94e61fe47ff8..a51fc357a05c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -583,10 +583,12 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
-	}
+	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+		struct rtmsg *rtm = nlmsg_data(nlh);
 
-	s_h = cb->args[0];
-	s_e = cb->args[1];
+		if (rtm->rtm_flags & RTM_F_PREFIX)
+			arg.filter.flags = RTM_F_PREFIX;
+	}
 
 	w = (void *)cb->args[2];
 	if (!w) {
@@ -612,6 +614,20 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	arg.net = net;
 	w->args = &arg;
 
+	if (arg.filter.table_id) {
+		tb = fib6_get_table(net, arg.filter.table_id);
+		if (!tb) {
+			NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
+			return -ENOENT;
+		}
+
+		res = fib6_dump_table(tb, skb, cb);
+		goto out;
+	}
+
+	s_h = cb->args[0];
+	s_e = cb->args[1];
+
 	rcu_read_lock();
 	for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
 		e = 0;
@@ -621,16 +637,16 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 				goto next;
 			res = fib6_dump_table(tb, skb, cb);
 			if (res != 0)
-				goto out;
+				goto out_unlock;
 next:
 			e++;
 		}
 	}
-out:
+out_unlock:
 	rcu_read_unlock();
 	cb->args[1] = e;
 	cb->args[0] = h;
-
+out:
 	res = res < 0 ? res : skb->len;
 	if (res <= 0)
 		fib6_dump_end(cb);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4e08b0689a8..9fd600e42f9d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4767,28 +4767,52 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	return -EMSGSIZE;
 }
 
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+			       const struct net_device *dev)
+{
+	if (f6i->fib6_nh.nh_dev == dev)
+		return true;
+
+	if (f6i->fib6_nsiblings) {
+		struct fib6_info *sibling, *next_sibling;
+
+		list_for_each_entry_safe(sibling, next_sibling,
+					 &f6i->fib6_siblings, fib6_siblings) {
+			if (sibling->fib6_nh.nh_dev == dev)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
 {
 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+	struct fib_dump_filter *filter = &arg->filter;
+	unsigned int flags = NLM_F_MULTI;
 	struct net *net = arg->net;
 
 	if (rt == net->ipv6.fib6_null_entry)
 		return 0;
 
-	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
-		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
-
-		/* user wants prefix routes only */
-		if (rtm->rtm_flags & RTM_F_PREFIX &&
-		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
-			/* success since this is not a prefix route */
+	if ((filter->flags & RTM_F_PREFIX) &&
+	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
+		/* success since this is not a prefix route */
+		return 1;
+	}
+	if (filter->filter_set) {
+		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
 			return 1;
 		}
+		flags |= NLM_F_DUMP_FILTERED;
 	}
 
 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
-			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
+			     arg->cb->nlh->nlmsg_seq, flags);
 }
 
 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 10/11] net/ipv6: Bail early if user only wants cloned entries
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Similar to IPv4, IPv6 fib no longer contains cloned routes. If a user
requests a route dump for only cloned entries, no sense walking the FIB
and returning everything.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/ip6_fib.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5562c77022c6..2a058b408a6a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -586,10 +586,13 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
 		struct rtmsg *rtm = nlmsg_data(nlh);
 
-		if (rtm->rtm_flags & RTM_F_PREFIX)
-			arg.filter.flags = RTM_F_PREFIX;
+		arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED);
 	}
 
+	/* fib entries are never clones */
+	if (arg.filter.flags & RTM_F_CLONED)
+		return skb->len;
+
 	w = (void *)cb->args[2];
 	if (!w) {
 		/* New dump:
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 07/11] net: Plumb support for filtering ipv4 and ipv6 multicast route dumps
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Implement kernel side filtering of routes by egress device index and
table id. If the table id is given in the filter, lookup table and
call mr_table_dump directly for it.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/mroute_base.h |  7 ++++---
 net/ipv4/ipmr.c             | 18 +++++++++++++++---
 net/ipv4/ipmr_base.c        | 42 +++++++++++++++++++++++++++++++++++++++---
 net/ipv6/ip6mr.c            | 18 +++++++++++++++---
 4 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index db85373c8d15..34de06b426ef 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -7,6 +7,7 @@
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/fib_notifier.h>
+#include <net/ip_fib.h>
 
 /**
  * struct vif_device - interface representor for multicast routing
@@ -288,7 +289,7 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
 		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
 			      u32 portid, u32 seq, struct mr_mfc *c,
 			      int cmd, int flags),
-		  spinlock_t *lock);
+		  spinlock_t *lock, struct fib_dump_filter *filter);
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -296,7 +297,7 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock);
+		     spinlock_t *lock, struct fib_dump_filter *filter);
 
 int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
 	    int (*rules_dump)(struct net *net,
@@ -346,7 +347,7 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 			     struct sk_buff *skb,
 			     u32 portid, u32 seq, struct mr_mfc *c,
 			     int cmd, int flags),
-		 spinlock_t *lock)
+		 spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	return -EINVAL;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 44d777058960..3fa988e6a3df 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2528,18 +2528,30 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct fib_dump_filter filter = {};
+	int err;
 
 	if (cb->strict_check) {
-		int err;
-
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
 					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
 
+	if (filter.table_id) {
+		struct mr_table *mrt;
+
+		mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id);
+		if (!mrt) {
+			NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+			return -ENOENT;
+		}
+		err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+				    &mfc_unres_lock, &filter);
+		return skb->len ? : err;
+	}
+
 	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
-				_ipmr_fill_mroute, &mfc_unres_lock);
+				_ipmr_fill_mroute, &mfc_unres_lock, &filter);
 }
 
 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 132dd2613ca5..bfe8fd04afa0 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,21 +268,45 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+			    const struct mr_mfc *c,
+			    const struct net_device *dev)
+{
+	int ct;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			const struct vif_device *vif;
+
+			vif = &mrt->vif_table[ct];
+			if (vif->dev == dev)
+				return true;
+		}
+	}
+	return false;
+}
+
 int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
 		  struct netlink_callback *cb,
 		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
 			      u32 portid, u32 seq, struct mr_mfc *c,
 			      int cmd, int flags),
-		  spinlock_t *lock)
+		  spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int e = 0, s_e = cb->args[1];
 	unsigned int flags = NLM_F_MULTI;
 	struct mr_mfc *mfc;
 	int err;
 
+	if (filter->filter_set)
+		flags |= NLM_F_DUMP_FILTERED;
+
 	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 		if (e < s_e)
 			goto next_entry;
+		if (filter->dev &&
+		    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+			goto next_entry;
 
 		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
@@ -298,6 +322,9 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
 	list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 		if (e < s_e)
 			goto next_entry2;
+		if (filter->dev &&
+		    !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+			goto next_entry2;
 
 		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
 			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
@@ -324,19 +351,28 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 struct sk_buff *skb,
 				 u32 portid, u32 seq, struct mr_mfc *c,
 				 int cmd, int flags),
-		     spinlock_t *lock)
+		     spinlock_t *lock, struct fib_dump_filter *filter)
 {
 	unsigned int t = 0, s_t = cb->args[0];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
 	int err;
 
+	/* multicast does not track protocol or have route type other
+	 * than RTN_MULTICAST
+	 */
+	if (filter->filter_set) {
+		if (filter->protocol || filter->flags ||
+		    (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+			return skb->len;
+	}
+
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
 
-		err = mr_table_dump(mrt, skb, cb, fill, lock);
+		err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
 		if (err < 0)
 			break;
 next_table:
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dbd5166c5599..9759b0aecdd6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2459,16 +2459,28 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct fib_dump_filter filter = {};
+	int err;
 
 	if (cb->strict_check) {
-		int err;
-
 		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
 					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
 
+	if (filter.table_id) {
+		struct mr_table *mrt;
+
+		mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id);
+		if (!mrt) {
+			NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
+			return -ENOENT;
+		}
+		err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
+				    &mfc_unres_lock, &filter);
+		return skb->len ? : err;
+	}
+
 	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
-				_ip6mr_fill_mroute, &mfc_unres_lock);
+				_ip6mr_fill_mroute, &mfc_unres_lock, &filter);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 09/11] net/mpls: Handle kernel side filtering of route dumps
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Update the dump request parsing in MPLS for the non-INET case to
enable kernel side filtering. If INET is disabled the only filters
that make sense for MPLS are protocol and nexthop device.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/mpls/af_mpls.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 24381696932a..7d55d4c04088 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2044,7 +2044,9 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 				   struct netlink_callback *cb)
 {
 	struct netlink_ext_ack *extack = cb->extack;
+	struct nlattr *tb[RTA_MAX + 1];
 	struct rtmsg *rtm;
+	int err, i;
 
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request");
@@ -2053,15 +2055,36 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 
 	rtm = nlmsg_data(nlh);
 	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
-	    rtm->rtm_table   || rtm->rtm_protocol || rtm->rtm_scope ||
-	    rtm->rtm_type    || rtm->rtm_flags) {
+	    rtm->rtm_table   || rtm->rtm_scope    || rtm->rtm_type  ||
+	    rtm->rtm_flags) {
 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request");
 		return -EINVAL;
 	}
 
-	if (nlmsg_attrlen(nlh, sizeof(*rtm))) {
-		NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in FIB dump request");
-		return -EINVAL;
+	if (rtm->rtm_protocol) {
+		filter->protocol = rtm->rtm_protocol;
+		filter->filter_set = 1;
+		cb->answer_flags = NLM_F_DUMP_FILTERED;
+	}
+
+	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+				 rtm_mpls_policy, extack);
+	if (err < 0)
+		return err;
+
+	for (i = 0; i <= RTA_MAX; ++i) {
+		int ifindex;
+
+		if (i == RTA_OIF) {
+			ifindex = nla_get_u32(tb[i]);
+			filter->dev = __dev_get_by_index(net, ifindex);
+			if (!filter->dev)
+				return -ENODEV;
+			filter->filter_set = 1;
+		} else if (tb[i]) {
+			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 02/11] net: Add struct for fib dump filter
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Add struct fib_dump_filter for options on limiting which routes are
returned in a dump request. The current list is table id, protocol,
route type, rtm_flags and nexthop device index. struct net is needed
to lookup the net_device from the index.

Declare the filter for each route dump handler and plumb the new
arguments from dump handlers to ip_valid_fib_dump_req.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/ip6_route.h |  1 +
 include/net/ip_fib.h    | 13 ++++++++++++-
 net/ipv4/fib_frontend.c |  6 ++++--
 net/ipv4/ipmr.c         |  6 +++++-
 net/ipv6/ip6_fib.c      |  5 +++--
 net/ipv6/ip6mr.c        |  5 ++++-
 net/mpls/af_mpls.c      | 12 ++++++++----
 7 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7ab119936e69 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -174,6 +174,7 @@ struct rt6_rtnl_dump_arg {
 	struct sk_buff *skb;
 	struct netlink_callback *cb;
 	struct net *net;
+	struct fib_dump_filter filter;
 };
 
 int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 852e4ebf2209..667013bf4266 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,6 +222,16 @@ struct fib_table {
 	unsigned long		__data[0];
 };
 
+struct fib_dump_filter {
+	u32			table_id;
+	/* filter_set is an optimization that an entry is set */
+	bool			filter_set;
+	unsigned char		protocol;
+	unsigned char		rt_type;
+	unsigned int		flags;
+	struct net_device	*dev;
+};
+
 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 		     struct fib_result *res, int fib_flags);
 int fib_table_insert(struct net *, struct fib_table *, struct fib_config *,
@@ -453,6 +463,7 @@ static inline void fib_proc_exit(struct net *net)
 
 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack);
 #endif  /* _NET_FIB_H */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 0f1beceb47d5..850850dd80e1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -802,7 +802,8 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-int ip_valid_fib_dump_req(const struct nlmsghdr *nlh,
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+			  struct fib_dump_filter *filter,
 			  struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -837,6 +838,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct fib_dump_filter filter = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
 	struct fib_table *tb;
@@ -844,7 +846,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 	int dumped = 0, err;
 
 	if (cb->strict_check) {
-		err = ip_valid_fib_dump_req(nlh, cb->extack);
+		err = ip_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 91b0d5671649..44d777058960 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2527,9 +2527,13 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct fib_dump_filter filter = {};
+
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(cb->nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0783af11b0b7..94e61fe47ff8 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -569,17 +569,18 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
+	struct rt6_rtnl_dump_arg arg = {};
 	unsigned int h, s_h;
 	unsigned int e = 0, s_e;
-	struct rt6_rtnl_dump_arg arg;
 	struct fib6_walker *w;
 	struct fib6_table *tb;
 	struct hlist_head *head;
 	int res = 0;
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d7563ef76518..dbd5166c5599 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2458,10 +2458,13 @@ static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nlmsghdr *nlh = cb->nlh;
+	struct fib_dump_filter filter = {};
 
 	if (cb->strict_check) {
-		int err = ip_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+					    &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5fe274c47c41..bfcb4759c9ee 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2032,13 +2032,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 }
 
 #if IS_ENABLED(CONFIG_INET)
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
-	return ip_valid_fib_dump_req(nlh, extack);
+	return ip_valid_fib_dump_req(net, nlh, filter, extack);
 }
 #else
-static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh,
+static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+				   struct fib_dump_filter *filter,
 				   struct netlink_ext_ack *extack)
 {
 	struct rtmsg *rtm;
@@ -2070,14 +2072,16 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb)
 	const struct nlmsghdr *nlh = cb->nlh;
 	struct net *net = sock_net(skb->sk);
 	struct mpls_route __rcu **platform_label;
+	struct fib_dump_filter filter = {};
 	size_t platform_labels;
 	unsigned int index;
 
 	ASSERT_RTNL();
 
 	if (cb->strict_check) {
-		int err = mpls_valid_fib_dump_req(nlh, cb->extack);
+		int err;
 
+		err = mpls_valid_fib_dump_req(net, nlh, &filter, cb->extack);
 		if (err < 0)
 			return err;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 06/11] ipmr: Refactor mr_rtm_dumproute
From: David Ahern @ 2018-10-16  1:56 UTC (permalink / raw)
  To: netdev, davem; +Cc: David Ahern
In-Reply-To: <20181016015651.22696-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Move per-table loops from mr_rtm_dumproute to mr_table_dump and export
mr_table_dump for dumps by specific table id.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/linux/mroute_base.h |  6 ++++
 net/ipv4/ipmr_base.c        | 88 ++++++++++++++++++++++++++++-----------------
 2 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index 6675b9f81979..db85373c8d15 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -283,6 +283,12 @@ void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg);
 
 int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 		   struct mr_mfc *c, struct rtmsg *rtm);
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+		  struct netlink_callback *cb,
+		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+			      u32 portid, u32 seq, struct mr_mfc *c,
+			      int cmd, int flags),
+		  spinlock_t *lock);
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..132dd2613ca5 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -268,6 +268,55 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(mr_fill_mroute);
 
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+		  struct netlink_callback *cb,
+		  int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+			      u32 portid, u32 seq, struct mr_mfc *c,
+			      int cmd, int flags),
+		  spinlock_t *lock)
+{
+	unsigned int e = 0, s_e = cb->args[1];
+	unsigned int flags = NLM_F_MULTI;
+	struct mr_mfc *mfc;
+	int err;
+
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+		if (e < s_e)
+			goto next_entry;
+
+		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+		if (err < 0)
+			goto out;
+next_entry:
+		e++;
+	}
+	e = 0;
+	s_e = 0;
+
+	spin_lock_bh(lock);
+	list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+		if (e < s_e)
+			goto next_entry2;
+
+		err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+			   cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+		if (err < 0) {
+			spin_unlock_bh(lock);
+			goto out;
+		}
+next_entry2:
+		e++;
+	}
+	spin_unlock_bh(lock);
+	err = 0;
+	e = 0;
+
+out:
+	cb->args[1] = e;
+	return err;
+}
+
 int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 		     struct mr_table *(*iter)(struct net *net,
 					      struct mr_table *mrt),
@@ -277,51 +326,24 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
 				 int cmd, int flags),
 		     spinlock_t *lock)
 {
-	unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
+	unsigned int t = 0, s_t = cb->args[0];
 	struct net *net = sock_net(skb->sk);
 	struct mr_table *mrt;
-	struct mr_mfc *mfc;
+	int err;
 
 	rcu_read_lock();
 	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
 		if (t < s_t)
 			goto next_table;
-		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
-			if (e < s_e)
-				goto next_entry;
-			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
-				 cb->nlh->nlmsg_seq, mfc,
-				 RTM_NEWROUTE, NLM_F_MULTI) < 0)
-				goto done;
-next_entry:
-			e++;
-		}
-		e = 0;
-		s_e = 0;
-
-		spin_lock_bh(lock);
-		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
-			if (e < s_e)
-				goto next_entry2;
-			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
-				 cb->nlh->nlmsg_seq, mfc,
-				 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
-				spin_unlock_bh(lock);
-				goto done;
-			}
-next_entry2:
-			e++;
-		}
-		spin_unlock_bh(lock);
-		e = 0;
-		s_e = 0;
+
+		err = mr_table_dump(mrt, skb, cb, fill, lock);
+		if (err < 0)
+			break;
 next_table:
 		t++;
 	}
-done:
 	rcu_read_unlock();
 
-	cb->args[1] = e;
 	cb->args[0] = t;
 
 	return skb->len;
-- 
2.11.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox