Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH btf 0/3] Add BTF types deduplication algorithm
From: Yonghong Song @ 2019-01-31  7:18 UTC (permalink / raw)
  To: Andrii Nakryiko, netdev@vger.kernel.org, Alexei Starovoitov,
	Martin Lau, acme@kernel.org, daniel@iogearbox.net, Kernel Team,
	Edward Cree
In-Reply-To: <20190131065837.3380288-1-andriin@fb.com>


add Edward.

On 1/30/19 10:58 PM, Andrii Nakryiko wrote:
> This patch series adds BTF deduplication algorithm to libbpf. This algorithm
> allows to take BTF type information containing duplicate per-compilation unit
> information and reduce it to equivalent set of BTF types with no duplication without
> loss of information. It also deduplicates strings and removes those strings that
> are not referenced from any BTF type (and line information in .BTF.ext section,
> if any).
> 
> Algorithm also resolves struct/union forward declarations into concrete BTF types
> across multiple compilation units to facilitate better deduplication ratio. If
> undesired, this resolution can be disabled through specifying corresponding options.
> 
> When applied to BTF data emitted by pahole's DWARF->BTF converter, it reduces
> the overall size of .BTF section by about 65x, from about 112MB to 1.75MB, leaving
> only 29247 out of initial 3073497 BTF type descriptors.
> 
> Algorithm with minor differences and preliminary results before FUNC/FUNC_PROTO
> support is also described more verbosely at:
> https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html
> 
> Andrii Nakryiko (3):
>    btf: extract BTF type size calculation
>    btf: add BTF types deduplication algorithm
>    selftests/btf: add initial BTF dedup tests
> 
>   tools/lib/bpf/btf.c                    | 1851 +++++++++++++++++++++++-
>   tools/lib/bpf/btf.h                    |   11 +
>   tools/lib/bpf/libbpf.map               |    3 +
>   tools/testing/selftests/bpf/test_btf.c |  535 ++++++-
>   4 files changed, 2333 insertions(+), 67 deletions(-)
> 

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: add optional memory accounting for maps
From: Y Song @ 2019-01-31  7:15 UTC (permalink / raw)
  To: Martynas Pumputis; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20190130140251.23784-1-m@lambda.lt>

On Wed, Jan 30, 2019 at 6:05 AM Martynas Pumputis <m@lambda.lt> wrote:
>
> Previously, memory allocated for a map was not accounted. Therefore,
> this memory could not be taken into consideration by the cgroups
> memory controller.
>
> This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
> the memory accounting for a map, and it can be set during
> the map creation ("BPF_MAP_CREATE") in "map_flags".
>
> When enabled, we account only that amount of memory which is charged
> against the "RLIMIT_MEMLOCK" limit.
>
> To validate the change, first we create the memory cgroup "test-map":
>
>     # mkdir /sys/fs/cgroup/memory/test-map
>
> And then we run the following program against the cgroup:
>
>     $ cat test_map.c
>     <..>
>     int main() {
>         usleep(3 * 1000000);
>         assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, 0) > 0);
>         usleep(3 * 1000000);
>     }
>     # cgexec -g memory:test-map ./test_map &
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     397312
>     258048
>
>     <after 3 sec the map has been created>
>
>     # bpftool map list
>     19: hash  flags 0x0
>         key 8B  value 16B  max_entries 65536  memlock 5771264B
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     401408
>     262144
>
> As we can see, the memory allocated for map is not accounted, as
> 397312B + 5771264B > 401408B.
>
> Next, we enabled the accounting and re-run the test:
>
>     $ cat test_map.c
>     <..>
>     int main() {
>         usleep(3 * 1000000);
>         assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, BPF_F_ACCOUNT_MEM) > 0);
>         usleep(3 * 1000000);
>     }
>     # cgexec -g memory:test-map ./test_map &
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     450560
>     307200
>
>     <after 3 sec the map has been created>
>
>     # bpftool map list
>     20: hash  flags 0x80
>         key 8B  value 16B  max_entries 65536  memlock 5771264B
>     # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
>     6221824
>     6078464
>
> This time, the memory (including kmem) is accounted, as
> 450560B + 5771264B <= 6221824B

The above test result is for cgroup v1.
I also tested the patch for cgroup v2. It works fine too.

>
> Signed-off-by: Martynas Pumputis <m@lambda.lt>
> ---
>  include/linux/bpf.h            |  5 +++--
>  include/uapi/linux/bpf.h       |  2 ++
>  kernel/bpf/arraymap.c          | 14 +++++++++-----
>  kernel/bpf/bpf_lru_list.c      | 11 +++++++++--
>  kernel/bpf/bpf_lru_list.h      |  1 +
>  kernel/bpf/cpumap.c            | 12 +++++++++---
>  kernel/bpf/devmap.c            | 10 ++++++++--
>  kernel/bpf/hashtab.c           | 19 ++++++++++++++-----
>  kernel/bpf/lpm_trie.c          | 19 +++++++++++++------
>  kernel/bpf/queue_stack_maps.c  |  5 +++--
>  kernel/bpf/reuseport_array.c   |  3 ++-
>  kernel/bpf/stackmap.c          | 12 ++++++++----
>  kernel/bpf/syscall.c           | 12 ++++++++----
>  kernel/bpf/xskmap.c            |  9 +++++++--
>  net/core/sock_map.c            | 13 +++++++++----
>  tools/include/uapi/linux/bpf.h |  3 +++
>  16 files changed, 108 insertions(+), 42 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e734f163bd0b..353a3f4304fe 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -79,7 +79,8 @@ struct bpf_map {
>         u32 btf_value_type_id;
>         struct btf *btf;
>         bool unpriv_array;
> -       /* 55 bytes hole */
> +       bool account_mem;
> +       /* 54 bytes hole */
>
>         /* The 3rd and 4th cacheline with misc members to avoid false sharing
>          * particularly with refcounting.
> @@ -506,7 +507,7 @@ void bpf_map_put(struct bpf_map *map);
>  int bpf_map_precharge_memlock(u32 pages);
>  int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
>  void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
> -void *bpf_map_area_alloc(size_t size, int numa_node);
> +void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem);
>  void bpf_map_area_free(void *base);
>  void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 91c43884f295..a374ccbaa51b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -278,6 +278,8 @@ enum bpf_attach_type {
>  #define BPF_F_NO_COMMON_LRU    (1U << 1)
>  /* Specify numa node during map creation */
>  #define BPF_F_NUMA_NODE                (1U << 2)
> +/* Enable memory accounting for map */
> +#define BPF_F_ACCOUNT_MEM      (1U << 7)

Could you put the above definition after BPF_F_ZERO_SEED (1U << 6)?
The same for tools.
With this change, you can add my ack:
Acked-by: Yonghong Song <yhs@fb.com>

^ permalink raw reply

* [PATCH btf 2/3] btf: add BTF types deduplication algorithm
From: Andrii Nakryiko @ 2019-01-31  6:58 UTC (permalink / raw)
  To: netdev, ast, yhs, kafai, acme, daniel, kernel-team; +Cc: Andrii Nakryiko
In-Reply-To: <20190131065837.3380288-1-andriin@fb.com>

This patch implements BTF types deduplication algorithm. It allows to
greatly compress typical output of pahole's DWARF-to-BTF conversion or
LLVM's compilation output by detecting and collapsing identical types emitted in
isolation per compilation unit. Algorithm also resolves struct/union forward
declarations into concrete BTF types representing referenced struct/union. If
undesired, this resolution can be disabled through specifying corresponding options.

Algorithm itself and its application to Linux kernel's BTF types is
described in details at:
https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/btf.c      | 1741 ++++++++++++++++++++++++++++++++++++++
 tools/lib/bpf/btf.h      |    8 +
 tools/lib/bpf/libbpf.map |    1 +
 3 files changed, 1750 insertions(+)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 79b782963917..981f166b401c 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -780,3 +780,1744 @@ __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext)
 {
 	return btf_ext->line_info.rec_size;
 }
+
+struct btf_dedup;
+
+static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
+				       const struct btf_dedup_opts *opts);
+static void btf_dedup_free(struct btf_dedup *d);
+static int btf_dedup_strings(struct btf_dedup *d);
+static int btf_dedup_prim_types(struct btf_dedup *d);
+static int btf_dedup_struct_types(struct btf_dedup *d);
+static int btf_dedup_ref_types(struct btf_dedup *d);
+static int btf_dedup_compact_types(struct btf_dedup *d);
+static int btf_dedup_remap_types(struct btf_dedup *d);
+
+/*
+ * Deduplicate BTF types and strings.
+ *
+ * BTF dedup algorithm takes as an input `struct btf` representing `.BTF` ELF
+ * section with all BTF type descriptors and string data. It overwrites that
+ * memory in-place with deduplicated types and strings without any loss of
+ * information. If optional `struct btf_ext` representing '.BTF.ext' ELF section
+ * is provided, all the strings referenced from .BTF.ext section are honored
+ * and updated to point to the right offsets after deduplication.
+ *
+ * If function returns with error, type/string data might be garbled and should
+ * be discarded.
+ *
+ * More verbose and detailed description of both problem btf_dedup is solving,
+ * as well as solution could be found at:
+ * https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html
+ *
+ * Problem description and justification
+ * =====================================
+ *
+ * BTF type information is typically emitted either as a result of conversion
+ * from DWARF to BTF or directly by compiler. In both cases, each compilation
+ * unit contains information about a subset of all the types that are used
+ * in an application. These subsets are frequently overlapping and contain a lot
+ * of duplicated information when later concatenated together into a single
+ * binary. This algorithm ensures that each unique type is represented by single
+ * BTF type descriptor, greatly reducing resulting size of BTF data.
+ *
+ * Compilation unit isolation and subsequent duplication of data is not the only
+ * problem. The same type hierarchy (e.g., struct and all the type that struct
+ * references) in different compilation units can be represented in BTF to
+ * various degrees of completeness (or, rather, incompleteness) due to
+ * struct/union forward declarations.
+ *
+ * Let's take a look at an example, that we'll use to better understand the
+ * problem (and solution). Suppose we have two compilation units, each using
+ * same `struct S`, but each of them having incomplete type information about
+ * struct's fields:
+ *
+ * // CU #1:
+ * struct S;
+ * struct A {
+ *	int a;
+ *	struct A* self;
+ *	struct S* parent;
+ * };
+ * struct B;
+ * struct S {
+ *	struct A* a_ptr;
+ *	struct B* b_ptr;
+ * };
+ *
+ * // CU #2:
+ * struct S;
+ * struct A;
+ * struct B {
+ *	int b;
+ *	struct B* self;
+ *	struct S* parent;
+ * };
+ * struct S {
+ *	struct A* a_ptr;
+ *	struct B* b_ptr;
+ * };
+ *
+ * In case of CU #1, BTF data will know only that `struct B` exist (but no
+ * more), but will know the complete type information about `struct A`. While
+ * for CU #2, it will know full type information about `struct B`, but will
+ * only know about forward declaration of `struct A` (in BTF terms, it will
+ * have `BTF_KIND_FWD` type descriptor with name `B`).
+ *
+ * This compilation unit isolation means that it's possible that there is no
+ * single CU with complete type information describing structs `S`, `A`, and
+ * `B`. Also, we might get tons of duplicated and redundant type information.
+ *
+ * Additional complication we need to keep in mind comes from the fact that
+ * types, in general, can form graphs containing cycles, not just DAGs.
+ *
+ * While algorithm does deduplication, it also merges and resolves type
+ * information (unless disabled throught `struct btf_opts`), whenever possible.
+ * E.g., in the example above with two compilation units having partial type
+ * information for structs `A` and `B`, the output of algorithm will emit
+ * a single copy of each BTF type that describes structs `A`, `B`, and `S`
+ * (as well as type information for `int` and pointers), as if they were defined
+ * in a single compilation unit as:
+ *
+ * struct A {
+ *	int a;
+ *	struct A* self;
+ *	struct S* parent;
+ * };
+ * struct B {
+ *	int b;
+ *	struct B* self;
+ *	struct S* parent;
+ * };
+ * struct S {
+ *	struct A* a_ptr;
+ *	struct B* b_ptr;
+ * };
+ *
+ * Algorithm summary
+ * =================
+ *
+ * Algorithm completes its work in 6 separate passes:
+ *
+ * 1. Strings deduplication.
+ * 2. Primitive types deduplication (int, enum, fwd).
+ * 3. Struct/union types deduplication.
+ * 4. Reference types deduplication (pointers, typedefs, arrays, funcs, func
+ *    protos, and const/volatile/restrict modifiers).
+ * 5. Types compaction.
+ * 6. Types remapping.
+ *
+ * Algorithm determines canonical type descriptor, which is a single
+ * representative type for each truly unique type. This canonical type is the
+ * one that will go into final deduplicated BTF type information. For
+ * struct/unions, it is also the type that algorithm will merge additional type
+ * information into (while resolving FWDs), as it discovers it from data in
+ * other CUs. Each input BTF type eventually gets either mapped to itself, if
+ * that type is canonical, or to some other type, if that type is equivalent
+ * and was chosen as canonical representative. This mapping is stored in
+ * `btf_dedup->map` array. This map is also used to record STRUCT/UNION that
+ * FWD type got resolved to.
+ *
+ * To facilitate fast discovery of canonical types, we also maintain canonical
+ * index (`btf_dedup->dedup_table`), which maps type descriptor's signature hash
+ * (i.e., hashed kind, name, size, fields, etc) into a list of canonical types
+ * that match that signature. With sufficiently good choice of type signature
+ * hashing function, we can limit number of canonical types for each unique type
+ * signature to a very small number, allowing to find canonical type for any
+ * duplicated type very quickly.
+ *
+ * Struct/union deduplication is the most critical part and algorithm for
+ * deduplicating structs/unions is described in greater details in comments for
+ * `btf_dedup_is_equiv` function.
+ */
+int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
+	       const struct btf_dedup_opts *opts, btf_print_fn_t err_log)
+{
+	struct btf_dedup *d = btf_dedup_new(btf, btf_ext, opts);
+	int err;
+
+	if (IS_ERR(d)) {
+		elog("btf_dedup_new failed: %ld", PTR_ERR(d));
+		return -EINVAL;
+	}
+
+	err = btf_dedup_strings(d);
+	if (err < 0) {
+		elog("btf_dedup_strings failed:%d\n", err);
+		goto done;
+	}
+	err = btf_dedup_prim_types(d);
+	if (err < 0) {
+		elog("btf_dedup_prim_types failed:%d\n", err);
+		goto done;
+	}
+	err = btf_dedup_struct_types(d);
+	if (err < 0) {
+		elog("btf_dedup_struct_types failed:%d\n", err);
+		goto done;
+	}
+	err = btf_dedup_ref_types(d);
+	if (err < 0) {
+		elog("btf_dedup_ref_types failed:%d\n", err);
+		goto done;
+	}
+	err = btf_dedup_compact_types(d);
+	if (err < 0) {
+		elog("btf_dedup_compact_types failed:%d\n", err);
+		goto done;
+	}
+	err = btf_dedup_remap_types(d);
+	if (err < 0) {
+		elog("btf_dedup_remap_types failed:%d\n", err);
+		goto done;
+	}
+
+done:
+	btf_dedup_free(d);
+	return err;
+}
+
+#define BTF_DEDUP_TABLE_SIZE_LOG 14
+#define BTF_DEDUP_TABLE_MOD ((1 << BTF_DEDUP_TABLE_SIZE_LOG) - 1)
+#define BTF_UNPROCESSED_ID ((__u32)-1)
+#define BTF_IN_PROGRESS_ID ((__u32)-2)
+
+struct btf_dedup_node {
+	struct btf_dedup_node *next;
+	__u32 type_id;
+};
+
+struct btf_dedup {
+	/* .BTF section to be deduped in-place */
+	struct btf *btf;
+	/*
+	 * Optional .BTF.ext section. When provided, any strings referenced
+	 * from it will be taken into account when deduping strings
+	 */
+	struct btf_ext *btf_ext;
+	/*
+	 * This is a map from any type's signature hash to a list of possible
+	 * canonical representative type candidates. Hash collisions are
+	 * ignored, so even types of various kinds can share same list of
+	 * candidates, which is fine because we rely on subsequent
+	 * btf_xxx_equal() checks to authoritatively verify type equality.
+	 */
+	struct btf_dedup_node **dedup_table;
+	/* Canonical types map */
+	__u32 *map;
+	/* Hypothetical mapping, used during type graph equivalence checks */
+	__u32 *hypot_map;
+	__u32 *hypot_list;
+	size_t hypot_cnt;
+	size_t hypot_cap;
+	/* Various option modifying behavior of algorithm */
+	struct btf_dedup_opts opts;
+};
+
+struct btf_str_ptr {
+	const char *str;
+	__u32 new_off;
+	bool used;
+};
+
+struct btf_str_ptrs {
+	struct btf_str_ptr *ptrs;
+	const char *data;
+	__u32 cnt;
+	__u32 cap;
+};
+
+static inline __u32 hash_combine(__u32 h, __u32 value)
+{
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e370001UL
+	return h * 37 + value * GOLDEN_RATIO_PRIME;
+#undef GOLDEN_RATIO_PRIME
+}
+
+#define for_each_hash_node(table, hash, node) \
+	for (node = table[hash & BTF_DEDUP_TABLE_MOD]; node; node = node->next)
+
+static int btf_dedup_table_add(struct btf_dedup *d, __u32 hash, __u32 type_id)
+{
+	struct btf_dedup_node *node = malloc(sizeof(struct btf_dedup_node));
+
+	if (!node)
+		return -ENOMEM;
+	node->type_id = type_id;
+	node->next = d->dedup_table[hash & BTF_DEDUP_TABLE_MOD];
+	d->dedup_table[hash & BTF_DEDUP_TABLE_MOD] = node;
+	return 0;
+}
+
+static int btf_dedup_hypot_map_add(struct btf_dedup *d,
+				   __u32 from_id, __u32 to_id)
+{
+	if (d->hypot_cnt == d->hypot_cap) {
+		__u32 *new_list;
+
+		d->hypot_cap += max(16, d->hypot_cap / 2);
+		new_list = realloc(d->hypot_list, sizeof(__u32) * d->hypot_cap);
+		if (!new_list)
+			return -ENOMEM;
+		d->hypot_list = new_list;
+	}
+	d->hypot_list[d->hypot_cnt++] = from_id;
+	d->hypot_map[from_id] = to_id;
+	return 0;
+}
+
+static void btf_dedup_clear_hypot_map(struct btf_dedup *d)
+{
+	int i;
+
+	for (i = 0; i < d->hypot_cnt; i++)
+		d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID;
+	d->hypot_cnt = 0;
+}
+
+static void btf_dedup_table_free(struct btf_dedup *d)
+{
+	struct btf_dedup_node *head, *tmp;
+	int i;
+
+	if (!d->dedup_table)
+		return;
+
+	for (i = 0; i < (1 << BTF_DEDUP_TABLE_SIZE_LOG); i++) {
+		while (d->dedup_table[i]) {
+			tmp = d->dedup_table[i];
+			d->dedup_table[i] = tmp->next;
+			free(tmp);
+		}
+
+		head = d->dedup_table[i];
+		while (head) {
+			tmp = head;
+			head = head->next;
+			free(tmp);
+		}
+	}
+
+	free(d->dedup_table);
+	d->dedup_table = NULL;
+}
+
+static void btf_dedup_free(struct btf_dedup *d)
+{
+	btf_dedup_table_free(d);
+
+	free(d->map);
+	d->map = NULL;
+
+	free(d->hypot_map);
+	d->hypot_map = NULL;
+
+	free(d->hypot_list);
+	d->hypot_list = NULL;
+
+	free(d);
+}
+
+static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
+				       const struct btf_dedup_opts *opts)
+{
+	struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup));
+	int i, err = 0;
+
+	if (!d)
+		return ERR_PTR(-ENOMEM);
+
+	d->btf = btf;
+	d->btf_ext = btf_ext;
+
+	d->dedup_table = calloc(1 << BTF_DEDUP_TABLE_SIZE_LOG,
+				sizeof(struct btf_dedup_node *));
+	if (!d->dedup_table) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	d->map = malloc(sizeof(__u32) * (1 + btf->nr_types));
+	if (!d->map) {
+		err = -ENOMEM;
+		goto done;
+	}
+	/* special BTF "void" type is made canonical immediately */
+	d->map[0] = 0;
+	for (i = 1; i <= btf->nr_types; i++)
+		d->map[i] = BTF_UNPROCESSED_ID;
+
+	d->hypot_map = malloc(sizeof(__u32) * (1 + btf->nr_types));
+	if (!d->hypot_map) {
+		err = -ENOMEM;
+		goto done;
+	}
+	for (i = 0; i <= btf->nr_types; i++)
+		d->hypot_map[i] = BTF_UNPROCESSED_ID;
+
+	d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds;
+
+done:
+	if (err) {
+		btf_dedup_free(d);
+		return ERR_PTR(err);
+	}
+
+	return d;
+}
+
+typedef int (*str_off_fn_t)(__u32 *str_off_ptr, void *ctx);
+
+/*
+ * Iterate over all possible places in .BTF and .BTF.ext that can reference
+ * string and pass pointer to it to a provided callback `fn`.
+ */
+static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx)
+{
+	void *line_data_cur, *line_data_end;
+	int i, j, r, rec_size;
+	struct btf_type *t;
+
+	for (i = 1; i <= d->btf->nr_types; i++) {
+		t = d->btf->types[i];
+		r = fn(&t->name_off, ctx);
+		if (r)
+			return r;
+
+		switch (BTF_INFO_KIND(t->info)) {
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION: {
+			struct btf_member *m = (struct btf_member *)(t + 1);
+			__u16 vlen = BTF_INFO_VLEN(t->info);
+
+			for (j = 0; j < vlen; j++) {
+				r = fn(&m->name_off, ctx);
+				if (r)
+					return r;
+				m++;
+			}
+			break;
+		}
+		case BTF_KIND_ENUM: {
+			struct btf_enum *m = (struct btf_enum *)(t + 1);
+			__u16 vlen = BTF_INFO_VLEN(t->info);
+
+			for (j = 0; j < vlen; j++) {
+				r = fn(&m->name_off, ctx);
+				if (r)
+					return r;
+				m++;
+			}
+			break;
+		}
+		case BTF_KIND_FUNC_PROTO: {
+			struct btf_param *m = (struct btf_param *)(t + 1);
+			__u16 vlen = BTF_INFO_VLEN(t->info);
+
+			for (j = 0; j < vlen; j++) {
+				r = fn(&m->name_off, ctx);
+				if (r)
+					return r;
+				m++;
+			}
+			break;
+		}
+		default:
+			break;
+		}
+	}
+
+	if (!d->btf_ext)
+		return 0;
+
+	line_data_cur = d->btf_ext->line_info.info;
+	line_data_end = d->btf_ext->line_info.info + d->btf_ext->line_info.len;
+	rec_size = d->btf_ext->line_info.rec_size;
+
+	while (line_data_cur < line_data_end) {
+		struct btf_ext_info_sec *sec = line_data_cur;
+		struct bpf_line_info_min *line_info;
+		__u32 num_info = sec->num_info;
+
+		r = fn(&sec->sec_name_off, ctx);
+		if (r)
+			return r;
+
+		line_data_cur += sizeof(struct btf_ext_info_sec);
+		for (i = 0; i < num_info; i++) {
+			line_info = line_data_cur;
+			r = fn(&line_info->file_name_off, ctx);
+			if (r)
+				return r;
+			r = fn(&line_info->line_off, ctx);
+			if (r)
+				return r;
+			line_data_cur += rec_size;
+		}
+	}
+
+	return 0;
+}
+
+static int str_sort_by_content(const void *a1, const void *a2)
+{
+	const struct btf_str_ptr *p1 = a1;
+	const struct btf_str_ptr *p2 = a2;
+
+	return strcmp(p1->str, p2->str);
+}
+
+static int str_sort_by_offset(const void *a1, const void *a2)
+{
+	const struct btf_str_ptr *p1 = a1;
+	const struct btf_str_ptr *p2 = a2;
+
+	if (p1->str != p2->str)
+		return p1->str < p2->str ? -1 : 1;
+	return 0;
+}
+
+static int btf_dedup_str_ptr_cmp(const void *str_ptr, const void *pelem)
+{
+	const struct btf_str_ptr *p = pelem;
+
+	if (str_ptr != p->str)
+		return (const char *)str_ptr < p->str ? -1 : 1;
+	return 0;
+}
+
+static int btf_str_mark_as_used(__u32 *str_off_ptr, void *ctx)
+{
+	struct btf_str_ptrs *strs;
+	struct btf_str_ptr *s;
+
+	if (*str_off_ptr == 0)
+		return 0;
+
+	strs = ctx;
+	s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt,
+		    sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp);
+	if (!s)
+		return -EINVAL;
+	s->used = true;
+	return 0;
+}
+
+static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx)
+{
+	struct btf_str_ptrs *strs;
+	struct btf_str_ptr *s;
+
+	if (*str_off_ptr == 0)
+		return 0;
+
+	strs = ctx;
+	s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt,
+		    sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp);
+	if (!s)
+		return -EINVAL;
+	*str_off_ptr = s->new_off;
+	return 0;
+}
+
+/*
+ * Dedup string and filter out those that are not referenced from either .BTF
+ * or .BTF.ext (if provided) sections.
+ *
+ * This is done by building index of all strings in BTF's string section,
+ * then iterating over all entities that can reference strings (e.g., type
+ * names, struct field names, .BTF.ext line info, etc) and marking corresponding
+ * strings as used. After that all used strings are deduped and compacted into
+ * sequential blob of memory and new offsets are calculated. Then all the string
+ * references are iterated again and rewritten using new offsets.
+ */
+static int btf_dedup_strings(struct btf_dedup *d)
+{
+	const struct btf_header *hdr = d->btf->hdr;
+	char *start = (char *)d->btf->nohdr_data + hdr->str_off;
+	char *end = start + d->btf->hdr->str_len;
+	char *p = start, *tmp_strs = NULL;
+	struct btf_str_ptrs strs = {
+		.cnt = 0,
+		.cap = 0,
+		.ptrs = NULL,
+		.data = start,
+	};
+	int i, j, err = 0, grp_idx;
+	bool grp_used;
+
+	/* build index of all strings */
+	while (p < end) {
+		if (strs.cnt + 1 > strs.cap) {
+			struct btf_str_ptr *new_ptrs;
+
+			strs.cap += max(strs.cnt / 2, 16);
+			new_ptrs = realloc(strs.ptrs,
+					   sizeof(strs.ptrs[0]) * strs.cap);
+			if (!new_ptrs) {
+				err = -ENOMEM;
+				goto done;
+			}
+			strs.ptrs = new_ptrs;
+		}
+
+		strs.ptrs[strs.cnt].str = p;
+		strs.ptrs[strs.cnt].used = false;
+
+		p += strlen(p) + 1;
+		strs.cnt++;
+	}
+
+	/* temporary storage for deduplicated strings */
+	tmp_strs = malloc(d->btf->hdr->str_len);
+	if (!tmp_strs) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	/* mark all used strings */
+	strs.ptrs[0].used = true;
+	err = btf_for_each_str_off(d, btf_str_mark_as_used, &strs);
+	if (err)
+		goto done;
+
+	/* sort strings by context, so that we can identify duplicates */
+	qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_content);
+
+	/*
+	 * iterate groups of equal strings and if any instance in a group was
+	 * referenced, emit single instance and remember new offset
+	 */
+	p = tmp_strs;
+	grp_idx = 0;
+	grp_used = strs.ptrs[0].used;
+	/* iterate past end to avoid code duplication after loop */
+	for (i = 1; i <= strs.cnt; i++) {
+		/*
+		 * when i == strs.cnt, we want to skip string comparison and go
+		 * straight to handling last group of strings (otherwise we'd
+		 * need to handle last group after the loop w/ duplicated code)
+		 */
+		if (i < strs.cnt &&
+		    !strcmp(strs.ptrs[i].str, strs.ptrs[grp_idx].str)) {
+			grp_used = grp_used || strs.ptrs[i].used;
+			continue;
+		}
+
+		/*
+		 * this check would have been required after the loop to handle
+		 * last group of strings, but due to <= condition in a loop
+		 * we avoid that duplication
+		 */
+		if (grp_used) {
+			int new_off = p - tmp_strs;
+			__u32 len = strlen(strs.ptrs[grp_idx].str);
+
+			memmove(p, strs.ptrs[grp_idx].str, len + 1);
+			for (j = grp_idx; j < i; j++)
+				strs.ptrs[j].new_off = new_off;
+			p += len + 1;
+		}
+
+		if (i < strs.cnt) {
+			grp_idx = i;
+			grp_used = strs.ptrs[i].used;
+		}
+	}
+
+	/* replace original strings with deduped ones */
+	d->btf->hdr->str_len = p - tmp_strs;
+	memmove(start, tmp_strs, d->btf->hdr->str_len);
+	end = start + d->btf->hdr->str_len;
+
+	/* restore original order for further binary search lookups */
+	qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_offset);
+
+	/* remap string offsets */
+	err = btf_for_each_str_off(d, btf_str_remap_offset, &strs);
+	if (err)
+		goto done;
+
+	d->btf->hdr->str_len = end - start;
+
+done:
+	free(tmp_strs);
+	free(strs.ptrs);
+	return err;
+}
+
+static __u32 btf_hash_common(struct btf_type *t)
+{
+	__u32 h;
+
+	h = hash_combine(0, t->name_off);
+	h = hash_combine(h, t->info);
+	h = hash_combine(h, t->size);
+	return h;
+}
+
+static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2)
+{
+	return t1->name_off == t2->name_off &&
+	       t1->info == t2->info &&
+	       t1->size == t2->size;
+}
+
+/* Calculate type signature hash of INT. */
+static __u32 btf_hash_int(struct btf_type *t)
+{
+	__u32 info = *(__u32 *)(t + 1);
+	__u32 h;
+
+	h = btf_hash_common(t);
+	h = hash_combine(h, info);
+	return h;
+}
+
+/* Check structural equality of two INTs. */
+static bool btf_equal_int(struct btf_type *t1, struct btf_type *t2)
+{
+	__u32 info1, info2;
+
+	if (!btf_equal_common(t1, t2))
+		return false;
+	info1 = *(__u32 *)(t1 + 1);
+	info2 = *(__u32 *)(t2 + 1);
+	return info1 == info2;
+}
+
+/* Calculate type signature hash of ENUM. */
+static __u32 btf_hash_enum(struct btf_type *t)
+{
+	struct btf_enum *member = (struct btf_enum *)(t + 1);
+	__u32 vlen = BTF_INFO_VLEN(t->info);
+	__u32 h = btf_hash_common(t);
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		h = hash_combine(h, member->name_off);
+		h = hash_combine(h, member->val);
+		member++;
+	}
+	return h;
+}
+
+/* Check structural equality of two ENUMs. */
+static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2)
+{
+	struct btf_enum *m1, *m2;
+	__u16 vlen;
+	int i;
+
+	if (!btf_equal_common(t1, t2))
+		return false;
+
+	vlen = BTF_INFO_VLEN(t1->info);
+	m1 = (struct btf_enum *)(t1 + 1);
+	m2 = (struct btf_enum *)(t2 + 1);
+	for (i = 0; i < vlen; i++) {
+		if (m1->name_off != m2->name_off || m1->val != m2->val)
+			return false;
+		m1++;
+		m2++;
+	}
+	return true;
+}
+
+/*
+ * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs,
+ * as referenced type IDs equivalence is established separately during type
+ * graph equivalence check algorithm.
+ */
+static __u32 btf_hash_struct(struct btf_type *t)
+{
+	struct btf_member *member = (struct btf_member *)(t + 1);
+	__u32 vlen = BTF_INFO_VLEN(t->info);
+	__u32 h = btf_hash_common(t);
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		h = hash_combine(h, member->name_off);
+		h = hash_combine(h, member->offset);
+		/* no hashing of referenced type ID, it can be unresolved yet */
+		member++;
+	}
+	return h;
+}
+
+/*
+ * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type
+ * IDs. This check is performed during type graph equivalence check and
+ * referenced types equivalence is checked separately.
+ */
+static bool btf_equal_struct(struct btf_type *t1, struct btf_type *t2)
+{
+	struct btf_member *m1, *m2;
+	__u16 vlen;
+	int i;
+
+	if (!btf_equal_common(t1, t2))
+		return false;
+
+	vlen = BTF_INFO_VLEN(t1->info);
+	m1 = (struct btf_member *)(t1 + 1);
+	m2 = (struct btf_member *)(t2 + 1);
+	for (i = 0; i < vlen; i++) {
+		if (m1->name_off != m2->name_off || m1->offset != m2->offset)
+			return false;
+		m1++;
+		m2++;
+	}
+	return true;
+}
+
+/*
+ * Calculate type signature hash of ARRAY, including referenced type IDs,
+ * under assumption that they were already resolved to canonical type IDs and
+ * are not going to change.
+ */
+static __u32 btf_hash_array(struct btf_type *t)
+{
+	struct btf_array *info = (struct btf_array *)(t + 1);
+	__u32 h = btf_hash_common(t);
+
+	h = hash_combine(h, info->type);
+	h = hash_combine(h, info->index_type);
+	h = hash_combine(h, info->nelems);
+	return h;
+}
+
+/*
+ * Check exact equality of two ARRAYs, taking into account referenced
+ * type IDs, under assumption that they were already resolved to canonical
+ * type IDs and are not going to change.
+ * This function is called during reference types deduplication to compare
+ * ARRAY to potential canonical representative.
+ */
+static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2)
+{
+	struct btf_array *info1, *info2;
+
+	if (!btf_equal_common(t1, t2))
+		return false;
+
+	info1 = (struct btf_array *)(t1 + 1);
+	info2 = (struct btf_array *)(t2 + 1);
+	return info1->type == info2->type &&
+	       info1->index_type == info2->index_type &&
+	       info1->nelems == info2->nelems;
+}
+
+/*
+ * Check structural compatibility of two ARRAYs, ignoring referenced type
+ * IDs. This check is performed during type graph equivalence check and
+ * referenced types equivalence is checked separately.
+ */
+static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2)
+{
+	struct btf_array *info1, *info2;
+
+	if (!btf_equal_common(t1, t2))
+		return false;
+
+	info1 = (struct btf_array *)(t1 + 1);
+	info2 = (struct btf_array *)(t2 + 1);
+	return info1->nelems == info2->nelems;
+}
+
+/*
+ * Calculate type signature hash of FUNC_PROTO, including referenced type IDs,
+ * under assumption that they were already resolved to canonical type IDs and
+ * are not going to change.
+ */
+static inline __u32 btf_hash_fnproto(struct btf_type *t)
+{
+	struct btf_param *member = (struct btf_param *)(t + 1);
+	__u16 vlen = BTF_INFO_VLEN(t->info);
+	__u32 h = btf_hash_common(t);
+	int i;
+
+	for (i = 0; i < vlen; i++) {
+		h = hash_combine(h, member->name_off);
+		h = hash_combine(h, member->type);
+		member++;
+	}
+	return h;
+}
+
+/*
+ * Check exact equality of two FUNC_PROTOs, taking into account referenced
+ * type IDs, under assumption that they were already resolved to canonical
+ * type IDs and are not going to change.
+ * This function is called during reference types deduplication to compare
+ * FUNC_PROTO to potential canonical representative.
+ */
+static inline bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2)
+{
+	struct btf_param *m1, *m2;
+	__u16 vlen;
+	int i;
+
+	if (!btf_equal_common(t1, t2))
+		return false;
+
+	vlen = BTF_INFO_VLEN(t1->info);
+	m1 = (struct btf_param *)(t1 + 1);
+	m2 = (struct btf_param *)(t2 + 1);
+	for (i = 0; i < vlen; i++) {
+		if (m1->name_off != m2->name_off || m1->type != m2->type)
+			return false;
+		m1++;
+		m2++;
+	}
+	return true;
+}
+
+/*
+ * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type
+ * IDs. This check is performed during type graph equivalence check and
+ * referenced types equivalence is checked separately.
+ */
+static inline bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2)
+{
+	struct btf_param *m1, *m2;
+	__u16 vlen;
+	int i;
+
+	/* skip return type ID */
+	if (t1->name_off != t2->name_off || t1->info != t2->info)
+		return false;
+
+	vlen = BTF_INFO_VLEN(t1->info);
+	m1 = (struct btf_param *)(t1 + 1);
+	m2 = (struct btf_param *)(t2 + 1);
+	for (i = 0; i < vlen; i++) {
+		if (m1->name_off != m2->name_off)
+			return false;
+		m1++;
+		m2++;
+	}
+	return true;
+}
+
+/*
+ * Deduplicate primitive types, that can't reference other types, by calculating
+ * their type signature hash and comparing them with any possible canonical
+ * candidate. If no canonical candidate matches, type itself is marked as
+ * canonical and is added into `btf_dedup->dedup_table` as another candidate.
+ */
+static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
+{
+	struct btf_type *t = d->btf->types[type_id];
+	struct btf_type *cand;
+	struct btf_dedup_node *cand_node;
+	/* if we don't find equivalent type, then we are canonical */
+	__u32 new_id = type_id;
+	__u32 h;
+
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_CONST:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_ARRAY:
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+	case BTF_KIND_FUNC:
+	case BTF_KIND_FUNC_PROTO:
+		return 0;
+
+	case BTF_KIND_INT:
+		h = btf_hash_int(t);
+		for_each_hash_node(d->dedup_table, h, cand_node) {
+			cand = d->btf->types[cand_node->type_id];
+			if (btf_equal_int(t, cand)) {
+				new_id = cand_node->type_id;
+				break;
+			}
+		}
+		break;
+
+	case BTF_KIND_ENUM:
+		h = btf_hash_enum(t);
+		for_each_hash_node(d->dedup_table, h, cand_node) {
+			cand = d->btf->types[cand_node->type_id];
+			if (btf_equal_enum(t, cand)) {
+				new_id = cand_node->type_id;
+				break;
+			}
+		}
+		break;
+
+	case BTF_KIND_FWD:
+		h = btf_hash_common(t);
+		for_each_hash_node(d->dedup_table, h, cand_node) {
+			cand = d->btf->types[cand_node->type_id];
+			if (btf_equal_common(t, cand)) {
+				new_id = cand_node->type_id;
+				break;
+			}
+		}
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	d->map[type_id] = new_id;
+	if (type_id == new_id && btf_dedup_table_add(d, h, type_id))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int btf_dedup_prim_types(struct btf_dedup *d)
+{
+	int i, err;
+
+	for (i = 1; i <= d->btf->nr_types; i++) {
+		err = btf_dedup_prim_type(d, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * Check whether type is already mapped into canonical one (could be to itself).
+ */
+static inline bool is_type_mapped(struct btf_dedup *d, uint32_t type_id)
+{
+	return d->map[type_id] <= BTF_MAX_TYPE;
+}
+
+/*
+ * Resolve type ID into its canonical type ID, if any; otherwise return original
+ * type ID. If type is FWD and is resolved into STRUCT/UNION already, follow
+ * STRUCT/UNION link and resolve it into canonical type ID as well.
+ */
+static inline __u32 resolve_type_id(struct btf_dedup *d, __u32 type_id)
+{
+	while (is_type_mapped(d, type_id) && d->map[type_id] != type_id)
+		type_id = d->map[type_id];
+	return type_id;
+}
+
+/*
+ * Resolve FWD to underlying STRUCT/UNION, if any; otherwise return original
+ * type ID.
+ */
+static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id)
+{
+	__u32 orig_type_id = type_id;
+
+	if (BTF_INFO_KIND(d->btf->types[type_id]->info) != BTF_KIND_FWD)
+		return type_id;
+
+	while (is_type_mapped(d, type_id) && d->map[type_id] != type_id)
+		type_id = d->map[type_id];
+
+	if (BTF_INFO_KIND(d->btf->types[type_id]->info) != BTF_KIND_FWD)
+		return type_id;
+
+	return orig_type_id;
+}
+
+
+static inline __u16 btf_fwd_kind(struct btf_type *t)
+{
+	return BTF_INFO_KFLAG(t->info) ? BTF_KIND_UNION : BTF_KIND_STRUCT;
+}
+
+/*
+ * Check equivalence of BTF type graph formed by candidate struct/union (we'll
+ * call it "candidate graph" in this description for brevity) to a type graph
+ * formed by (potential) canonical struct/union ("canonical graph" for brevity
+ * here, though keep in mind that not all types in canonical graph are
+ * necessarily canonical representatives themselves, some of them might be
+ * duplicates or its uniqueness might not have been established yet).
+ * Returns:
+ *  - >0, if type graphs are equivalent;
+ *  -  0, if not equivalent;
+ *  - <0, on error.
+ *
+ * Algorithm performs side-by-side DFS traversal of both type graphs and checks
+ * equivalence of BTF types at each step. If at any point BTF types in candidate
+ * and canonical graphs are not compatible structurally, whole graphs are
+ * incompatible. If types are structurally equivalent (i.e., all information
+ * except referenced type IDs is exactly the same), a mapping from `canon_id` to
+ * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`).
+ * If a type references other types, then those referenced types are checked
+ * for equivalence recursively.
+ *
+ * During DFS traversal, if we find that for current `canon_id` type we
+ * already have some mapping in hypothetical map, we check for two possible
+ * situations:
+ *   - `canon_id` is mapped to exactly the same type as `cand_id`. This will
+ *     happen when type graphs have cycles. In this case we assume those two
+ *     types are equivalent.
+ *   - `canon_id` is mapped to different type. This is contradiction in our
+ *     hypothetical mapping, because same graph in canonical graph corresponds
+ *     to two different types in candidate graph, which for equivalent type
+ *     graphs shouldn't happen. This condition terminates equivalence check
+ *     with negative result.
+ *
+ * If type graphs traversal exhausts types to check and find no contradiction,
+ * then type graphs are equivalent.
+ *
+ * When checking types for equivalence, there is one special case: FWD types.
+ * If FWD type resolution is allowed and one of the types (either from canonical
+ * or candidate graph) is FWD and other is STRUCT/UNION (depending on FWD's kind
+ * flag) and their names match, hypothetical mapping is updated to point from
+ * FWD to STRUCT/UNION. If graphs will be determined as equivalent successfully,
+ * this mapping will be used to record FWD -> STRUCT/UNION mapping permanently.
+ *
+ * Technically, this could lead to incorrect FWD to STRUCT/UNION resolution,
+ * if there are two exactly named (or anonymous) structs/unions that are
+ * compatible structurally, one of which has FWD field, while other is concrete
+ * STRUCT/UNION, but according to C sources they are different structs/unions
+ * that are referencing different types with the same name. This is extremely
+ * unlikely to happen, but btf_dedup API allows to disable FWD resolution if
+ * this logic is causing problems.
+ *
+ * Doing FWD resolution means that both candidate and/or canonical graphs can
+ * consists of portions of the graph that come from multiple compilation units.
+ * This is due to the fact that types within single compilation unit are always
+ * deduplicated and FWDs are already resolved, if referenced struct/union
+ * definiton is available. So, if we had unresolved FWD and found corresponding
+ * STRUCT/UNION, they will be from different compilation units. This
+ * consequently means that when we "link" FWD to corresponding STRUCT/UNION,
+ * type graph will likely have at least two different BTF types that describe
+ * same type (e.g., most probably there will be two different BTF types for the
+ * same 'int' primitive type) and could even have "overlapping" parts of type
+ * graph that describe same subset of types.
+ *
+ * This in turn means that our assumption that each type in canonical graph
+ * must correspond to exactly one type in candidate graph might not hold
+ * anymore and will make it harder to detect contradictions using hypothetical
+ * map. To handle this problem, we allow to follow FWD -> STRUCT/UNION
+ * resolution only in canonical graph. FWDs in candidate graphs are never
+ * resolved. To see why it's OK, let's check all possible situations w.r.t. FWDs
+ * that can occur:
+ *   - Both types in canonical and candidate graphs are FWDs. If they are
+ *     structurally equivalent, then they can either be both resolved to the
+ *     same STRUCT/UNION or not resolved at all. In both cases they are
+ *     equivalent and there is no need to resolve FWD on candidate side.
+ *   - Both types in canonical and candidate graphs are concrete STRUCT/UNION,
+ *     so nothing to resolve as well, algorithm will check equivalence anyway.
+ *   - Type in canonical graph is FWD, while type in candidate is concrete
+ *     STRUCT/UNION. In this case candidate graph comes from single compilation
+ *     unit, so there is exactly one BTF type for each unique C type. After
+ *     resolving FWD into STRUCT/UNION, there might be more than one BTF type
+ *     in canonical graph mapping to single BTF type in candidate graph, but
+ *     because hypothetical mapping maps from canonical to candidate types, it's
+ *     alright, and we still maintain the property of having single `canon_id`
+ *     mapping to single `cand_id` (there could be two different `canon_id`
+ *     mapped to the same `cand_id`, but it's not contradictory).
+ *   - Type in canonical graph is concrete STRUCT/UNION, while type in candidate
+ *     graph is FWD. In this case we are just going to check compatibility of
+ *     STRUCT/UNION and corresponding FWD, and if they are compatible, we'll
+ *     assume that whatever STRUCT/UNION FWD resolves to must be equivalent to
+ *     a concrete STRUCT/UNION from canonical graph. If the rest of type graphs
+ *     turn out equivalent, we'll re-resolve FWD to concrete STRUCT/UNION from
+ *     canonical graph.
+ */
+static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
+			      __u32 canon_id)
+{
+	struct btf_type *cand_type;
+	struct btf_type *canon_type;
+	__u32 hypot_type_id;
+	__u16 cand_kind;
+	__u16 canon_kind;
+	int i, eq;
+
+	/* if both resolve to the same canonical, they must be equivalent */
+	if (resolve_type_id(d, cand_id) == resolve_type_id(d, canon_id))
+		return 1;
+
+	canon_id = resolve_fwd_id(d, canon_id);
+
+	hypot_type_id = d->hypot_map[canon_id];
+	if (hypot_type_id <= BTF_MAX_TYPE)
+		return hypot_type_id == cand_id;
+
+	if (btf_dedup_hypot_map_add(d, canon_id, cand_id))
+		return -ENOMEM;
+
+	cand_type = d->btf->types[cand_id];
+	canon_type = d->btf->types[canon_id];
+	cand_kind = BTF_INFO_KIND(cand_type->info);
+	canon_kind = BTF_INFO_KIND(canon_type->info);
+
+	if (cand_type->name_off != canon_type->name_off)
+		return 0;
+
+	/* FWD <--> STRUCT/UNION equivalence check, if enabled */
+	if (!d->opts.dont_resolve_fwds
+	    && (cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD)
+	    && cand_kind != canon_kind) {
+		__u16 real_kind;
+		__u16 fwd_kind;
+
+		if (cand_kind == BTF_KIND_FWD) {
+			real_kind = canon_kind;
+			fwd_kind = btf_fwd_kind(cand_type);
+		} else {
+			real_kind = cand_kind;
+			fwd_kind = btf_fwd_kind(canon_type);
+		}
+		return fwd_kind == real_kind;
+	}
+
+	if (cand_type->info != canon_type->info)
+		return 0;
+
+	switch (cand_kind) {
+	case BTF_KIND_INT:
+		return btf_equal_int(cand_type, canon_type);
+
+	case BTF_KIND_ENUM:
+		return btf_equal_enum(cand_type, canon_type);
+
+	case BTF_KIND_FWD:
+		return btf_equal_common(cand_type, canon_type);
+
+	case BTF_KIND_CONST:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_FUNC:
+		return btf_dedup_is_equiv(d, cand_type->type, canon_type->type);
+
+	case BTF_KIND_ARRAY: {
+		struct btf_array *cand_arr, *canon_arr;
+
+		if (!btf_compat_array(cand_type, canon_type))
+			return 0;
+		cand_arr = (struct btf_array *)(cand_type + 1);
+		canon_arr = (struct btf_array *)(canon_type + 1);
+		eq = btf_dedup_is_equiv(d,
+			cand_arr->index_type, canon_arr->index_type);
+		if (eq <= 0)
+			return eq;
+		return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type);
+	}
+
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION: {
+		struct btf_member *cand_m, *canon_m;
+		__u16 vlen;
+
+		if (!btf_equal_struct(cand_type, canon_type))
+			return 0;
+		vlen = BTF_INFO_VLEN(cand_type->info);
+		cand_m = (struct btf_member *)(cand_type + 1);
+		canon_m = (struct btf_member *)(canon_type + 1);
+		for (i = 0; i < vlen; i++) {
+			eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type);
+			if (eq <= 0)
+				return eq;
+			cand_m++;
+			canon_m++;
+		}
+
+		return 1;
+	}
+
+	case BTF_KIND_FUNC_PROTO: {
+		struct btf_param *cand_p, *canon_p;
+		__u16 vlen;
+
+		if (!btf_compat_fnproto(cand_type, canon_type))
+			return 0;
+		eq = btf_dedup_is_equiv(d, cand_type->type, canon_type->type);
+		if (eq <= 0)
+			return eq;
+		vlen = BTF_INFO_VLEN(cand_type->info);
+		cand_p = (struct btf_param *)(cand_type + 1);
+		canon_p = (struct btf_param *)(canon_type + 1);
+		for (i = 0; i < vlen; i++) {
+			eq = btf_dedup_is_equiv(d, cand_p->type, canon_p->type);
+			if (eq <= 0)
+				return eq;
+			cand_p++;
+			canon_p++;
+		}
+		return 1;
+	}
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Use hypothetical mapping, produced by successful type graph equivalence
+ * check, to augment existing struct/union canonical mapping, where possible.
+ *
+ * If BTF_KIND_FWD resolution is allowed, this mapping is also used to record
+ * FWD -> STRUCT/UNION correspondence as well. FWD resolution is bidirectional:
+ * it doesn't matter if FWD type was part of canonical graph or candidate one,
+ * we are recording the mapping anyway. As opposed to carefulness required
+ * for struct/union correspondence mapping (described below), for FWD resolution
+ * it's not important, as by the time that FWD type (reference type) will be
+ * deduplicated all structs/unions will be deduped already anyway.
+ *
+ * Recording STRUCT/UNION mapping is purely a performance optimization and is
+ * not required for correctness. It needs to be done carefully to ensure that
+ * struct/union from candidate's type graph is not mapped into corresponding
+ * struct/union from canonical type graph that itself hasn't been resolved into
+ * canonical representative. The only guarantee we have is that canonical
+ * struct/union was determined as canonical and that won't change. But any
+ * types referenced through that struct/union fields could have been not yet
+ * resolved, so in case like that it's too early to establish any kind of
+ * correspondence between structs/unions.
+ *
+ * No canonical correspondence is derived for primitive types (they are already
+ * deduplicated completely already anyway) or reference types (they rely on
+ * stability of struct/union canonical relationship for equivalence checks).
+ */
+static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
+{
+	__u32 cand_type_id, targ_type_id;
+	__u16 t_kind, c_kind;
+	__u32 t_id, c_id;
+	int i;
+
+	for (i = 0; i < d->hypot_cnt; i++) {
+		cand_type_id = d->hypot_list[i];
+		targ_type_id = d->hypot_map[cand_type_id];
+		t_id = resolve_type_id(d, targ_type_id);
+		c_id = resolve_type_id(d, cand_type_id);
+		t_kind = BTF_INFO_KIND(d->btf->types[t_id]->info);
+		c_kind = BTF_INFO_KIND(d->btf->types[c_id]->info);
+		/*
+		 * Resolve FWD into STRUCT/UNION.
+		 * It's ok to resolve FWD into STRUCT/UNION that's not yet
+		 * mapped to canonical representative (as opposed to
+		 * STRUCT/UNION <--> STRUCT/UNION mapping logic below), because
+		 * eventually that struct is going to be mapped and all resolved
+		 * FWDs will automatically resolve to correct canonical
+		 * representative. This will happen before ref type deduping,
+		 * which critically depends on stability of these mapping. This
+		 * stability is not a requirement for STRUCT/UNION equivalence
+		 * checks, though.
+		 */
+		if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD)
+			d->map[c_id] = t_id;
+		else if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD)
+			d->map[t_id] = c_id;
+
+		if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) &&
+		    c_kind != BTF_KIND_FWD &&
+		    is_type_mapped(d, c_id) &&
+		    !is_type_mapped(d, t_id)) {
+			/*
+			 * as a perf optimization, we can map struct/union
+			 * that's part of type graph we just verified for
+			 * equivalence. We can do that for struct/union that has
+			 * canonical representative only, though.
+			 */
+			d->map[t_id] = c_id;
+		}
+	}
+}
+
+/*
+ * Deduplicate struct/union types.
+ *
+ * For each struct/union type its type signature hash is calculated, taking
+ * into account type's name, size, number, order and names of fields, but
+ * ignoring type ID's referenced from fields, because they might not be deduped
+ * completely until after reference types deduplication phase. This type hash
+ * is used to iterate over all potential canonical types, sharing same hash.
+ * For each canonical candidate we check whether type graphs that they form
+ * (through referenced types in fields and so on) are equivalent using algorithm
+ * implemented in `btf_dedup_is_equiv`. If such equivalence is found and
+ * BTF_KIND_FWD resolution is allowed, then hypothetical mapping
+ * (btf_dedup->hypot_map) produced by aforementioned type graph equivalence
+ * algorithm is used to record FWD -> STRUCT/UNION mapping. It's also used to
+ * potentially map other structs/unions to their canonical representatives,
+ * if such relationship hasn't yet been established. This speeds up algorithm
+ * by eliminating some of the duplicate work.
+ *
+ * If no matching canonical representative was found, struct/union is marked
+ * as canonical for itself and is added into btf_dedup->dedup_table hash map
+ * for further look ups.
+ */
+static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
+{
+	struct btf_dedup_node *cand_node;
+	struct btf_type *t;
+	/* if we don't find equivalent type, then we are canonical */
+	__u32 new_id = type_id;
+	__u16 kind;
+	__u32 h;
+
+	/* already deduped or is in process of deduping (loop detected) */
+	if (d->map[type_id] <= BTF_MAX_TYPE)
+		return 0;
+
+	t = d->btf->types[type_id];
+	kind = BTF_INFO_KIND(t->info);
+
+	if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION)
+		return 0;
+
+	h = btf_hash_struct(t);
+	for_each_hash_node(d->dedup_table, h, cand_node) {
+		int eq;
+
+		btf_dedup_clear_hypot_map(d);
+		eq = btf_dedup_is_equiv(d, type_id, cand_node->type_id);
+		if (eq < 0)
+			return eq;
+		if (!eq)
+			continue;
+		new_id = cand_node->type_id;
+		btf_dedup_merge_hypot_map(d);
+		break;
+	}
+
+	d->map[type_id] = new_id;
+	if (type_id == new_id && btf_dedup_table_add(d, h, type_id))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int btf_dedup_struct_types(struct btf_dedup *d)
+{
+	int i, err;
+
+	for (i = 1; i <= d->btf->nr_types; i++) {
+		err = btf_dedup_struct_type(d, i);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+/*
+ * Deduplicate reference type.
+ *
+ * Once all primitive and struct/union types got deduplicated, we can easily
+ * deduplicate all other (reference) BTF types. This is done in two steps:
+ *
+ * 1. Resolve all referenced type IDs into their canonical type IDs. This
+ * resolution can be done either immediately for primitive or struct/union types
+ * (because they were deduped in previous two phases) or recursively for
+ * reference types. Recursion will always terminate at either primitive or
+ * struct/union type, at which point we can "unwind" chain of reference types
+ * one by one. There is no danger of encountering cycles because in C type
+ * system the only way to form type cycle is through struct/union, so any chain
+ * of reference types, even those taking part in a type cycle, will inevitably
+ * reach struct/union at some point.
+ *
+ * 2. Once all referenced type IDs are resolved into canonical ones, BTF type
+ * becomes "stable", in the sense that no further deduplication will cause
+ * any changes to it. With that, it's now possible to calculate type's signature
+ * hash (this time taking into account referenced type IDs) and loop over all
+ * potential canonical representatives. If no match was found, current type
+ * will become canonical representative of itself and will be added into
+ * btf_dedup->dedup_table as another possible canonical representative.
+ */
+static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
+{
+	struct btf_dedup_node *cand_node;
+	struct btf_type *t, *cand;
+	/* if we don't find equivalent type, then we are representative type */
+	__u32 new_id = type_id;
+	__u32 h, ref_type_id;
+
+	if (d->map[type_id] == BTF_IN_PROGRESS_ID)
+		return -ELOOP;
+	if (d->map[type_id] <= BTF_MAX_TYPE)
+		return resolve_type_id(d, type_id);
+
+	t = d->btf->types[type_id];
+	d->map[type_id] = BTF_IN_PROGRESS_ID;
+
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_CONST:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_FUNC:
+		ref_type_id = btf_dedup_ref_type(d, t->type);
+		if (ref_type_id < 0)
+			return ref_type_id;
+		t->type = ref_type_id;
+
+		h = btf_hash_common(t);
+		for_each_hash_node(d->dedup_table, h, cand_node) {
+			cand = d->btf->types[cand_node->type_id];
+			if (btf_equal_common(t, cand)) {
+				new_id = cand_node->type_id;
+				break;
+			}
+		}
+		break;
+
+	case BTF_KIND_ARRAY: {
+		struct btf_array *info = (struct btf_array *)(t + 1);
+
+		ref_type_id = btf_dedup_ref_type(d, info->type);
+		if (ref_type_id < 0)
+			return ref_type_id;
+		info->type = ref_type_id;
+
+		ref_type_id = btf_dedup_ref_type(d, info->index_type);
+		if (ref_type_id < 0)
+			return ref_type_id;
+		info->index_type = ref_type_id;
+
+		h = btf_hash_array(t);
+		for_each_hash_node(d->dedup_table, h, cand_node) {
+			cand = d->btf->types[cand_node->type_id];
+			if (btf_equal_array(t, cand)) {
+				new_id = cand_node->type_id;
+				break;
+			}
+		}
+		break;
+	}
+
+	case BTF_KIND_FUNC_PROTO: {
+		struct btf_param *param;
+		__u16 vlen;
+		int i;
+
+		ref_type_id = btf_dedup_ref_type(d, t->type);
+		if (ref_type_id < 0)
+			return ref_type_id;
+		t->type = ref_type_id;
+
+		vlen = BTF_INFO_VLEN(t->info);
+		param = (struct btf_param *)(t + 1);
+		for (i = 0; i < vlen; i++) {
+			ref_type_id = btf_dedup_ref_type(d, param->type);
+			if (ref_type_id < 0)
+				return ref_type_id;
+			param->type = ref_type_id;
+			param++;
+		}
+
+		h = btf_hash_fnproto(t);
+		for_each_hash_node(d->dedup_table, h, cand_node) {
+			cand = d->btf->types[cand_node->type_id];
+			if (btf_equal_fnproto(t, cand)) {
+				new_id = cand_node->type_id;
+				break;
+			}
+		}
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	d->map[type_id] = new_id;
+	if (type_id == new_id && btf_dedup_table_add(d, h, type_id))
+		return -ENOMEM;
+
+	return new_id;
+}
+
+static int btf_dedup_ref_types(struct btf_dedup *d)
+{
+	int i, err;
+
+	for (i = 1; i <= d->btf->nr_types; i++) {
+		err = btf_dedup_ref_type(d, i);
+		if (err < 0)
+			return err;
+	}
+	btf_dedup_table_free(d);
+	return 0;
+}
+
+/*
+ * Compact types.
+ *
+ * After we established for each type its corresponding canonical representative
+ * type, we now can eliminate types that are not canonical and leave only
+ * canonical ones layed out sequentially in memory by copying them over
+ * duplicates. During compaction btf_dedup->hypot_map array is reused to store
+ * a map from original type ID to a new compacted type ID, which will be used
+ * during next phase to "fix up" type IDs, referenced from struct/union and
+ * reference types.
+ */
+static int btf_dedup_compact_types(struct btf_dedup *d)
+{
+	struct btf_type **new_types;
+	__u32 next_type_id = 1;
+	char *types_start, *p;
+	int i, len;
+
+	/* we are going to reuse hypot_map to store compaction remapping */
+	d->hypot_map[0] = 0;
+	for (i = 1; i <= d->btf->nr_types; i++)
+		d->hypot_map[i] = BTF_UNPROCESSED_ID;
+
+	types_start = d->btf->nohdr_data + d->btf->hdr->type_off;
+	p = types_start;
+
+	for (i = 1; i <= d->btf->nr_types; i++) {
+		if (d->map[i] != i)
+			continue;
+
+		len = btf_type_size(d->btf->types[i], NULL);
+		if (len < 0)
+			return len;
+
+		memmove(p, d->btf->types[i], len);
+		d->hypot_map[i] = next_type_id;
+		d->btf->types[next_type_id] = (struct btf_type *)p;
+		p += len;
+		next_type_id++;
+	}
+
+	/* shrink struct btf's internal types index and update btf_header */
+	d->btf->nr_types = next_type_id - 1;
+	d->btf->types_size = d->btf->nr_types;
+	d->btf->hdr->type_len = p - types_start;
+	new_types = realloc(d->btf->types,
+			    (1 + d->btf->nr_types) * sizeof(struct btf_type *));
+	if (!new_types)
+		return -ENOMEM;
+	d->btf->types = new_types;
+
+	/* make sure string section follows type information without gaps */
+	d->btf->hdr->str_off = p - (char *)d->btf->nohdr_data;
+	memmove(p, d->btf->strings, d->btf->hdr->str_len);
+	d->btf->strings = p;
+	p += d->btf->hdr->str_len;
+
+	d->btf->data_size = p - (char *)d->btf->data;
+	return 0;
+}
+
+/*
+ * Figure out final (deduplicated and compacted) type ID for provided original
+ * `type_id` by first resolving it into corresponding canonical type ID and
+ * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map,
+ * which is populated during compaction phase.
+ */
+static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id)
+{
+	__u32 resolved_type_id, new_type_id;
+
+	resolved_type_id = resolve_type_id(d, type_id);
+	new_type_id = d->hypot_map[resolved_type_id];
+	if (new_type_id > BTF_MAX_TYPE)
+		return -EINVAL;
+	return new_type_id;
+}
+
+/*
+ * Remap referenced type IDs into deduped type IDs.
+ *
+ * After BTF types are deduplicated and compacted, their final type IDs may
+ * differ from original ones. The map from original to a corresponding
+ * deduped type ID is stored in btf_dedup->hypot_map and is populated during
+ * compaction phase. During remapping phase we are rewriting all type IDs
+ * referenced from any BTF type (e.g., struct fields, func proto args, etc) to
+ * their final deduped type IDs.
+ */
+static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
+{
+	struct btf_type *t = d->btf->types[type_id];
+	int i, r;
+
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_INT:
+	case BTF_KIND_ENUM:
+		break;
+
+	case BTF_KIND_FWD:
+	case BTF_KIND_CONST:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_FUNC:
+		r = btf_dedup_remap_type_id(d, t->type);
+		if (r < 0)
+			return r;
+		t->type = r;
+		break;
+
+	case BTF_KIND_ARRAY: {
+		struct btf_array *arr_info = (struct btf_array *)(t + 1);
+
+		r = btf_dedup_remap_type_id(d, arr_info->type);
+		if (r < 0)
+			return r;
+		arr_info->type = r;
+		r = btf_dedup_remap_type_id(d, arr_info->index_type);
+		if (r < 0)
+			return r;
+		arr_info->index_type = r;
+		break;
+	}
+
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION: {
+		struct btf_member *member = (struct btf_member *)(t + 1);
+		__u16 vlen = BTF_INFO_VLEN(t->info);
+
+		for (i = 0; i < vlen; i++) {
+			r = btf_dedup_remap_type_id(d, member->type);
+			if (r < 0)
+				return r;
+			member->type = r;
+			member++;
+		}
+		break;
+	}
+
+	case BTF_KIND_FUNC_PROTO: {
+		struct btf_param *param = (struct btf_param *)(t + 1);
+		__u16 vlen = BTF_INFO_VLEN(t->info);
+
+		r = btf_dedup_remap_type_id(d, t->type);
+		if (r < 0)
+			return r;
+		t->type = r;
+
+		for (i = 0; i < vlen; i++) {
+			r = btf_dedup_remap_type_id(d, param->type);
+			if (r < 0)
+				return r;
+			param->type = r;
+			param++;
+		}
+		break;
+	}
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_dedup_remap_types(struct btf_dedup *d)
+{
+	int i, r;
+
+	for (i = 1; i <= d->btf->nr_types; i++) {
+		r = btf_dedup_remap_type(d, i);
+		if (r < 0)
+			return r;
+	}
+	return 0;
+}
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index b0610dcdae6b..0c1d739fd4e8 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -83,6 +83,14 @@ int btf_ext__reloc_line_info(const struct btf *btf,
 __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext);
 __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext);
 
+struct btf_dedup_opts {
+	bool dont_resolve_fwds;
+};
+
+LIBBPF_API int btf__dedup(struct btf *btf, struct btf_ext *btf_ext,
+			  const struct btf_dedup_opts *opts,
+			  btf_print_fn_t err_log);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 266bc95d0142..42524133779b 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -107,6 +107,7 @@ LIBBPF_0.0.1 {
 		bpf_set_link_xdp_fd;
 		bpf_task_fd_query;
 		bpf_verify_program;
+		btf__dedup;
 		btf__fd;
 		btf__find_by_name;
 		btf__free;
-- 
2.17.1


^ permalink raw reply related

* [PATCH btf 3/3] selftests/btf: add initial BTF dedup tests
From: Andrii Nakryiko @ 2019-01-31  6:58 UTC (permalink / raw)
  To: netdev, ast, yhs, kafai, acme, daniel, kernel-team; +Cc: Andrii Nakryiko
In-Reply-To: <20190131065837.3380288-1-andriin@fb.com>

This patch sets up a new kind of tests (BTF dedup tests) and tests few aspects of
BTF dedup algorithm. More complete set of tests will come in follow up patches.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/btf.c                    |  12 +
 tools/lib/bpf/btf.h                    |   3 +
 tools/lib/bpf/libbpf.map               |   2 +
 tools/testing/selftests/bpf/test_btf.c | 535 ++++++++++++++++++++++++-
 4 files changed, 537 insertions(+), 15 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 981f166b401c..abfcf80e8625 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -235,6 +235,11 @@ static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log)
 	return 0;
 }
 
+__u32 btf__get_nr_types(const struct btf *btf)
+{
+	return btf->nr_types;
+}
+
 const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id)
 {
 	if (type_id > btf->nr_types)
@@ -426,6 +431,13 @@ int btf__fd(const struct btf *btf)
 	return btf->fd;
 }
 
+void btf__get_strings(const struct btf *btf, const char **strings,
+		      __u32 *str_len)
+{
+	*strings = btf->strings;
+	*str_len = btf->hdr->str_len;
+}
+
 const char *btf__name_by_offset(const struct btf *btf, __u32 offset)
 {
 	if (offset < btf->hdr->str_len)
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 0c1d739fd4e8..afffe30e84ae 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -62,11 +62,14 @@ LIBBPF_API void btf__free(struct btf *btf);
 LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log);
 LIBBPF_API __s32 btf__find_by_name(const struct btf *btf,
 				   const char *type_name);
+LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf);
 LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf,
 						  __u32 id);
 LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id);
 LIBBPF_API int btf__fd(const struct btf *btf);
+LIBBPF_API void btf__get_strings(const struct btf *btf, const char **strings,
+				 __u32 *str_len);
 LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset);
 LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf);
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 42524133779b..9b4a1f1bd3bf 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -112,6 +112,8 @@ LIBBPF_0.0.1 {
 		btf__find_by_name;
 		btf__free;
 		btf__get_from_id;
+		btf__get_nr_types;
+		btf__get_strings;
 		btf__name_by_offset;
 		btf__new;
 		btf__resolve_size;
diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c
index 179f1d8ec5bf..dcc770245815 100644
--- a/tools/testing/selftests/bpf/test_btf.c
+++ b/tools/testing/selftests/bpf/test_btf.c
@@ -78,12 +78,21 @@ static int __base_pr(const char *format, ...)
 	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz),	\
 	BTF_INT_ENC(encoding, bits_offset, bits)
 
+#define BTF_FWD_ENC(name, kind_flag) \
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FWD, kind_flag, 0), 0)
+
 #define BTF_ARRAY_ENC(type, index_type, nr_elems)	\
 	(type), (index_type), (nr_elems)
 #define BTF_TYPE_ARRAY_ENC(type, index_type, nr_elems) \
 	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), \
 	BTF_ARRAY_ENC(type, index_type, nr_elems)
 
+#define BTF_STRUCT_ENC(name, nr_elems, sz)	\
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, nr_elems), sz)
+
+#define BTF_UNION_ENC(name, nr_elems, sz)	\
+	BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_UNION, 0, nr_elems), sz)
+
 #define BTF_MEMBER_ENC(name, type, bits_offset)	\
 	(name), (type), (bits_offset)
 #define BTF_ENUM_ENC(name, val) (name), (val)
@@ -99,6 +108,12 @@ static int __base_pr(const char *format, ...)
 #define BTF_CONST_ENC(type) \
 	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), type)
 
+#define BTF_VOLATILE_ENC(type) \
+	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), type)
+
+#define BTF_RESTRICT_ENC(type) \
+	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), type)
+
 #define BTF_FUNC_PROTO_ENC(ret_type, nargs) \
 	BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, nargs), ret_type)
 
@@ -111,6 +126,10 @@ static int __base_pr(const char *format, ...)
 #define BTF_END_RAW 0xdeadbeef
 #define NAME_TBD 0xdeadb33f
 
+#define NAME_NTH(N) (0xffff0000 | N)
+#define IS_NAME_NTH(X) ((X & 0xffff0000) == 0xffff0000)
+#define GET_NAME_NTH_IDX(X) (X & 0x0000ffff)
+
 #define MAX_NR_RAW_U32 1024
 #define BTF_LOG_BUF_SIZE 65535
 
@@ -119,12 +138,14 @@ static struct args {
 	unsigned int file_test_num;
 	unsigned int get_info_test_num;
 	unsigned int info_raw_test_num;
+	unsigned int dedup_test_num;
 	bool raw_test;
 	bool file_test;
 	bool get_info_test;
 	bool pprint_test;
 	bool always_log;
 	bool info_raw_test;
+	bool dedup_test;
 } args;
 
 static char btf_log_buf[BTF_LOG_BUF_SIZE];
@@ -2835,11 +2856,13 @@ static void *btf_raw_create(const struct btf_header *hdr,
 			    const char **ret_next_str)
 {
 	const char *next_str = str, *end_str = str + str_sec_size;
+	const char **strs_idx = NULL, **tmp_strs_idx;
+	int strs_cap = 0, strs_cnt = 0, next_str_idx = 0;
 	unsigned int size_needed, offset;
 	struct btf_header *ret_hdr;
-	int i, type_sec_size;
+	int i, type_sec_size, err = 0;
 	uint32_t *ret_types;
-	void *raw_btf;
+	void *raw_btf = NULL;
 
 	type_sec_size = get_raw_sec_size(raw_types);
 	if (CHECK(type_sec_size < 0, "Cannot get nr_raw_types"))
@@ -2854,17 +2877,44 @@ static void *btf_raw_create(const struct btf_header *hdr,
 	memcpy(raw_btf, hdr, sizeof(*hdr));
 	offset = sizeof(*hdr);
 
+	/* Index strings */
+	while ((next_str = get_next_str(next_str, end_str))) {
+		if (strs_cnt == strs_cap) {
+			strs_cap += max(16, strs_cap / 2);
+			tmp_strs_idx = realloc(strs_idx,
+					       sizeof(*strs_idx) * strs_cap);
+			if (CHECK(!tmp_strs_idx,
+				  "Cannot allocate memory for strs_idx")) {
+				err = -1;
+				goto done;
+			}
+			strs_idx = tmp_strs_idx;
+		}
+		strs_idx[strs_cnt++] = next_str;
+		next_str += strlen(next_str);
+	}
+
 	/* Copy type section */
 	ret_types = raw_btf + offset;
 	for (i = 0; i < type_sec_size / sizeof(raw_types[0]); i++) {
 		if (raw_types[i] == NAME_TBD) {
-			next_str = get_next_str(next_str, end_str);
-			if (CHECK(!next_str, "Error in getting next_str")) {
-				free(raw_btf);
-				return NULL;
+			if (CHECK(next_str_idx == strs_cnt,
+				  "Error in getting next_str #%d",
+				  next_str_idx)) {
+				err = -1;
+				goto done;
 			}
-			ret_types[i] = next_str - str;
-			next_str += strlen(next_str);
+			ret_types[i] = strs_idx[next_str_idx++] - str;
+		} else if (IS_NAME_NTH(raw_types[i])) {
+			int idx = GET_NAME_NTH_IDX(raw_types[i]);
+
+			if (CHECK(idx <= 0 || idx > strs_cnt,
+				  "Error getting string #%d, strs_cnt:%d",
+				  idx, strs_cnt)) {
+				err = -1;
+				goto done;
+			}
+			ret_types[i] = strs_idx[idx-1] - str;
 		} else {
 			ret_types[i] = raw_types[i];
 		}
@@ -2881,8 +2931,17 @@ static void *btf_raw_create(const struct btf_header *hdr,
 
 	*btf_size = size_needed;
 	if (ret_next_str)
-		*ret_next_str = next_str;
+		*ret_next_str =
+			next_str_idx < strs_cnt ? strs_idx[next_str_idx] : NULL;
 
+done:
+	if (err) {
+		if (raw_btf)
+			free(raw_btf);
+		if (strs_idx)
+			free(strs_idx);
+		return NULL;
+	}
 	return raw_btf;
 }
 
@@ -5551,20 +5610,450 @@ static int test_info_raw(void)
 	return err;
 }
 
+struct btf_raw_data {
+	__u32 raw_types[MAX_NR_RAW_U32];
+	const char *str_sec;
+	__u32 str_sec_size;
+};
+
+struct btf_dedup_test {
+	const char *descr;
+	struct btf_raw_data input;
+	struct btf_raw_data expect;
+	struct btf_dedup_opts opts;
+};
+
+const struct btf_dedup_test dedup_tests[] = {
+
+{
+	.descr = "dedup: unused strings filtering",
+	.input = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 64, 8),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0unused\0int\0foo\0bar\0long"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 64, 8),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0int\0long"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+	},
+},
+{
+	.descr = "dedup: strings deduplication",
+	.input = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 64, 8),
+			BTF_TYPE_INT_ENC(NAME_NTH(3), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 64, 8),
+			BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0int\0long int\0int\0long int\0int"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 64, 8),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0int\0long int"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+	},
+},
+{
+	.descr = "dedup: struct example #1",
+	/*
+	 * struct s {
+	 *	struct s *next;
+	 *	const int *a;
+	 *	int b[16];
+	 *	int c;
+	 * }
+	 */
+	.input = {
+		.raw_types = {
+			/* int */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			/* int[16] */
+			BTF_TYPE_ARRAY_ENC(1, 1, 16),					/* [2] */
+			/* struct s { */
+			BTF_STRUCT_ENC(NAME_NTH(2), 4, 84),				/* [3] */
+				BTF_MEMBER_ENC(NAME_NTH(3), 4, 0),	/* struct s *next;	*/
+				BTF_MEMBER_ENC(NAME_NTH(4), 5, 64),	/* const int *a;	*/
+				BTF_MEMBER_ENC(NAME_NTH(5), 2, 128),	/* int b[16];		*/
+				BTF_MEMBER_ENC(NAME_NTH(6), 1, 640),	/* int c;		*/
+			/* ptr -> [3] struct s */
+			BTF_PTR_ENC(3),							/* [4] */
+			/* ptr -> [6] const int */
+			BTF_PTR_ENC(6),							/* [5] */
+			/* const -> [1] int */
+			BTF_CONST_ENC(1),						/* [6] */
+
+			/* full copy of the above */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),	/* [7] */
+			BTF_TYPE_ARRAY_ENC(7, 7, 16),					/* [8] */
+			BTF_STRUCT_ENC(NAME_NTH(2), 4, 84),				/* [9] */
+				BTF_MEMBER_ENC(NAME_NTH(3), 10, 0),
+				BTF_MEMBER_ENC(NAME_NTH(4), 11, 64),
+				BTF_MEMBER_ENC(NAME_NTH(5), 8, 128),
+				BTF_MEMBER_ENC(NAME_NTH(6), 7, 640),
+			BTF_PTR_ENC(9),							/* [10] */
+			BTF_PTR_ENC(12),						/* [11] */
+			BTF_CONST_ENC(7),						/* [12] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"),
+	},
+	.expect = {
+		.raw_types = {
+			/* int */
+			BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4),	/* [1] */
+			/* int[16] */
+			BTF_TYPE_ARRAY_ENC(1, 1, 16),					/* [2] */
+			/* struct s { */
+			BTF_STRUCT_ENC(NAME_NTH(6), 4, 84),				/* [3] */
+				BTF_MEMBER_ENC(NAME_NTH(5), 4, 0),	/* struct s *next;	*/
+				BTF_MEMBER_ENC(NAME_NTH(1), 5, 64),	/* const int *a;	*/
+				BTF_MEMBER_ENC(NAME_NTH(2), 2, 128),	/* int b[16];		*/
+				BTF_MEMBER_ENC(NAME_NTH(3), 1, 640),	/* int c;		*/
+			/* ptr -> [3] struct s */
+			BTF_PTR_ENC(3),							/* [4] */
+			/* ptr -> [6] const int */
+			BTF_PTR_ENC(6),							/* [5] */
+			/* const -> [1] int */
+			BTF_CONST_ENC(1),						/* [6] */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+	},
+},
+{
+	.descr = "dedup: all possible kinds (no duplicates)",
+	.input = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 8),		/* [1] int */
+			BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 4),	/* [2] enum */
+				BTF_ENUM_ENC(NAME_TBD, 0),
+				BTF_ENUM_ENC(NAME_TBD, 1),
+			BTF_FWD_ENC(NAME_TBD, 1 /* union kind_flag */),			/* [3] fwd */
+			BTF_TYPE_ARRAY_ENC(2, 1, 7),					/* [4] array */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 4),					/* [5] struct */
+				BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+			BTF_UNION_ENC(NAME_TBD, 1, 4),					/* [6] union */
+				BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+			BTF_TYPEDEF_ENC(NAME_TBD, 1),					/* [7] typedef */
+			BTF_PTR_ENC(0),							/* [8] ptr */
+			BTF_CONST_ENC(8),						/* [9] const */
+			BTF_VOLATILE_ENC(8),						/* [10] volatile */
+			BTF_RESTRICT_ENC(8),						/* [11] restrict */
+			BTF_FUNC_PROTO_ENC(1, 2),					/* [12] func_proto */
+				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
+			BTF_FUNC_ENC(NAME_TBD, 12),					/* [13] func */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 8),		/* [1] int */
+			BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 4),	/* [2] enum */
+				BTF_ENUM_ENC(NAME_TBD, 0),
+				BTF_ENUM_ENC(NAME_TBD, 1),
+			BTF_FWD_ENC(NAME_TBD, 1 /* union kind_flag */),			/* [3] fwd */
+			BTF_TYPE_ARRAY_ENC(2, 1, 7),					/* [4] array */
+			BTF_STRUCT_ENC(NAME_TBD, 1, 4),					/* [5] struct */
+				BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+			BTF_UNION_ENC(NAME_TBD, 1, 4),					/* [6] union */
+				BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+			BTF_TYPEDEF_ENC(NAME_TBD, 1),					/* [7] typedef */
+			BTF_PTR_ENC(0),							/* [8] ptr */
+			BTF_CONST_ENC(8),						/* [9] const */
+			BTF_VOLATILE_ENC(8),						/* [10] volatile */
+			BTF_RESTRICT_ENC(8),						/* [11] restrict */
+			BTF_FUNC_PROTO_ENC(1, 2),					/* [12] func_proto */
+				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+				BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
+			BTF_FUNC_ENC(NAME_TBD, 12),					/* [13] func */
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+	},
+},
+{
+	.descr = "dedup: no int duplicates",
+	.input = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8),
+			/* different name */
+			BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 32, 8),
+			/* different encoding */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_CHAR, 0, 32, 8),
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_BOOL, 0, 32, 8),
+			/* different bit offset */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 8, 32, 8),
+			/* different bit size */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8),
+			/* different byte size */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0int\0some other int"),
+	},
+	.expect = {
+		.raw_types = {
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8),
+			/* different name */
+			BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 32, 8),
+			/* different encoding */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_CHAR, 0, 32, 8),
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_BOOL, 0, 32, 8),
+			/* different bit offset */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 8, 32, 8),
+			/* different bit size */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8),
+			/* different byte size */
+			BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+			BTF_END_RAW,
+		},
+		BTF_STR_SEC("\0int\0some other int"),
+	},
+	.opts = {
+		.dont_resolve_fwds = false,
+	},
+},
+
+};
+
+static int btf_type_size(const struct btf_type *t)
+{
+	int base_size = sizeof(struct btf_type);
+	__u16 vlen = BTF_INFO_VLEN(t->info);
+	__u16 kind = BTF_INFO_KIND(t->info);
+
+	switch (kind) {
+	case BTF_KIND_FWD:
+	case BTF_KIND_CONST:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_FUNC:
+		return base_size;
+	case BTF_KIND_INT:
+		return base_size + sizeof(__u32);
+	case BTF_KIND_ENUM:
+		return base_size + vlen * sizeof(struct btf_enum);
+	case BTF_KIND_ARRAY:
+		return base_size + sizeof(struct btf_array);
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+		return base_size + vlen * sizeof(struct btf_member);
+	case BTF_KIND_FUNC_PROTO:
+		return base_size + vlen * sizeof(struct btf_param);
+	default:
+		fprintf(stderr, "Unsupported BTF_KIND:%u\n", kind);
+		return -EINVAL;
+	}
+}
+
+static void dump_btf_strings(const char *strs, __u32 len)
+{
+	const char *cur = strs;
+	int i = 0;
+
+	while (cur < strs + len) {
+		fprintf(stderr, "string #%d: '%s'\n", i, cur);
+		cur += strlen(cur) + 1;
+		i++;
+	}
+}
+
+static int do_test_dedup(unsigned int test_num)
+{
+	const struct btf_dedup_test *test = &dedup_tests[test_num - 1];
+	int err = 0, i;
+	__u32 test_nr_types, expect_nr_types, test_str_len, expect_str_len;
+	void *raw_btf;
+	unsigned int raw_btf_size;
+	struct btf *test_btf = NULL, *expect_btf = NULL;
+	const char *ret_test_next_str, *ret_expect_next_str;
+	const char *test_strs, *expect_strs;
+	const char *test_str_cur, *test_str_end;
+	const char *expect_str_cur, *expect_str_end;
+
+	fprintf(stderr, "BTF dedup test[%u] (%s):", test_num, test->descr);
+
+	raw_btf = btf_raw_create(&hdr_tmpl, test->input.raw_types,
+				 test->input.str_sec, test->input.str_sec_size,
+				 &raw_btf_size, &ret_test_next_str);
+	if (!raw_btf)
+		return -1;
+	test_btf = btf__new((__u8 *)raw_btf, raw_btf_size, __base_pr);
+	free(raw_btf);
+	if (CHECK(IS_ERR(test_btf), "invalid test_btf errno:%ld",
+		  PTR_ERR(test_btf))) {
+		err = -1;
+		goto done;
+	}
+
+	raw_btf = btf_raw_create(&hdr_tmpl, test->expect.raw_types,
+				 test->expect.str_sec,
+				 test->expect.str_sec_size,
+				 &raw_btf_size, &ret_expect_next_str);
+	if (!raw_btf)
+		return -1;
+	expect_btf = btf__new((__u8 *)raw_btf, raw_btf_size, __base_pr);
+	free(raw_btf);
+	if (CHECK(IS_ERR(expect_btf), "invalid expect_btf errno:%ld",
+		  PTR_ERR(expect_btf))) {
+		err = -1;
+		goto done;
+	}
+
+	err = btf__dedup(test_btf, NULL, &test->opts, __base_pr);
+	if (CHECK(err, "btf_dedup failed errno:%d", err)) {
+		err = -1;
+		goto done;
+	}
+
+	btf__get_strings(test_btf, &test_strs, &test_str_len);
+	btf__get_strings(expect_btf, &expect_strs, &expect_str_len);
+	if (CHECK(test_str_len != expect_str_len,
+		  "test_str_len:%u != expect_str_len:%u",
+		  test_str_len, expect_str_len)) {
+		fprintf(stderr, "\ntest strings:\n");
+		dump_btf_strings(test_strs, test_str_len);
+		fprintf(stderr, "\nexpected strings:\n");
+		dump_btf_strings(expect_strs, expect_str_len);
+		err = -1;
+		goto done;
+	}
+
+	test_str_cur = test_strs;
+	test_str_end = test_strs + test_str_len;
+	expect_str_cur = expect_strs;
+	expect_str_end = expect_strs + expect_str_len;
+	while (test_str_cur < test_str_end && expect_str_cur < expect_str_end) {
+		size_t test_len, expect_len;
+
+		test_len = strlen(test_str_cur);
+		expect_len = strlen(expect_str_cur);
+		if (CHECK(test_len != expect_len,
+			  "test_len:%zu != expect_len:%zu "
+			  "(test_str:%s, expect_str:%s)",
+			  test_len, expect_len, test_str_cur, expect_str_cur)) {
+			err = -1;
+			goto done;
+		}
+		if (CHECK(strcmp(test_str_cur, expect_str_cur),
+			  "test_str:%s != expect_str:%s",
+			  test_str_cur, expect_str_cur)) {
+			err = -1;
+			goto done;
+		}
+		test_str_cur += test_len + 1;
+		expect_str_cur += expect_len + 1;
+	}
+	if (CHECK(test_str_cur != test_str_end,
+		  "test_str_cur:%p != test_str_end:%p",
+		  test_str_cur, test_str_end)) {
+		err = -1;
+		goto done;
+	}
+
+	test_nr_types = btf__get_nr_types(test_btf);
+	expect_nr_types = btf__get_nr_types(expect_btf);
+	if (CHECK(test_nr_types != expect_nr_types,
+		  "test_nr_types:%u != expect_nr_types:%u",
+		  test_nr_types, expect_nr_types)) {
+		err = -1;
+		goto done;
+	}
+
+	for (i = 1; i <= test_nr_types; i++) {
+		const struct btf_type *test_type, *expect_type;
+		int test_size, expect_size;
+
+		test_type = btf__type_by_id(test_btf, i);
+		expect_type = btf__type_by_id(expect_btf, i);
+		test_size = btf_type_size(test_type);
+		expect_size = btf_type_size(expect_type);
+
+		if (CHECK(test_size != expect_size,
+			  "type #%d: test_size:%d != expect_size:%u",
+			  i, test_size, expect_size)) {
+			err = -1;
+			goto done;
+		}
+		if (CHECK(memcmp((void *)test_type,
+				 (void *)expect_type,
+				 test_size),
+			  "type #%d: contents differ", i)) {
+			err = -1;
+			goto done;
+		}
+	}
+
+done:
+	if (!err)
+		fprintf(stderr, "OK");
+	if (!IS_ERR(test_btf))
+		btf__free(test_btf);
+	if (!IS_ERR(expect_btf))
+		btf__free(expect_btf);
+
+	return err;
+}
+
+static int test_dedup(void)
+{
+	unsigned int i;
+	int err = 0;
+
+	if (args.dedup_test_num)
+		return count_result(do_test_dedup(args.dedup_test_num));
+
+	for (i = 1; i <= ARRAY_SIZE(dedup_tests); i++)
+		err |= count_result(do_test_dedup(i));
+
+	return err;
+}
+
 static void usage(const char *cmd)
 {
 	fprintf(stderr, "Usage: %s [-l] [[-r btf_raw_test_num (1 - %zu)] |\n"
 			"\t[-g btf_get_info_test_num (1 - %zu)] |\n"
 			"\t[-f btf_file_test_num (1 - %zu)] |\n"
 			"\t[-k btf_prog_info_raw_test_num (1 - %zu)] |\n"
-			"\t[-p (pretty print test)]]\n",
+			"\t[-p (pretty print test)] |\n"
+			"\t[-d btf_dedup_test_num (1 - %zu)]]\n",
 		cmd, ARRAY_SIZE(raw_tests), ARRAY_SIZE(get_info_tests),
-		ARRAY_SIZE(file_tests), ARRAY_SIZE(info_raw_tests));
+		ARRAY_SIZE(file_tests), ARRAY_SIZE(info_raw_tests),
+		ARRAY_SIZE(dedup_tests));
 }
 
 static int parse_args(int argc, char **argv)
 {
-	const char *optstr = "lpk:f:r:g:";
+	const char *optstr = "hlpk:f:r:g:d:";
 	int opt;
 
 	while ((opt = getopt(argc, argv, optstr)) != -1) {
@@ -5591,12 +6080,16 @@ static int parse_args(int argc, char **argv)
 			args.info_raw_test_num = atoi(optarg);
 			args.info_raw_test = true;
 			break;
+		case 'd':
+			args.dedup_test_num = atoi(optarg);
+			args.dedup_test = true;
+			break;
 		case 'h':
 			usage(argv[0]);
 			exit(0);
 		default:
-				usage(argv[0]);
-				return -1;
+			usage(argv[0]);
+			return -1;
 		}
 	}
 
@@ -5632,6 +6125,14 @@ static int parse_args(int argc, char **argv)
 		return -1;
 	}
 
+	if (args.dedup_test_num &&
+	    (args.dedup_test_num < 1 ||
+	     args.dedup_test_num > ARRAY_SIZE(dedup_tests))) {
+		fprintf(stderr, "BTF dedup test number must be [1 - %zu]\n",
+			ARRAY_SIZE(dedup_tests));
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -5667,14 +6168,18 @@ int main(int argc, char **argv)
 	if (args.info_raw_test)
 		err |= test_info_raw();
 
+	if (args.dedup_test)
+		err |= test_dedup();
+
 	if (args.raw_test || args.get_info_test || args.file_test ||
-	    args.pprint_test || args.info_raw_test)
+	    args.pprint_test || args.info_raw_test || args.dedup_test)
 		goto done;
 
 	err |= test_raw();
 	err |= test_get_info();
 	err |= test_file();
 	err |= test_info_raw();
+	err |= test_dedup();
 
 done:
 	print_summary();
-- 
2.17.1


^ permalink raw reply related

* [PATCH btf 1/3] btf: extract BTF type size calculation
From: Andrii Nakryiko @ 2019-01-31  6:58 UTC (permalink / raw)
  To: netdev, ast, yhs, kafai, acme, daniel, kernel-team; +Cc: Andrii Nakryiko
In-Reply-To: <20190131065837.3380288-1-andriin@fb.com>

This pre-patch extracts calculation of amount of space taken by BTF type descriptor
for later reuse by btf_dedup functionality.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/btf.c | 98 +++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 52 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index d682d3b8f7b9..79b782963917 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -180,6 +180,37 @@ static int btf_parse_str_sec(struct btf *btf, btf_print_fn_t err_log)
 	return 0;
 }
 
+static int btf_type_size(struct btf_type *t, btf_print_fn_t err_log)
+{
+	int base_size = sizeof(struct btf_type);
+	__u16 vlen = BTF_INFO_VLEN(t->info);
+
+	switch (BTF_INFO_KIND(t->info)) {
+	case BTF_KIND_FWD:
+	case BTF_KIND_CONST:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_PTR:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_FUNC:
+		return base_size;
+	case BTF_KIND_INT:
+		return base_size + sizeof(__u32);
+	case BTF_KIND_ENUM:
+		return base_size + vlen * sizeof(struct btf_enum);
+	case BTF_KIND_ARRAY:
+		return base_size + sizeof(struct btf_array);
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+		return base_size + vlen * sizeof(struct btf_member);
+	case BTF_KIND_FUNC_PROTO:
+		return base_size + vlen * sizeof(struct btf_param);
+	default:
+		elog("Unsupported BTF_KIND:%u\n", BTF_INFO_KIND(t->info));
+		return -EINVAL;
+	}
+}
+
 static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log)
 {
 	struct btf_header *hdr = btf->hdr;
@@ -189,41 +220,13 @@ static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log)
 
 	while (next_type < end_type) {
 		struct btf_type *t = next_type;
-		__u16 vlen = BTF_INFO_VLEN(t->info);
+		int type_size;
 		int err;
 
-		next_type += sizeof(*t);
-		switch (BTF_INFO_KIND(t->info)) {
-		case BTF_KIND_INT:
-			next_type += sizeof(int);
-			break;
-		case BTF_KIND_ARRAY:
-			next_type += sizeof(struct btf_array);
-			break;
-		case BTF_KIND_STRUCT:
-		case BTF_KIND_UNION:
-			next_type += vlen * sizeof(struct btf_member);
-			break;
-		case BTF_KIND_ENUM:
-			next_type += vlen * sizeof(struct btf_enum);
-			break;
-		case BTF_KIND_FUNC_PROTO:
-			next_type += vlen * sizeof(struct btf_param);
-			break;
-		case BTF_KIND_FUNC:
-		case BTF_KIND_TYPEDEF:
-		case BTF_KIND_PTR:
-		case BTF_KIND_FWD:
-		case BTF_KIND_VOLATILE:
-		case BTF_KIND_CONST:
-		case BTF_KIND_RESTRICT:
-			break;
-		default:
-			elog("Unsupported BTF_KIND:%u\n",
-			     BTF_INFO_KIND(t->info));
-			return -EINVAL;
-		}
-
+		type_size = btf_type_size(t, err_log);
+		if (type_size < 0)
+			return type_size;
+		next_type += type_size;
 		err = btf_add_type(btf, t);
 		if (err)
 			return err;
@@ -250,21 +253,6 @@ static bool btf_type_is_void_or_null(const struct btf_type *t)
 	return !t || btf_type_is_void(t);
 }
 
-static __s64 btf_type_size(const struct btf_type *t)
-{
-	switch (BTF_INFO_KIND(t->info)) {
-	case BTF_KIND_INT:
-	case BTF_KIND_STRUCT:
-	case BTF_KIND_UNION:
-	case BTF_KIND_ENUM:
-		return t->size;
-	case BTF_KIND_PTR:
-		return sizeof(void *);
-	default:
-		return -EINVAL;
-	}
-}
-
 #define MAX_RESOLVE_DEPTH 32
 
 __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
@@ -278,11 +266,16 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 	t = btf__type_by_id(btf, type_id);
 	for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
 	     i++) {
-		size = btf_type_size(t);
-		if (size >= 0)
-			break;
-
 		switch (BTF_INFO_KIND(t->info)) {
+		case BTF_KIND_INT:
+		case BTF_KIND_STRUCT:
+		case BTF_KIND_UNION:
+		case BTF_KIND_ENUM:
+			size = t->size;
+			goto done;
+		case BTF_KIND_PTR:
+			size = sizeof(void *);
+			goto done;
 		case BTF_KIND_TYPEDEF:
 		case BTF_KIND_VOLATILE:
 		case BTF_KIND_CONST:
@@ -306,6 +299,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 	if (size < 0)
 		return -EINVAL;
 
+done:
 	if (nelems && size > UINT32_MAX / nelems)
 		return -E2BIG;
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH btf 0/3] Add BTF types deduplication algorithm
From: Andrii Nakryiko @ 2019-01-31  6:58 UTC (permalink / raw)
  To: netdev, ast, yhs, kafai, acme, daniel, kernel-team; +Cc: Andrii Nakryiko

This patch series adds BTF deduplication algorithm to libbpf. This algorithm
allows to take BTF type information containing duplicate per-compilation unit
information and reduce it to equivalent set of BTF types with no duplication without
loss of information. It also deduplicates strings and removes those strings that
are not referenced from any BTF type (and line information in .BTF.ext section,
if any).

Algorithm also resolves struct/union forward declarations into concrete BTF types
across multiple compilation units to facilitate better deduplication ratio. If
undesired, this resolution can be disabled through specifying corresponding options.

When applied to BTF data emitted by pahole's DWARF->BTF converter, it reduces
the overall size of .BTF section by about 65x, from about 112MB to 1.75MB, leaving
only 29247 out of initial 3073497 BTF type descriptors.

Algorithm with minor differences and preliminary results before FUNC/FUNC_PROTO
support is also described more verbosely at:
https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html

Andrii Nakryiko (3):
  btf: extract BTF type size calculation
  btf: add BTF types deduplication algorithm
  selftests/btf: add initial BTF dedup tests

 tools/lib/bpf/btf.c                    | 1851 +++++++++++++++++++++++-
 tools/lib/bpf/btf.h                    |   11 +
 tools/lib/bpf/libbpf.map               |    3 +
 tools/testing/selftests/bpf/test_btf.c |  535 ++++++-
 4 files changed, 2333 insertions(+), 67 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH] can: flexcan: fix timeout when set small bitrate
From: Joakim Zhang @ 2019-01-31  6:58 UTC (permalink / raw)
  To: mkl@pengutronix.de, linux-can@vger.kernel.org
  Cc: wg@grandegger.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, dl-linux-imx, Aisheng Dong,
	Joakim Zhang

From: Dong Aisheng <aisheng.dong@nxp.com>

Current we can meet timeout issue when setting a small bitrate
like 10000 as follows:
root@imx6qdlsolo:~# ip link set can0 up type can bitrate 10000
A link change request failed with some changes committed already.
Interface can0 may have been left with an inconsistent configuration,
please check.
RTNETLINK answers: Connection timed out

It is caused by calling of flexcan_chip_unfreeze() timeout.

Originally the code is using usleep_range(10, 20) for unfreeze operation,
but the patch (8badd65 can: flexcan: avoid calling usleep_range from
interrupt context) changed it into udelay(10) which is only a half delay
of before, there're also some other delay changes.

After only changed unfreeze delay back to udelay(20), the issue is gone.
So other timeout values are kept the same as 8badd65 changed.

Signed-off-by: Dong Aisheng <aisheng.dong@nxp.com>
Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
---
 drivers/net/can/flexcan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index 2bca867bcfaa..1d3a9053bbeb 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -530,7 +530,7 @@ static int flexcan_chip_unfreeze(struct flexcan_priv *priv)
 	priv->write(reg, &regs->mcr);
 
 	while (timeout-- && (priv->read(&regs->mcr) & FLEXCAN_MCR_FRZ_ACK))
-		udelay(10);
+		udelay(20);
 
 	if (priv->read(&regs->mcr) & FLEXCAN_MCR_FRZ_ACK)
 		return -ETIMEDOUT;
-- 
2.17.1


^ permalink raw reply related

* Re: r8169 Driver - Poor Network Performance Since Kernel 4.19
From: Heiner Kallweit @ 2019-01-31  6:49 UTC (permalink / raw)
  To: David Chang; +Cc: Peter Ceiley, Realtek linux nic maintainers, netdev
In-Reply-To: <4d832c16-8830-b746-a818-6026c2e6725c@gmail.com>

And one more inquiry ..

So far I read about the issue only in combination with NFS.
Does the issue also occur with iperf or some other type of
high network load?

Heiner


On 31.01.2019 07:35, Heiner Kallweit wrote:
> Hi David, two more things:
> 
> 1. Could you please test a recent linux-next kernel?
> 2. Please get a register dump (ethtool -d <if>) from 4.18 and 4.19
>    and compare them.
> 
> Heiner
> 
> 
> On 31.01.2019 07:21, Heiner Kallweit wrote:
>> David, thanks for the link to the bug ticket.
>> I think only a proper bisect can help to find the offending commit.
>>
>> Heiner
>>
>>
>> On 31.01.2019 03:32, David Chang wrote:
>>> Hi,
>>>
>>> We had a similr case here.
>>> - Realtek r8169 receive performance regression in kernel 4.19
>>>   https://bugzilla.suse.com/show_bug.cgi?id=1119649
>>>
>>> kernel: r8169 0000:01:00.0 eth0: RTL8168h/8111h, XID 54100880
>>> The major symptom is there are many rx_missed count.
>>>
>>>
>>> On Jan 30, 2019 at 20:15:45 +0100, Heiner Kallweit wrote:
>>>> Hi Peter,
>>>>
>>>> recently I had somebody where pcie_aspm=off for whatever reason didn't
>>>> do the trick, can you also check with pcie_aspm.policy=performance.
>>>
>>> We will give it a try later.
>>>
>>>> And please check with "ethtool -S <if>" whether the chip statistics
>>>> show a significant number of errors.
>>>>
>>>> If this doesn't help you may have to bisect to find the offending commit.
>>>
>>> We had tried fallback driver to a few previous commits as following,
>>> but with no luck.
>>>
>>> 9675931e6b65 r8169: re-enable MSI-X on RTL8168g (v4.19)
>>> 098b01ad9837 r8169: don't include asm headers directly (v4.19-rc1)
>>> a2965f12fde6 r8169: remove rtl8169_set_speed_xmii (v4.19-rc1)
>>> 6fcf9b1d4d6c r8169: fix runtime suspend (v4.19-rc1)
>>> e397286b8e89 r8169: remove TBI 1000BaseX support (v4.19-rc1)
>>>
>>> Thanks,
>>> David Chang
>>>
>>>>
>>>> Heiner
>>>>
>>>>
>>>> On 30.01.2019 10:59, Peter Ceiley wrote:
>>>>> Hi Heiner,
>>>>>
>>>>> I tried disabling the ASPM using the pcie_aspm=off kernel parameter
>>>>> and this made no difference.
>>>>>
>>>>> I tried compiling the 4.18.16 r8169.c with the 4.19.18 source and
>>>>> subsequently loaded the module in the running 4.19.18 kernel. I can
>>>>> confirm that this immediately resolved the issue and access to the NFS
>>>>> shares operated as expected.
>>>>>
>>>>> I presume this means it is an issue with the r8169 driver included in
>>>>> 4.19 onwards?
>>>>>
>>>>> To answer your last questions:
>>>>>
>>>>> Base Board Information
>>>>>     Manufacturer: Alienware
>>>>>     Product Name: 0PGRP5
>>>>>     Version: A02
>>>>>
>>>>> ... and yes, the RTL8168 is the onboard network chip.
>>>>>
>>>>> Regards,
>>>>>
>>>>> Peter.
>>>>>
>>>>> On Tue, 29 Jan 2019 at 17:44, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>
>>>>>> Hi Peter,
>>>>>>
>>>>>> I think the vendor driver doesn't enable ASPM per default.
>>>>>> So it's worth a try to disable ASPM in the BIOS or via sysfs.
>>>>>> Few older systems seem to have issues with ASPM, what kind of
>>>>>> system / mainboard are you using? The RTL8168 is the onboard
>>>>>> network chip?
>>>>>>
>>>>>> Rgds, Heiner
>>>>>>
>>>>>>
>>>>>> On 29.01.2019 07:20, Peter Ceiley wrote:
>>>>>>> Hi Heiner,
>>>>>>>
>>>>>>> Thanks, I'll do some more testing. It might not be the driver - I
>>>>>>> assumed it was due to the fact that using the r8168 driver 'resolves'
>>>>>>> the issue. I'll see if I can test the r8169.c on top of 4.19 - this is
>>>>>>> a good idea.
>>>>>>>
>>>>>>> Cheers,
>>>>>>>
>>>>>>> Peter.
>>>>>>>
>>>>>>> On Tue, 29 Jan 2019 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>>
>>>>>>>> Hi Peter,
>>>>>>>>
>>>>>>>> at a first glance it doesn't look like a typical driver issue.
>>>>>>>> What you could do:
>>>>>>>>
>>>>>>>> - Test the r8169.c from 4.18 on top of 4.19.
>>>>>>>>
>>>>>>>> - Check whether disabling ASPM (/sys/module/pcie_aspm) has an effect.
>>>>>>>>
>>>>>>>> - Bisect between 4.18 and 4.19 to find the offending commit.
>>>>>>>>
>>>>>>>> Any specific reason why you think root cause is in the driver and not
>>>>>>>> elsewhere in the network subsystem?
>>>>>>>>
>>>>>>>> Heiner
>>>>>>>>
>>>>>>>>
>>>>>>>> On 28.01.2019 23:10, Peter Ceiley wrote:
>>>>>>>>> Hi Heiner,
>>>>>>>>>
>>>>>>>>> Thanks for getting back to me.
>>>>>>>>>
>>>>>>>>> No, I don't use jumbo packets.
>>>>>>>>>
>>>>>>>>> Bandwidth is *generally* good, and iperf results to my NAS provide
>>>>>>>>> over 900 Mbits/s in both circumstances. The issue seems to appear when
>>>>>>>>> establishing a connection and is most notable, for example, on my
>>>>>>>>> mounted NFS shares where it takes seconds (up to 10's of seconds on
>>>>>>>>> larger directories) to list the contents of each directory. Once a
>>>>>>>>> transfer begins on a file, I appear to get good bandwidth.
>>>>>>>>>
>>>>>>>>> I'm unsure of the best scientific data to provide you in order to
>>>>>>>>> troubleshoot this issue. Running the following
>>>>>>>>>
>>>>>>>>>     netstat -s |grep retransmitted
>>>>>>>>>
>>>>>>>>> shows a steady increase in retransmitted segments each time I list the
>>>>>>>>> contents of a remote directory, for example, running 'ls' on a
>>>>>>>>> directory containing 345 media files did the following using kernel
>>>>>>>>> 4.19.18:
>>>>>>>>>
>>>>>>>>> increased retransmitted segments by 21 and the 'time' command showed
>>>>>>>>> the following:
>>>>>>>>>     real    0m19.867s
>>>>>>>>>     user    0m0.012s
>>>>>>>>>     sys    0m0.036s
>>>>>>>>>
>>>>>>>>> The same command shows no retransmitted segments running kernel
>>>>>>>>> 4.18.16 and 'time' showed:
>>>>>>>>>     real    0m0.300s
>>>>>>>>>     user    0m0.004s
>>>>>>>>>     sys    0m0.007s
>>>>>>>>>
>>>>>>>>> ifconfig does not show any RX/TX errors nor dropped packets in either case.
>>>>>>>>>
>>>>>>>>> dmesg XID:
>>>>>>>>> [    2.979984] r8169 0000:03:00.0 eth0: RTL8168g/8111g,
>>>>>>>>> f8:b1:56:fe:67:e0, XID 4c000800, IRQ 32
>>>>>>>>>
>>>>>>>>> # lspci -vv
>>>>>>>>> 03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd.
>>>>>>>>> RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 0c)
>>>>>>>>>     Subsystem: Dell RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>>     Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
>>>>>>>>> ParErr- Stepping- SERR- FastB2B- DisINTx+
>>>>>>>>>     Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
>>>>>>>>> <TAbort- <MAbort- >SERR- <PERR- INTx-
>>>>>>>>>     Latency: 0, Cache Line Size: 64 bytes
>>>>>>>>>     Interrupt: pin A routed to IRQ 19
>>>>>>>>>     Region 0: I/O ports at d000 [size=256]
>>>>>>>>>     Region 2: Memory at f7b00000 (64-bit, non-prefetchable) [size=4K]
>>>>>>>>>     Region 4: Memory at f2100000 (64-bit, prefetchable) [size=16K]
>>>>>>>>>     Capabilities: [40] Power Management version 3
>>>>>>>>>         Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA
>>>>>>>>> PME(D0+,D1+,D2+,D3hot+,D3cold+)
>>>>>>>>>         Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
>>>>>>>>>     Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
>>>>>>>>>         Address: 0000000000000000  Data: 0000
>>>>>>>>>     Capabilities: [70] Express (v2) Endpoint, MSI 01
>>>>>>>>>         DevCap:    MaxPayload 128 bytes, PhantFunc 0, Latency L0s
>>>>>>>>> <512ns, L1 <64us
>>>>>>>>>             ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
>>>>>>>>> SlotPowerLimit 10.000W
>>>>>>>>>         DevCtl:    CorrErr- NonFatalErr- FatalErr- UnsupReq-
>>>>>>>>>             RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
>>>>>>>>>             MaxPayload 128 bytes, MaxReadReq 4096 bytes
>>>>>>>>>         DevSta:    CorrErr+ NonFatalErr- FatalErr- UnsupReq- AuxPwr+ TransPend-
>>>>>>>>>         LnkCap:    Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, Exit
>>>>>>>>> Latency L0s unlimited, L1 <64us
>>>>>>>>>             ClockPM+ Surprise- LLActRep- BwNot- ASPMOptComp+
>>>>>>>>>         LnkCtl:    ASPM L1 Enabled; RCB 64 bytes Disabled- CommClk+
>>>>>>>>>             ExtSynch- ClockPM+ AutWidDis- BWInt- AutBWInt-
>>>>>>>>>         LnkSta:    Speed 2.5GT/s (ok), Width x1 (ok)
>>>>>>>>>             TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
>>>>>>>>>         DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR+,
>>>>>>>>> OBFF Via message/WAKE#
>>>>>>>>>              AtomicOpsCap: 32bit- 64bit- 128bitCAS-
>>>>>>>>>         DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-, LTR+,
>>>>>>>>> OBFF Disabled
>>>>>>>>>              AtomicOpsCtl: ReqEn-
>>>>>>>>>         LnkCtl2: Target Link Speed: 2.5GT/s, EnterCompliance- SpeedDis-
>>>>>>>>>              Transmit Margin: Normal Operating Range,
>>>>>>>>> EnterModifiedCompliance- ComplianceSOS-
>>>>>>>>>              Compliance De-emphasis: -6dB
>>>>>>>>>         LnkSta2: Current De-emphasis Level: -6dB,
>>>>>>>>> EqualizationComplete-, EqualizationPhase1-
>>>>>>>>>              EqualizationPhase2-, EqualizationPhase3-, LinkEqualizationRequest-
>>>>>>>>>     Capabilities: [b0] MSI-X: Enable+ Count=4 Masked-
>>>>>>>>>         Vector table: BAR=4 offset=00000000
>>>>>>>>>         PBA: BAR=4 offset=00000800
>>>>>>>>>     Capabilities: [d0] Vital Product Data
>>>>>>>>> pcilib: sysfs_read_vpd: read failed: Input/output error
>>>>>>>>>         Not readable
>>>>>>>>>     Capabilities: [100 v1] Advanced Error Reporting
>>>>>>>>>         UESta:    DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>>>         UEMsk:    DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>>>         UESvrt:    DLP+ SDES+ TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>>> RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
>>>>>>>>>         CESta:    RxErr+ BadTLP+ BadDLLP+ Rollover- Timeout+ AdvNonFatalErr-
>>>>>>>>>         CEMsk:    RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
>>>>>>>>>         AERCap:    First Error Pointer: 00, ECRCGenCap+ ECRCGenEn-
>>>>>>>>> ECRCChkCap+ ECRCChkEn-
>>>>>>>>>             MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
>>>>>>>>>         HeaderLog: 00000000 00000000 00000000 00000000
>>>>>>>>>     Capabilities: [140 v1] Virtual Channel
>>>>>>>>>         Caps:    LPEVC=0 RefClk=100ns PATEntryBits=1
>>>>>>>>>         Arb:    Fixed- WRR32- WRR64- WRR128-
>>>>>>>>>         Ctrl:    ArbSelect=Fixed
>>>>>>>>>         Status:    InProgress-
>>>>>>>>>         VC0:    Caps:    PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
>>>>>>>>>             Arb:    Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
>>>>>>>>>             Ctrl:    Enable+ ID=0 ArbSelect=Fixed TC/VC=01
>>>>>>>>>             Status:    NegoPending- InProgress-
>>>>>>>>>     Capabilities: [160 v1] Device Serial Number 01-00-00-00-68-4c-e0-00
>>>>>>>>>     Capabilities: [170 v1] Latency Tolerance Reporting
>>>>>>>>>         Max snoop latency: 71680ns
>>>>>>>>>         Max no snoop latency: 71680ns
>>>>>>>>>     Kernel driver in use: r8169
>>>>>>>>>     Kernel modules: r8169
>>>>>>>>>
>>>>>>>>> Please let me know if you have any other ideas in terms of testing.
>>>>>>>>>
>>>>>>>>> Thanks!
>>>>>>>>>
>>>>>>>>> Peter.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Tue, 29 Jan 2019 at 05:28, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 28.01.2019 12:13, Peter Ceiley wrote:
>>>>>>>>>>> Hi,
>>>>>>>>>>>
>>>>>>>>>>> I have been experiencing very poor network performance since Kernel
>>>>>>>>>>> 4.19 and I'm confident it's related to the r8169 driver.
>>>>>>>>>>>
>>>>>>>>>>> I have no issue with kernel versions 4.18 and prior. I am experiencing
>>>>>>>>>>> this issue in kernels 4.19 and 4.20 (currently running/testing with
>>>>>>>>>>> 4.20.4 & 4.19.18).
>>>>>>>>>>>
>>>>>>>>>>> If someone could guide me in the right direction, I'm happy to help
>>>>>>>>>>> troubleshoot this issue. Note that I have been keeping an eye on one
>>>>>>>>>>> issue related to loading of the PHY driver, however, my symptoms
>>>>>>>>>>> differ in that I still have a network connection. I have attempted to
>>>>>>>>>>> reload the driver on a running system, but this does not improve the
>>>>>>>>>>> situation.
>>>>>>>>>>>
>>>>>>>>>>> Using the proprietary r8168 driver returns my device to proper working order.
>>>>>>>>>>>
>>>>>>>>>>> lshw shows:
>>>>>>>>>>>        description: Ethernet interface
>>>>>>>>>>>        product: RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>>>>        vendor: Realtek Semiconductor Co., Ltd.
>>>>>>>>>>>        physical id: 0
>>>>>>>>>>>        bus info: pci@0000:03:00.0
>>>>>>>>>>>        logical name: enp3s0
>>>>>>>>>>>        version: 0c
>>>>>>>>>>>        serial:
>>>>>>>>>>>        size: 1Gbit/s
>>>>>>>>>>>        capacity: 1Gbit/s
>>>>>>>>>>>        width: 64 bits
>>>>>>>>>>>        clock: 33MHz
>>>>>>>>>>>        capabilities: pm msi pciexpress msix vpd bus_master cap_list
>>>>>>>>>>> ethernet physical tp aui bnc mii fibre 10bt 10bt-fd 100bt 100bt-fd
>>>>>>>>>>> 1000bt-fd autonegotiation
>>>>>>>>>>>        configuration: autonegotiation=on broadcast=yes driver=r8169
>>>>>>>>>>> duplex=full firmware=rtl8168g-2_0.0.1 02/06/13 ip=192.168.1.25
>>>>>>>>>>> latency=0 link=yes multicast=yes port=MII speed=1Gbit/s
>>>>>>>>>>>        resources: irq:19 ioport:d000(size=256)
>>>>>>>>>>> memory:f7b00000-f7b00fff memory:f2100000-f2103fff
>>>>>>>>>>>
>>>>>>>>>>> Kind Regards,
>>>>>>>>>>>
>>>>>>>>>>> Peter.
>>>>>>>>>>>
>>>>>>>>>> Hi Peter,
>>>>>>>>>>
>>>>>>>>>> the description "poor network performance" is quite vague, therefore:
>>>>>>>>>>
>>>>>>>>>> - Can you provide any measurements?
>>>>>>>>>> - iperf results before and after
>>>>>>>>>> - statistics about dropped packets (rx and/or tx)
>>>>>>>>>> - Do you use jumbo packets?
>>>>>>>>>>
>>>>>>>>>> Also help would be a "lspci -vv" output for the network card and
>>>>>>>>>> the dmesg output line with the chip XID.
>>>>>>>>>>
>>>>>>>>>> Heiner
>>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
> 


^ permalink raw reply

* [PATCH net-next] r8169: improve WoL handling
From: Heiner Kallweit @ 2019-01-31  6:43 UTC (permalink / raw)
  To: Realtek linux nic maintainers, David Miller; +Cc: netdev@vger.kernel.org

WoL handling for the RTL8168 family is a little bit tricky because of
different types of broken BIOS and/or chip quirks.

Two known issues:
1. Network properly resumes from suspend only if WoL is enabled in the chip.
2. Some notebooks wake up immediately if system is suspended and network
   device is wakeup-enabled.

Few patches tried to deal with this:
7edf6d314cd0 ("r8169: disable WOL per default")
18041b523692 ("r8169: restore previous behavior to accept BIOS WoL
settings")

Currently we have the situation that the chip WoL settings as set by
the BIOS are respected (to prevent issue 1), but the device doesn't get
wakeup-enabled (to prevent issue 2).

This leads to another issue:
If systemd is told to set WoL it first checks whether the requested
settings are active already (and does nothing if yes). Due to the chip
WoL flags being set properly systemd assumes that WoL is configured
properly in our case. Result is that device doesn't get wakeup-enabled
and WoL doesn't work (until it's set e.g. by ethtool).

This patch now:
- leaves the chip WoL settings as is (to prevent issue 1)
- keeps the behavior to not wakeup-enable the device initially
  (to prevent issue 2)
- In addition we report WoL as being disabled in get_wol, matching
  that device isn't wakeup-enabled. If systemd is told to enable WoL,
  it will therefore detect that it has to do something and will
  call set_wol.

Of course the user still has the option to override this with
e.g. ethtool.

Because there are lots of consumer mainboards with members of the
RTL8168 family as onboard network, there's a certain chance that also
the new approach may trigger a BIOS bug and cause issues. Therefore
don't remove __rtl8169_get_wol() completely.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3e650bd9e..2dab28115 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1371,6 +1371,8 @@ static void rtl_link_chg_patch(struct rtl8169_private *tp)
 
 #define WAKE_ANY (WAKE_PHY | WAKE_MAGIC | WAKE_UCAST | WAKE_BCAST | WAKE_MCAST)
 
+/* Don't delete it completely, in case we need to re-enable it */
+#if 0
 static u32 __rtl8169_get_wol(struct rtl8169_private *tp)
 {
 	u8 options;
@@ -1405,6 +1407,7 @@ static u32 __rtl8169_get_wol(struct rtl8169_private *tp)
 
 	return wolopts;
 }
+#endif
 
 static void rtl8169_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 {
@@ -4284,7 +4287,7 @@ static void rtl_wol_suspend_quirk(struct rtl8169_private *tp)
 
 static bool rtl_wol_pll_power_down(struct rtl8169_private *tp)
 {
-	if (!__rtl8169_get_wol(tp))
+	if (!device_may_wakeup(tp_to_dev(tp)))
 		return false;
 
 	phy_speed_down(tp->phydev, false);
@@ -7441,8 +7444,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		return rc;
 	}
 
-	tp->saved_wolopts = __rtl8169_get_wol(tp);
-
 	mutex_init(&tp->wk.mutex);
 	INIT_WORK(&tp->wk.work, rtl_task);
 	u64_stats_init(&tp->rx_stats.syncp);
-- 
2.20.1


^ permalink raw reply related

* Re: r8169 Driver - Poor Network Performance Since Kernel 4.19
From: Heiner Kallweit @ 2019-01-31  6:35 UTC (permalink / raw)
  To: David Chang; +Cc: Peter Ceiley, Realtek linux nic maintainers, netdev
In-Reply-To: <b8ed124a-fa79-e242-af03-4d24e4d6c56e@gmail.com>

Hi David, two more things:

1. Could you please test a recent linux-next kernel?
2. Please get a register dump (ethtool -d <if>) from 4.18 and 4.19
   and compare them.

Heiner


On 31.01.2019 07:21, Heiner Kallweit wrote:
> David, thanks for the link to the bug ticket.
> I think only a proper bisect can help to find the offending commit.
> 
> Heiner
> 
> 
> On 31.01.2019 03:32, David Chang wrote:
>> Hi,
>>
>> We had a similr case here.
>> - Realtek r8169 receive performance regression in kernel 4.19
>>   https://bugzilla.suse.com/show_bug.cgi?id=1119649
>>
>> kernel: r8169 0000:01:00.0 eth0: RTL8168h/8111h, XID 54100880
>> The major symptom is there are many rx_missed count.
>>
>>
>> On Jan 30, 2019 at 20:15:45 +0100, Heiner Kallweit wrote:
>>> Hi Peter,
>>>
>>> recently I had somebody where pcie_aspm=off for whatever reason didn't
>>> do the trick, can you also check with pcie_aspm.policy=performance.
>>
>> We will give it a try later.
>>
>>> And please check with "ethtool -S <if>" whether the chip statistics
>>> show a significant number of errors.
>>>
>>> If this doesn't help you may have to bisect to find the offending commit.
>>
>> We had tried fallback driver to a few previous commits as following,
>> but with no luck.
>>
>> 9675931e6b65 r8169: re-enable MSI-X on RTL8168g (v4.19)
>> 098b01ad9837 r8169: don't include asm headers directly (v4.19-rc1)
>> a2965f12fde6 r8169: remove rtl8169_set_speed_xmii (v4.19-rc1)
>> 6fcf9b1d4d6c r8169: fix runtime suspend (v4.19-rc1)
>> e397286b8e89 r8169: remove TBI 1000BaseX support (v4.19-rc1)
>>
>> Thanks,
>> David Chang
>>
>>>
>>> Heiner
>>>
>>>
>>> On 30.01.2019 10:59, Peter Ceiley wrote:
>>>> Hi Heiner,
>>>>
>>>> I tried disabling the ASPM using the pcie_aspm=off kernel parameter
>>>> and this made no difference.
>>>>
>>>> I tried compiling the 4.18.16 r8169.c with the 4.19.18 source and
>>>> subsequently loaded the module in the running 4.19.18 kernel. I can
>>>> confirm that this immediately resolved the issue and access to the NFS
>>>> shares operated as expected.
>>>>
>>>> I presume this means it is an issue with the r8169 driver included in
>>>> 4.19 onwards?
>>>>
>>>> To answer your last questions:
>>>>
>>>> Base Board Information
>>>>     Manufacturer: Alienware
>>>>     Product Name: 0PGRP5
>>>>     Version: A02
>>>>
>>>> ... and yes, the RTL8168 is the onboard network chip.
>>>>
>>>> Regards,
>>>>
>>>> Peter.
>>>>
>>>> On Tue, 29 Jan 2019 at 17:44, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>
>>>>> Hi Peter,
>>>>>
>>>>> I think the vendor driver doesn't enable ASPM per default.
>>>>> So it's worth a try to disable ASPM in the BIOS or via sysfs.
>>>>> Few older systems seem to have issues with ASPM, what kind of
>>>>> system / mainboard are you using? The RTL8168 is the onboard
>>>>> network chip?
>>>>>
>>>>> Rgds, Heiner
>>>>>
>>>>>
>>>>> On 29.01.2019 07:20, Peter Ceiley wrote:
>>>>>> Hi Heiner,
>>>>>>
>>>>>> Thanks, I'll do some more testing. It might not be the driver - I
>>>>>> assumed it was due to the fact that using the r8168 driver 'resolves'
>>>>>> the issue. I'll see if I can test the r8169.c on top of 4.19 - this is
>>>>>> a good idea.
>>>>>>
>>>>>> Cheers,
>>>>>>
>>>>>> Peter.
>>>>>>
>>>>>> On Tue, 29 Jan 2019 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>
>>>>>>> Hi Peter,
>>>>>>>
>>>>>>> at a first glance it doesn't look like a typical driver issue.
>>>>>>> What you could do:
>>>>>>>
>>>>>>> - Test the r8169.c from 4.18 on top of 4.19.
>>>>>>>
>>>>>>> - Check whether disabling ASPM (/sys/module/pcie_aspm) has an effect.
>>>>>>>
>>>>>>> - Bisect between 4.18 and 4.19 to find the offending commit.
>>>>>>>
>>>>>>> Any specific reason why you think root cause is in the driver and not
>>>>>>> elsewhere in the network subsystem?
>>>>>>>
>>>>>>> Heiner
>>>>>>>
>>>>>>>
>>>>>>> On 28.01.2019 23:10, Peter Ceiley wrote:
>>>>>>>> Hi Heiner,
>>>>>>>>
>>>>>>>> Thanks for getting back to me.
>>>>>>>>
>>>>>>>> No, I don't use jumbo packets.
>>>>>>>>
>>>>>>>> Bandwidth is *generally* good, and iperf results to my NAS provide
>>>>>>>> over 900 Mbits/s in both circumstances. The issue seems to appear when
>>>>>>>> establishing a connection and is most notable, for example, on my
>>>>>>>> mounted NFS shares where it takes seconds (up to 10's of seconds on
>>>>>>>> larger directories) to list the contents of each directory. Once a
>>>>>>>> transfer begins on a file, I appear to get good bandwidth.
>>>>>>>>
>>>>>>>> I'm unsure of the best scientific data to provide you in order to
>>>>>>>> troubleshoot this issue. Running the following
>>>>>>>>
>>>>>>>>     netstat -s |grep retransmitted
>>>>>>>>
>>>>>>>> shows a steady increase in retransmitted segments each time I list the
>>>>>>>> contents of a remote directory, for example, running 'ls' on a
>>>>>>>> directory containing 345 media files did the following using kernel
>>>>>>>> 4.19.18:
>>>>>>>>
>>>>>>>> increased retransmitted segments by 21 and the 'time' command showed
>>>>>>>> the following:
>>>>>>>>     real    0m19.867s
>>>>>>>>     user    0m0.012s
>>>>>>>>     sys    0m0.036s
>>>>>>>>
>>>>>>>> The same command shows no retransmitted segments running kernel
>>>>>>>> 4.18.16 and 'time' showed:
>>>>>>>>     real    0m0.300s
>>>>>>>>     user    0m0.004s
>>>>>>>>     sys    0m0.007s
>>>>>>>>
>>>>>>>> ifconfig does not show any RX/TX errors nor dropped packets in either case.
>>>>>>>>
>>>>>>>> dmesg XID:
>>>>>>>> [    2.979984] r8169 0000:03:00.0 eth0: RTL8168g/8111g,
>>>>>>>> f8:b1:56:fe:67:e0, XID 4c000800, IRQ 32
>>>>>>>>
>>>>>>>> # lspci -vv
>>>>>>>> 03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd.
>>>>>>>> RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 0c)
>>>>>>>>     Subsystem: Dell RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>     Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
>>>>>>>> ParErr- Stepping- SERR- FastB2B- DisINTx+
>>>>>>>>     Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
>>>>>>>> <TAbort- <MAbort- >SERR- <PERR- INTx-
>>>>>>>>     Latency: 0, Cache Line Size: 64 bytes
>>>>>>>>     Interrupt: pin A routed to IRQ 19
>>>>>>>>     Region 0: I/O ports at d000 [size=256]
>>>>>>>>     Region 2: Memory at f7b00000 (64-bit, non-prefetchable) [size=4K]
>>>>>>>>     Region 4: Memory at f2100000 (64-bit, prefetchable) [size=16K]
>>>>>>>>     Capabilities: [40] Power Management version 3
>>>>>>>>         Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA
>>>>>>>> PME(D0+,D1+,D2+,D3hot+,D3cold+)
>>>>>>>>         Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
>>>>>>>>     Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
>>>>>>>>         Address: 0000000000000000  Data: 0000
>>>>>>>>     Capabilities: [70] Express (v2) Endpoint, MSI 01
>>>>>>>>         DevCap:    MaxPayload 128 bytes, PhantFunc 0, Latency L0s
>>>>>>>> <512ns, L1 <64us
>>>>>>>>             ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
>>>>>>>> SlotPowerLimit 10.000W
>>>>>>>>         DevCtl:    CorrErr- NonFatalErr- FatalErr- UnsupReq-
>>>>>>>>             RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
>>>>>>>>             MaxPayload 128 bytes, MaxReadReq 4096 bytes
>>>>>>>>         DevSta:    CorrErr+ NonFatalErr- FatalErr- UnsupReq- AuxPwr+ TransPend-
>>>>>>>>         LnkCap:    Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, Exit
>>>>>>>> Latency L0s unlimited, L1 <64us
>>>>>>>>             ClockPM+ Surprise- LLActRep- BwNot- ASPMOptComp+
>>>>>>>>         LnkCtl:    ASPM L1 Enabled; RCB 64 bytes Disabled- CommClk+
>>>>>>>>             ExtSynch- ClockPM+ AutWidDis- BWInt- AutBWInt-
>>>>>>>>         LnkSta:    Speed 2.5GT/s (ok), Width x1 (ok)
>>>>>>>>             TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
>>>>>>>>         DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR+,
>>>>>>>> OBFF Via message/WAKE#
>>>>>>>>              AtomicOpsCap: 32bit- 64bit- 128bitCAS-
>>>>>>>>         DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-, LTR+,
>>>>>>>> OBFF Disabled
>>>>>>>>              AtomicOpsCtl: ReqEn-
>>>>>>>>         LnkCtl2: Target Link Speed: 2.5GT/s, EnterCompliance- SpeedDis-
>>>>>>>>              Transmit Margin: Normal Operating Range,
>>>>>>>> EnterModifiedCompliance- ComplianceSOS-
>>>>>>>>              Compliance De-emphasis: -6dB
>>>>>>>>         LnkSta2: Current De-emphasis Level: -6dB,
>>>>>>>> EqualizationComplete-, EqualizationPhase1-
>>>>>>>>              EqualizationPhase2-, EqualizationPhase3-, LinkEqualizationRequest-
>>>>>>>>     Capabilities: [b0] MSI-X: Enable+ Count=4 Masked-
>>>>>>>>         Vector table: BAR=4 offset=00000000
>>>>>>>>         PBA: BAR=4 offset=00000800
>>>>>>>>     Capabilities: [d0] Vital Product Data
>>>>>>>> pcilib: sysfs_read_vpd: read failed: Input/output error
>>>>>>>>         Not readable
>>>>>>>>     Capabilities: [100 v1] Advanced Error Reporting
>>>>>>>>         UESta:    DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>>         UEMsk:    DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>>         UESvrt:    DLP+ SDES+ TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>>> RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
>>>>>>>>         CESta:    RxErr+ BadTLP+ BadDLLP+ Rollover- Timeout+ AdvNonFatalErr-
>>>>>>>>         CEMsk:    RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
>>>>>>>>         AERCap:    First Error Pointer: 00, ECRCGenCap+ ECRCGenEn-
>>>>>>>> ECRCChkCap+ ECRCChkEn-
>>>>>>>>             MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
>>>>>>>>         HeaderLog: 00000000 00000000 00000000 00000000
>>>>>>>>     Capabilities: [140 v1] Virtual Channel
>>>>>>>>         Caps:    LPEVC=0 RefClk=100ns PATEntryBits=1
>>>>>>>>         Arb:    Fixed- WRR32- WRR64- WRR128-
>>>>>>>>         Ctrl:    ArbSelect=Fixed
>>>>>>>>         Status:    InProgress-
>>>>>>>>         VC0:    Caps:    PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
>>>>>>>>             Arb:    Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
>>>>>>>>             Ctrl:    Enable+ ID=0 ArbSelect=Fixed TC/VC=01
>>>>>>>>             Status:    NegoPending- InProgress-
>>>>>>>>     Capabilities: [160 v1] Device Serial Number 01-00-00-00-68-4c-e0-00
>>>>>>>>     Capabilities: [170 v1] Latency Tolerance Reporting
>>>>>>>>         Max snoop latency: 71680ns
>>>>>>>>         Max no snoop latency: 71680ns
>>>>>>>>     Kernel driver in use: r8169
>>>>>>>>     Kernel modules: r8169
>>>>>>>>
>>>>>>>> Please let me know if you have any other ideas in terms of testing.
>>>>>>>>
>>>>>>>> Thanks!
>>>>>>>>
>>>>>>>> Peter.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On Tue, 29 Jan 2019 at 05:28, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> On 28.01.2019 12:13, Peter Ceiley wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> I have been experiencing very poor network performance since Kernel
>>>>>>>>>> 4.19 and I'm confident it's related to the r8169 driver.
>>>>>>>>>>
>>>>>>>>>> I have no issue with kernel versions 4.18 and prior. I am experiencing
>>>>>>>>>> this issue in kernels 4.19 and 4.20 (currently running/testing with
>>>>>>>>>> 4.20.4 & 4.19.18).
>>>>>>>>>>
>>>>>>>>>> If someone could guide me in the right direction, I'm happy to help
>>>>>>>>>> troubleshoot this issue. Note that I have been keeping an eye on one
>>>>>>>>>> issue related to loading of the PHY driver, however, my symptoms
>>>>>>>>>> differ in that I still have a network connection. I have attempted to
>>>>>>>>>> reload the driver on a running system, but this does not improve the
>>>>>>>>>> situation.
>>>>>>>>>>
>>>>>>>>>> Using the proprietary r8168 driver returns my device to proper working order.
>>>>>>>>>>
>>>>>>>>>> lshw shows:
>>>>>>>>>>        description: Ethernet interface
>>>>>>>>>>        product: RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>>>        vendor: Realtek Semiconductor Co., Ltd.
>>>>>>>>>>        physical id: 0
>>>>>>>>>>        bus info: pci@0000:03:00.0
>>>>>>>>>>        logical name: enp3s0
>>>>>>>>>>        version: 0c
>>>>>>>>>>        serial:
>>>>>>>>>>        size: 1Gbit/s
>>>>>>>>>>        capacity: 1Gbit/s
>>>>>>>>>>        width: 64 bits
>>>>>>>>>>        clock: 33MHz
>>>>>>>>>>        capabilities: pm msi pciexpress msix vpd bus_master cap_list
>>>>>>>>>> ethernet physical tp aui bnc mii fibre 10bt 10bt-fd 100bt 100bt-fd
>>>>>>>>>> 1000bt-fd autonegotiation
>>>>>>>>>>        configuration: autonegotiation=on broadcast=yes driver=r8169
>>>>>>>>>> duplex=full firmware=rtl8168g-2_0.0.1 02/06/13 ip=192.168.1.25
>>>>>>>>>> latency=0 link=yes multicast=yes port=MII speed=1Gbit/s
>>>>>>>>>>        resources: irq:19 ioport:d000(size=256)
>>>>>>>>>> memory:f7b00000-f7b00fff memory:f2100000-f2103fff
>>>>>>>>>>
>>>>>>>>>> Kind Regards,
>>>>>>>>>>
>>>>>>>>>> Peter.
>>>>>>>>>>
>>>>>>>>> Hi Peter,
>>>>>>>>>
>>>>>>>>> the description "poor network performance" is quite vague, therefore:
>>>>>>>>>
>>>>>>>>> - Can you provide any measurements?
>>>>>>>>> - iperf results before and after
>>>>>>>>> - statistics about dropped packets (rx and/or tx)
>>>>>>>>> - Do you use jumbo packets?
>>>>>>>>>
>>>>>>>>> Also help would be a "lspci -vv" output for the network card and
>>>>>>>>> the dmesg output line with the chip XID.
>>>>>>>>>
>>>>>>>>> Heiner
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
> 


^ permalink raw reply

* Re: [ethtool 1/6] ethtool: move option parsing related code into function
From: Jeff Kirsher @ 2019-01-31  6:30 UTC (permalink / raw)
  To: John W. Linville, Nicholas Nunley; +Cc: netdev, nhorman, sassmann
In-Reply-To: <20190130210831.GD19981@tuxdriver.com>

[-- Attachment #1: Type: text/plain, Size: 3069 bytes --]

On Wed, 2019-01-30 at 16:08 -0500, John W. Linville wrote:
> On Thu, Jan 17, 2019 at 03:03:08PM -0800, Nicholas Nunley wrote:
> > Move option parsing code into find_option function.
> > 
> > No behavior changes.
> > 
> > Based on patch by Kan Liang <kan.liang@intel.com>
> > 
> > Signed-off-by: Nicholas Nunley <nicholas.d.nunley@intel.com>
> 
> Well, after looking at this series for a while I had decided to apply it.
> But when I applied it and did a 'make distcheck', I got this:
> 
> ...
> 
> gcc -DTEST_ETHTOOL -g -O2   -o test-features test_features-test-
> features.o test_features-test-common.o test_features-ethtool.o
> test_features-rxclass.o test_features-amd8111e.o test_features-de2104x.o
> test_features-dsa.o test_features-e100.o test_features-e1000.o
> test_features-et131x.o test_features-igb.o test_features-fec_8xx.o
> test_features-ibm_emac.o test_features-ixgb.o test_features-ixgbe.o
> test_features-natsemi.o test_features-pcnet32.o test_features-realtek.o
> test_features-tg3.o test_features-marvell.o test_features-vioc.o
> test_features-smsc911x.o test_features-at76c50x-usb.o test_features-sfc.o 
> test_features-stmmac.o test_features-sff-common.o test_features-sfpid.o
> test_features-sfpdiag.o test_features-ixgbevf.o test_features-tse.o
> test_features-vmxnet3.o test_features-qsfp.o test_features-fjes.o
> test_features-lan78xx.o -lm 
> make[2]: Leaving directory '/home/linville/git/ethtool/ethtool-
> 4.19/_build/sub'
> make  check-TESTS
> make[2]: Entering directory '/home/linville/git/ethtool/ethtool-
> 4.19/_build/sub'
> make[3]: Entering directory '/home/linville/git/ethtool/ethtool-
> 4.19/_build/sub'
> FAIL: test-cmdline
> PASS: test-features
> =========================================================================
> ===
> Testsuite summary for ethtool 4.19
> =========================================================================
> ===
> # TOTAL: 2
> # PASS:  1
> # SKIP:  0
> # XFAIL: 0
> # FAIL:  1
> # XPASS: 0
> # ERROR: 0
> =========================================================================
> ===
> See ./test-suite.log
> Please report to netdev@vger.kernel.org
> =========================================================================
> ===
> make[3]: *** [Makefile:1942: test-suite.log] Error 1
> make[3]: Leaving directory '/home/linville/git/ethtool/ethtool-
> 4.19/_build/sub'
> make[2]: *** [Makefile:2050: check-TESTS] Error 2
> make[2]: Leaving directory '/home/linville/git/ethtool/ethtool-
> 4.19/_build/sub'
> make[1]: *** [Makefile:2264: check-am] Error 2
> make[1]: Leaving directory '/home/linville/git/ethtool/ethtool-
> 4.19/_build/sub'
> make: *** [Makefile:2186: distcheck] Error 1
> 
> ...
> 
> I'll attach ./ethtool-4.19/_build/sub/test-suite.log to this
> message. Obviously we need whatever additional changes are needed to
> get 'make check-TESTS' to pass legitimately.

Interesting that we did not see this in testing.  I will work with Nick to
provide an updated patch, resolves your distcheck failure.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net 0/3] net: stmmac: Misc fixes
From: David Miller @ 2019-01-31  6:29 UTC (permalink / raw)
  To: jose.abreu; +Cc: netdev, joao.pinto, peppe.cavallaro, alexandre.torgue
In-Reply-To: <cover.1548859967.git.joabreu@synopsys.com>

From: Jose Abreu <jose.abreu@synopsys.com>
Date: Wed, 30 Jan 2019 15:54:18 +0100

> Some misc fixes for stmmac targeting -net.

Series applied.

^ permalink raw reply

* RE: [PATCH net-next] strparser: Return if socket does not have required number of bytes
From: Vakul Garg @ 2019-01-31  6:25 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, borisp@mellanox.com, aviadye@mellanox.com,
	davejwatson@fb.com, doronrk@fb.com
In-Reply-To: <20190130.215950.1443919738399463157.davem@davemloft.net>



> -----Original Message-----
> From: David Miller <davem@davemloft.net>
> Sent: Thursday, January 31, 2019 11:30 AM
> To: Vakul Garg <vakul.garg@nxp.com>
> Cc: netdev@vger.kernel.org; borisp@mellanox.com;
> aviadye@mellanox.com; davejwatson@fb.com; doronrk@fb.com
> Subject: Re: [PATCH net-next] strparser: Return if socket does not have
> required number of bytes
> 
> From: Vakul Garg <vakul.garg@nxp.com>
> Date: Wed, 30 Jan 2019 07:31:44 +0000
> 
> > Function strp_data_ready() should peek the associated socket to check
> > whether it has the required number of bytes available before queueing
> > work or initiating socket read via strp_read_sock(). This saves cpu
> > cycles because strp_read_sock() is called only when required amount of
> > data is available.
> >
> > Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
> 
> You can't do this, I think.  It's racy and the user socket owned check is
> absolutely necessary before you do the need bytes check.

Do you mean to say that 'peek_len' operation can't be invoked if the socket is already owned by some other context?

My understanding by reading following link is that we can to peek_len operation without acquiring sock lock.

https://elixir.bootlin.com/linux/v5.0-rc4/source/include/linux/net.h#L191

Can you please kindly elaborate the problem?


^ permalink raw reply

* Re: r8169 Driver - Poor Network Performance Since Kernel 4.19
From: Heiner Kallweit @ 2019-01-31  6:21 UTC (permalink / raw)
  To: David Chang; +Cc: Peter Ceiley, Realtek linux nic maintainers, netdev
In-Reply-To: <20190131023240.GF25745@linux-kyyb.suse>

David, thanks for the link to the bug ticket.
I think only a proper bisect can help to find the offending commit.

Heiner


On 31.01.2019 03:32, David Chang wrote:
> Hi,
> 
> We had a similr case here.
> - Realtek r8169 receive performance regression in kernel 4.19
>   https://bugzilla.suse.com/show_bug.cgi?id=1119649
> 
> kernel: r8169 0000:01:00.0 eth0: RTL8168h/8111h, XID 54100880
> The major symptom is there are many rx_missed count.
> 
> 
> On Jan 30, 2019 at 20:15:45 +0100, Heiner Kallweit wrote:
>> Hi Peter,
>>
>> recently I had somebody where pcie_aspm=off for whatever reason didn't
>> do the trick, can you also check with pcie_aspm.policy=performance.
> 
> We will give it a try later.
> 
>> And please check with "ethtool -S <if>" whether the chip statistics
>> show a significant number of errors.
>>
>> If this doesn't help you may have to bisect to find the offending commit.
> 
> We had tried fallback driver to a few previous commits as following,
> but with no luck.
> 
> 9675931e6b65 r8169: re-enable MSI-X on RTL8168g (v4.19)
> 098b01ad9837 r8169: don't include asm headers directly (v4.19-rc1)
> a2965f12fde6 r8169: remove rtl8169_set_speed_xmii (v4.19-rc1)
> 6fcf9b1d4d6c r8169: fix runtime suspend (v4.19-rc1)
> e397286b8e89 r8169: remove TBI 1000BaseX support (v4.19-rc1)
> 
> Thanks,
> David Chang
> 
>>
>> Heiner
>>
>>
>> On 30.01.2019 10:59, Peter Ceiley wrote:
>>> Hi Heiner,
>>>
>>> I tried disabling the ASPM using the pcie_aspm=off kernel parameter
>>> and this made no difference.
>>>
>>> I tried compiling the 4.18.16 r8169.c with the 4.19.18 source and
>>> subsequently loaded the module in the running 4.19.18 kernel. I can
>>> confirm that this immediately resolved the issue and access to the NFS
>>> shares operated as expected.
>>>
>>> I presume this means it is an issue with the r8169 driver included in
>>> 4.19 onwards?
>>>
>>> To answer your last questions:
>>>
>>> Base Board Information
>>>     Manufacturer: Alienware
>>>     Product Name: 0PGRP5
>>>     Version: A02
>>>
>>> ... and yes, the RTL8168 is the onboard network chip.
>>>
>>> Regards,
>>>
>>> Peter.
>>>
>>> On Tue, 29 Jan 2019 at 17:44, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>
>>>> Hi Peter,
>>>>
>>>> I think the vendor driver doesn't enable ASPM per default.
>>>> So it's worth a try to disable ASPM in the BIOS or via sysfs.
>>>> Few older systems seem to have issues with ASPM, what kind of
>>>> system / mainboard are you using? The RTL8168 is the onboard
>>>> network chip?
>>>>
>>>> Rgds, Heiner
>>>>
>>>>
>>>> On 29.01.2019 07:20, Peter Ceiley wrote:
>>>>> Hi Heiner,
>>>>>
>>>>> Thanks, I'll do some more testing. It might not be the driver - I
>>>>> assumed it was due to the fact that using the r8168 driver 'resolves'
>>>>> the issue. I'll see if I can test the r8169.c on top of 4.19 - this is
>>>>> a good idea.
>>>>>
>>>>> Cheers,
>>>>>
>>>>> Peter.
>>>>>
>>>>> On Tue, 29 Jan 2019 at 17:16, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>
>>>>>> Hi Peter,
>>>>>>
>>>>>> at a first glance it doesn't look like a typical driver issue.
>>>>>> What you could do:
>>>>>>
>>>>>> - Test the r8169.c from 4.18 on top of 4.19.
>>>>>>
>>>>>> - Check whether disabling ASPM (/sys/module/pcie_aspm) has an effect.
>>>>>>
>>>>>> - Bisect between 4.18 and 4.19 to find the offending commit.
>>>>>>
>>>>>> Any specific reason why you think root cause is in the driver and not
>>>>>> elsewhere in the network subsystem?
>>>>>>
>>>>>> Heiner
>>>>>>
>>>>>>
>>>>>> On 28.01.2019 23:10, Peter Ceiley wrote:
>>>>>>> Hi Heiner,
>>>>>>>
>>>>>>> Thanks for getting back to me.
>>>>>>>
>>>>>>> No, I don't use jumbo packets.
>>>>>>>
>>>>>>> Bandwidth is *generally* good, and iperf results to my NAS provide
>>>>>>> over 900 Mbits/s in both circumstances. The issue seems to appear when
>>>>>>> establishing a connection and is most notable, for example, on my
>>>>>>> mounted NFS shares where it takes seconds (up to 10's of seconds on
>>>>>>> larger directories) to list the contents of each directory. Once a
>>>>>>> transfer begins on a file, I appear to get good bandwidth.
>>>>>>>
>>>>>>> I'm unsure of the best scientific data to provide you in order to
>>>>>>> troubleshoot this issue. Running the following
>>>>>>>
>>>>>>>     netstat -s |grep retransmitted
>>>>>>>
>>>>>>> shows a steady increase in retransmitted segments each time I list the
>>>>>>> contents of a remote directory, for example, running 'ls' on a
>>>>>>> directory containing 345 media files did the following using kernel
>>>>>>> 4.19.18:
>>>>>>>
>>>>>>> increased retransmitted segments by 21 and the 'time' command showed
>>>>>>> the following:
>>>>>>>     real    0m19.867s
>>>>>>>     user    0m0.012s
>>>>>>>     sys    0m0.036s
>>>>>>>
>>>>>>> The same command shows no retransmitted segments running kernel
>>>>>>> 4.18.16 and 'time' showed:
>>>>>>>     real    0m0.300s
>>>>>>>     user    0m0.004s
>>>>>>>     sys    0m0.007s
>>>>>>>
>>>>>>> ifconfig does not show any RX/TX errors nor dropped packets in either case.
>>>>>>>
>>>>>>> dmesg XID:
>>>>>>> [    2.979984] r8169 0000:03:00.0 eth0: RTL8168g/8111g,
>>>>>>> f8:b1:56:fe:67:e0, XID 4c000800, IRQ 32
>>>>>>>
>>>>>>> # lspci -vv
>>>>>>> 03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd.
>>>>>>> RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 0c)
>>>>>>>     Subsystem: Dell RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>     Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop-
>>>>>>> ParErr- Stepping- SERR- FastB2B- DisINTx+
>>>>>>>     Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-
>>>>>>> <TAbort- <MAbort- >SERR- <PERR- INTx-
>>>>>>>     Latency: 0, Cache Line Size: 64 bytes
>>>>>>>     Interrupt: pin A routed to IRQ 19
>>>>>>>     Region 0: I/O ports at d000 [size=256]
>>>>>>>     Region 2: Memory at f7b00000 (64-bit, non-prefetchable) [size=4K]
>>>>>>>     Region 4: Memory at f2100000 (64-bit, prefetchable) [size=16K]
>>>>>>>     Capabilities: [40] Power Management version 3
>>>>>>>         Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA
>>>>>>> PME(D0+,D1+,D2+,D3hot+,D3cold+)
>>>>>>>         Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
>>>>>>>     Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+
>>>>>>>         Address: 0000000000000000  Data: 0000
>>>>>>>     Capabilities: [70] Express (v2) Endpoint, MSI 01
>>>>>>>         DevCap:    MaxPayload 128 bytes, PhantFunc 0, Latency L0s
>>>>>>> <512ns, L1 <64us
>>>>>>>             ExtTag- AttnBtn- AttnInd- PwrInd- RBE+ FLReset-
>>>>>>> SlotPowerLimit 10.000W
>>>>>>>         DevCtl:    CorrErr- NonFatalErr- FatalErr- UnsupReq-
>>>>>>>             RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
>>>>>>>             MaxPayload 128 bytes, MaxReadReq 4096 bytes
>>>>>>>         DevSta:    CorrErr+ NonFatalErr- FatalErr- UnsupReq- AuxPwr+ TransPend-
>>>>>>>         LnkCap:    Port #0, Speed 2.5GT/s, Width x1, ASPM L0s L1, Exit
>>>>>>> Latency L0s unlimited, L1 <64us
>>>>>>>             ClockPM+ Surprise- LLActRep- BwNot- ASPMOptComp+
>>>>>>>         LnkCtl:    ASPM L1 Enabled; RCB 64 bytes Disabled- CommClk+
>>>>>>>             ExtSynch- ClockPM+ AutWidDis- BWInt- AutBWInt-
>>>>>>>         LnkSta:    Speed 2.5GT/s (ok), Width x1 (ok)
>>>>>>>             TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
>>>>>>>         DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR+,
>>>>>>> OBFF Via message/WAKE#
>>>>>>>              AtomicOpsCap: 32bit- 64bit- 128bitCAS-
>>>>>>>         DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis-, LTR+,
>>>>>>> OBFF Disabled
>>>>>>>              AtomicOpsCtl: ReqEn-
>>>>>>>         LnkCtl2: Target Link Speed: 2.5GT/s, EnterCompliance- SpeedDis-
>>>>>>>              Transmit Margin: Normal Operating Range,
>>>>>>> EnterModifiedCompliance- ComplianceSOS-
>>>>>>>              Compliance De-emphasis: -6dB
>>>>>>>         LnkSta2: Current De-emphasis Level: -6dB,
>>>>>>> EqualizationComplete-, EqualizationPhase1-
>>>>>>>              EqualizationPhase2-, EqualizationPhase3-, LinkEqualizationRequest-
>>>>>>>     Capabilities: [b0] MSI-X: Enable+ Count=4 Masked-
>>>>>>>         Vector table: BAR=4 offset=00000000
>>>>>>>         PBA: BAR=4 offset=00000800
>>>>>>>     Capabilities: [d0] Vital Product Data
>>>>>>> pcilib: sysfs_read_vpd: read failed: Input/output error
>>>>>>>         Not readable
>>>>>>>     Capabilities: [100 v1] Advanced Error Reporting
>>>>>>>         UESta:    DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>         UEMsk:    DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
>>>>>>>         UESvrt:    DLP+ SDES+ TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt-
>>>>>>> RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
>>>>>>>         CESta:    RxErr+ BadTLP+ BadDLLP+ Rollover- Timeout+ AdvNonFatalErr-
>>>>>>>         CEMsk:    RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
>>>>>>>         AERCap:    First Error Pointer: 00, ECRCGenCap+ ECRCGenEn-
>>>>>>> ECRCChkCap+ ECRCChkEn-
>>>>>>>             MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
>>>>>>>         HeaderLog: 00000000 00000000 00000000 00000000
>>>>>>>     Capabilities: [140 v1] Virtual Channel
>>>>>>>         Caps:    LPEVC=0 RefClk=100ns PATEntryBits=1
>>>>>>>         Arb:    Fixed- WRR32- WRR64- WRR128-
>>>>>>>         Ctrl:    ArbSelect=Fixed
>>>>>>>         Status:    InProgress-
>>>>>>>         VC0:    Caps:    PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
>>>>>>>             Arb:    Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
>>>>>>>             Ctrl:    Enable+ ID=0 ArbSelect=Fixed TC/VC=01
>>>>>>>             Status:    NegoPending- InProgress-
>>>>>>>     Capabilities: [160 v1] Device Serial Number 01-00-00-00-68-4c-e0-00
>>>>>>>     Capabilities: [170 v1] Latency Tolerance Reporting
>>>>>>>         Max snoop latency: 71680ns
>>>>>>>         Max no snoop latency: 71680ns
>>>>>>>     Kernel driver in use: r8169
>>>>>>>     Kernel modules: r8169
>>>>>>>
>>>>>>> Please let me know if you have any other ideas in terms of testing.
>>>>>>>
>>>>>>> Thanks!
>>>>>>>
>>>>>>> Peter.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On Tue, 29 Jan 2019 at 05:28, Heiner Kallweit <hkallweit1@gmail.com> wrote:
>>>>>>>>
>>>>>>>> On 28.01.2019 12:13, Peter Ceiley wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> I have been experiencing very poor network performance since Kernel
>>>>>>>>> 4.19 and I'm confident it's related to the r8169 driver.
>>>>>>>>>
>>>>>>>>> I have no issue with kernel versions 4.18 and prior. I am experiencing
>>>>>>>>> this issue in kernels 4.19 and 4.20 (currently running/testing with
>>>>>>>>> 4.20.4 & 4.19.18).
>>>>>>>>>
>>>>>>>>> If someone could guide me in the right direction, I'm happy to help
>>>>>>>>> troubleshoot this issue. Note that I have been keeping an eye on one
>>>>>>>>> issue related to loading of the PHY driver, however, my symptoms
>>>>>>>>> differ in that I still have a network connection. I have attempted to
>>>>>>>>> reload the driver on a running system, but this does not improve the
>>>>>>>>> situation.
>>>>>>>>>
>>>>>>>>> Using the proprietary r8168 driver returns my device to proper working order.
>>>>>>>>>
>>>>>>>>> lshw shows:
>>>>>>>>>        description: Ethernet interface
>>>>>>>>>        product: RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller
>>>>>>>>>        vendor: Realtek Semiconductor Co., Ltd.
>>>>>>>>>        physical id: 0
>>>>>>>>>        bus info: pci@0000:03:00.0
>>>>>>>>>        logical name: enp3s0
>>>>>>>>>        version: 0c
>>>>>>>>>        serial:
>>>>>>>>>        size: 1Gbit/s
>>>>>>>>>        capacity: 1Gbit/s
>>>>>>>>>        width: 64 bits
>>>>>>>>>        clock: 33MHz
>>>>>>>>>        capabilities: pm msi pciexpress msix vpd bus_master cap_list
>>>>>>>>> ethernet physical tp aui bnc mii fibre 10bt 10bt-fd 100bt 100bt-fd
>>>>>>>>> 1000bt-fd autonegotiation
>>>>>>>>>        configuration: autonegotiation=on broadcast=yes driver=r8169
>>>>>>>>> duplex=full firmware=rtl8168g-2_0.0.1 02/06/13 ip=192.168.1.25
>>>>>>>>> latency=0 link=yes multicast=yes port=MII speed=1Gbit/s
>>>>>>>>>        resources: irq:19 ioport:d000(size=256)
>>>>>>>>> memory:f7b00000-f7b00fff memory:f2100000-f2103fff
>>>>>>>>>
>>>>>>>>> Kind Regards,
>>>>>>>>>
>>>>>>>>> Peter.
>>>>>>>>>
>>>>>>>> Hi Peter,
>>>>>>>>
>>>>>>>> the description "poor network performance" is quite vague, therefore:
>>>>>>>>
>>>>>>>> - Can you provide any measurements?
>>>>>>>> - iperf results before and after
>>>>>>>> - statistics about dropped packets (rx and/or tx)
>>>>>>>> - Do you use jumbo packets?
>>>>>>>>
>>>>>>>> Also help would be a "lspci -vv" output for the network card and
>>>>>>>> the dmesg output line with the chip XID.
>>>>>>>>
>>>>>>>> Heiner
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
> 


^ permalink raw reply

* Re: [PATCH net] ipvlan, l3mdev: fix broken l3s mode wrt local routes
From: David Miller @ 2019-01-31  6:14 UTC (permalink / raw)
  To: daniel; +Cc: netdev, maheshb, dsa, fw, m
In-Reply-To: <20190130114948.24227-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 30 Jan 2019 12:49:48 +0100

> While implementing ipvlan l3 and l3s mode for kubernetes CNI plugin,
> I ran into the issue that while l3 mode is working fine, l3s mode
> does not have any connectivity to kube-apiserver and hence all pods
> end up in Error state as well. The ipvlan master device sits on
> top of a bond device and hostns traffic to kube-apiserver (also running
> in hostns) is DNATed from 10.152.183.1:443 to 139.178.29.207:37573
> where the latter is the address of the bond0. While in l3 mode, a
> curl to https://10.152.183.1:443 or to https://139.178.29.207:37573
> works fine from hostns, neither of them do in case of l3s. In the
> latter only a curl to https://127.0.0.1:37573 appeared to work where
> for local addresses of bond0 I saw kernel suddenly starting to emit
> ARP requests to query HW address of bond0 which remained unanswered
> and neighbor entries in INCOMPLETE state. These ARP requests only
> happen while in l3s.
> 
> Debugging this further, I found the issue is that l3s mode is piggy-
> backing on l3 master device, and in this case local routes are using
> l3mdev_master_dev_rcu(dev) instead of net->loopback_dev as per commit
> f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev
> if relevant") and 5f02ce24c269 ("net: l3mdev: Allow the l3mdev to be
> a loopback"). I found that reverting them back into using the
> net->loopback_dev fixed ipvlan l3s connectivity and got everything
> working for the CNI.
> 
> Now judging from 4fbae7d83c98 ("ipvlan: Introduce l3s mode") and the
> l3mdev paper in [0] the only sole reason why ipvlan l3s is relying
> on l3 master device is to get the l3mdev_ip_rcv() receive hook for
> setting the dst entry of the input route without adding its own
> ipvlan specific hacks into the receive path, however, any l3 domain
> semantics beyond just that are breaking l3s operation. Note that
> ipvlan also has the ability to dynamically switch its internal
> operation from l3 to l3s for all ports via ipvlan_set_port_mode()
> at runtime. In any case, l3 vs l3s soley distinguishes itself by
> 'de-confusing' netfilter through switching skb->dev to ipvlan slave
> device late in NF_INET_LOCAL_IN before handing the skb to L4.
> 
> Minimal fix taken here is to add a IFF_L3MDEV_RX_HANDLER flag which,
> if set from ipvlan setup, gets us only the wanted l3mdev_l3_rcv() hook
> without any additional l3mdev semantics on top. This should also have
> minimal impact since dev->priv_flags is already hot in cache. With
> this set, l3s mode is working fine and I also get things like
> masquerading pod traffic on the ipvlan master properly working.
> 
>   [0] https://netdevconf.org/1.2/papers/ahern-what-is-l3mdev-paper.pdf
> 
> Fixes: f5a0aab84b74 ("net: ipv4: dst for local input routes should use l3mdev if relevant")
> Fixes: 5f02ce24c269 ("net: l3mdev: Allow the l3mdev to be a loopback")
> Fixes: 4fbae7d83c98 ("ipvlan: Introduce l3s mode")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net-next] strparser: Return if socket does not have required number of bytes
From: David Miller @ 2019-01-31  5:59 UTC (permalink / raw)
  To: vakul.garg; +Cc: netdev, borisp, aviadye, davejwatson, doronrk
In-Reply-To: <20190130072933.23281-1-vakul.garg@nxp.com>

From: Vakul Garg <vakul.garg@nxp.com>
Date: Wed, 30 Jan 2019 07:31:44 +0000

> Function strp_data_ready() should peek the associated socket to check
> whether it has the required number of bytes available before queueing
> work or initiating socket read via strp_read_sock(). This saves cpu
> cycles because strp_read_sock() is called only when required amount of
> data is available.
> 
> Signed-off-by: Vakul Garg <vakul.garg@nxp.com>

You can't do this, I think.  It's racy and the user socket owned check
is absolutely necessary before you do the need bytes check.

^ permalink raw reply

* Re: [PATCH net v4] l2tp: fix reading optional fields of L2TPv3
From: David Miller @ 2019-01-31  5:45 UTC (permalink / raw)
  To: jian.w.wen; +Cc: netdev, eric.dumazet, gnault
In-Reply-To: <20190130065514.13076-1-jian.w.wen@oracle.com>

From: Jacob Wen <jian.w.wen@oracle.com>
Date: Wed, 30 Jan 2019 14:55:14 +0800

> Use pskb_may_pull() to make sure the optional fields are in skb linear
> parts, so we can safely read them later.
> 
> It's easy to reproduce the issue with a net driver that supports paged
> skb data. Just create a L2TPv3 over IP tunnel and then generates some
> network traffic.
> Once reproduced, rx err in /sys/kernel/debug/l2tp/tunnels will increase.
> 
> Changes in v4:
> 1. s/l2tp_v3_pull_opt/l2tp_v3_ensure_opt_in_linear/
> 2. s/tunnel->version != L2TP_HDR_VER_2/tunnel->version == L2TP_HDR_VER_3/
> 3. Add 'Fixes' in commit messages.
> 
> Changes in v3:
> 1. To keep consistency, move the code out of l2tp_recv_common.
> 2. Use "net" instead of "net-next", since this is a bug fix.
> 
> Changes in v2:
> 1. Only fix L2TPv3 to make code simple.
>    To fix both L2TPv3 and L2TPv2, we'd better refactor l2tp_recv_common.
>    It's complicated to do so.
> 2. Reloading pointers after pskb_may_pull
> 
> Fixes: f7faffa3ff8e ("l2tp: Add L2TPv3 protocol support")
> Fixes: 0d76751fad77 ("l2tp: Add L2TPv3 IP encapsulation (no UDP) support")
> Fixes: a32e0eec7042 ("l2tp: introduce L2TPv3 IP encapsulation support for IPv6")
> 
> Signed-off-by: Jacob Wen <jian.w.wen@oracle.com>
> Acked-by: Guillaume Nault <gnault@redhat.com>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH] ipconfig: make the wait for carrier timeout configurable
From: David Miller @ 2019-01-31  5:36 UTC (permalink / raw)
  To: martink
  Cc: kuznet, yoshfuji, netdev, linux-kernel, manfred.schlaegl,
	martin.kepplinger
In-Reply-To: <20190128113005.7683-1-martink@posteo.de>

From: Martin Kepplinger <martink@posteo.de>
Date: Mon, 28 Jan 2019 12:30:05 +0100

> From: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
> 
> commit 3fb72f1e6e61 ("ipconfig wait for carrier") added a
> "wait for carrier" policy, with a fixed worst case maximum wait
> of two minutes.
> 
> This makes the wait for carrier timeout configurable (0 - 240 seconds).
> 
> The informative timeout messages introduced with
> commit 5e404cd65860 ("ipconfig: add informative timeout messages while
> waiting for carrier") were adapted. Message output is done in a fixed
> interval of 20 seconds, just like before (240/12).
> 
> Signed-off-by: Manfred Schlaegl <manfred.schlaegl@ginzinger.com>
> Signed-off-by: Martin Kepplinger <martin.kepplinger@ginzinger.com>

I would prefer that this be implemented using a kernel command line
parameter.

That way you don't need to rebuild the kernel to adjust the value.

Thanks.

^ permalink raw reply

* Re: [PATCH] bpf: selftests: handle sparse CPU allocations
From: Y Song @ 2019-01-31  5:25 UTC (permalink / raw)
  To: Martynas Pumputis; +Cc: netdev, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20190130091900.4044-1-m@lambda.lt>

[ My reply somehow rejected by netdev, this is to send it again. ]

On Wed, Jan 30, 2019 at 1:19 AM Martynas Pumputis <m@lambda.lt> wrote:
>
> Previously, bpf_num_possible_cpus() had a bug when calculating a
> number of possible CPUs in the case of sparse CPU allocations, as
> it was considering only the first range or element of
> /sys/devices/system/cpu/possible.
>
> E.g. in the case of "0,2-3" (CPU 1 is not available), the function
> returned 1 instead of 3.
>
> This patch fixes the function by making it parse all CPU ranges and
> elements.
>
> Signed-off-by: Martynas Pumputis <m@lambda.lt>
> ---
>  tools/testing/selftests/bpf/bpf_util.h | 29 +++++++++++++++++---------
>  1 file changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
> index 315a44fa32af..8cab50408204 100644
> --- a/tools/testing/selftests/bpf/bpf_util.h
> +++ b/tools/testing/selftests/bpf/bpf_util.h
> @@ -13,7 +13,7 @@ static inline unsigned int bpf_num_possible_cpus(void)
>         unsigned int start, end, possible_cpus = 0;
>         char buff[128];
>         FILE *fp;
> -       int n;
> +       int n, i, j = 0;
>
>         fp = fopen(fcpu, "r");
>         if (!fp) {
> @@ -21,17 +21,26 @@ static inline unsigned int bpf_num_possible_cpus(void)
>                 exit(1);
>         }
>
> -       while (fgets(buff, sizeof(buff), fp)) {
> -               n = sscanf(buff, "%u-%u", &start, &end);
> -               if (n == 0) {
> -                       printf("Failed to retrieve # possible CPUs!\n");
> -                       exit(1);
> -               } else if (n == 1) {
> -                       end = start;
> +       if (!fgets(buff, sizeof(buff), fp)) {
> +               printf("Failed to read %s!\n", fcpu);
> +               exit(1);
> +       }
> +
> +       for (i = 0; i <= strlen(buff); i++) {
> +               if (buff[i] == ',' || buff[i] == '\0') {
> +                       buff[i] = '\0';

This does not sound right. For example, the cpu list "0,2-3",
you will change "," to '\0" so buffer becomes "0\02-3".
The next iteration you will get strlen(buff) = 1.
The "2-3" will be skipped.

> +                       n = sscanf(&buff[j], "%u-%u", &start, &end);
> +                       if (n <= 0) {
> +                               printf("Failed to retrieve # possible CPUs!\n");
> +                               exit(1);
> +                       } else if (n == 1) {
> +                               end = start;
> +                       }
> +                       possible_cpus += end - start + 1;
> +                       j = i + 1;
>                 }
> -               possible_cpus = start == 0 ? end + 1 : 0;
> -               break;
>         }
> +
>         fclose(fp);
>
>         return possible_cpus;
> --
> 2.20.1
>

^ permalink raw reply

* Re: Question on ptr_ring linux header
From: fei phung @ 2019-01-31  5:16 UTC (permalink / raw)
  To: mst, feiphung, netdev
In-Reply-To: <CAF8QhUg9oSBy14tZUkRtxFQP6_Y_4CbFttt6HOCG2pgnQY1SaA@mail.gmail.com>

Hi,

/*
 * Filename: circ_ring.c
 * Version: 1.0
 * Description: A circular buffer using API from
 * https://github.com/torvalds/linux/blob/master/include/linux/ptr_ring.h
 */

ptr_ring's void** queue is just giving data race problem, running
consume() together with [assignment of pointers+produce()] will
definitely give rise to data race

mutex or lock cannot help in this case. Please correct me if wrong

Regards,
Phung

^ permalink raw reply

* Re: [PATCH iproute2-next] Introduce ip-brctl shell script
From: David Ahern @ 2019-01-31  5:12 UTC (permalink / raw)
  To: Stefano Brivio
  Cc: Phil Sutter, Eric Garver, Tomas Dolezal, Stephen Hemminger,
	Lennert Buytenhek, netdev
In-Reply-To: <20190130115555.61868ab9@redhat.com>

On 1/30/19 3:55 AM, Stefano Brivio wrote:
>> I get your intent, but this seems more appropriate for you / Red Hat to
>> carry than something we want to distribute as part of iproute2.
> 
> Sure, I could also do that, but:
> 
> - me creating another project: similar maintenance burden for
>   distribution maintainers as keeping bridge-utils around,
>   for something that won't have any active development
> 
> - carrying it in a single distribution downstream: I would have gone
>   that way if I thought it wouldn't be useful for others. I myself use
>   (also) distributions other than Fedora/RHEL and this would feel
>   just... wrong
> 
> Why do you think it's not appropriate to distribute this as part of
> iproute2? Too ugly? Bloated? Anything I can improve?
> 
> I think it would be appropriate because it intimately depends on
> ip-link -- it's really nothing more than a helper for iproute2 tools.
> 

Again, I understand your point ... I still, too often, type ifconfig
from long in-grained muscle memory.

This is a convenience wrapper around commands packaged in iproute2. If
iproute2 adds this wrapper, it will have to carry it and maintain it
forever. Distributions (Fedora, RHEL, Debian, etc) may see it
differently and decide to add this patch onto iproute2 that they
distribute as a means for dropping bridge-utils. That's a reasonable
migration choice. It is just not something upstream iproute2 should carry.

^ permalink raw reply

* [PATCH v6 bpf-next 9/9] selftests/bpf: test for BPF_F_LOCK
From: Alexei Starovoitov @ 2019-01-31  5:05 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, jannh, netdev, kernel-team
In-Reply-To: <20190131050546.3634009-1-ast@kernel.org>

Add C based test that runs 4 bpf programs in parallel
that update the same hash and array maps.
And another 2 threads that read from these two maps
via lookup(key, value, BPF_F_LOCK) api
to make sure the user space sees consistent value in both
hash and array elements while user space races with kernel bpf progs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile        |  2 +-
 tools/testing/selftests/bpf/test_map_lock.c | 66 ++++++++++++++++++
 tools/testing/selftests/bpf/test_progs.c    | 74 +++++++++++++++++++++
 3 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_map_lock.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 302b8e70dec9..9de0a1ae8d65 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ BPF_OBJ_FILES = \
 	sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \
-	xdp_dummy.o test_map_in_map.o test_spin_lock.o
+	xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o
 
 # Objects are built with default compilation flags and with sub-register
 # code-gen enabled.
diff --git a/tools/testing/selftests/bpf/test_map_lock.c b/tools/testing/selftests/bpf/test_map_lock.c
new file mode 100644
index 000000000000..af8cc68ed2f9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_map_lock.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+#define VAR_NUM 16
+
+struct hmap_elem {
+	struct bpf_spin_lock lock;
+	int var[VAR_NUM];
+};
+
+struct bpf_map_def SEC("maps") hash_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct hmap_elem),
+	.max_entries = 1,
+};
+
+BPF_ANNOTATE_KV_PAIR(hash_map, int, struct hmap_elem);
+
+struct array_elem {
+	struct bpf_spin_lock lock;
+	int var[VAR_NUM];
+};
+
+struct bpf_map_def SEC("maps") array_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct array_elem),
+	.max_entries = 1,
+};
+
+BPF_ANNOTATE_KV_PAIR(array_map, int, struct array_elem);
+
+SEC("map_lock_demo")
+int bpf_map_lock_test(struct __sk_buff *skb)
+{
+	struct hmap_elem zero = {}, *val;
+	int rnd = bpf_get_prandom_u32();
+	int key = 0, err = 1, i;
+	struct array_elem *q;
+
+	val = bpf_map_lookup_elem(&hash_map, &key);
+	if (!val)
+		goto err;
+	/* spin_lock in hash map */
+	bpf_spin_lock(&val->lock);
+	for (i = 0; i < VAR_NUM; i++)
+		val->var[i] = rnd;
+	bpf_spin_unlock(&val->lock);
+
+	/* spin_lock in array */
+	q = bpf_map_lookup_elem(&array_map, &key);
+	if (!q)
+		goto err;
+	bpf_spin_lock(&q->lock);
+	for (i = 0; i < VAR_NUM; i++)
+		q->var[i] = rnd;
+	bpf_spin_unlock(&q->lock);
+	err = 0;
+err:
+	return err;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index d2e71d697340..a08d026ac396 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -2025,6 +2025,79 @@ static void test_spinlock(void)
 	bpf_object__close(obj);
 }
 
+static void *parallel_map_access(void *arg)
+{
+	int err, map_fd = *(u32 *) arg;
+	int vars[17], i, j, rnd, key = 0;
+
+	for (i = 0; i < 10000; i++) {
+		err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
+		if (err) {
+			printf("lookup failed\n");
+			error_cnt++;
+			goto out;
+		}
+		if (vars[0] != 0) {
+			printf("lookup #%d var[0]=%d\n", i, vars[0]);
+			error_cnt++;
+			goto out;
+		}
+		rnd = vars[1];
+		for (j = 2; j < 17; j++) {
+			if (vars[j] == rnd)
+				continue;
+			printf("lookup #%d var[1]=%d var[%d]=%d\n",
+			       i, rnd, j, vars[j]);
+			error_cnt++;
+			goto out;
+		}
+	}
+out:
+	pthread_exit(arg);
+}
+
+static void test_map_lock(void)
+{
+	const char *file = "./test_map_lock.o";
+	int prog_fd, map_fd[2], vars[17] = {};
+	pthread_t thread_id[6];
+	struct bpf_object *obj;
+	int err = 0, key = 0, i;
+	void *ret;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	if (err) {
+		printf("test_map_lock:bpf_prog_load errno %d\n", errno);
+		goto close_prog;
+	}
+	map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
+	if (map_fd[0] < 0)
+		goto close_prog;
+	map_fd[1] = bpf_find_map(__func__, obj, "array_map");
+	if (map_fd[1] < 0)
+		goto close_prog;
+
+	bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
+
+	for (i = 0; i < 4; i++)
+		assert(pthread_create(&thread_id[i], NULL,
+				      &test_spin_lock, &prog_fd) == 0);
+	for (i = 4; i < 6; i++)
+		assert(pthread_create(&thread_id[i], NULL,
+				      &parallel_map_access, &map_fd[i - 4]) == 0);
+	for (i = 0; i < 4; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 &&
+		       ret == (void *)&prog_fd);
+	for (i = 4; i < 6; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 &&
+		       ret == (void *)&map_fd[i - 4]);
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	srand(time(NULL));
@@ -2054,6 +2127,7 @@ int main(void)
 	test_queue_stack_map(STACK);
 	test_flow_dissector();
 	test_spinlock();
+	test_map_lock();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.20.0


^ permalink raw reply related

* [PATCH v6 bpf-next 8/9] libbpf: introduce bpf_map_lookup_elem_flags()
From: Alexei Starovoitov @ 2019-01-31  5:05 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, jannh, netdev, kernel-team
In-Reply-To: <20190131050546.3634009-1-ast@kernel.org>

Introduce
int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags)
helper to lookup array/hash/cgroup_local_storage elements with BPF_F_LOCK flag.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf.c      | 13 +++++++++++++
 tools/lib/bpf/bpf.h      |  2 ++
 tools/lib/bpf/libbpf.map |  1 +
 3 files changed, 16 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 88cbd110ae58..3defad77dc7a 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -368,6 +368,19 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value)
 	return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
 }
 
+int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.map_fd = fd;
+	attr.key = ptr_to_u64(key);
+	attr.value = ptr_to_u64(value);
+	attr.flags = flags;
+
+	return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+
 int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value)
 {
 	union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 8f09de482839..ed09eed2dc3b 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -110,6 +110,8 @@ LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value,
 				   __u64 flags);
 
 LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value);
+LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value,
+					 __u64 flags);
 LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
 					      void *value);
 LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 266bc95d0142..f6f96fc38c50 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -130,4 +130,5 @@ LIBBPF_0.0.2 {
 		bpf_probe_helper;
 		bpf_probe_map_type;
 		bpf_probe_prog_type;
+		bpf_map_lookup_elem_flags;
 } LIBBPF_0.0.1;
-- 
2.20.0


^ permalink raw reply related

* [PATCH v6 bpf-next 5/9] selftests/bpf: add bpf_spin_lock C test
From: Alexei Starovoitov @ 2019-01-31  5:05 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, jannh, netdev, kernel-team
In-Reply-To: <20190131050546.3634009-1-ast@kernel.org>

add bpf_spin_lock C based test that requires latest llvm with BTF support

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile         |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h    |   4 +
 tools/testing/selftests/bpf/test_progs.c     |  43 +++++++-
 tools/testing/selftests/bpf/test_spin_lock.c | 108 +++++++++++++++++++
 4 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_spin_lock.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 8993e9c8f410..302b8e70dec9 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ BPF_OBJ_FILES = \
 	sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
 	get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
 	test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \
-	xdp_dummy.o test_map_in_map.o
+	xdp_dummy.o test_map_in_map.o test_spin_lock.o
 
 # Objects are built with default compilation flags and with sub-register
 # code-gen enabled.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 6c77cf7bedce..6a0ce0f055c5 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -172,6 +172,10 @@ static int (*bpf_skb_vlan_pop)(void *ctx) =
 	(void *) BPF_FUNC_skb_vlan_pop;
 static int (*bpf_rc_pointer_rel)(void *ctx, int rel_x, int rel_y) =
 	(void *) BPF_FUNC_rc_pointer_rel;
+static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) =
+	(void *) BPF_FUNC_spin_lock;
+static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) =
+	(void *) BPF_FUNC_spin_unlock;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index d8940b8b2f8d..d2e71d697340 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -28,7 +28,7 @@ typedef __u16 __sum16;
 #include <sys/wait.h>
 #include <sys/types.h>
 #include <fcntl.h>
-
+#include <pthread.h>
 #include <linux/bpf.h>
 #include <linux/err.h>
 #include <bpf/bpf.h>
@@ -1985,6 +1985,46 @@ static void test_flow_dissector(void)
 	bpf_object__close(obj);
 }
 
+static void *test_spin_lock(void *arg)
+{
+	__u32 duration, retval;
+	int err, prog_fd = *(u32 *) arg;
+
+	err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4),
+				NULL, NULL, &retval, &duration);
+	CHECK(err || retval, "",
+	      "err %d errno %d retval %d duration %d\n",
+	      err, errno, retval, duration);
+	pthread_exit(arg);
+}
+
+static void test_spinlock(void)
+{
+	const char *file = "./test_spin_lock.o";
+	pthread_t thread_id[4];
+	struct bpf_object *obj;
+	int prog_fd;
+	int err = 0, i;
+	void *ret;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+	if (err) {
+		printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
+		goto close_prog;
+	}
+	for (i = 0; i < 4; i++)
+		assert(pthread_create(&thread_id[i], NULL,
+				      &test_spin_lock, &prog_fd) == 0);
+	for (i = 0; i < 4; i++)
+		assert(pthread_join(thread_id[i], &ret) == 0 &&
+		       ret == (void *)&prog_fd);
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	srand(time(NULL));
@@ -2013,6 +2053,7 @@ int main(void)
 	test_queue_stack_map(QUEUE);
 	test_queue_stack_map(STACK);
 	test_flow_dissector();
+	test_spinlock();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
diff --git a/tools/testing/selftests/bpf/test_spin_lock.c b/tools/testing/selftests/bpf/test_spin_lock.c
new file mode 100644
index 000000000000..40f904312090
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_spin_lock.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+struct hmap_elem {
+	volatile int cnt;
+	struct bpf_spin_lock lock;
+	int test_padding;
+};
+
+struct bpf_map_def SEC("maps") hmap = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct hmap_elem),
+	.max_entries = 1,
+};
+
+BPF_ANNOTATE_KV_PAIR(hmap, int, struct hmap_elem);
+
+
+struct cls_elem {
+	struct bpf_spin_lock lock;
+	volatile int cnt;
+};
+
+struct bpf_map_def SEC("maps") cls_map = {
+	.type = BPF_MAP_TYPE_CGROUP_STORAGE,
+	.key_size = sizeof(struct bpf_cgroup_storage_key),
+	.value_size = sizeof(struct cls_elem),
+};
+
+BPF_ANNOTATE_KV_PAIR(cls_map, struct bpf_cgroup_storage_key,
+		     struct cls_elem);
+
+struct bpf_vqueue {
+	struct bpf_spin_lock lock;
+	/* 4 byte hole */
+	unsigned long long lasttime;
+	int credit;
+	unsigned int rate;
+};
+
+struct bpf_map_def SEC("maps") vqueue = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_vqueue),
+	.max_entries = 1,
+};
+
+BPF_ANNOTATE_KV_PAIR(vqueue, int, struct bpf_vqueue);
+#define CREDIT_PER_NS(delta, rate) (((delta) * rate) >> 20)
+
+SEC("spin_lock_demo")
+int bpf_sping_lock_test(struct __sk_buff *skb)
+{
+	volatile int credit = 0, max_credit = 100, pkt_len = 64;
+	struct hmap_elem zero = {}, *val;
+	unsigned long long curtime;
+	struct bpf_vqueue *q;
+	struct cls_elem *cls;
+	int key = 0;
+	int err = 0;
+
+	val = bpf_map_lookup_elem(&hmap, &key);
+	if (!val) {
+		bpf_map_update_elem(&hmap, &key, &zero, 0);
+		val = bpf_map_lookup_elem(&hmap, &key);
+		if (!val) {
+			err = 1;
+			goto err;
+		}
+	}
+	/* spin_lock in hash map run time test */
+	bpf_spin_lock(&val->lock);
+	if (val->cnt)
+		val->cnt--;
+	else
+		val->cnt++;
+	if (val->cnt != 0 && val->cnt != 1)
+		err = 1;
+	bpf_spin_unlock(&val->lock);
+
+	/* spin_lock in array. virtual queue demo */
+	q = bpf_map_lookup_elem(&vqueue, &key);
+	if (!q)
+		goto err;
+	curtime = bpf_ktime_get_ns();
+	bpf_spin_lock(&q->lock);
+	q->credit += CREDIT_PER_NS(curtime - q->lasttime, q->rate);
+	q->lasttime = curtime;
+	if (q->credit > max_credit)
+		q->credit = max_credit;
+	q->credit -= pkt_len;
+	credit = q->credit;
+	bpf_spin_unlock(&q->lock);
+
+	/* spin_lock in cgroup local storage */
+	cls = bpf_get_local_storage(&cls_map, 0);
+	bpf_spin_lock(&cls->lock);
+	cls->cnt++;
+	bpf_spin_unlock(&cls->lock);
+
+err:
+	return err;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.20.0


^ permalink raw reply related

* [PATCH v6 bpf-next 6/9] bpf: introduce BPF_F_LOCK flag
From: Alexei Starovoitov @ 2019-01-31  5:05 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, jannh, netdev, kernel-team
In-Reply-To: <20190131050546.3634009-1-ast@kernel.org>

Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands
and for map_update() helper function.
In all these cases take a lock of existing element (which was provided
in BTF description) before copying (in or out) the rest of map value.

Implementation details that are part of uapi:

Array:
The array map takes the element lock for lookup/update.

Hash:
hash map also takes the lock for lookup/update and tries to avoid the bucket lock.
If old element exists it takes the element lock and updates the element in place.
If element doesn't exist it allocates new one and inserts into hash table
while holding the bucket lock.
In rare case the hashmap has to take both the bucket lock and the element lock
to update old value in place.

Cgroup local storage:
It is similar to array. update in place and lookup are done with lock taken.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h        |  2 ++
 include/uapi/linux/bpf.h   |  1 +
 kernel/bpf/arraymap.c      | 24 ++++++++++++++--------
 kernel/bpf/hashtab.c       | 42 +++++++++++++++++++++++++++++++++++---
 kernel/bpf/helpers.c       | 16 +++++++++++++++
 kernel/bpf/local_storage.c | 14 ++++++++++++-
 kernel/bpf/syscall.c       | 25 +++++++++++++++++++++--
 7 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2ae615b48bb8..bd169a7bcc93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -119,6 +119,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 		memcpy(dst, src, map->value_size);
 	}
 }
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+			   bool lock_src);
 
 struct bpf_offload_dev;
 struct bpf_offloaded_map;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 86f7c438d40f..1777fa0c61e4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -267,6 +267,7 @@ enum bpf_attach_type {
 #define BPF_ANY		0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
+#define BPF_F_LOCK	4 /* spin_lock-ed map_lookup/map_update */
 
 /* flags for BPF_MAP_CREATE command */
 #define BPF_F_NO_PREALLOC	(1U << 0)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index d6d979910a2a..c72e0d8e1e65 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
+	char *val;
 
-	if (unlikely(map_flags > BPF_EXIST))
+	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
@@ -262,18 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		/* all elements were pre-allocated, cannot insert a new one */
 		return -E2BIG;
 
-	if (unlikely(map_flags == BPF_NOEXIST))
+	if (unlikely(map_flags & BPF_NOEXIST))
 		/* all elements already exist */
 		return -EEXIST;
 
-	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+	if (unlikely((map_flags & BPF_F_LOCK) &&
+		     !map_value_has_spin_lock(map)))
+		return -EINVAL;
+
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
 		       value, map->value_size);
-	else
-		copy_map_value(map,
-			       array->value +
-			       array->elem_size * (index & array->index_mask),
-			       value);
+	} else {
+		val = array->value +
+			array->elem_size * (index & array->index_mask);
+		if (map_flags & BPF_F_LOCK)
+			copy_map_value_locked(map, val, value, false);
+		else
+			copy_map_value(map, val, value);
+	}
 	return 0;
 }
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6d3b22c5ad68..937776531998 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -804,11 +804,11 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 		       u64 map_flags)
 {
-	if (l_old && map_flags == BPF_NOEXIST)
+	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
 		/* elem already exists */
 		return -EEXIST;
 
-	if (!l_old && map_flags == BPF_EXIST)
+	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
 		/* elem doesn't exist, cannot update it */
 		return -ENOENT;
 
@@ -827,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	u32 key_size, hash;
 	int ret;
 
-	if (unlikely(map_flags > BPF_EXIST))
+	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
@@ -840,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
+	if (unlikely(map_flags & BPF_F_LOCK)) {
+		if (unlikely(!map_value_has_spin_lock(map)))
+			return -EINVAL;
+		/* find an element without taking the bucket lock */
+		l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+					      htab->n_buckets);
+		ret = check_flags(htab, l_old, map_flags);
+		if (ret)
+			return ret;
+		if (l_old) {
+			/* grab the element lock and update value in place */
+			copy_map_value_locked(map,
+					      l_old->key + round_up(key_size, 8),
+					      value, false);
+			return 0;
+		}
+		/* fall through, grab the bucket lock and lookup again.
+		 * 99.9% chance that the element won't be found,
+		 * but second lookup under lock has to be done.
+		 */
+	}
+
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
@@ -849,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
+	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+		/* first lookup without the bucket lock didn't find the element,
+		 * but second lookup with the bucket lock found it.
+		 * This case is highly unlikely, but has to be dealt with:
+		 * grab the element lock in addition to the bucket lock
+		 * and update element in place
+		 */
+		copy_map_value_locked(map,
+				      l_old->key + round_up(key_size, 8),
+				      value, false);
+		ret = 0;
+		goto err;
+	}
+
 	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
 				l_old);
 	if (IS_ERR(l_new)) {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 29a8a4e62b8a..8218be1ee6ef 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -301,6 +301,22 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
 	.arg1_type	= ARG_PTR_TO_SPIN_LOCK,
 };
 
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+			   bool lock_src)
+{
+	struct bpf_spin_lock *lock;
+
+	if (lock_src)
+		lock = src + map->spin_lock_off;
+	else
+		lock = dst + map->spin_lock_off;
+	preempt_disable();
+	____bpf_spin_lock(lock);
+	copy_map_value(map, dst, src);
+	____bpf_spin_unlock(lock);
+	preempt_enable();
+}
+
 #ifdef CONFIG_CGROUPS
 BPF_CALL_0(bpf_get_current_cgroup_id)
 {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 0295427f06e2..6b572e2de7fb 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
 
-	if (flags != BPF_ANY && flags != BPF_EXIST)
+	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
+		return -EINVAL;
+
+	if (unlikely(flags & BPF_NOEXIST))
+		return -EINVAL;
+
+	if (unlikely((flags & BPF_F_LOCK) &&
+		     !map_value_has_spin_lock(map)))
 		return -EINVAL;
 
 	storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	if (!storage)
 		return -ENOENT;
 
+	if (flags & BPF_F_LOCK) {
+		copy_map_value_locked(map, storage->buf->data, value, false);
+		return 0;
+	}
+
 	new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
 			   map->value_size,
 			   __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b29e6dc44650..0834958f1dc4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
 
 static int map_lookup_elem(union bpf_attr *attr)
 {
@@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
 		return -EINVAL;
 
+	if (attr->flags & ~BPF_F_LOCK)
+		return -EINVAL;
+
 	f = fdget(ufd);
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
@@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if ((attr->flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -758,7 +767,13 @@ static int map_lookup_elem(union bpf_attr *attr)
 			err = -ENOENT;
 		} else {
 			err = 0;
-			copy_map_value(map, value, ptr);
+			if (attr->flags & BPF_F_LOCK)
+				/* lock 'ptr' and copy everything but lock */
+				copy_map_value_locked(map, value, ptr, true);
+			else
+				copy_map_value(map, value, ptr);
+			/* mask lock, since value wasn't zero inited */
+			check_and_init_map_lock(map, value);
 		}
 		rcu_read_unlock();
 	}
@@ -818,6 +833,12 @@ static int map_update_elem(union bpf_attr *attr)
 		goto err_put;
 	}
 
+	if ((attr->flags & BPF_F_LOCK) &&
+	    !map_value_has_spin_lock(map)) {
+		err = -EINVAL;
+		goto err_put;
+	}
+
 	key = __bpf_copy_key(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
-- 
2.20.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox