Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v4 bpf-next 02/14] libbpf: convert libbpf code to use new btf helpers
From: Andrii Nakryiko @ 2019-08-07  5:37 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel, yhs
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko
In-Reply-To: <20190807053806.1534571-1-andriin@fb.com>

Simplify code by relying on newly added BTF helper functions.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/lib/bpf/btf.c      | 181 ++++++++++++++++++---------------------
 tools/lib/bpf/btf_dump.c | 136 ++++++++++-------------------
 tools/lib/bpf/libbpf.c   |  60 +++++++------
 3 files changed, 157 insertions(+), 220 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 467224feb43b..f49a54c2575f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -19,13 +19,6 @@
 #define BTF_MAX_NR_TYPES 0x7fffffff
 #define BTF_MAX_STR_OFFSET 0x7fffffff
 
-#define IS_MODIFIER(k) (((k) == BTF_KIND_TYPEDEF) || \
-		((k) == BTF_KIND_VOLATILE) || \
-		((k) == BTF_KIND_CONST) || \
-		((k) == BTF_KIND_RESTRICT))
-
-#define IS_VAR(k) ((k) == BTF_KIND_VAR)
-
 static struct btf_type btf_void;
 
 struct btf {
@@ -192,9 +185,9 @@ static int btf_parse_str_sec(struct btf *btf)
 static int btf_type_size(struct btf_type *t)
 {
 	int base_size = sizeof(struct btf_type);
-	__u16 vlen = BTF_INFO_VLEN(t->info);
+	__u16 vlen = btf_vlen(t);
 
-	switch (BTF_INFO_KIND(t->info)) {
+	switch (btf_kind(t)) {
 	case BTF_KIND_FWD:
 	case BTF_KIND_CONST:
 	case BTF_KIND_VOLATILE:
@@ -219,7 +212,7 @@ static int btf_type_size(struct btf_type *t)
 	case BTF_KIND_DATASEC:
 		return base_size + vlen * sizeof(struct btf_var_secinfo);
 	default:
-		pr_debug("Unsupported BTF_KIND:%u\n", BTF_INFO_KIND(t->info));
+		pr_debug("Unsupported BTF_KIND:%u\n", btf_kind(t));
 		return -EINVAL;
 	}
 }
@@ -263,7 +256,7 @@ const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id)
 
 static bool btf_type_is_void(const struct btf_type *t)
 {
-	return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+	return t == &btf_void || btf_is_fwd(t);
 }
 
 static bool btf_type_is_void_or_null(const struct btf_type *t)
@@ -284,7 +277,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 	t = btf__type_by_id(btf, type_id);
 	for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
 	     i++) {
-		switch (BTF_INFO_KIND(t->info)) {
+		switch (btf_kind(t)) {
 		case BTF_KIND_INT:
 		case BTF_KIND_STRUCT:
 		case BTF_KIND_UNION:
@@ -303,7 +296,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id)
 			type_id = t->type;
 			break;
 		case BTF_KIND_ARRAY:
-			array = (const struct btf_array *)(t + 1);
+			array = btf_array(t);
 			if (nelems && array->nelems > UINT32_MAX / nelems)
 				return -E2BIG;
 			nelems *= array->nelems;
@@ -334,8 +327,7 @@ int btf__resolve_type(const struct btf *btf, __u32 type_id)
 	t = btf__type_by_id(btf, type_id);
 	while (depth < MAX_RESOLVE_DEPTH &&
 	       !btf_type_is_void_or_null(t) &&
-	       (IS_MODIFIER(BTF_INFO_KIND(t->info)) ||
-		IS_VAR(BTF_INFO_KIND(t->info)))) {
+	       (btf_is_mod(t) || btf_is_typedef(t) || btf_is_var(t))) {
 		type_id = t->type;
 		t = btf__type_by_id(btf, type_id);
 		depth++;
@@ -554,11 +546,11 @@ static int compare_vsi_off(const void *_a, const void *_b)
 static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf,
 			     struct btf_type *t)
 {
-	__u32 size = 0, off = 0, i, vars = BTF_INFO_VLEN(t->info);
+	__u32 size = 0, off = 0, i, vars = btf_vlen(t);
 	const char *name = btf__name_by_offset(btf, t->name_off);
 	const struct btf_type *t_var;
 	struct btf_var_secinfo *vsi;
-	struct btf_var *var;
+	const struct btf_var *var;
 	int ret;
 
 	if (!name) {
@@ -574,12 +566,11 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf,
 
 	t->size = size;
 
-	for (i = 0, vsi = (struct btf_var_secinfo *)(t + 1);
-	     i < vars; i++, vsi++) {
+	for (i = 0, vsi = (void *)btf_var_secinfos(t); i < vars; i++, vsi++) {
 		t_var = btf__type_by_id(btf, vsi->type);
-		var = (struct btf_var *)(t_var + 1);
+		var = btf_var(t_var);
 
-		if (BTF_INFO_KIND(t_var->info) != BTF_KIND_VAR) {
+		if (!btf_is_var(t_var)) {
 			pr_debug("Non-VAR type seen in section %s\n", name);
 			return -EINVAL;
 		}
@@ -595,7 +586,8 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf,
 
 		ret = bpf_object__variable_offset(obj, name, &off);
 		if (ret) {
-			pr_debug("No offset found in symbol table for VAR %s\n", name);
+			pr_debug("No offset found in symbol table for VAR %s\n",
+				 name);
 			return -ENOENT;
 		}
 
@@ -619,7 +611,7 @@ int btf__finalize_data(struct bpf_object *obj, struct btf *btf)
 		 * is section size and global variable offset. We use
 		 * the info from the ELF itself for this purpose.
 		 */
-		if (BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC) {
+		if (btf_is_datasec(t)) {
 			err = btf_fixup_datasec(obj, btf, t);
 			if (err)
 				break;
@@ -774,14 +766,13 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name,
 		return -EINVAL;
 	}
 
-	if (BTF_INFO_KIND(container_type->info) != BTF_KIND_STRUCT ||
-	    BTF_INFO_VLEN(container_type->info) < 2) {
+	if (!btf_is_struct(container_type) || btf_vlen(container_type) < 2) {
 		pr_warning("map:%s container_name:%s is an invalid container struct\n",
 			   map_name, container_name);
 		return -EINVAL;
 	}
 
-	key = (struct btf_member *)(container_type + 1);
+	key = btf_members(container_type);
 	value = key + 1;
 
 	key_size = btf__resolve_size(btf, key->type);
@@ -1440,10 +1431,9 @@ static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext,
 	d->map[0] = 0;
 	for (i = 1; i <= btf->nr_types; i++) {
 		struct btf_type *t = d->btf->types[i];
-		__u16 kind = BTF_INFO_KIND(t->info);
 
 		/* VAR and DATASEC are never deduped and are self-canonical */
-		if (kind == BTF_KIND_VAR || kind == BTF_KIND_DATASEC)
+		if (btf_is_var(t) || btf_is_datasec(t))
 			d->map[i] = i;
 		else
 			d->map[i] = BTF_UNPROCESSED_ID;
@@ -1484,11 +1474,11 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx)
 		if (r)
 			return r;
 
-		switch (BTF_INFO_KIND(t->info)) {
+		switch (btf_kind(t)) {
 		case BTF_KIND_STRUCT:
 		case BTF_KIND_UNION: {
-			struct btf_member *m = (struct btf_member *)(t + 1);
-			__u16 vlen = BTF_INFO_VLEN(t->info);
+			struct btf_member *m = (void *)btf_members(t);
+			__u16 vlen = btf_vlen(t);
 
 			for (j = 0; j < vlen; j++) {
 				r = fn(&m->name_off, ctx);
@@ -1499,8 +1489,8 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx)
 			break;
 		}
 		case BTF_KIND_ENUM: {
-			struct btf_enum *m = (struct btf_enum *)(t + 1);
-			__u16 vlen = BTF_INFO_VLEN(t->info);
+			struct btf_enum *m = (void *)btf_enum(t);
+			__u16 vlen = btf_vlen(t);
 
 			for (j = 0; j < vlen; j++) {
 				r = fn(&m->name_off, ctx);
@@ -1511,8 +1501,8 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx)
 			break;
 		}
 		case BTF_KIND_FUNC_PROTO: {
-			struct btf_param *m = (struct btf_param *)(t + 1);
-			__u16 vlen = BTF_INFO_VLEN(t->info);
+			struct btf_param *m = (void *)btf_params(t);
+			__u16 vlen = btf_vlen(t);
 
 			for (j = 0; j < vlen; j++) {
 				r = fn(&m->name_off, ctx);
@@ -1801,16 +1791,16 @@ static long btf_hash_enum(struct btf_type *t)
 /* Check structural equality of two ENUMs. */
 static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2)
 {
-	struct btf_enum *m1, *m2;
+	const struct btf_enum *m1, *m2;
 	__u16 vlen;
 	int i;
 
 	if (!btf_equal_common(t1, t2))
 		return false;
 
-	vlen = BTF_INFO_VLEN(t1->info);
-	m1 = (struct btf_enum *)(t1 + 1);
-	m2 = (struct btf_enum *)(t2 + 1);
+	vlen = btf_vlen(t1);
+	m1 = btf_enum(t1);
+	m2 = btf_enum(t2);
 	for (i = 0; i < vlen; i++) {
 		if (m1->name_off != m2->name_off || m1->val != m2->val)
 			return false;
@@ -1822,8 +1812,7 @@ static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2)
 
 static inline bool btf_is_enum_fwd(struct btf_type *t)
 {
-	return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM &&
-	       BTF_INFO_VLEN(t->info) == 0;
+	return btf_is_enum(t) && btf_vlen(t) == 0;
 }
 
 static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2)
@@ -1843,8 +1832,8 @@ static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2)
  */
 static long btf_hash_struct(struct btf_type *t)
 {
-	struct btf_member *member = (struct btf_member *)(t + 1);
-	__u32 vlen = BTF_INFO_VLEN(t->info);
+	const struct btf_member *member = btf_members(t);
+	__u32 vlen = btf_vlen(t);
 	long h = btf_hash_common(t);
 	int i;
 
@@ -1864,16 +1853,16 @@ static long btf_hash_struct(struct btf_type *t)
  */
 static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2)
 {
-	struct btf_member *m1, *m2;
+	const struct btf_member *m1, *m2;
 	__u16 vlen;
 	int i;
 
 	if (!btf_equal_common(t1, t2))
 		return false;
 
-	vlen = BTF_INFO_VLEN(t1->info);
-	m1 = (struct btf_member *)(t1 + 1);
-	m2 = (struct btf_member *)(t2 + 1);
+	vlen = btf_vlen(t1);
+	m1 = btf_members(t1);
+	m2 = btf_members(t2);
 	for (i = 0; i < vlen; i++) {
 		if (m1->name_off != m2->name_off || m1->offset != m2->offset)
 			return false;
@@ -1890,7 +1879,7 @@ static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2)
  */
 static long btf_hash_array(struct btf_type *t)
 {
-	struct btf_array *info = (struct btf_array *)(t + 1);
+	const struct btf_array *info = btf_array(t);
 	long h = btf_hash_common(t);
 
 	h = hash_combine(h, info->type);
@@ -1908,13 +1897,13 @@ static long btf_hash_array(struct btf_type *t)
  */
 static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2)
 {
-	struct btf_array *info1, *info2;
+	const struct btf_array *info1, *info2;
 
 	if (!btf_equal_common(t1, t2))
 		return false;
 
-	info1 = (struct btf_array *)(t1 + 1);
-	info2 = (struct btf_array *)(t2 + 1);
+	info1 = btf_array(t1);
+	info2 = btf_array(t2);
 	return info1->type == info2->type &&
 	       info1->index_type == info2->index_type &&
 	       info1->nelems == info2->nelems;
@@ -1927,14 +1916,10 @@ static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2)
  */
 static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2)
 {
-	struct btf_array *info1, *info2;
-
 	if (!btf_equal_common(t1, t2))
 		return false;
 
-	info1 = (struct btf_array *)(t1 + 1);
-	info2 = (struct btf_array *)(t2 + 1);
-	return info1->nelems == info2->nelems;
+	return btf_array(t1)->nelems == btf_array(t2)->nelems;
 }
 
 /*
@@ -1944,8 +1929,8 @@ static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2)
  */
 static long btf_hash_fnproto(struct btf_type *t)
 {
-	struct btf_param *member = (struct btf_param *)(t + 1);
-	__u16 vlen = BTF_INFO_VLEN(t->info);
+	const struct btf_param *member = btf_params(t);
+	__u16 vlen = btf_vlen(t);
 	long h = btf_hash_common(t);
 	int i;
 
@@ -1966,16 +1951,16 @@ static long btf_hash_fnproto(struct btf_type *t)
  */
 static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2)
 {
-	struct btf_param *m1, *m2;
+	const struct btf_param *m1, *m2;
 	__u16 vlen;
 	int i;
 
 	if (!btf_equal_common(t1, t2))
 		return false;
 
-	vlen = BTF_INFO_VLEN(t1->info);
-	m1 = (struct btf_param *)(t1 + 1);
-	m2 = (struct btf_param *)(t2 + 1);
+	vlen = btf_vlen(t1);
+	m1 = btf_params(t1);
+	m2 = btf_params(t2);
 	for (i = 0; i < vlen; i++) {
 		if (m1->name_off != m2->name_off || m1->type != m2->type)
 			return false;
@@ -1992,7 +1977,7 @@ static bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2)
  */
 static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2)
 {
-	struct btf_param *m1, *m2;
+	const struct btf_param *m1, *m2;
 	__u16 vlen;
 	int i;
 
@@ -2000,9 +1985,9 @@ static bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2)
 	if (t1->name_off != t2->name_off || t1->info != t2->info)
 		return false;
 
-	vlen = BTF_INFO_VLEN(t1->info);
-	m1 = (struct btf_param *)(t1 + 1);
-	m2 = (struct btf_param *)(t2 + 1);
+	vlen = btf_vlen(t1);
+	m1 = btf_params(t1);
+	m2 = btf_params(t2);
 	for (i = 0; i < vlen; i++) {
 		if (m1->name_off != m2->name_off)
 			return false;
@@ -2028,7 +2013,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id)
 	__u32 cand_id;
 	long h;
 
-	switch (BTF_INFO_KIND(t->info)) {
+	switch (btf_kind(t)) {
 	case BTF_KIND_CONST:
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_RESTRICT:
@@ -2141,13 +2126,13 @@ static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id)
 {
 	__u32 orig_type_id = type_id;
 
-	if (BTF_INFO_KIND(d->btf->types[type_id]->info) != BTF_KIND_FWD)
+	if (!btf_is_fwd(d->btf->types[type_id]))
 		return type_id;
 
 	while (is_type_mapped(d, type_id) && d->map[type_id] != type_id)
 		type_id = d->map[type_id];
 
-	if (BTF_INFO_KIND(d->btf->types[type_id]->info) != BTF_KIND_FWD)
+	if (!btf_is_fwd(d->btf->types[type_id]))
 		return type_id;
 
 	return orig_type_id;
@@ -2156,7 +2141,7 @@ static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id)
 
 static inline __u16 btf_fwd_kind(struct btf_type *t)
 {
-	return BTF_INFO_KFLAG(t->info) ? BTF_KIND_UNION : BTF_KIND_STRUCT;
+	return btf_kflag(t) ? BTF_KIND_UNION : BTF_KIND_STRUCT;
 }
 
 /*
@@ -2277,8 +2262,8 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 
 	cand_type = d->btf->types[cand_id];
 	canon_type = d->btf->types[canon_id];
-	cand_kind = BTF_INFO_KIND(cand_type->info);
-	canon_kind = BTF_INFO_KIND(canon_type->info);
+	cand_kind = btf_kind(cand_type);
+	canon_kind = btf_kind(canon_type);
 
 	if (cand_type->name_off != canon_type->name_off)
 		return 0;
@@ -2327,12 +2312,12 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		return btf_dedup_is_equiv(d, cand_type->type, canon_type->type);
 
 	case BTF_KIND_ARRAY: {
-		struct btf_array *cand_arr, *canon_arr;
+		const struct btf_array *cand_arr, *canon_arr;
 
 		if (!btf_compat_array(cand_type, canon_type))
 			return 0;
-		cand_arr = (struct btf_array *)(cand_type + 1);
-		canon_arr = (struct btf_array *)(canon_type + 1);
+		cand_arr = btf_array(cand_type);
+		canon_arr = btf_array(canon_type);
 		eq = btf_dedup_is_equiv(d,
 			cand_arr->index_type, canon_arr->index_type);
 		if (eq <= 0)
@@ -2342,14 +2327,14 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
-		struct btf_member *cand_m, *canon_m;
+		const struct btf_member *cand_m, *canon_m;
 		__u16 vlen;
 
 		if (!btf_shallow_equal_struct(cand_type, canon_type))
 			return 0;
-		vlen = BTF_INFO_VLEN(cand_type->info);
-		cand_m = (struct btf_member *)(cand_type + 1);
-		canon_m = (struct btf_member *)(canon_type + 1);
+		vlen = btf_vlen(cand_type);
+		cand_m = btf_members(cand_type);
+		canon_m = btf_members(canon_type);
 		for (i = 0; i < vlen; i++) {
 			eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type);
 			if (eq <= 0)
@@ -2362,7 +2347,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 	}
 
 	case BTF_KIND_FUNC_PROTO: {
-		struct btf_param *cand_p, *canon_p;
+		const struct btf_param *cand_p, *canon_p;
 		__u16 vlen;
 
 		if (!btf_compat_fnproto(cand_type, canon_type))
@@ -2370,9 +2355,9 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id,
 		eq = btf_dedup_is_equiv(d, cand_type->type, canon_type->type);
 		if (eq <= 0)
 			return eq;
-		vlen = BTF_INFO_VLEN(cand_type->info);
-		cand_p = (struct btf_param *)(cand_type + 1);
-		canon_p = (struct btf_param *)(canon_type + 1);
+		vlen = btf_vlen(cand_type);
+		cand_p = btf_params(cand_type);
+		canon_p = btf_params(canon_type);
 		for (i = 0; i < vlen; i++) {
 			eq = btf_dedup_is_equiv(d, cand_p->type, canon_p->type);
 			if (eq <= 0)
@@ -2427,8 +2412,8 @@ static void btf_dedup_merge_hypot_map(struct btf_dedup *d)
 		targ_type_id = d->hypot_map[cand_type_id];
 		t_id = resolve_type_id(d, targ_type_id);
 		c_id = resolve_type_id(d, cand_type_id);
-		t_kind = BTF_INFO_KIND(d->btf->types[t_id]->info);
-		c_kind = BTF_INFO_KIND(d->btf->types[c_id]->info);
+		t_kind = btf_kind(d->btf->types[t_id]);
+		c_kind = btf_kind(d->btf->types[c_id]);
 		/*
 		 * Resolve FWD into STRUCT/UNION.
 		 * It's ok to resolve FWD into STRUCT/UNION that's not yet
@@ -2497,7 +2482,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id)
 		return 0;
 
 	t = d->btf->types[type_id];
-	kind = BTF_INFO_KIND(t->info);
+	kind = btf_kind(t);
 
 	if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION)
 		return 0;
@@ -2592,7 +2577,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 	t = d->btf->types[type_id];
 	d->map[type_id] = BTF_IN_PROGRESS_ID;
 
-	switch (BTF_INFO_KIND(t->info)) {
+	switch (btf_kind(t)) {
 	case BTF_KIND_CONST:
 	case BTF_KIND_VOLATILE:
 	case BTF_KIND_RESTRICT:
@@ -2616,7 +2601,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 		break;
 
 	case BTF_KIND_ARRAY: {
-		struct btf_array *info = (struct btf_array *)(t + 1);
+		struct btf_array *info = (void *)btf_array(t);
 
 		ref_type_id = btf_dedup_ref_type(d, info->type);
 		if (ref_type_id < 0)
@@ -2650,8 +2635,8 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id)
 			return ref_type_id;
 		t->type = ref_type_id;
 
-		vlen = BTF_INFO_VLEN(t->info);
-		param = (struct btf_param *)(t + 1);
+		vlen = btf_vlen(t);
+		param = (void *)btf_params(t);
 		for (i = 0; i < vlen; i++) {
 			ref_type_id = btf_dedup_ref_type(d, param->type);
 			if (ref_type_id < 0)
@@ -2791,7 +2776,7 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
 	struct btf_type *t = d->btf->types[type_id];
 	int i, r;
 
-	switch (BTF_INFO_KIND(t->info)) {
+	switch (btf_kind(t)) {
 	case BTF_KIND_INT:
 	case BTF_KIND_ENUM:
 		break;
@@ -2811,7 +2796,7 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
 		break;
 
 	case BTF_KIND_ARRAY: {
-		struct btf_array *arr_info = (struct btf_array *)(t + 1);
+		struct btf_array *arr_info = (void *)btf_array(t);
 
 		r = btf_dedup_remap_type_id(d, arr_info->type);
 		if (r < 0)
@@ -2826,8 +2811,8 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
 
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
-		struct btf_member *member = (struct btf_member *)(t + 1);
-		__u16 vlen = BTF_INFO_VLEN(t->info);
+		struct btf_member *member = (void *)btf_members(t);
+		__u16 vlen = btf_vlen(t);
 
 		for (i = 0; i < vlen; i++) {
 			r = btf_dedup_remap_type_id(d, member->type);
@@ -2840,8 +2825,8 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
 	}
 
 	case BTF_KIND_FUNC_PROTO: {
-		struct btf_param *param = (struct btf_param *)(t + 1);
-		__u16 vlen = BTF_INFO_VLEN(t->info);
+		struct btf_param *param = (void *)btf_params(t);
+		__u16 vlen = btf_vlen(t);
 
 		r = btf_dedup_remap_type_id(d, t->type);
 		if (r < 0)
@@ -2859,8 +2844,8 @@ static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id)
 	}
 
 	case BTF_KIND_DATASEC: {
-		struct btf_var_secinfo *var = (struct btf_var_secinfo *)(t + 1);
-		__u16 vlen = BTF_INFO_VLEN(t->info);
+		struct btf_var_secinfo *var = (void *)btf_var_secinfos(t);
+		__u16 vlen = btf_vlen(t);
 
 		for (i = 0; i < vlen; i++) {
 			r = btf_dedup_remap_type_id(d, var->type);
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 7065bb5b2752..e8987bfa4d0e 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -100,21 +100,6 @@ static bool str_equal_fn(const void *a, const void *b, void *ctx)
 	return strcmp(a, b) == 0;
 }
 
-static __u16 btf_kind_of(const struct btf_type *t)
-{
-	return BTF_INFO_KIND(t->info);
-}
-
-static __u16 btf_vlen_of(const struct btf_type *t)
-{
-	return BTF_INFO_VLEN(t->info);
-}
-
-static bool btf_kflag_of(const struct btf_type *t)
-{
-	return BTF_INFO_KFLAG(t->info);
-}
-
 static const char *btf_name_of(const struct btf_dump *d, __u32 name_off)
 {
 	return btf__name_by_offset(d->btf, name_off);
@@ -349,7 +334,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 	 */
 	struct btf_dump_type_aux_state *tstate = &d->type_states[id];
 	const struct btf_type *t;
-	__u16 kind, vlen;
+	__u16 vlen;
 	int err, i;
 
 	/* return true, letting typedefs know that it's ok to be emitted */
@@ -357,18 +342,16 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 		return 1;
 
 	t = btf__type_by_id(d->btf, id);
-	kind = btf_kind_of(t);
 
 	if (tstate->order_state == ORDERING) {
 		/* type loop, but resolvable through fwd declaration */
-		if ((kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION) &&
-		    through_ptr && t->name_off != 0)
+		if (btf_is_composite(t) && through_ptr && t->name_off != 0)
 			return 0;
 		pr_warning("unsatisfiable type cycle, id:[%u]\n", id);
 		return -ELOOP;
 	}
 
-	switch (kind) {
+	switch (btf_kind(t)) {
 	case BTF_KIND_INT:
 		tstate->order_state = ORDERED;
 		return 0;
@@ -378,14 +361,12 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 		tstate->order_state = ORDERED;
 		return err;
 
-	case BTF_KIND_ARRAY: {
-		const struct btf_array *a = (void *)(t + 1);
+	case BTF_KIND_ARRAY:
+		return btf_dump_order_type(d, btf_array(t)->type, through_ptr);
 
-		return btf_dump_order_type(d, a->type, through_ptr);
-	}
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
-		const struct btf_member *m = (void *)(t + 1);
+		const struct btf_member *m = btf_members(t);
 		/*
 		 * struct/union is part of strong link, only if it's embedded
 		 * (so no ptr in a path) or it's anonymous (so has to be
@@ -396,7 +377,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 
 		tstate->order_state = ORDERING;
 
-		vlen = btf_vlen_of(t);
+		vlen = btf_vlen(t);
 		for (i = 0; i < vlen; i++, m++) {
 			err = btf_dump_order_type(d, m->type, false);
 			if (err < 0)
@@ -447,7 +428,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 		return btf_dump_order_type(d, t->type, through_ptr);
 
 	case BTF_KIND_FUNC_PROTO: {
-		const struct btf_param *p = (void *)(t + 1);
+		const struct btf_param *p = btf_params(t);
 		bool is_strong;
 
 		err = btf_dump_order_type(d, t->type, through_ptr);
@@ -455,7 +436,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr)
 			return err;
 		is_strong = err > 0;
 
-		vlen = btf_vlen_of(t);
+		vlen = btf_vlen(t);
 		for (i = 0; i < vlen; i++, p++) {
 			err = btf_dump_order_type(d, p->type, through_ptr);
 			if (err < 0)
@@ -553,7 +534,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 		return;
 
 	t = btf__type_by_id(d->btf, id);
-	kind = btf_kind_of(t);
+	kind = btf_kind(t);
 
 	if (top_level_def && t->name_off == 0) {
 		pr_warning("unexpected nameless definition, id:[%u]\n", id);
@@ -618,12 +599,9 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 	case BTF_KIND_RESTRICT:
 		btf_dump_emit_type(d, t->type, cont_id);
 		break;
-	case BTF_KIND_ARRAY: {
-		const struct btf_array *a = (void *)(t + 1);
-
-		btf_dump_emit_type(d, a->type, cont_id);
+	case BTF_KIND_ARRAY:
+		btf_dump_emit_type(d, btf_array(t)->type, cont_id);
 		break;
-	}
 	case BTF_KIND_FWD:
 		btf_dump_emit_fwd_def(d, id, t);
 		btf_dump_printf(d, ";\n\n");
@@ -656,8 +634,8 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 		 * applicable
 		 */
 		if (top_level_def || t->name_off == 0) {
-			const struct btf_member *m = (void *)(t + 1);
-			__u16 vlen = btf_vlen_of(t);
+			const struct btf_member *m = btf_members(t);
+			__u16 vlen = btf_vlen(t);
 			int i, new_cont_id;
 
 			new_cont_id = t->name_off == 0 ? cont_id : id;
@@ -678,8 +656,8 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 		}
 		break;
 	case BTF_KIND_FUNC_PROTO: {
-		const struct btf_param *p = (void *)(t + 1);
-		__u16 vlen = btf_vlen_of(t);
+		const struct btf_param *p = btf_params(t);
+		__u16 vlen = btf_vlen(t);
 		int i;
 
 		btf_dump_emit_type(d, t->type, cont_id);
@@ -696,7 +674,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id)
 static int btf_align_of(const struct btf *btf, __u32 id)
 {
 	const struct btf_type *t = btf__type_by_id(btf, id);
-	__u16 kind = btf_kind_of(t);
+	__u16 kind = btf_kind(t);
 
 	switch (kind) {
 	case BTF_KIND_INT:
@@ -709,15 +687,12 @@ static int btf_align_of(const struct btf *btf, __u32 id)
 	case BTF_KIND_CONST:
 	case BTF_KIND_RESTRICT:
 		return btf_align_of(btf, t->type);
-	case BTF_KIND_ARRAY: {
-		const struct btf_array *a = (void *)(t + 1);
-
-		return btf_align_of(btf, a->type);
-	}
+	case BTF_KIND_ARRAY:
+		return btf_align_of(btf, btf_array(t)->type);
 	case BTF_KIND_STRUCT:
 	case BTF_KIND_UNION: {
-		const struct btf_member *m = (void *)(t + 1);
-		__u16 vlen = btf_vlen_of(t);
+		const struct btf_member *m = btf_members(t);
+		__u16 vlen = btf_vlen(t);
 		int i, align = 1;
 
 		for (i = 0; i < vlen; i++, m++)
@@ -726,7 +701,7 @@ static int btf_align_of(const struct btf *btf, __u32 id)
 		return align;
 	}
 	default:
-		pr_warning("unsupported BTF_KIND:%u\n", btf_kind_of(t));
+		pr_warning("unsupported BTF_KIND:%u\n", btf_kind(t));
 		return 1;
 	}
 }
@@ -737,20 +712,18 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id,
 	const struct btf_member *m;
 	int align, i, bit_sz;
 	__u16 vlen;
-	bool kflag;
 
 	align = btf_align_of(btf, id);
 	/* size of a non-packed struct has to be a multiple of its alignment*/
 	if (t->size % align)
 		return true;
 
-	m = (void *)(t + 1);
-	kflag = btf_kflag_of(t);
-	vlen = btf_vlen_of(t);
+	m = btf_members(t);
+	vlen = btf_vlen(t);
 	/* all non-bitfield fields have to be naturally aligned */
 	for (i = 0; i < vlen; i++, m++) {
 		align = btf_align_of(btf, m->type);
-		bit_sz = kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0;
+		bit_sz = btf_member_bitfield_size(t, i);
 		if (bit_sz == 0 && m->offset % (8 * align) != 0)
 			return true;
 	}
@@ -807,7 +780,7 @@ static void btf_dump_emit_struct_fwd(struct btf_dump *d, __u32 id,
 				     const struct btf_type *t)
 {
 	btf_dump_printf(d, "%s %s",
-			btf_kind_of(t) == BTF_KIND_STRUCT ? "struct" : "union",
+			btf_is_struct(t) ? "struct" : "union",
 			btf_dump_type_name(d, id));
 }
 
@@ -816,12 +789,11 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 				     const struct btf_type *t,
 				     int lvl)
 {
-	const struct btf_member *m = (void *)(t + 1);
-	bool kflag = btf_kflag_of(t), is_struct;
+	const struct btf_member *m = btf_members(t);
+	bool is_struct = btf_is_struct(t);
 	int align, i, packed, off = 0;
-	__u16 vlen = btf_vlen_of(t);
+	__u16 vlen = btf_vlen(t);
 
-	is_struct = btf_kind_of(t) == BTF_KIND_STRUCT;
 	packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0;
 	align = packed ? 1 : btf_align_of(d->btf, id);
 
@@ -835,8 +807,8 @@ static void btf_dump_emit_struct_def(struct btf_dump *d,
 		int m_off, m_sz;
 
 		fname = btf_name_of(d, m->name_off);
-		m_sz = kflag ? BTF_MEMBER_BITFIELD_SIZE(m->offset) : 0;
-		m_off = kflag ? BTF_MEMBER_BIT_OFFSET(m->offset) : m->offset;
+		m_sz = btf_member_bitfield_size(t, i);
+		m_off = btf_member_bit_offset(t, i);
 		align = packed ? 1 : btf_align_of(d->btf, m->type);
 
 		btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1);
@@ -871,7 +843,7 @@ static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id,
 				   int lvl)
 {
 	const struct btf_enum *v = (void *)(t+1);
-	__u16 vlen = btf_vlen_of(t);
+	__u16 vlen = btf_vlen(t);
 	const char *name;
 	size_t dup_cnt;
 	int i;
@@ -905,7 +877,7 @@ static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id,
 {
 	const char *name = btf_dump_type_name(d, id);
 
-	if (btf_kflag_of(t))
+	if (btf_kflag(t))
 		btf_dump_printf(d, "union %s", name);
 	else
 		btf_dump_printf(d, "struct %s", name);
@@ -987,7 +959,6 @@ static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id,
 	struct id_stack decl_stack;
 	const struct btf_type *t;
 	int err, stack_start;
-	__u16 kind;
 
 	stack_start = d->decl_stack_cnt;
 	for (;;) {
@@ -1008,8 +979,7 @@ static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id,
 			break;
 
 		t = btf__type_by_id(d->btf, id);
-		kind = btf_kind_of(t);
-		switch (kind) {
+		switch (btf_kind(t)) {
 		case BTF_KIND_PTR:
 		case BTF_KIND_VOLATILE:
 		case BTF_KIND_CONST:
@@ -1017,12 +987,9 @@ static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id,
 		case BTF_KIND_FUNC_PROTO:
 			id = t->type;
 			break;
-		case BTF_KIND_ARRAY: {
-			const struct btf_array *a = (void *)(t + 1);
-
-			id = a->type;
+		case BTF_KIND_ARRAY:
+			id = btf_array(t)->type;
 			break;
-		}
 		case BTF_KIND_INT:
 		case BTF_KIND_ENUM:
 		case BTF_KIND_FWD:
@@ -1032,7 +999,7 @@ static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id,
 			goto done;
 		default:
 			pr_warning("unexpected type in decl chain, kind:%u, id:[%u]\n",
-				   kind, id);
+				   btf_kind(t), id);
 			goto done;
 		}
 	}
@@ -1070,7 +1037,7 @@ static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack)
 		id = decl_stack->ids[decl_stack->cnt - 1];
 		t = btf__type_by_id(d->btf, id);
 
-		switch (btf_kind_of(t)) {
+		switch (btf_kind(t)) {
 		case BTF_KIND_VOLATILE:
 			btf_dump_printf(d, "volatile ");
 			break;
@@ -1087,20 +1054,6 @@ static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack)
 	}
 }
 
-static bool btf_is_mod_kind(const struct btf *btf, __u32 id)
-{
-	const struct btf_type *t = btf__type_by_id(btf, id);
-
-	switch (btf_kind_of(t)) {
-	case BTF_KIND_VOLATILE:
-	case BTF_KIND_CONST:
-	case BTF_KIND_RESTRICT:
-		return true;
-	default:
-		return false;
-	}
-}
-
 static void btf_dump_emit_name(const struct btf_dump *d,
 			       const char *name, bool last_was_ptr)
 {
@@ -1139,7 +1092,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 		}
 
 		t = btf__type_by_id(d->btf, id);
-		kind = btf_kind_of(t);
+		kind = btf_kind(t);
 
 		switch (kind) {
 		case BTF_KIND_INT:
@@ -1185,7 +1138,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 			btf_dump_printf(d, " restrict");
 			break;
 		case BTF_KIND_ARRAY: {
-			const struct btf_array *a = (void *)(t + 1);
+			const struct btf_array *a = btf_array(t);
 			const struct btf_type *next_t;
 			__u32 next_id;
 			bool multidim;
@@ -1201,7 +1154,8 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 			 */
 			while (decls->cnt) {
 				next_id = decls->ids[decls->cnt - 1];
-				if (btf_is_mod_kind(d->btf, next_id))
+				next_t = btf__type_by_id(d->btf, next_id);
+				if (btf_is_mod(next_t))
 					decls->cnt--;
 				else
 					break;
@@ -1214,7 +1168,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 			}
 
 			next_t = btf__type_by_id(d->btf, next_id);
-			multidim = btf_kind_of(next_t) == BTF_KIND_ARRAY;
+			multidim = btf_is_array(next_t);
 			/* we need space if we have named non-pointer */
 			if (fname[0] && !last_was_ptr)
 				btf_dump_printf(d, " ");
@@ -1228,8 +1182,8 @@ static void btf_dump_emit_type_chain(struct btf_dump *d,
 			return;
 		}
 		case BTF_KIND_FUNC_PROTO: {
-			const struct btf_param *p = (void *)(t + 1);
-			__u16 vlen = btf_vlen_of(t);
+			const struct btf_param *p = btf_params(t);
+			__u16 vlen = btf_vlen(t);
 			int i;
 
 			btf_dump_emit_mods(d, decls);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ead915aec349..fafcc75b33b1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1049,9 +1049,9 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf,
 	const struct btf_array *arr_info;
 	const struct btf_type *arr_t;
 
-	if (BTF_INFO_KIND(t->info) != BTF_KIND_PTR) {
+	if (!btf_is_ptr(t)) {
 		pr_warning("map '%s': attr '%s': expected PTR, got %u.\n",
-			   map_name, name, BTF_INFO_KIND(t->info));
+			   map_name, name, btf_kind(t));
 		return false;
 	}
 
@@ -1061,12 +1061,12 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf,
 			   map_name, name, t->type);
 		return false;
 	}
-	if (BTF_INFO_KIND(arr_t->info) != BTF_KIND_ARRAY) {
+	if (!btf_is_array(arr_t)) {
 		pr_warning("map '%s': attr '%s': expected ARRAY, got %u.\n",
-			   map_name, name, BTF_INFO_KIND(arr_t->info));
+			   map_name, name, btf_kind(arr_t));
 		return false;
 	}
-	arr_info = (const void *)(arr_t + 1);
+	arr_info = btf_array(arr_t);
 	*res = arr_info->nelems;
 	return true;
 }
@@ -1084,11 +1084,11 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 	struct bpf_map *map;
 	int vlen, i;
 
-	vi = (const struct btf_var_secinfo *)(const void *)(sec + 1) + var_idx;
+	vi = btf_var_secinfos(sec) + var_idx;
 	var = btf__type_by_id(obj->btf, vi->type);
-	var_extra = (const void *)(var + 1);
+	var_extra = btf_var(var);
 	map_name = btf__name_by_offset(obj->btf, var->name_off);
-	vlen = BTF_INFO_VLEN(var->info);
+	vlen = btf_vlen(var);
 
 	if (map_name == NULL || map_name[0] == '\0') {
 		pr_warning("map #%d: empty name.\n", var_idx);
@@ -1098,9 +1098,9 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 		pr_warning("map '%s' BTF data is corrupted.\n", map_name);
 		return -EINVAL;
 	}
-	if (BTF_INFO_KIND(var->info) != BTF_KIND_VAR) {
+	if (!btf_is_var(var)) {
 		pr_warning("map '%s': unexpected var kind %u.\n",
-			   map_name, BTF_INFO_KIND(var->info));
+			   map_name, btf_kind(var));
 		return -EINVAL;
 	}
 	if (var_extra->linkage != BTF_VAR_GLOBAL_ALLOCATED &&
@@ -1111,9 +1111,9 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 	}
 
 	def = skip_mods_and_typedefs(obj->btf, var->type);
-	if (BTF_INFO_KIND(def->info) != BTF_KIND_STRUCT) {
+	if (!btf_is_struct(def)) {
 		pr_warning("map '%s': unexpected def kind %u.\n",
-			   map_name, BTF_INFO_KIND(var->info));
+			   map_name, btf_kind(var));
 		return -EINVAL;
 	}
 	if (def->size > vi->size) {
@@ -1136,8 +1136,8 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 	pr_debug("map '%s': at sec_idx %d, offset %zu.\n",
 		 map_name, map->sec_idx, map->sec_offset);
 
-	vlen = BTF_INFO_VLEN(def->info);
-	m = (const void *)(def + 1);
+	vlen = btf_vlen(def);
+	m = btf_members(def);
 	for (i = 0; i < vlen; i++, m++) {
 		const char *name = btf__name_by_offset(obj->btf, m->name_off);
 
@@ -1187,9 +1187,9 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 					   map_name, m->type);
 				return -EINVAL;
 			}
-			if (BTF_INFO_KIND(t->info) != BTF_KIND_PTR) {
+			if (!btf_is_ptr(t)) {
 				pr_warning("map '%s': key spec is not PTR: %u.\n",
-					   map_name, BTF_INFO_KIND(t->info));
+					   map_name, btf_kind(t));
 				return -EINVAL;
 			}
 			sz = btf__resolve_size(obj->btf, t->type);
@@ -1230,9 +1230,9 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj,
 					   map_name, m->type);
 				return -EINVAL;
 			}
-			if (BTF_INFO_KIND(t->info) != BTF_KIND_PTR) {
+			if (!btf_is_ptr(t)) {
 				pr_warning("map '%s': value spec is not PTR: %u.\n",
-					   map_name, BTF_INFO_KIND(t->info));
+					   map_name, btf_kind(t));
 				return -EINVAL;
 			}
 			sz = btf__resolve_size(obj->btf, t->type);
@@ -1293,7 +1293,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict)
 	nr_types = btf__get_nr_types(obj->btf);
 	for (i = 1; i <= nr_types; i++) {
 		t = btf__type_by_id(obj->btf, i);
-		if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC)
+		if (!btf_is_datasec(t))
 			continue;
 		name = btf__name_by_offset(obj->btf, t->name_off);
 		if (strcmp(name, MAPS_ELF_SEC) == 0) {
@@ -1307,7 +1307,7 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict)
 		return -ENOENT;
 	}
 
-	vlen = BTF_INFO_VLEN(sec->info);
+	vlen = btf_vlen(sec);
 	for (i = 0; i < vlen; i++) {
 		err = bpf_object__init_user_btf_map(obj, sec, i,
 						    obj->efile.btf_maps_shndx,
@@ -1368,24 +1368,22 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj)
 	struct btf *btf = obj->btf;
 	struct btf_type *t;
 	int i, j, vlen;
-	__u16 kind;
 
 	if (!obj->btf || (has_func && has_datasec))
 		return;
 
 	for (i = 1; i <= btf__get_nr_types(btf); i++) {
 		t = (struct btf_type *)btf__type_by_id(btf, i);
-		kind = BTF_INFO_KIND(t->info);
 
-		if (!has_datasec && kind == BTF_KIND_VAR) {
+		if (!has_datasec && btf_is_var(t)) {
 			/* replace VAR with INT */
 			t->info = BTF_INFO_ENC(BTF_KIND_INT, 0, 0);
 			t->size = sizeof(int);
-			*(int *)(t+1) = BTF_INT_ENC(0, 0, 32);
-		} else if (!has_datasec && kind == BTF_KIND_DATASEC) {
+			*(int *)(t + 1) = BTF_INT_ENC(0, 0, 32);
+		} else if (!has_datasec && btf_is_datasec(t)) {
 			/* replace DATASEC with STRUCT */
-			struct btf_var_secinfo *v = (void *)(t + 1);
-			struct btf_member *m = (void *)(t + 1);
+			const struct btf_var_secinfo *v = btf_var_secinfos(t);
+			struct btf_member *m = (void *)btf_members(t);
 			struct btf_type *vt;
 			char *name;
 
@@ -1396,7 +1394,7 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj)
 				name++;
 			}
 
-			vlen = BTF_INFO_VLEN(t->info);
+			vlen = btf_vlen(t);
 			t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, vlen);
 			for (j = 0; j < vlen; j++, v++, m++) {
 				/* order of field assignments is important */
@@ -1406,12 +1404,12 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj)
 				vt = (void *)btf__type_by_id(btf, v->type);
 				m->name_off = vt->name_off;
 			}
-		} else if (!has_func && kind == BTF_KIND_FUNC_PROTO) {
+		} else if (!has_func && btf_is_func_proto(t)) {
 			/* replace FUNC_PROTO with ENUM */
-			vlen = BTF_INFO_VLEN(t->info);
+			vlen = btf_vlen(t);
 			t->info = BTF_INFO_ENC(BTF_KIND_ENUM, 0, vlen);
 			t->size = sizeof(__u32); /* kernel enforced */
-		} else if (!has_func && kind == BTF_KIND_FUNC) {
+		} else if (!has_func && btf_is_func(t)) {
 			/* replace FUNC with TYPEDEF */
 			t->info = BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0);
 		}
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 bpf-next 10/14] selftests/bpf: add CO-RE relocs enum/ptr/func_proto tests
From: Andrii Nakryiko @ 2019-08-07  5:38 UTC (permalink / raw)
  To: bpf, netdev, ast, daniel, yhs
  Cc: andrii.nakryiko, kernel-team, Andrii Nakryiko
In-Reply-To: <20190807053806.1534571-1-andriin@fb.com>

Test CO-RE relocation handling of ints, enums, pointers, func protos, etc.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
---
 .../selftests/bpf/prog_tests/core_reloc.c     | 36 ++++++++++
 .../bpf/progs/btf__core_reloc_primitives.c    |  3 +
 ...f__core_reloc_primitives___diff_enum_def.c |  3 +
 ..._core_reloc_primitives___diff_func_proto.c |  3 +
 ...f__core_reloc_primitives___diff_ptr_type.c |  3 +
 ...tf__core_reloc_primitives___err_non_enum.c |  3 +
 ...btf__core_reloc_primitives___err_non_int.c |  3 +
 ...btf__core_reloc_primitives___err_non_ptr.c |  3 +
 .../selftests/bpf/progs/core_reloc_types.h    | 67 +++++++++++++++++++
 .../bpf/progs/test_core_reloc_primitives.c    | 43 ++++++++++++
 10 files changed, 167 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c
 create mode 100644 tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c

diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
index 30e2f5c5d3b5..c924c8ae67af 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_reloc.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -81,6 +81,32 @@
 	.fails = true,							\
 }
 
+#define PRIMITIVES_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) {	\
+	.a = 1,								\
+	.b = 2,								\
+	.c = 3,								\
+	.d = (void *)4,							\
+	.f = (void *)5,							\
+}
+
+#define PRIMITIVES_CASE_COMMON(name)					\
+	.case_name = #name,						\
+	.bpf_obj_file = "test_core_reloc_primitives.o",			\
+	.btf_src_file = "btf__core_reloc_" #name ".o"
+
+#define PRIMITIVES_CASE(name) {						\
+	PRIMITIVES_CASE_COMMON(name),					\
+	.input = PRIMITIVES_DATA(core_reloc_##name),			\
+	.input_len = sizeof(struct core_reloc_##name),			\
+	.output = PRIMITIVES_DATA(core_reloc_primitives),		\
+	.output_len = sizeof(struct core_reloc_primitives),		\
+}
+
+#define PRIMITIVES_ERR_CASE(name) {					\
+	PRIMITIVES_CASE_COMMON(name),					\
+	.fails = true,							\
+}
+
 struct core_reloc_test_case {
 	const char *case_name;
 	const char *bpf_obj_file;
@@ -137,6 +163,16 @@ static struct core_reloc_test_case test_cases[] = {
 	ARRAYS_ERR_CASE(arrays___err_non_array),
 	ARRAYS_ERR_CASE(arrays___err_wrong_val_type1),
 	ARRAYS_ERR_CASE(arrays___err_wrong_val_type2),
+
+	/* enum/ptr/int handling scenarios */
+	PRIMITIVES_CASE(primitives),
+	PRIMITIVES_CASE(primitives___diff_enum_def),
+	PRIMITIVES_CASE(primitives___diff_func_proto),
+	PRIMITIVES_CASE(primitives___diff_ptr_type),
+
+	PRIMITIVES_ERR_CASE(primitives___err_non_enum),
+	PRIMITIVES_ERR_CASE(primitives___err_non_int),
+	PRIMITIVES_ERR_CASE(primitives___err_non_ptr),
 };
 
 struct data {
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c
new file mode 100644
index 000000000000..96b90e39242a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c
new file mode 100644
index 000000000000..6e87233a3ed0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___diff_enum_def x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c
new file mode 100644
index 000000000000..d9f48e80b9d9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___diff_func_proto x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c
new file mode 100644
index 000000000000..c718f75f8f3b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___diff_ptr_type x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c
new file mode 100644
index 000000000000..b8a120830891
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___err_non_enum x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c
new file mode 100644
index 000000000000..ad8b3c9aa76f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___err_non_int x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c
new file mode 100644
index 000000000000..e20bc1d42d0a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___err_non_ptr x) {}
diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h
index 45de7986ea2e..7526a5f5755b 100644
--- a/tools/testing/selftests/bpf/progs/core_reloc_types.h
+++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h
@@ -387,3 +387,70 @@ struct core_reloc_arrays___err_wrong_val_type2 {
 	int c[3]; /* value is not a struct */
 	struct core_reloc_arrays_substruct d[1][2];
 };
+
+/*
+ * PRIMITIVES
+ */
+enum core_reloc_primitives_enum {
+	A = 0,
+	B = 1,
+};
+
+struct core_reloc_primitives {
+	char a;
+	int b;
+	enum core_reloc_primitives_enum c;
+	void *d;
+	int (*f)(const char *);
+};
+
+struct core_reloc_primitives___diff_enum_def {
+	char a;
+	int b;
+	void *d;
+	int (*f)(const char *);
+	enum {
+		X = 100,
+		Y = 200,
+	} c; /* inline enum def with differing set of values */
+};
+
+struct core_reloc_primitives___diff_func_proto {
+	void (*f)(int); /* incompatible function prototype */
+	void *d;
+	enum core_reloc_primitives_enum c;
+	int b;
+	char a;
+};
+
+struct core_reloc_primitives___diff_ptr_type {
+	const char * const d; /* different pointee type + modifiers */
+	char a;
+	int b;
+	enum core_reloc_primitives_enum c;
+	int (*f)(const char *);
+};
+
+struct core_reloc_primitives___err_non_enum {
+	char a[1];
+	int b;
+	int c; /* int instead of enum */
+	void *d;
+	int (*f)(const char *);
+};
+
+struct core_reloc_primitives___err_non_int {
+	char a[1];
+	int *b; /* ptr instead of int */
+	enum core_reloc_primitives_enum c;
+	void *d;
+	int (*f)(const char *);
+};
+
+struct core_reloc_primitives___err_non_ptr {
+	char a[1];
+	int b;
+	enum core_reloc_primitives_enum c;
+	int d; /* int instead of ptr */
+	int (*f)(const char *);
+};
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c
new file mode 100644
index 000000000000..add52f23ab35
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+static volatile struct data {
+	char in[256];
+	char out[256];
+} data;
+
+enum core_reloc_primitives_enum {
+	A = 0,
+	B = 1,
+};
+
+struct core_reloc_primitives {
+	char a;
+	int b;
+	enum core_reloc_primitives_enum c;
+	void *d;
+	int (*f)(const char *);
+};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_primitives(void *ctx)
+{
+	struct core_reloc_primitives *in = (void *)&data.in;
+	struct core_reloc_primitives *out = (void *)&data.out;
+
+	if (BPF_CORE_READ(&out->a, &in->a) ||
+	    BPF_CORE_READ(&out->b, &in->b) ||
+	    BPF_CORE_READ(&out->c, &in->c) ||
+	    BPF_CORE_READ(&out->d, &in->d) ||
+	    BPF_CORE_READ(&out->f, &in->f))
+		return 1;
+
+	return 0;
+}
+
-- 
2.17.1


^ permalink raw reply related

* [PATCH net v2] net/tls: prevent skb_orphan() from leaking TLS plain text with offload
From: Jakub Kicinski @ 2019-08-07  6:06 UTC (permalink / raw)
  To: davem
  Cc: netdev, davejwatson, borisp, aviadye, john.fastabend, daniel,
	willemb, edumazet, alexei.starovoitov, oss-drivers,
	Jakub Kicinski

sk_validate_xmit_skb() and drivers depend on the sk member of
struct sk_buff to identify segments requiring encryption.
Any operation which removes or does not preserve the original TLS
socket such as skb_orphan() or skb_clone() will cause clear text
leaks.

Make the TCP socket underlying an offloaded TLS connection
mark all skbs as decrypted, if TLS TX is in offload mode.
Then in sk_validate_xmit_skb() catch skbs which have no socket
(or a socket with no validation) and decrypted flag set.

Note that CONFIG_SOCK_VALIDATE_XMIT, CONFIG_TLS_DEVICE and
sk->sk_validate_xmit_skb are slightly interchangeable right now,
they all imply TLS offload. The new checks are guarded by
CONFIG_TLS_DEVICE because that's the option guarding the
sk_buff->decrypted member.

Second, smaller issue with orphaning is that it breaks
the guarantee that packets will be delivered to device
queues in-order. All TLS offload drivers depend on that
scheduling property. This means skb_orphan_partial()'s
trick of preserving partial socket references will cause
issues in the drivers. We need a full orphan, and as a
result netem delay/throttling will cause all TLS offload
skbs to be dropped.

Reusing the sk_buff->decrypted flag also protects from
leaking clear text when incoming, decrypted skb is redirected
(e.g. by TC).

See commit 0608c69c9a80 ("bpf: sk_msg, sock{map|hash} redirect
through ULP") for justification why the internal flag is safe.

v2:
 - remove superfluous decrypted mark copy (Willem);
 - remove the stale doc entry (Boris);
 - rely entirely on EOR marking to prevent coalescing (Boris);
 - use an internal sendpages flag instead of marking the socket
   (Boris).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 Documentation/networking/tls-offload.rst | 18 ------------------
 include/linux/skbuff.h                   |  8 ++++++++
 include/linux/socket.h                   |  3 +++
 include/net/sock.h                       | 10 +++++++++-
 net/core/sock.c                          | 20 +++++++++++++++-----
 net/ipv4/tcp.c                           |  3 +++
 net/ipv4/tcp_output.c                    |  3 +++
 net/tls/tls_device.c                     |  9 +++++++--
 8 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst
index b70b70dc4524..0dd3f748239f 100644
--- a/Documentation/networking/tls-offload.rst
+++ b/Documentation/networking/tls-offload.rst
@@ -506,21 +506,3 @@ Drivers should ignore the changes to TLS the device feature flags.
 These flags will be acted upon accordingly by the core ``ktls`` code.
 TLS device feature flags only control adding of new TLS connection
 offloads, old connections will remain active after flags are cleared.
-
-Known bugs
-==========
-
-skb_orphan() leaks clear text
------------------------------
-
-Currently drivers depend on the :c:member:`sk` member of
-:c:type:`struct sk_buff <sk_buff>` to identify segments requiring
-encryption. Any operation which removes or does not preserve the socket
-association such as :c:func:`skb_orphan` or :c:func:`skb_clone`
-will cause the driver to miss the packets and lead to clear text leaks.
-
-Redirects leak clear text
--------------------------
-
-In the RX direction, if segment has already been decrypted by the device
-and it gets redirected or mirrored - clear text will be transmitted out.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d8af86d995d6..ba5583522d24 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1374,6 +1374,14 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 	to->l4_hash = from->l4_hash;
 };
 
+static inline void skb_copy_decrypted(struct sk_buff *to,
+				      const struct sk_buff *from)
+{
+#ifdef CONFIG_TLS_DEVICE
+	to->decrypted = from->decrypted;
+#endif
+}
+
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
 static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
 {
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 97523818cb14..fc0bed59fc84 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -292,6 +292,9 @@ struct ucred {
 #define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
 #define MSG_EOF         MSG_FIN
 #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */
+#define MSG_SENDPAGE_DECRYPTED	0x100000 /* sendpage() internal : page may carry
+					  * plain text and require encryption
+					  */
 
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
diff --git a/include/net/sock.h b/include/net/sock.h
index 228db3998e46..2c53f1a1d905 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2482,6 +2482,7 @@ static inline bool sk_fullsock(const struct sock *sk)
 
 /* Checks if this SKB belongs to an HW offloaded socket
  * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
  */
 static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
 						   struct net_device *dev)
@@ -2489,8 +2490,15 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
 #ifdef CONFIG_SOCK_VALIDATE_XMIT
 	struct sock *sk = skb->sk;
 
-	if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb)
+	if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
 		skb = sk->sk_validate_xmit_skb(sk, dev, skb);
+#ifdef CONFIG_TLS_DEVICE
+	} else if (unlikely(skb->decrypted)) {
+		pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+		kfree_skb(skb);
+		skb = NULL;
+#endif
+	}
 #endif
 
 	return skb;
diff --git a/net/core/sock.c b/net/core/sock.c
index d57b0cc995a0..0f9619b0892f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1992,6 +1992,20 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 }
 EXPORT_SYMBOL(skb_set_owner_w);
 
+static bool can_skb_orphan_partial(const struct sk_buff *skb)
+{
+#ifdef CONFIG_TLS_DEVICE
+	/* Drivers depend on in-order delivery for crypto offload,
+	 * partial orphan breaks out-of-order-OK logic.
+	 */
+	if (skb->decrypted)
+		return false;
+#endif
+	return (IS_ENABLED(CONFIG_INET) &&
+		skb->destructor == tcp_wfree) ||
+		skb->destructor == sock_wfree;
+}
+
 /* This helper is used by netem, as it can hold packets in its
  * delay queue. We want to allow the owner socket to send more
  * packets, as if they were already TX completed by a typical driver.
@@ -2003,11 +2017,7 @@ void skb_orphan_partial(struct sk_buff *skb)
 	if (skb_is_tcp_pure_ack(skb))
 		return;
 
-	if (skb->destructor == sock_wfree
-#ifdef CONFIG_INET
-	    || skb->destructor == tcp_wfree
-#endif
-		) {
+	if (can_skb_orphan_partial(skb)) {
 		struct sock *sk = skb->sk;
 
 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 776905899ac0..77b485d60b9d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -984,6 +984,9 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 			if (!skb)
 				goto wait_for_memory;
 
+#ifdef CONFIG_TLS_DEVICE
+			skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
 			skb_entail(sk, skb);
 			copy = size_goal;
 		}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 6e4afc48d7bb..979520e46e33 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1320,6 +1320,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
 	if (!buff)
 		return -ENOMEM; /* We'll just try again later. */
+	skb_copy_decrypted(buff, skb);
 
 	sk->sk_wmem_queued += buff->truesize;
 	sk_mem_charge(sk, buff->truesize);
@@ -1874,6 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
 	if (unlikely(!buff))
 		return -ENOMEM;
+	skb_copy_decrypted(buff, skb);
 
 	sk->sk_wmem_queued += buff->truesize;
 	sk_mem_charge(sk, buff->truesize);
@@ -2143,6 +2145,7 @@ static int tcp_mtu_probe(struct sock *sk)
 	sk_mem_charge(sk, nskb->truesize);
 
 	skb = tcp_send_head(sk);
+	skb_copy_decrypted(nskb, skb);
 
 	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
 	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 7c0b2b778703..43922d86e510 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -373,9 +373,9 @@ static int tls_push_data(struct sock *sk,
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_prot_info *prot = &tls_ctx->prot_info;
 	struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
-	int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
 	int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
 	struct tls_record_info *record = ctx->open_record;
+	int tls_push_record_flags;
 	struct page_frag *pfrag;
 	size_t orig_size = size;
 	u32 max_open_record_len;
@@ -390,6 +390,9 @@ static int tls_push_data(struct sock *sk,
 	if (sk->sk_err)
 		return -sk->sk_err;
 
+	flags |= MSG_SENDPAGE_DECRYPTED;
+	tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
+
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 	if (tls_is_partially_sent_record(tls_ctx)) {
 		rc = tls_push_partial_record(sk, tls_ctx, flags);
@@ -576,7 +579,9 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
 		gfp_t sk_allocation = sk->sk_allocation;
 
 		sk->sk_allocation = GFP_ATOMIC;
-		tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL);
+		tls_push_partial_record(sk, ctx,
+					MSG_DONTWAIT | MSG_NOSIGNAL |
+					MSG_SENDPAGE_DECRYPTED);
 		sk->sk_allocation = sk_allocation;
 	}
 }
-- 
2.21.0


^ permalink raw reply related

* Re: [net] ixgbe: fix possible deadlock in ixgbe_service_task()
From: Taehee Yoo @ 2019-08-07  6:08 UTC (permalink / raw)
  To: David Miller; +Cc: jeffrey.t.kirsher, Netdev, nhorman, sassmann, andrewx.bowers
In-Reply-To: <20190806.145104.1044990165298646882.davem@davemloft.net>

On Wed, 7 Aug 2019 at 08:36, David Miller <davem@davemloft.net> wrote:
>

Hi David
Thank you for the review!

> From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> Date: Mon,  5 Aug 2019 13:04:03 -0700
>
> > diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > index cbaf712d6529..3386e752e458 100644
> > --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> > @@ -7898,9 +7898,7 @@ static void ixgbe_service_task(struct work_struct *work)
> >       }
> >       if (ixgbe_check_fw_error(adapter)) {
> >               if (!test_bit(__IXGBE_DOWN, &adapter->state)) {
> > -                     rtnl_lock();
> >                       unregister_netdev(adapter->netdev);
> > -                     rtnl_unlock();
> >               }
>
> Please remove the (now unnecessary) curly braces for this basic block.
>

I will send a v2 patch.
Thank you!

> Thank you.

^ permalink raw reply

* Re: [PATCH bpf 0/2] tools: bpftool: fix pinning error messages
From: Y Song @ 2019-08-07  6:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Alexei Starovoitov, Daniel Borkmann, netdev, bpf, oss-drivers
In-Reply-To: <20190807001923.19483-1-jakub.kicinski@netronome.com>

On Tue, Aug 6, 2019 at 5:20 PM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> Hi!
>
> First make sure we don't use "prog" in error messages because
> the pinning operation could be performed on a map. Second add
> back missing error message if pin syscall failed.
>
> Jakub Kicinski (2):
>   tools: bpftool: fix error message (prog -> object)
>   tools: bpftool: add error message on pin failure

Looks good to me. Ack the whole series.
Acked-by: Yonghong Song <yhs@fb.com>

>
>  tools/bpf/bpftool/common.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
>
> --
> 2.21.0
>

^ permalink raw reply

* Re: [PATCH net] netdevsim: Restore per-network namespace accounting for fib entries
From: Jiri Pirko @ 2019-08-07  6:27 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: David Ahern, davem, netdev, David Ahern
In-Reply-To: <20190806153214.25203a68@cakuba.netronome.com>

Wed, Aug 07, 2019 at 12:32:14AM CEST, jakub.kicinski@netronome.com wrote:
>On Tue,  6 Aug 2019 12:15:17 -0700, David Ahern wrote:
>> From: David Ahern <dsahern@gmail.com>
>> 
>> Prior to the commit in the fixes tag, the resource controller in netdevsim
>> tracked fib entries and rules per network namespace. Restore that behavior.
>> 
>> Fixes: 5fc494225c1e ("netdevsim: create devlink instance per netdevsim instance")
>> Signed-off-by: David Ahern <dsahern@gmail.com>
>
>Thanks.
>
>Let's see what Jiri says, but to me this patch seems to indeed restore
>the original per-namespace accounting when the more natural way forward
>may perhaps be to make nsim only count the fib entries where

I think that:
1) netdevsim is a glorified dummy device for testing kernel api, not for
   configuring per-namespace resource limitation.
2) If the conclusion os to use devlink instead of cgroups for resourse
   limitations, it should be done in a separate code, not in netdevsim.

I would definitelly want to wait what is the result of discussion around 2)
first. But one way or another netdevsim code should not do this, I would
like to cutout the fib limitations from it instead, just to expose the
setup resource limits through debugfs like the rest of the configuration
of netdevsim.


>
>	fib_info->net == devlink_net(devlink)
>
>> -void nsim_fib_destroy(struct nsim_fib_data *data)
>> +int nsim_fib_init(void)
>>  {
>> -	unregister_fib_notifier(&data->fib_nb);
>> -	kfree(data);
>> +	int err;
>> +
>> +	err = register_pernet_subsys(&nsim_fib_net_ops);
>> +	if (err < 0) {
>> +		pr_err("Failed to register pernet subsystem\n");
>> +		goto err_out;
>> +	}
>> +
>> +	err = register_fib_notifier(&nsim_fib_nb, nsim_fib_dump_inconsistent);
>> +	if (err < 0) {
>> +		pr_err("Failed to register fib notifier\n");
>
>		unregister_pernet_subsys(&nsim_fib_net_ops);
>?
>
>> +		goto err_out;
>> +	}
>> +
>> +err_out:
>> +	return err;
>>  }

^ permalink raw reply

* Re: [PATCH ethtool] ethtool: dump nested registers
From: Jiri Pirko @ 2019-08-07  6:34 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: netdev, Vivien Didelot, f.fainelli, andrew, davem, linville,
	cphealy
In-Reply-To: <20190806052002.GD31971@unicorn.suse.cz>

Tue, Aug 06, 2019 at 07:20:02AM CEST, mkubecek@suse.cz wrote:
>On Mon, Aug 05, 2019 at 10:52:16AM -0400, Vivien Didelot wrote:
>> Hi Michal!
>> 
>> On Mon, 5 Aug 2019 10:04:48 +0200, Michal Kubecek <mkubecek@suse.cz> wrote:
>> > On Fri, Aug 02, 2019 at 03:34:54PM -0400, Vivien Didelot wrote:
>> > > Usually kernel drivers set the regs->len value to the same length as
>> > > info->regdump_len, which was used for the allocation. In case where
>> > > regs->len is smaller than the allocated info->regdump_len length,
>> > > we may assume that the dump contains a nested set of registers.
>> > > 
>> > > This becomes handy for kernel drivers to expose registers of an
>> > > underlying network conduit unfortunately not exposed to userspace,
>> > > as found in network switching equipment for example.
>> > > 
>> > > This patch adds support for recursing into the dump operation if there
>> > > is enough room for a nested ethtool_drvinfo structure containing a
>> > > valid driver name, followed by a ethtool_regs structure like this:
>> > > 
>> > >     0      regs->len                        info->regdump_len
>> > >     v              v                                        v
>> > >     +--------------+-----------------+--------------+-- - --+
>> > >     | ethtool_regs | ethtool_drvinfo | ethtool_regs |       |
>> > >     +--------------+-----------------+--------------+-- - --+
>> > > 
>> > > Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>
>> > > ---
>> > 
>> > I'm not sure about this approach. If these additional objects with their
>> > own registers are represented by a network device, we can query their
>> > registers directly. If they are not (which, IIUC, is the case in your
>> > use case), we should use an appropriate interface. AFAIK the CPU ports
>> > are already represented in devlink, shouldn't devlink be also used to
>> > query their registers?
>> 
>> Yet another interface wasn't that much appropriate for DSA, making the
>> stack unnecessarily complex.
>
>AFAICS, there is already devlink support in dsa and CPU ports are
>presented as devlink ports. So it wouldn't be a completely new
>interface.

I agree that since we have cpu devlink-port object, we should use this
object to carry info of it. Not to "abuse" netdevice of front panel
ports. We already have devlink regions for the purpose of binary dumps.
Currently they are per-devlink, but it should be easy to extend them
per-devlink-port.

Similar to the statistics. I think that they should go to devlink-port
object too.


>
>> In fact we are already glueing the statistics of the CPU port into the
>> master's ethtool ops (both physical ports are wired together).
>
>The ethtool statistics (in general) are one big mess, I don't think it's
>an example worth following; rather one showing us what to avoid.
>
>> Adding support for nested registers dump in ethtool makes it simple to
>> (pretty) dump CPU port's registers without too much userspace
>> addition.
>
>It is indeed convenient for pretty print - but very hard to use for any
>automated processing. My point is that CPU port is not represented by
>a network device but it is already represented by a devlink port so that
>it makes much more sense to tie its register dump to that object than to
>add add it to register dump of port's master.
>
>In the future, I would like to transform the ethtool register dump from
>current opaque block of data to an actual dump of registers. It is
>unfortunate that drivers are already mixing information specific to
>a network device with information common for the whole physical device.
>Adding more data which is actually related to a different object would
>only make things worse.
>
>Michal Kubecek

^ permalink raw reply

* Re: [PATCH 00/12] block/bio, fs: convert put_page() to put_user_page*()
From: Christoph Hellwig @ 2019-08-07  6:34 UTC (permalink / raw)
  To: John Hubbard
  Cc: Christoph Hellwig, john.hubbard, Andrew Morton, Alexander Viro,
	Anna Schumaker, David S . Miller, Dominique Martinet,
	Eric Van Hensbergen, Jason Gunthorpe, Jason Wang, Jens Axboe,
	Latchesar Ionkov, Michael S . Tsirkin, Miklos Szeredi,
	Trond Myklebust, Christoph Hellwig, Matthew Wilcox, linux-mm,
	LKML, ceph-devel, kvm, linux-block, linux-cifs, linux-fsdevel,
	linux-nfs, linux-rdma, netdev, samba-technical, v9fs-developer,
	virtualization
In-Reply-To: <c35aa2bf-c830-9e57-78ca-9ce6fb6cb53b@nvidia.com>

On Mon, Aug 05, 2019 at 03:54:35PM -0700, John Hubbard wrote:
> On 7/23/19 11:17 PM, Christoph Hellwig wrote:
> > On Tue, Jul 23, 2019 at 09:25:06PM -0700, john.hubbard@gmail.com wrote:
> >> * Store, in the iov_iter, a "came from gup (get_user_pages)" parameter.
> >>   Then, use the new iov_iter_get_pages_use_gup() to retrieve it when
> >>   it is time to release the pages. That allows choosing between put_page()
> >>   and put_user_page*().
> >>
> >> * Pass in one more piece of information to bio_release_pages: a "from_gup"
> >>   parameter. Similar use as above.
> >>
> >> * Change the block layer, and several file systems, to use
> >>   put_user_page*().
> > 
> > I think we can do this in a simple and better way.  We have 5 ITER_*
> > types.  Of those ITER_DISCARD as the name suggests never uses pages, so
> > we can skip handling it.  ITER_PIPE is rejected іn the direct I/O path,
> > which leaves us with three.
> > 
> 
> Hi Christoph,
> 
> Are you working on anything like this?

I was hoping I could steer you towards it.  But if you don't want to do
it yourself I'll add it to my ever growing todo list.

> Or on the put_user_bvec() idea?

I have a prototype from two month ago:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/gup-bvec

but that only survived the most basic testing, so it'll need more work,
which I'm not sure when I'll find time for.

^ permalink raw reply

* Re: [PATCH 00/12] block/bio, fs: convert put_page() to put_user_page*()
From: John Hubbard @ 2019-08-07  6:38 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: john.hubbard, Andrew Morton, Alexander Viro, Anna Schumaker,
	David S . Miller, Dominique Martinet, Eric Van Hensbergen,
	Jason Gunthorpe, Jason Wang, Jens Axboe, Latchesar Ionkov,
	Michael S . Tsirkin, Miklos Szeredi, Trond Myklebust,
	Christoph Hellwig, Matthew Wilcox, linux-mm, LKML, ceph-devel,
	kvm, linux-block, linux-cifs, linux-fsdevel, linux-nfs,
	linux-rdma, netdev, samba-technical, v9fs-developer,
	virtualization
In-Reply-To: <20190807063448.GA6002@infradead.org>

On 8/6/19 11:34 PM, Christoph Hellwig wrote:
> On Mon, Aug 05, 2019 at 03:54:35PM -0700, John Hubbard wrote:
>> On 7/23/19 11:17 PM, Christoph Hellwig wrote:
...
>>> I think we can do this in a simple and better way.  We have 5 ITER_*
>>> types.  Of those ITER_DISCARD as the name suggests never uses pages, so
>>> we can skip handling it.  ITER_PIPE is rejected іn the direct I/O path,
>>> which leaves us with three.
>>>
>>
>> Hi Christoph,
>>
>> Are you working on anything like this?
> 
> I was hoping I could steer you towards it.  But if you don't want to do
> it yourself I'll add it to my ever growing todo list.
> 

Sure, I'm up for this. The bvec-related items are the next logical part
of the gup/dma conversions to work on, and I just wanted to avoid solving the
same problem if you were already in the code.


>> Or on the put_user_bvec() idea?
> 
> I have a prototype from two month ago:
> 
> http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/gup-bvec
> 
> but that only survived the most basic testing, so it'll need more work,
> which I'm not sure when I'll find time for.
> 

I'll take a peek, and probably pester you with a few questions if I get
confused. :)

thanks,
-- 
John Hubbard
NVIDIA

^ permalink raw reply

* Re: [PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
From: Jason Wang @ 2019-08-07  6:49 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: mst, kvm, virtualization, netdev, linux-kernel, linux-mm
In-Reply-To: <20190806120416.GB11627@ziepe.ca>


On 2019/8/6 下午8:04, Jason Gunthorpe wrote:
> On Mon, Aug 05, 2019 at 12:20:45PM +0800, Jason Wang wrote:
>> On 2019/8/2 下午8:46, Jason Gunthorpe wrote:
>>> On Fri, Aug 02, 2019 at 05:40:07PM +0800, Jason Wang wrote:
>>>>> This must be a proper barrier, like a spinlock, mutex, or
>>>>> synchronize_rcu.
>>>> I start with synchronize_rcu() but both you and Michael raise some
>>>> concern.
>>> I've also idly wondered if calling synchronize_rcu() under the various
>>> mm locks is a deadlock situation.
>>
>> Maybe, that's why I suggest to use vhost_work_flush() which is much
>> lightweight can can achieve the same function. It can guarantee all previous
>> work has been processed after vhost_work_flush() return.
> If things are already running in a work, then yes, you can piggyback
> on the existing spinlocks inside the workqueue and be Ok
>
> However, if that work is doing any copy_from_user, then the flush
> becomes dependent on swap and it won't work again...


Yes it do copy_from_user(), so we can't do this.


>
>>>> 1) spinlock: add lots of overhead on datapath, this leads 0 performance
>>>> improvement.
>>> I think the topic here is correctness not performance improvement>
>   
>> But the whole series is to speed up vhost.
> So? Starting with a whole bunch of crazy, possibly broken, locking and
> claiming a performance win is not reasonable.


Yes, I admit this patch is tricky, I'm not going to push this. Will post 
a V3.


>
>> Spinlock is correct but make the whole series meaningless consider it won't
>> bring any performance improvement.
> You can't invent a faster spinlock by opencoding some wild
> scheme. There is nothing special about the usage here, it needs a
> blocking lock, plain and simple.
>
> Jason


Will post V3. Let's see if you are happy with that version.

Thanks



^ permalink raw reply

* [PATCH V3 00/10] Fixes for metadata accelreation
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang

Hi all:

This series try to fix several issues introduced by meta data
accelreation series. Please review.

Changes from V2:
- use seqlck helper to synchronize MMU notifier with vhost worker

Changes from V1:

- try not use RCU to syncrhonize MMU notifier with vhost worker
- set dirty pages after no readers
- return -EAGAIN only when we find the range is overlapped with
  metadata

Jason Wang (9):
  vhost: don't set uaddr for invalid address
  vhost: validate MMU notifier registration
  vhost: fix vhost map leak
  vhost: reset invalidate_count in vhost_set_vring_num_addr()
  vhost: mark dirty pages during map uninit
  vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps()
  vhost: do not use RCU to synchronize MMU notifier with worker
  vhost: correctly set dirty pages in MMU notifiers callback
  vhost: do not return -EAGAIN for non blocking invalidation too early

Michael S. Tsirkin (1):
  vhost: disable metadata prefetch optimization

 drivers/vhost/vhost.c | 228 +++++++++++++++++++++++++++---------------
 drivers/vhost/vhost.h |  10 +-
 2 files changed, 151 insertions(+), 87 deletions(-)

-- 
2.18.1


^ permalink raw reply

* [PATCH V3 01/10] vhost: disable metadata prefetch optimization
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>

This seems to cause guest and host memory corruption.
Disable for now until we get a better handle on that.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 819296332913..42a8c2a13ab1 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -96,7 +96,7 @@ struct vhost_uaddr {
 };
 
 #if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
-#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
+#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
 #else
 #define VHOST_ARCH_CAN_ACCEL_UACCESS 0
 #endif
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 03/10] vhost: validate MMU notifier registration
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

The return value of mmu_notifier_register() is not checked in
vhost_vring_set_num_addr(). This will cause an out of sync between mm
and MMU notifier thus a double free. To solve this, introduce a
boolean flag to track whether MMU notifier is registered and only do
unregistering when it was true.

Reported-and-tested-by:
syzbot+e58112d71f77113ddb7b@syzkaller.appspotmail.com
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 19 +++++++++++++++----
 drivers/vhost/vhost.h |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 488380a581dc..17f6abea192e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -629,6 +629,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->iov_limit = iov_limit;
 	dev->weight = weight;
 	dev->byte_weight = byte_weight;
+	dev->has_notifier = false;
 	init_llist_head(&dev->work_list);
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
@@ -730,6 +731,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 	if (err)
 		goto err_mmu_notifier;
 #endif
+	dev->has_notifier = true;
 
 	return 0;
 
@@ -959,7 +961,11 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	}
 	if (dev->mm) {
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
-		mmu_notifier_unregister(&dev->mmu_notifier, dev->mm);
+		if (dev->has_notifier) {
+			mmu_notifier_unregister(&dev->mmu_notifier,
+						dev->mm);
+			dev->has_notifier = false;
+		}
 #endif
 		mmput(dev->mm);
 	}
@@ -2064,8 +2070,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 	/* Unregister MMU notifer to allow invalidation callback
 	 * can access vq->uaddrs[] without holding a lock.
 	 */
-	if (d->mm)
+	if (d->has_notifier) {
 		mmu_notifier_unregister(&d->mmu_notifier, d->mm);
+		d->has_notifier = false;
+	}
 
 	vhost_uninit_vq_maps(vq);
 #endif
@@ -2085,8 +2093,11 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 	if (r == 0)
 		vhost_setup_vq_uaddr(vq);
 
-	if (d->mm)
-		mmu_notifier_register(&d->mmu_notifier, d->mm);
+	if (d->mm) {
+		r = mmu_notifier_register(&d->mmu_notifier, d->mm);
+		if (!r)
+			d->has_notifier = true;
+	}
 #endif
 
 	mutex_unlock(&vq->mutex);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 42a8c2a13ab1..a9a2a93857d2 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -214,6 +214,7 @@ struct vhost_dev {
 	int iov_limit;
 	int weight;
 	int byte_weight;
+	bool has_notifier;
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 04/10] vhost: fix vhost map leak
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

We don't free map during vhost_map_unprefetch(). This means it could
be leaked. Fixing by free the map.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 17f6abea192e..2a3154976277 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -302,9 +302,7 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 static void vhost_map_unprefetch(struct vhost_map *map)
 {
 	kfree(map->pages);
-	map->pages = NULL;
-	map->npages = 0;
-	map->addr = NULL;
+	kfree(map);
 }
 
 static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 06/10] vhost: mark dirty pages during map uninit
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

We don't mark dirty pages if the map was teared down outside MMU
notifier. This will lead untracked dirty pages. Fixing by marking
dirty pages during map uninit.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2a7217c33668..c12cdadb0855 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -305,6 +305,18 @@ static void vhost_map_unprefetch(struct vhost_map *map)
 	kfree(map);
 }
 
+static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
+				struct vhost_map *map, int index)
+{
+	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
+	int i;
+
+	if (uaddr->write) {
+		for (i = 0; i < map->npages; i++)
+			set_page_dirty(map->pages[i]);
+	}
+}
+
 static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
 {
 	struct vhost_map *map[VHOST_NUM_ADDRS];
@@ -314,8 +326,10 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
 	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
 		map[i] = rcu_dereference_protected(vq->maps[i],
 				  lockdep_is_held(&vq->mmu_lock));
-		if (map[i])
+		if (map[i]) {
+			vhost_set_map_dirty(vq, map[i], i);
 			rcu_assign_pointer(vq->maps[i], NULL);
+		}
 	}
 	spin_unlock(&vq->mmu_lock);
 
@@ -353,7 +367,6 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 {
 	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
 	struct vhost_map *map;
-	int i;
 
 	if (!vhost_map_range_overlap(uaddr, start, end))
 		return;
@@ -364,10 +377,7 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 	map = rcu_dereference_protected(vq->maps[index],
 					lockdep_is_held(&vq->mmu_lock));
 	if (map) {
-		if (uaddr->write) {
-			for (i = 0; i < map->npages; i++)
-				set_page_dirty(map->pages[i]);
-		}
+		vhost_set_map_dirty(vq, map, index);
 		rcu_assign_pointer(vq->maps[index], NULL);
 	}
 	spin_unlock(&vq->mmu_lock);
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 08/10] vhost: do not use RCU to synchronize MMU notifier with worker
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

We used to use RCU to synchronize MMU notifier with worker. This leads
calling synchronize_rcu() in invalidate_range_start(). But on a busy
system, there would be many factors that may slow down the
synchronize_rcu() which makes it unsuitable to be called in MMU
notifier.

So this patch switches use seqlock counter to track whether or not the
map was used. The counter was increased when vq try to start or finish
uses the map. This means, when it was even, we're sure there's no
readers and MMU notifier is synchronized. When it was odd, it means
there's a reader we need to wait it to be even again then we are
synchronized. Consider the read critical section is pretty small the
synchronization should be done very fast.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 141 ++++++++++++++++++++++++++----------------
 drivers/vhost/vhost.h |   7 ++-
 2 files changed, 90 insertions(+), 58 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cfc11f9ed9c9..57bfbb60d960 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -324,17 +324,16 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
 
 	spin_lock(&vq->mmu_lock);
 	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		map[i] = rcu_dereference_protected(vq->maps[i],
-				  lockdep_is_held(&vq->mmu_lock));
+		map[i] = vq->maps[i];
 		if (map[i]) {
 			vhost_set_map_dirty(vq, map[i], i);
-			rcu_assign_pointer(vq->maps[i], NULL);
+			vq->maps[i] = NULL;
 		}
 	}
 	spin_unlock(&vq->mmu_lock);
 
-	/* No need for synchronize_rcu() or kfree_rcu() since we are
-	 * serialized with memory accessors (e.g vq mutex held).
+	/* No need for synchronization since we are serialized with
+	 * memory accessors (e.g vq mutex held).
 	 */
 
 	for (i = 0; i < VHOST_NUM_ADDRS; i++)
@@ -362,6 +361,40 @@ static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
 	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
 }
 
+static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
+{
+	write_seqcount_begin(&vq->seq);
+}
+
+static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
+{
+	write_seqcount_end(&vq->seq);
+}
+
+static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
+{
+	unsigned int seq;
+
+	/* Make sure any changes to map was done before checking seq
+	 * counter. Paired with smp_wmb() in write_seqcount_begin().
+	 */
+	smp_mb();
+	seq = raw_read_seqcount(&vq->seq);
+	/* Odd means the map was currently accessed by vhost worker */
+	if (seq & 0x1) {
+		/* When seq changes, we are sure no reader can see
+		 * previous map */
+		while (raw_read_seqcount(&vq->seq) == seq) {
+			if (need_resched())
+				schedule();
+		}
+	}
+	/* Make sure seq counter was checked before map is
+	 * freed. Paired with smp_wmb() in write_seqcount_end().
+	 */
+	smp_mb();
+}
+
 static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 				      int index,
 				      unsigned long start,
@@ -376,16 +409,15 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 	spin_lock(&vq->mmu_lock);
 	++vq->invalidate_count;
 
-	map = rcu_dereference_protected(vq->maps[index],
-					lockdep_is_held(&vq->mmu_lock));
+	map = vq->maps[index];
 	if (map) {
 		vhost_set_map_dirty(vq, map, index);
-		rcu_assign_pointer(vq->maps[index], NULL);
+		vq->maps[index] = NULL;
 	}
 	spin_unlock(&vq->mmu_lock);
 
 	if (map) {
-		synchronize_rcu();
+		vhost_vq_sync_access(vq);
 		vhost_map_unprefetch(map);
 	}
 }
@@ -457,7 +489,7 @@ static void vhost_init_maps(struct vhost_dev *dev)
 	for (i = 0; i < dev->nvqs; ++i) {
 		vq = dev->vqs[i];
 		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			RCU_INIT_POINTER(vq->maps[j], NULL);
+			vq->maps[j] = NULL;
 	}
 }
 #endif
@@ -655,6 +687,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vq->indirect = NULL;
 		vq->heads = NULL;
 		vq->dev = dev;
+		seqcount_init(&vq->seq);
 		mutex_init(&vq->mutex);
 		spin_lock_init(&vq->mmu_lock);
 		vhost_vq_reset(dev, vq);
@@ -921,7 +954,7 @@ static int vhost_map_prefetch(struct vhost_virtqueue *vq,
 	map->npages = npages;
 	map->pages = pages;
 
-	rcu_assign_pointer(vq->maps[index], map);
+	vq->maps[index] = map;
 	/* No need for a synchronize_rcu(). This function should be
 	 * called by dev->worker so we are serialized with all
 	 * readers.
@@ -1216,18 +1249,18 @@ static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			*((__virtio16 *)&used->ring[vq->num]) =
 				cpu_to_vhost16(vq, vq->avail_idx);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1245,18 +1278,18 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
 	size_t size;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			size = count * sizeof(*head);
 			memcpy(used->ring + idx, head, size);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1272,17 +1305,17 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			used->flags = cpu_to_vhost16(vq, vq->used_flags);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1298,17 +1331,17 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1362,17 +1395,17 @@ static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*idx = avail->idx;
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1387,17 +1420,17 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*head = avail->ring[idx & (vq->num - 1)];
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1413,17 +1446,17 @@ static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*flags = avail->flags;
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1438,15 +1471,15 @@ static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
 	struct vring_avail *avail;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[VHOST_ADDR_AVAIL]);
+		vhost_vq_access_map_begin(vq);
+		map = vq->maps[VHOST_ADDR_AVAIL];
 		if (likely(map)) {
 			avail = map->addr;
 			*event = (__virtio16)avail->ring[vq->num];
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1461,17 +1494,17 @@ static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
 	struct vring_used *used;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_USED]);
+		map = vq->maps[VHOST_ADDR_USED];
 		if (likely(map)) {
 			used = map->addr;
 			*idx = used->idx;
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1486,17 +1519,17 @@ static inline int vhost_get_desc(struct vhost_virtqueue *vq,
 	struct vring_desc *d;
 
 	if (!vq->iotlb) {
-		rcu_read_lock();
+		vhost_vq_access_map_begin(vq);
 
-		map = rcu_dereference(vq->maps[VHOST_ADDR_DESC]);
+		map = vq->maps[VHOST_ADDR_DESC];
 		if (likely(map)) {
 			d = map->addr;
 			*desc = *(d + idx);
-			rcu_read_unlock();
+			vhost_vq_access_map_end(vq);
 			return 0;
 		}
 
-		rcu_read_unlock();
+		vhost_vq_access_map_end(vq);
 	}
 #endif
 
@@ -1843,13 +1876,11 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
 static void vhost_vq_map_prefetch(struct vhost_virtqueue *vq)
 {
-	struct vhost_map __rcu *map;
+	struct vhost_map *map;
 	int i;
 
 	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
-		rcu_read_lock();
-		map = rcu_dereference(vq->maps[i]);
-		rcu_read_unlock();
+		map = vq->maps[i];
 		if (unlikely(!map))
 			vhost_map_prefetch(vq, i);
 	}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a9a2a93857d2..12399e7c7a61 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -115,16 +115,17 @@ struct vhost_virtqueue {
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
 	/* Read by memory accessors, modified by meta data
 	 * prefetching, MMU notifier and vring ioctl().
-	 * Synchonrized through mmu_lock (writers) and RCU (writers
-	 * and readers).
+	 * Synchonrized through mmu_lock (writers) and seqlock
+         * counters,  see vhost_vq_access_map_{begin|end}().
 	 */
-	struct vhost_map __rcu *maps[VHOST_NUM_ADDRS];
+	struct vhost_map *maps[VHOST_NUM_ADDRS];
 	/* Read by MMU notifier, modified by vring ioctl(),
 	 * synchronized through MMU notifier
 	 * registering/unregistering.
 	 */
 	struct vhost_uaddr uaddrs[VHOST_NUM_ADDRS];
 #endif
+	seqcount_t seq;
 	const struct vhost_umem_node *meta_iotlb[VHOST_NUM_ADDRS];
 
 	struct file *kick;
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 09/10] vhost: correctly set dirty pages in MMU notifiers callback
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

We need make sure there's no reference on the map before trying to
mark set dirty pages.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 57bfbb60d960..6650a3ff88c1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -410,14 +410,13 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 	++vq->invalidate_count;
 
 	map = vq->maps[index];
-	if (map) {
-		vhost_set_map_dirty(vq, map, index);
+	if (map)
 		vq->maps[index] = NULL;
-	}
 	spin_unlock(&vq->mmu_lock);
 
 	if (map) {
 		vhost_vq_sync_access(vq);
+		vhost_set_map_dirty(vq, map, index);
 		vhost_map_unprefetch(map);
 	}
 }
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 10/10] vhost: do not return -EAGAIN for non blocking invalidation too early
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

Instead of returning -EAGAIN unconditionally, we'd better do that only
we're sure the range is overlapped with the metadata area.

Reported-by: Jason Gunthorpe <jgg@ziepe.ca>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6650a3ff88c1..0271f853fa9c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -395,16 +395,19 @@ static void inline vhost_vq_sync_access(struct vhost_virtqueue *vq)
 	smp_mb();
 }
 
-static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
-				      int index,
-				      unsigned long start,
-				      unsigned long end)
+static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
+				     int index,
+				     unsigned long start,
+				     unsigned long end,
+				     bool blockable)
 {
 	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
 	struct vhost_map *map;
 
 	if (!vhost_map_range_overlap(uaddr, start, end))
-		return;
+		return 0;
+	else if (!blockable)
+		return -EAGAIN;
 
 	spin_lock(&vq->mmu_lock);
 	++vq->invalidate_count;
@@ -419,6 +422,8 @@ static void vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
 		vhost_set_map_dirty(vq, map, index);
 		vhost_map_unprefetch(map);
 	}
+
+	return 0;
 }
 
 static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
@@ -439,18 +444,19 @@ static int vhost_invalidate_range_start(struct mmu_notifier *mn,
 {
 	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
 					     mmu_notifier);
-	int i, j;
-
-	if (!mmu_notifier_range_blockable(range))
-		return -EAGAIN;
+	bool blockable = mmu_notifier_range_blockable(range);
+	int i, j, ret;
 
 	for (i = 0; i < dev->nvqs; i++) {
 		struct vhost_virtqueue *vq = dev->vqs[i];
 
-		for (j = 0; j < VHOST_NUM_ADDRS; j++)
-			vhost_invalidate_vq_start(vq, j,
-						  range->start,
-						  range->end);
+		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
+			ret = vhost_invalidate_vq_start(vq, j,
+							range->start,
+							range->end, blockable);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 07/10] vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps()
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

There's no need for RCU synchronization in vhost_uninit_vq_maps()
since we've already serialized with readers (memory accessors). This
also avoid the possible userspace DOS through ioctl() because of the
possible high latency caused by synchronize_rcu().

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c12cdadb0855..cfc11f9ed9c9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -333,7 +333,9 @@ static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
 	}
 	spin_unlock(&vq->mmu_lock);
 
-	synchronize_rcu();
+	/* No need for synchronize_rcu() or kfree_rcu() since we are
+	 * serialized with memory accessors (e.g vq mutex held).
+	 */
 
 	for (i = 0; i < VHOST_NUM_ADDRS; i++)
 		if (map[i])
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 05/10] vhost: reset invalidate_count in vhost_set_vring_num_addr()
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

The vhost_set_vring_num_addr() could be called in the middle of
invalidate_range_start() and invalidate_range_end(). If we don't reset
invalidate_count after the un-registering of MMU notifier, the
invalidate_cont will run out of sync (e.g never reach zero). This will
in fact disable the fast accessor path. Fixing by reset the count to
zero.

Reported-by: Michael S. Tsirkin <mst@redhat.com>
Reported-by: Jason Gunthorpe <jgg@mellanox.com>
Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2a3154976277..2a7217c33668 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2073,6 +2073,10 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 		d->has_notifier = false;
 	}
 
+	/* reset invalidate_count in case we are in the middle of
+	 * invalidate_start() and invalidate_end().
+	 */
+	vq->invalidate_count = 0;
 	vhost_uninit_vq_maps(vq);
 #endif
 
-- 
2.18.1


^ permalink raw reply related

* [PATCH V3 02/10] vhost: don't set uaddr for invalid address
From: Jason Wang @ 2019-08-07  6:54 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807065449.23373-1-jasowang@redhat.com>

We should not setup uaddr for the invalid address, otherwise we may
try to pin or prefetch mapping of wrong pages.

Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0536f8526359..488380a581dc 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2082,7 +2082,8 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 	}
 
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_setup_vq_uaddr(vq);
+	if (r == 0)
+		vhost_setup_vq_uaddr(vq);
 
 	if (d->mm)
 		mmu_notifier_register(&d->mmu_notifier, d->mm);
-- 
2.18.1


^ permalink raw reply related

* RE: [PATCH net] hv_netvsc: Fix a warning of suspicious RCU usage
From: Dexuan Cui @ 2019-08-07  6:56 UTC (permalink / raw)
  To: Jakub Kicinski, Stephen Hemminger
  Cc: netdev@vger.kernel.org, David S. Miller, Haiyang Zhang,
	sashal@kernel.org, KY Srinivasan, Michael Kelley,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	olaf@aepfle.de, apw@canonical.com, jasowang@redhat.com, vkuznets,
	marcelo.cerri@canonical.com
In-Reply-To: <20190806121242.141c2324@cakuba.netronome.com>

> From: Jakub Kicinski <jakub.kicinski@netronome.com>
> Sent: Tuesday, August 6, 2019 12:13 PM
> To: Dexuan Cui <decui@microsoft.com>
> 
> On Tue, 6 Aug 2019 05:17:44 +0000, Dexuan Cui wrote:
> > This fixes a warning of "suspicious rcu_dereference_check() usage"
> > when nload runs.
> >
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > Signed-off-by: Dexuan Cui <decui@microsoft.com>
> 
> Minor change in behaviour would perhaps be worth acknowledging in the
> commit message (since you check ndev for NULL later now), and a Fixes
> tag would be good.
> 
> But the looks pretty straightforward and correct!

Hi,
Yeah, it looks the minor behavior change doesn't matter, because IMO the 
'nvdev' can only be NULL when the NIC is being removed, or the MTU is
being changed, etc.

The Fixes tag is:
Fixes: 776e726bfb34 ("netvsc: fix RCU warning in get_stats")

If I should send a v2, please let me know.

Thanks,
-- Dexuan


^ permalink raw reply

* Re: [PATCH V3 01/10] vhost: disable metadata prefetch optimization
From: Jason Wang @ 2019-08-07  7:05 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg
In-Reply-To: <20190807065449.23373-2-jasowang@redhat.com>


On 2019/8/7 下午2:54, Jason Wang wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
>
> This seems to cause guest and host memory corruption.
> Disable for now until we get a better handle on that.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>   drivers/vhost/vhost.h | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 819296332913..42a8c2a13ab1 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -96,7 +96,7 @@ struct vhost_uaddr {
>   };
>   
>   #if defined(CONFIG_MMU_NOTIFIER) && ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 0
> -#define VHOST_ARCH_CAN_ACCEL_UACCESS 1
> +#define VHOST_ARCH_CAN_ACCEL_UACCESS 0
>   #else
>   #define VHOST_ARCH_CAN_ACCEL_UACCESS 0
>   #endif


Oops, this is unnecessary.

Will post V4.

Thanks


^ permalink raw reply

* [PATCH V4 0/9] Fixes for metadata accelreation
From: Jason Wang @ 2019-08-07  7:06 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang

Hi all:

This series try to fix several issues introduced by meta data
accelreation series. Please review.

Changes from V3:
- remove the unnecessary patch

Changes from V2:
- use seqlck helper to synchronize MMU notifier with vhost worker

Changes from V1:
- try not use RCU to syncrhonize MMU notifier with vhost worker
- set dirty pages after no readers
- return -EAGAIN only when we find the range is overlapped with
  metadata

Jason Wang (9):
  vhost: don't set uaddr for invalid address
  vhost: validate MMU notifier registration
  vhost: fix vhost map leak
  vhost: reset invalidate_count in vhost_set_vring_num_addr()
  vhost: mark dirty pages during map uninit
  vhost: don't do synchronize_rcu() in vhost_uninit_vq_maps()
  vhost: do not use RCU to synchronize MMU notifier with worker
  vhost: correctly set dirty pages in MMU notifiers callback
  vhost: do not return -EAGAIN for non blocking invalidation too early

 drivers/vhost/vhost.c | 228 +++++++++++++++++++++++++++---------------
 drivers/vhost/vhost.h |   8 +-
 2 files changed, 150 insertions(+), 86 deletions(-)

-- 
2.18.1


^ permalink raw reply

* [PATCH V4 1/9] vhost: don't set uaddr for invalid address
From: Jason Wang @ 2019-08-07  7:06 UTC (permalink / raw)
  To: mst, kvm, virtualization, netdev; +Cc: linux-kernel, linux-mm, jgg, Jason Wang
In-Reply-To: <20190807070617.23716-1-jasowang@redhat.com>

We should not setup uaddr for the invalid address, otherwise we may
try to pin or prefetch mapping of wrong pages.

Fixes: 7f466032dc9e ("vhost: access vq metadata through kernel virtual address")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/vhost.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0536f8526359..488380a581dc 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2082,7 +2082,8 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 	}
 
 #if VHOST_ARCH_CAN_ACCEL_UACCESS
-	vhost_setup_vq_uaddr(vq);
+	if (r == 0)
+		vhost_setup_vq_uaddr(vq);
 
 	if (d->mm)
 		mmu_notifier_register(&d->mmu_notifier, d->mm);
-- 
2.18.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox