Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
From: Alexei Starovoitov @ 2019-08-27 20:52 UTC (permalink / raw)
  To: luto; +Cc: davem, daniel, netdev, bpf, kernel-team, linux-api

Introduce CAP_BPF that allows loading all types of BPF programs,
create most map types, load BTF, iterate programs and maps.
CAP_BPF alone is not enough to attach or run programs.

Networking:

CAP_BPF and CAP_NET_ADMIN are necessary to:
- attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
- run networking bpf programs (like xdp, skb, flow_dissector)

Tracing:

CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
are necessary to:
- attach bpf program to raw tracepoint
- use bpf_trace_printk() in all program types (not only tracing programs)
- create bpf stackmap

To attach bpf to perf_events perf_event_open() needs to succeed as usual.

CAP_BPF controls BPF side.
CAP_NET_ADMIN controls intersection where BPF calls into networking.
perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.

In the future CAP_TRACING could be introduced to control
creation of kprobe/uprobe and attaching bpf to perf_events.
In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
Whereas probe_read() would be controlled by CAP_TRACING.
CAP_TRACING would also control generic kprobe+probe_read.
CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
that want to use bpf_probe_read.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
I would prefer to introduce CAP_TRACING soon, since it
will make tracing and networking permission model symmetrical.

 include/linux/filter.h                      |  1 +
 include/uapi/linux/capability.h             |  5 ++-
 kernel/bpf/arraymap.c                       |  2 +-
 kernel/bpf/cgroup.c                         |  2 +-
 kernel/bpf/core.c                           | 10 ++++--
 kernel/bpf/cpumap.c                         |  2 +-
 kernel/bpf/hashtab.c                        |  4 +--
 kernel/bpf/lpm_trie.c                       |  2 +-
 kernel/bpf/queue_stack_maps.c               |  2 +-
 kernel/bpf/reuseport_array.c                |  2 +-
 kernel/bpf/stackmap.c                       |  2 +-
 kernel/bpf/syscall.c                        | 32 ++++++++++-------
 kernel/bpf/verifier.c                       |  4 +--
 kernel/trace/bpf_trace.c                    |  2 +-
 net/core/bpf_sk_storage.c                   |  2 +-
 net/core/filter.c                           | 10 +++---
 security/selinux/include/classmap.h         |  4 +--
 tools/testing/selftests/bpf/test_verifier.c | 39 ++++++++++++++++-----
 18 files changed, 84 insertions(+), 43 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92c6e31fb008..16cea50af014 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -857,6 +857,7 @@ static inline bool bpf_dump_raw_ok(void)
 	return kallsyms_show_value() == 1;
 }
 
+bool cap_bpf_tracing(void);
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..b3390f34c9f5 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -366,8 +366,11 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_READ		37
 
+/* Allow bpf() syscall except attach and tracing */
 
-#define CAP_LAST_CAP         CAP_AUDIT_READ
+#define CAP_BPF			38
+
+#define CAP_LAST_CAP         CAP_BPF
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..045e30b7160d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool unpriv = !capable(CAP_SYS_ADMIN);
+	bool unpriv = !capable(CAP_BPF);
 	u64 cost, array_size, mask64;
 	struct bpf_map_memory mem;
 	struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..97f733354421 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
+		if (cap_bpf_tracing())
 			return bpf_get_trace_printk_proto();
 		/* fall through */
 	default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..5756c8a56f44 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return;
 
 	spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages)
 {
 	if (atomic_long_add_return(pages, &bpf_jit_current) >
 	    (bpf_jit_limit >> PAGE_SHIFT)) {
-		if (!capable(CAP_SYS_ADMIN)) {
+		if (!capable(CAP_BPF)) {
 			atomic_long_sub(pages, &bpf_jit_current);
 			return -EPERM;
 		}
@@ -2104,6 +2104,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
+bool cap_bpf_tracing(void)
+{
+	return capable(CAP_SYS_ADMIN) ||
+	       (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
+}
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..ca483c9a9c2e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -84,7 +84,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	int ret, cpu;
 	u64 cost;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..f459315625ac 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
 
-	if (lru && !capable(CAP_SYS_ADMIN))
+	if (lru && !capable(CAP_BPF))
 		/* LRU implementation is much complicated than other
-		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
+		 * maps.  Hence, limit to CAP_BPF.
 		 */
 		return -EPERM;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..a45fa5464d98 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..ca0ba9edca86 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
 /* Called from syscall */
 static int queue_stack_map_alloc_check(union bpf_attr *attr)
 {
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..bfad7d41a061 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	u64 array_size;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..c540b2b3fc4a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u64 cost, n_buckets;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return ERR_PTR(-EPERM);
 
 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0f62fd67c6b..ef7b06ca30e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr)
 		err = -EBUSY;
 		goto err_put;
 	}
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable(CAP_BPF)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1634,7 +1634,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return -EPERM;
 
 	/* copy eBPF program license from user space */
@@ -1647,11 +1647,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	is_gpl = license_is_gpl_compatible(license);
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+	    attr->insn_cnt > (capable(CAP_BPF) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
 		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return -EPERM;
 
 	bpf_prog_load_fixup_attach_type(attr);
@@ -1802,6 +1802,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	char tp_name[128];
 	int tp_fd, err;
 
+	if (!cap_bpf_tracing())
+		return -EPERM;
+
 	if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
 			      sizeof(tp_name) - 1) < 0)
 		return -EFAULT;
@@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
+		/* test_run callback is available for networking progs only.
+		 * Add cap_bpf_tracing() above when tracing progs become runable.
+		 */
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
@@ -2117,7 +2123,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	next_id++;
@@ -2143,7 +2149,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	spin_lock_bh(&prog_idr_lock);
@@ -2177,7 +2183,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2352,7 +2358,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.run_time_ns = stats.nsecs;
 	info.run_cnt = stats.cnt;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable(CAP_BPF)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
@@ -2670,7 +2676,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	return btf_new_fd(attr);
@@ -2683,7 +2689,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	return btf_get_fd_by_id(attr->btf_id);
@@ -2752,7 +2758,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return -EPERM;
 
 	if (attr->task_fd_query.flags != 0)
@@ -2820,7 +2826,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr = {};
 	int err;
 
-	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_BPF))
 		return -EPERM;
 
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10c0ff93f52b..5810e8cc9342 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -987,7 +987,7 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 	reg->umax_value = U64_MAX;
 
 	/* constant backtracking is enabled for root only for now */
-	reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
+	reg->precise = capable(CAP_BPF) ? false : true;
 }
 
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -9233,7 +9233,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
-	is_priv = capable(CAP_SYS_ADMIN);
+	is_priv = capable(CAP_BPF);
 
 	/* grab the mutex to protect few globals used by verifier */
 	if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..2bf58ff5bf75 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return -EPERM;
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..0b29f6abbeba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c1059cdad3d..986277abfde2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return NULL;
 
 	switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
-		return bpf_get_trace_printk_proto();
+		if (cap_bpf_tracing())
+			return bpf_get_trace_printk_proto();
+		/* fall through */
 	default:
 		return NULL;
 	}
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_BPF))
 			return false;
 		break;
 	}
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!capable(CAP_SYS_ADMIN))
+			if (!capable(CAP_BPF))
 				return false;
 			break;
 		default:
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 201f7e588a29..1c925bc04072 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -26,9 +26,9 @@
 	    "audit_control", "setfcap"
 
 #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
-		"wake_alarm", "block_suspend", "audit_read"
+		"wake_alarm", "block_suspend", "audit_read", "bpf"
 
-#if CAP_LAST_CAP > CAP_AUDIT_READ
+#if CAP_LAST_CAP > CAP_BPF
 #error New capability defined, please update COMMON_CAP2_PERMS.
 #endif
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 44e2d640b088..b31b961f1020 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -805,10 +805,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	}
 }
 
+struct libcap {
+	struct __user_cap_header_struct hdr;
+	struct __user_cap_data_struct data[2];
+};
+
 static int set_admin(bool admin)
 {
 	cap_t caps;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
+	/* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs */
+	const cap_value_t cap_val[] = {38/*CAP_BPF*/, CAP_NET_ADMIN};
+	const cap_value_t cap_val_admin = CAP_SYS_ADMIN;
+	struct libcap *cap;
 	int ret = -1;
 
 	caps = cap_get_proc();
@@ -816,11 +824,23 @@ static int set_admin(bool admin)
 		perror("cap_get_proc");
 		return -1;
 	}
-	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+	cap = (struct libcap *)caps;
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val_admin, CAP_CLEAR)) {
+		perror("cap_set_flag clear admin");
+		goto out;
+	}
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_val,
 				admin ? CAP_SET : CAP_CLEAR)) {
-		perror("cap_set_flag");
+		perror("cap_set_flag set_or_clear bpf+net");
 		goto out;
 	}
+	/* libcap is likely old and simply ignores CAP_BPF,
+	 * so update effective bits manually
+	 */
+	if (admin)
+		cap->data[1].effective |= 1 << (38 - 32);
+	else
+		cap->data[1].effective &= ~(1 << (38 - 32));
 	if (cap_set_proc(caps)) {
 		perror("cap_set_proc");
 		goto out;
@@ -1013,8 +1033,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 static bool is_admin(void)
 {
 	cap_t caps;
-	cap_flag_value_t sysadmin = CAP_CLEAR;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
+	cap_flag_value_t bpf_priv = CAP_CLEAR;
+	cap_flag_value_t net_priv = CAP_CLEAR;
+	struct libcap *cap;
 
 #ifdef CAP_IS_SUPPORTED
 	if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1027,11 +1048,13 @@ static bool is_admin(void)
 		perror("cap_get_proc");
 		return false;
 	}
-	if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
-		perror("cap_get_flag");
+	cap = (struct libcap *)caps;
+	bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32));
+	if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+		perror("cap_get_flag NET");
 	if (cap_free(caps))
 		perror("cap_free");
-	return (sysadmin == CAP_SET);
+	return bpf_priv == CAP_SET && net_priv == CAP_SET;
 }
 
 static void get_unpriv_disabled()
-- 
2.20.0


^ permalink raw reply related

* Re: Unable to create htb tc classes more than 64K
From: Dave Taht @ 2019-08-27 20:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Cong Wang, Akshat Kakkar, Anton Danilov, NetFilter, lartc, netdev
In-Reply-To: <9cbefe10-b172-ae2a-0ac7-d972468eb7a2@gmail.com>

On Sun, Aug 25, 2019 at 11:47 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 8/25/19 7:52 PM, Cong Wang wrote:
> > On Wed, Aug 21, 2019 at 11:00 PM Akshat Kakkar <akshat.1984@gmail.com> wrote:
> >>
> >> On Thu, Aug 22, 2019 at 3:37 AM Cong Wang <xiyou.wangcong@gmail.com> wrote:
> >>>> I am using ipset +  iptables to classify and not filters. Besides, if
> >>>> tc is allowing me to define qdisc -> classes -> qdsic -> classes
> >>>> (1,2,3 ...) sort of structure (ie like the one shown in ascii tree)
> >>>> then how can those lowest child classes be actually used or consumed?
> >>>
> >>> Just install tc filters on the lower level too.
> >>
> >> If I understand correctly, you are saying,
> >> instead of :
> >> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
> >> 0x00000001 fw flowid 1:10
> >> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
> >> 0x00000002 fw flowid 1:20
> >> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
> >> 0x00000003 fw flowid 2:10
> >> tc filter add dev eno2 parent 100: protocol ip prio 1 handle
> >> 0x00000004 fw flowid 2:20
> >>
> >>
> >> I should do this: (i.e. changing parent to just immediate qdisc)
> >> tc filter add dev eno2 parent 1: protocol ip prio 1 handle 0x00000001
> >> fw flowid 1:10
> >> tc filter add dev eno2 parent 1: protocol ip prio 1 handle 0x00000002
> >> fw flowid 1:20
> >> tc filter add dev eno2 parent 2: protocol ip prio 1 handle 0x00000003
> >> fw flowid 2:10
> >> tc filter add dev eno2 parent 2: protocol ip prio 1 handle 0x00000004
> >> fw flowid 2:20
> >
> >
> > Yes, this is what I meant.
> >
> >
> >>
> >> I tried this previously. But there is not change in the result.
> >> Behaviour is exactly same, i.e. I am still getting 100Mbps and not
> >> 100kbps or 300kbps
> >>
> >> Besides, as I mentioned previously I am using ipset + skbprio and not
> >> filters stuff. Filters I used just to test.
> >>
> >> ipset  -N foo hash:ip,mark skbinfo
> >>
> >> ipset -A foo 10.10.10.10, 0x0x00000001 skbprio 1:10
> >> ipset -A foo 10.10.10.20, 0x0x00000002 skbprio 1:20
> >> ipset -A foo 10.10.10.30, 0x0x00000003 skbprio 2:10
> >> ipset -A foo 10.10.10.40, 0x0x00000004 skbprio 2:20
> >>
> >> iptables -A POSTROUTING -j SET --map-set foo dst,dst --map-prio
> >
> > Hmm..
> >
> > I am not familiar with ipset, but it seems to save the skbprio into
> > skb->priority, so it doesn't need TC filter to classify it again.
> >
> > I guess your packets might go to the direct queue of HTB, which
> > bypasses the token bucket. Can you dump the stats and check?
>
> With more than 64K 'classes' I suggest to use a single FQ qdisc [1], and
> an eBPF program using EDT model (Earliest Departure Time)

Although this is very cool, I think in this case the OP is being
a router, not server?

> The BPF program would perform the classification, then find a data structure
> based on the 'class', and then update/maintain class virtual times and skb->tstamp
>
> TBF = bpf_map_lookup_elem(&map, &classid);
>
> uint64_t now = bpf_ktime_get_ns();
> uint64_t time_to_send = max(TBF->time_to_send, now);
>
> time_to_send += (u64)qdisc_pkt_len(skb) * NSEC_PER_SEC / TBF->rate;
> if (time_to_send > TBF->max_horizon) {
>     return TC_ACT_SHOT;
> }
> TBF->time_to_send = time_to_send;
> skb->tstamp = max(time_to_send, skb->tstamp);
> if (time_to_send - now > TBF->ecn_horizon)
>     bpf_skb_ecn_set_ce(skb);
> return TC_ACT_OK;
>
> tools/testing/selftests/bpf/progs/test_tc_edt.c shows something similar.
>
>
> [1]  MQ + FQ if the device is multi-queues.
>
>    Note that this setup scales very well on SMP, since we no longer are forced
>  to use a single HTB hierarchy (protected by a single spinlock)
>


-- 

Dave Täht
CTO, TekLibre, LLC
http://www.teklibre.com
Tel: 1-831-205-9740

^ permalink raw reply

* Re: [PATCH] ipv6: Not to probe neighbourless routes
From: David Miller @ 2019-08-27 20:49 UTC (permalink / raw)
  To: wang.yi59
  Cc: kuznet, yoshfuji, netdev, linux-kernel, xue.zhihong, wang.liang82,
	cheng.lin130
In-Reply-To: <1566896907-5121-1-git-send-email-wang.yi59@zte.com.cn>


Nothing says "low quality patch submission" like:

net/ipv6/route.c: In function ‘rt6_probe’:
net/ipv6/route.c:660:10: error: ‘struct fib6_nh’ has no member named ‘last_probe’
   fib6_nh->last_probe = jiffies;

"I did not even try to compile this"

^ permalink raw reply

* [PATCH] checkpatch: Allow consecutive close braces
From: Joe Perches @ 2019-08-27 20:49 UTC (permalink / raw)
  To: Jeff Kirsher, Forrest Fleming, Andrew Morton
  Cc: David S. Miller, intel-wired-lan, netdev, linux-kernel
In-Reply-To: <c2279a78904b581924894b712403299903eacbfc.camel@intel.com>

checkpatch allows consecutive open braces, so it
should also allow consecutive close braces.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 287fe73688f0..ac5e0f06e1af 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4687,7 +4687,7 @@ sub process {

 # closing brace should have a space following it when it has anything
 # on the line
-		if ($line =~ /}(?!(?:,|;|\)))\S/) {
+		if ($line =~ /}(?!(?:,|;|\)|\}))\S/) {
 			if (ERROR("SPACING",
 				  "space required after that close brace '}'\n" . $herecurr) &&
 			    $fix) {

^ permalink raw reply related

* Re: [PATCH 1/2] rtnetlink: gate MAC address with an LSM hook
From: Paul Moore @ 2019-08-27 20:47 UTC (permalink / raw)
  To: Jeffrey Vander Stoep; +Cc: David Miller, netdev, LSM List, selinux
In-Reply-To: <CABXk95BF=RfqFSHU_---DRHDoKyFON5kS_vYJbc4ns2OS=_t0w@mail.gmail.com>

On Fri, Aug 23, 2019 at 7:41 AM Jeffrey Vander Stoep <jeffv@google.com> wrote:
> On Fri, Aug 23, 2019 at 1:19 AM David Miller <davem@davemloft.net> wrote:
> > From: Jeff Vander Stoep <jeffv@google.com>
> > Date: Wed, 21 Aug 2019 15:45:47 +0200
> >
> > > MAC addresses are often considered sensitive because they are
> > > usually unique and can be used to identify/track a device or
> > > user [1].
> > >
> > > The MAC address is accessible via the RTM_NEWLINK message type of a
> > > netlink route socket[2]. Ideally we could grant/deny access to the
> > > MAC address on a case-by-case basis without blocking the entire
> > > RTM_NEWLINK message type which contains a lot of other useful
> > > information. This can be achieved using a new LSM hook on the netlink
> > > message receive path. Using this new hook, individual LSMs can select
> > > which processes are allowed access to the real MAC, otherwise a
> > > default value of zeros is returned. Offloading access control
> > > decisions like this to an LSM is convenient because it preserves the
> > > status quo for most Linux users while giving the various LSMs
> > > flexibility to make finer grained decisions on access to sensitive
> > > data based on policy.
> > >
> > > [1] https://adamdrake.com/mac-addresses-udids-and-privacy.html
> > > [2] Other access vectors like ioctl(SIOCGIFHWADDR) are already covered
> > > by existing LSM hooks.
> > >
> > > Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
> >
> > I'm sure the MAC address will escape into userspace via other means,
> > dumping pieces of networking config in other contexts, etc.  I mean,
> > if I can get a link dump, I can dump the neighbor table as well.
>
> These are already gated by existing LSM hooks and capability checks.
> They are not allowed on mandatory access control systems unless explicitly
> granted.
>
> > I kinda think this is all very silly whack-a-mole kind of stuff, to
> > be quite honest.
>
> We evaluated mechanisms for the MAC to reach unprivileged apps.
> A number of researchers have published on this as well such as:
> https://www.usenix.org/conference/usenixsecurity19/presentation/reardon
>
> Three "leaks" were identified, two have already been fixed.
> -ioctl(SIOCGIFHWADDR). Fixed using finer grained LSM checks
> on socket ioctls (similar to this change).
> -IPv6 IP addresses. Fixed by no longer including the MAC as part
> of the IP address.
> -RTM_NEWLINK netlink route messages. The last mole to be whacked.
>
> > And like others have said, tomorrow you'll be like "oh crap, we should
> > block X too" and we'll get another hook, another config knob, another
> > rulset update, etc.
>
> This seems like an issue inherent with permissions/capabilities. I don’t
> think we should abandon the concept of permissions because someone
> can forget to add a check.  Likewise, if someone adds new code to the
> kernel and omits a capable(CAP_NET_*) check, I would expect it to be
> fixed like any other bug without the idea of capability checks being tossed
> out.
>
> We need to do something because this information is being abused. Any
> recommendations? This seemed like the simplest approach, but I can
> definitely appreciate that it has downsides.
>
> I could make this really generic by adding a single hook to the end of
> sock_msgrecv() which would allow an LSM to modify the message to omit
> the MAC address and any other information that we deem as sensitive in the
> future. Basically what Casey was suggesting. Thoughts on that approach?

I apologize for the delay in responding; I'm blaming LSS-NA travel.

I'm also not a big fan of inserting the hook in rtnl_fill_ifinfo(); as
presented it is way too specific for a LSM hook for me to be happy.
However, I do agree that giving the LSMs some control over netlink
messages makes sense.  As others have pointed out, it's all a matter
of where to place the hook.

If we only care about netlink messages which leverage nlattrs I
suppose one option that I haven't seen mentioned would be to place a
hook in nla_put().  While it is a bit of an odd place for a hook, it
would allow the LSM easy access to the skb and attribute type to make
decisions, and all of the callers should already be checking the
return code (although we would need to verify this).  One notable
drawback (not the only one) is that the hook is going to get hit
multiple times for each message.

--
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH net-next] phy: mdio-bcm-iproc: use devm_platform_ioremap_resource() to simplify code
From: Ray Jui @ 2019-08-27 20:40 UTC (permalink / raw)
  To: YueHaibing, andrew, f.fainelli, hkallweit1, davem, rjui, sbranden
  Cc: bcm-kernel-feedback-list, netdev, linux-arm-kernel, linux-kernel
In-Reply-To: <20190827134616.11396-1-yuehaibing@huawei.com>



On 2019-08-27 6:46 a.m., YueHaibing wrote:
> Use devm_platform_ioremap_resource() to simplify the code a bit.
> This is detected by coccinelle.
> 
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> ---
>   drivers/net/phy/mdio-bcm-iproc.c | 4 +---
>   1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/drivers/net/phy/mdio-bcm-iproc.c b/drivers/net/phy/mdio-bcm-iproc.c
> index 7d0f388..7e9975d 100644
> --- a/drivers/net/phy/mdio-bcm-iproc.c
> +++ b/drivers/net/phy/mdio-bcm-iproc.c
> @@ -123,15 +123,13 @@ static int iproc_mdio_probe(struct platform_device *pdev)
>   {
>   	struct iproc_mdio_priv *priv;
>   	struct mii_bus *bus;
> -	struct resource *res;
>   	int rc;
>   
>   	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
>   	if (!priv)
>   		return -ENOMEM;
>   
> -	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> -	priv->base = devm_ioremap_resource(&pdev->dev, res);
> +	priv->base = devm_platform_ioremap_resource(pdev, 0);
>   	if (IS_ERR(priv->base)) {
>   		dev_err(&pdev->dev, "failed to ioremap register\n");
>   		return PTR_ERR(priv->base);
> 

Looks good to me. Thanks.

Reviewed-by: Ray Jui <ray.jui@broadcom.com>

^ permalink raw reply

* Re: [PATCH] net: intel: Cleanup e1000 - add space between }}
From: Jeff Kirsher @ 2019-08-27 20:27 UTC (permalink / raw)
  To: Forrest Fleming, Joe Perches
  Cc: Andrew Morton, David S. Miller, intel-wired-lan, netdev,
	linux-kernel
In-Reply-To: <CAE7kSDuHi3e_b0qyvXqocSVaNJrj3X7PPiawBWa68ZyrLSAZyA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2020 bytes --]

On Tue, 2019-08-27 at 12:45 -0700, Forrest Fleming wrote:
> On Tue, Aug 27, 2019 at 12:07 PM Joe Perches <joe@perches.com> wrote:
> > On Tue, 2019-08-27 at 12:02 -0700, Jeff Kirsher wrote:
> > > On Mon, 2019-08-26 at 20:41 -0700, Joe Perches wrote:
> > > > On Mon, 2019-08-26 at 01:03 -0700, Jeff Kirsher wrote:
> > > > > On Fri, 2019-08-23 at 19:14 +0000, Forrest Fleming wrote:
> > > > > > suggested by checkpatch
> > > > > > 
> > > > > > Signed-off-by: Forrest Fleming <ffleming@gmail.com>
> > > > > > ---
> > > > > >  .../net/ethernet/intel/e1000/e1000_param.c    | 28
> > > > > > +++++++++--
> > > > > > --------
> > > > > >  1 file changed, 14 insertions(+), 14 deletions(-)
> > > > > 
> > > > > While I do not see an issue with this change, I wonder how
> > > > > important it is
> > > > > to make such a change.  Especially since most of the hardware
> > > > > supported by
> > > > > this driver is not available for testing.  In addition, this
> > > > > is one
> > > > > suggested change by checkpatch.pl that I personally do not
> > > > > agree
> > > > > with.
> > > > 
> > > > I think checkpatch should allow consecutive }}.
> > > 
> > > Agreed, have you already submitted a formal patch Joe with the
> > > suggested change below?
> > 
> > No.
> > 
> > >   If so, I will ACK it.
> > 
> > Of course you can add an Acked-by:
> > 
> 
> Totally fair - I don't have strong feelings regarding the particular
> rule. I do
> feel strongly that we should avoid violating our rules as encoded by
> checkpatch,
> but I'm perfectly happy for the change to take the form of modifying
> checkpatch
> to allow a perfectly sensible (and readable) construct.
> 
> I'm happy to withdraw this patch from consideration; I couldn't find
> anything
> about there being a formal procedure for so doing, so please let me
> know if
> there's anything more I need to do (or point me to the relevant
> docs).
> 
> Thanks to everyone!

Nothing for you to do, I will drop the patch.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH] net: intel: Cleanup e1000 - add space between }}
From: Jeff Kirsher @ 2019-08-27 20:27 UTC (permalink / raw)
  To: Joe Perches, Forrest Fleming, Andrew Morton
  Cc: David S. Miller, intel-wired-lan, netdev, linux-kernel
In-Reply-To: <877726fc009ee5ffde50e589d332db90c9695f06.camel@perches.com>

[-- Attachment #1: Type: text/plain, Size: 1419 bytes --]

On Mon, 2019-08-26 at 20:41 -0700, Joe Perches wrote:
> On Mon, 2019-08-26 at 01:03 -0700, Jeff Kirsher wrote:
> > On Fri, 2019-08-23 at 19:14 +0000, Forrest Fleming wrote:
> > > suggested by checkpatch
> > > 
> > > Signed-off-by: Forrest Fleming <ffleming@gmail.com>
> > > ---
> > >  .../net/ethernet/intel/e1000/e1000_param.c    | 28 +++++++++--
> > > --------
> > >  1 file changed, 14 insertions(+), 14 deletions(-)
> > 
> > While I do not see an issue with this change, I wonder how
> > important it is
> > to make such a change.  Especially since most of the hardware
> > supported by
> > this driver is not available for testing.  In addition, this is one
> > suggested change by checkpatch.pl that I personally do not agree
> > with.
> 
> I think checkpatch should allow consecutive }}.

Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

> 
> Maybe:
> ---
> diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
> index 287fe73688f0..ac5e0f06e1af 100755
> --- a/scripts/checkpatch.pl
> +++ b/scripts/checkpatch.pl
> @@ -4687,7 +4687,7 @@ sub process {
>  
>  # closing brace should have a space following it when it has
> anything
>  # on the line
> -		if ($line =~ /}(?!(?:,|;|\)))\S/) {
> +		if ($line =~ /}(?!(?:,|;|\)|\}))\S/) {
>  			if (ERROR("SPACING",
>  				  "space required after that close
> brace '}'\n" . $herecurr) &&
>  			    $fix) {
> 
> 


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH -next] net: mlx5: Kconfig: Fix MLX5_CORE_EN dependencies
From: Saeed Mahameed @ 2019-08-27 20:14 UTC (permalink / raw)
  To: davem@davemloft.net, maowenan@huawei.com, leon@kernel.org
  Cc: kernel-janitors@vger.kernel.org, netdev@vger.kernel.org,
	linux-rdma@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20190827031251.98881-1-maowenan@huawei.com>

On Tue, 2019-08-27 at 11:12 +0800, Mao Wenan wrote:
> When MLX5_CORE_EN=y and PCI_HYPERV_INTERFACE is not set, below errors
> are found:
> drivers/net/ethernet/mellanox/mlx5/core/en_main.o: In function
> `mlx5e_nic_enable':
> en_main.c:(.text+0xb649): undefined reference to
> `mlx5e_hv_vhca_stats_create'
> drivers/net/ethernet/mellanox/mlx5/core/en_main.o: In function
> `mlx5e_nic_disable':
> en_main.c:(.text+0xb8c4): undefined reference to
> `mlx5e_hv_vhca_stats_destroy'
> 
> This because CONFIG_PCI_HYPERV_INTERFACE is newly introduced by
> 'commit 348dd93e40c1
> ("PCI: hv: Add a Hyper-V PCI interface driver for software
> backchannel interface"),
> Fix this by making MLX5_CORE_EN imply PCI_HYPERV_INTERFACE.
> 

let's not imply anything.. 
mlx5e_hv_vhca_* should already have stubs  in 
mlx5/core/en/hv_vhca_stat.h when PCI_HYPERV_INTERFACE is off/undef !

I Just tried:

$ ./scripts/config -s PCI_HYPERV_INTERFACE
$ ./scripts/config -s MLX5_CORE
$ ./scripts/config -s MLX5_CORE_EN
undef
y
y

$ make

And build passed just fine.



^ permalink raw reply

* Re: [PATCH] net/mlx5: fix a -Wstringop-truncation warning
From: Qian Cai @ 2019-08-27 20:12 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: linux-rdma@vger.kernel.org, davem@davemloft.net, Moshe Shemesh,
	Feras Daoud, linux-kernel@vger.kernel.org, Eran Ben Elisha,
	netdev@vger.kernel.org, leon@kernel.org
In-Reply-To: <21994e7e141ee5453c6814de025e083eeb651127.camel@mellanox.com>

On Mon, 2019-08-26 at 21:11 +0000, Saeed Mahameed wrote:
> On Fri, 2019-08-23 at 15:56 -0400, Qian Cai wrote:
> > In file included from ./arch/powerpc/include/asm/paca.h:15,
> >                  from ./arch/powerpc/include/asm/current.h:13,
> >                  from ./include/linux/thread_info.h:21,
> >                  from ./include/asm-generic/preempt.h:5,
> >                  from
> > ./arch/powerpc/include/generated/asm/preempt.h:1,
> >                  from ./include/linux/preempt.h:78,
> >                  from ./include/linux/spinlock.h:51,
> >                  from ./include/linux/wait.h:9,
> >                  from ./include/linux/completion.h:12,
> >                  from ./include/linux/mlx5/driver.h:37,
> >                  from
> > drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h:6,
> >                  from
> > drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c:33:
> > In function 'strncpy',
> >     inlined from 'mlx5_fw_tracer_save_trace' at
> > drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c:549:2,
> >     inlined from 'mlx5_tracer_print_trace' at
> > drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c:574:2:
> > ./include/linux/string.h:305:9: warning: '__builtin_strncpy' output
> > may
> > be truncated copying 256 bytes from a string of length 511
> > [-Wstringop-truncation]
> >   return __builtin_strncpy(p, q, size);
> >          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > 
> > Fix it by using the new strscpy_pad() since the commit 458a3bf82df4
> > ("lib/string: Add strscpy_pad() function") which will always
> > NUL-terminate the string, and avoid possibly leak data through the
> > ring
> > buffer where non-admin account might enable these events through
> > perf.
> > 
> > Fixes: fd1483fe1f9f ("net/mlx5: Add support for FW reporter dump")
> > Signed-off-by: Qian Cai <cai@lca.pw>
> 
> 
> Hi Qian and thanks for your patch,
> 
> We already have a patch that handles this issue, please check it out:
> https://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git/commit/?h=net-
> next-mlx5
> 

That commit will make "struct mlx5_fw_tracer" too large and trigger a warning in
__alloc_pages_nodemask(),

        /*
         * There are several places where we assume that the order value is sane
         * so bail out early if the request is out of bound.
         */
        if (unlikely(order >= MAX_ORDER)) {
                WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
                return NULL;
        }

[   98.339576][  T914] WARNING: CPU: 0 PID: 914 at mm/page_alloc.c:4705
__alloc_pages_nodemask+0x441/0x1bb0
[   98.349174][  T914] Modules linked in: smartpqi(+) scsi_transport_sas tg3
mlx5_core(+) libphy firmware_class dm_mirror dm_region_hash dm_log dm_mod
efivarfs
[   98.363495][  T914] CPU: 0 PID: 914 Comm: kworker/0:2 Not tainted 5.3.0-rc6-
next-20190827+ #14
[   98.372243][  T914] Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385
Gen10, BIOS A40 07/10/2019
[   98.381627][  T914] Workqueue: events work_for_cpu_fn
[   98.386720][  T914] RIP: 0010:__alloc_pages_nodemask+0x441/0x1bb0
[   98.392917][  T914] Code: 17 00 00 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f 5d
c3 89 85 3c fe ff ff bb 01 00 00 00 e9 96 fd ff ff 81 e7 00 20 00 00 75 02 <0f>
0b 48 c7 85 50 fe ff ff 00 00 00 00 eb 82 31 d2 be 36 12 00 00
[   98.412740][  T914] RSP: 0018:ffff88853418f948 EFLAGS: 00010246
[   98.418704][  T914] RAX: 0000000000000000 RBX: ffffffff9571a860 RCX:
1ffff110a6831f3e
[   98.426652][  T914] RDX: 0000000000000000 RSI: 000000000000000b RDI:
0000000000000000
[   98.434661][  T914] RBP: ffff88853418fb58 R08: ffffed1108808465 R09:
ffffed1108808465
[   98.442613][  T914] R10: ffffed1108808464 R11: ffff888844042323 R12:
0000000000000000
[   98.450548][  T914] R13: 000000000000000b R14: 0000000000000000 R15:
0000000000000001
[   98.458434][  T914] FS:  0000000000000000(0000) GS:ffff888844000000(0000)
knlGS:0000000000000000
[   98.467350][  T914] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   98.473911][  T914] CR2: 0000555c64680148 CR3: 0000000550412000 CR4:
00000000003406b0
[   98.481838][  T914] Call Trace:
[   98.485011][  T914]  ? find_next_bit+0x2c/0xa0
[   98.489490][  T914]  ? __kasan_check_write+0x14/0x20
[   98.494506][  T914]  ? graph_lock+0xb8/0x120
[   98.498811][  T914]  ? __free_zapped_classes+0x740/0x740
[   98.504239][  T914]  ? gfp_pfmemalloc_allowed+0xc0/0xc0
[   98.509504][  T914]  ? __kasan_check_read+0x11/0x20
[   98.514443][  T914]  ? register_lock_class+0x5ef/0x960
[   98.519624][  T914]  ? rcu_read_lock_sched_held+0xac/0xe0
[   98.525152][  T914]  ? rcu_read_lock_any_held.part.5+0x20/0x20
[   98.531130][  T914]  ? find_next_bit+0x2c/0xa0
[   98.535610][  T914]  alloc_pages_current+0x9c/0x110
[   98.540638][  T914]  kmalloc_order+0x22/0x70
[   98.544943][  T914]  kmalloc_order_trace+0x23/0x100
[   98.550072][  T914]  mlx5_fw_tracer_create+0x51/0x870 [mlx5_core]
[   98.556213][  T914]  ? __mutex_init+0x94/0xa0
[   98.560744][  T914]  ? mlx5_init_rl_table+0x144/0x210 [mlx5_core]
[   98.566929][  T914]  mlx5_load_one+0x199/0x980 [mlx5_core]
[   98.572637][  T914]  init_one+0x494/0x760 [mlx5_core]
[   98.577771][  T914]  ? mlx5_pci_resume+0xd0/0xd0 [mlx5_core]
[   98.583574][  T914]  local_pci_probe+0x7a/0xc0
[   98.588054][  T914]  ? pci_dma_configure+0xa0/0xa0
[   98.592938][  T914]  work_for_cpu_fn+0x2e/0x50
[   98.597416][  T914]  process_one_work+0x53b/0xa70
[   98.602220][  T914]  ? pwq_dec_nr_in_flight+0x170/0x170
[   98.607485][  T914]  ? move_linked_works+0x113/0x150
[   98.612497][  T914]  worker_thread+0x363/0x5b0
[   98.616976][  T914]  kthread+0x1df/0x200
[   98.620932][  T914]  ? process_one_work+0xa70/0xa70
[   98.625847][  T914]  ? kthread_park+0xd0/0xd0
[   98.630240][  T914]  ret_from_fork+0x22/0x40

^ permalink raw reply

* Re: [PATCH net] tcp: inherit timestamp on mtu probe
From: Eric Dumazet @ 2019-08-27 20:07 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: netdev, David Miller, Jakub Kicinski, Willem de Bruijn
In-Reply-To: <20190827190933.227725-1-willemdebruijn.kernel@gmail.com>

On Tue, Aug 27, 2019 at 9:09 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> From: Willem de Bruijn <willemb@google.com>
>
> TCP associates tx timestamp requests with a byte in the bytestream.
> If merging skbs in tcp_mtu_probe, migrate the tstamp request.
>
> Similar to MSG_EOR, do not allow moving a timestamp from any segment
> in the probe but the last. This to avoid merging multiple timestamps.
>
> Tested with the packetdrill script at
> https://github.com/wdebruij/packetdrill/commits/mtu_probe-1
>
> Link: http://patchwork.ozlabs.org/patch/1143278/#2232897
> Fixes: 4ed2d765dfac ("net-timestamp: TCP timestamping")
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>  net/ipv4/tcp_output.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 5c46bc4c7e8d..42abc9bd687a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2053,7 +2053,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
>                 if (len <= skb->len)
>                         break;
>
> -               if (unlikely(TCP_SKB_CB(skb)->eor))
> +               if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
>                         return false;
>
>                 len -= skb->len;
> @@ -2170,6 +2170,7 @@ static int tcp_mtu_probe(struct sock *sk)
>                          * we need to propagate it to the new skb.
>                          */
>                         TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
> +                       tcp_skb_collapse_tstamp(nskb, skb);

nit: maybe rename tcp_skb_collapse_tstamp() to tcp_skb_tstamp_copy()
or something ?

Its name came from the fact that it was only used from
tcp_collapse_retrans(), but it will no
longer be the case after your fix.

>                         tcp_unlink_write_queue(skb, sk);
>                         sk_wmem_free_skb(sk, skb);
>                 } else {
> --
> 2.23.0.187.g17f5b7556c-goog
>

^ permalink raw reply

* Re: [PATCH v5 net-next 02/18] ionic: Add hardware init and device commands
From: Andrew Lunn @ 2019-08-27 19:50 UTC (permalink / raw)
  To: Shannon Nelson; +Cc: netdev, davem
In-Reply-To: <ab2d6525-e1e1-ef87-7150-dabfaee5b6ff@pensando.io>

On Tue, Aug 27, 2019 at 10:39:20AM -0700, Shannon Nelson wrote:
> On 8/26/19 7:26 PM, Andrew Lunn wrote:
> >On Mon, Aug 26, 2019 at 02:33:23PM -0700, Shannon Nelson wrote:
> >>+void ionic_debugfs_add_dev(struct ionic *ionic)
> >>+{
> >>+	struct dentry *dentry;
> >>+
> >>+	dentry = debugfs_create_dir(ionic_bus_info(ionic), ionic_dir);
> >>+	if (IS_ERR_OR_NULL(dentry))
> >>+		return;
> >>+
> >>+	ionic->dentry = dentry;
> >>+}
> >Hi Shannon
> >
> >There was recently a big patchset from GregKH which removed all error
> >checking from drivers calling debugfs calls. I'm pretty sure you don't
> >need this check here.
> 
> With this check I end up either with a valid dentry value or NULL in
> ionic->dentry.  Without this check I possibly get an error value in
> ionic->dentry, which can get used later as the parent dentry to try to make
> a new debugfs node.

Hi Shannon

What you should find is that every debugfs function will have
something like:

	if (IS_ERR(dentry))
	   return dentry;
or
	if (IS_ERR(parent))
	   return parent;

If you know of a API which is missing such protection, i'm sure GregKH
would like to know. Especially since he just ripped all such
protection in driver out. Meaning he just broken some drivers if such
IS_ERR() calls are missing in the debugfs core.

> >I would be happier if there was a privacy statement, right here,
> >saying what this information is used for, and an agreement it is not
> >used for anything else. If that gets violated, you can then only blame
> >yourself when we ripe this out and hard code it to static values.
> 
> That makes perfect sense.
> 
> I can add a full description here of how the information will be used, which
> should help most folks, but I'm sure there will still be some that don't
> want this info released.
> 
> What I'd like to propose here is that I do the hardcoded strings myself for
> now, and I work up a way for the users to enable the feature as desired,
> with a reasonable comment here in the code and in the
> Documentation/.../ionic.rst file.  This might end up as an ethtool priv-flag
> that defaults to off and can set a NIC value that is remembered for later.
> 
> Does that sound reasonable?

Yes, that sounds reasonable.

Thanks
	Andrew

^ permalink raw reply

* Re: [PATCH] net: intel: Cleanup e1000 - add space between }}
From: Forrest Fleming @ 2019-08-27 19:45 UTC (permalink / raw)
  To: Joe Perches
  Cc: Jeff Kirsher, Andrew Morton, David S. Miller, intel-wired-lan,
	netdev, linux-kernel
In-Reply-To: <b1ea77866e8736fa691cf4658a87ca2c1bf642d6.camel@perches.com>

On Tue, Aug 27, 2019 at 12:07 PM Joe Perches <joe@perches.com> wrote:
>
> On Tue, 2019-08-27 at 12:02 -0700, Jeff Kirsher wrote:
> > On Mon, 2019-08-26 at 20:41 -0700, Joe Perches wrote:
> > > On Mon, 2019-08-26 at 01:03 -0700, Jeff Kirsher wrote:
> > > > On Fri, 2019-08-23 at 19:14 +0000, Forrest Fleming wrote:
> > > > > suggested by checkpatch
> > > > >
> > > > > Signed-off-by: Forrest Fleming <ffleming@gmail.com>
> > > > > ---
> > > > >  .../net/ethernet/intel/e1000/e1000_param.c    | 28 +++++++++--
> > > > > --------
> > > > >  1 file changed, 14 insertions(+), 14 deletions(-)
> > > >
> > > > While I do not see an issue with this change, I wonder how
> > > > important it is
> > > > to make such a change.  Especially since most of the hardware
> > > > supported by
> > > > this driver is not available for testing.  In addition, this is one
> > > > suggested change by checkpatch.pl that I personally do not agree
> > > > with.
> > >
> > > I think checkpatch should allow consecutive }}.
> >
> > Agreed, have you already submitted a formal patch Joe with the
> > suggested change below?
>
> No.
>
> >   If so, I will ACK it.
>
> Of course you can add an Acked-by:
>

Totally fair - I don't have strong feelings regarding the particular rule. I do
feel strongly that we should avoid violating our rules as encoded by checkpatch,
but I'm perfectly happy for the change to take the form of modifying checkpatch
to allow a perfectly sensible (and readable) construct.

I'm happy to withdraw this patch from consideration; I couldn't find anything
about there being a formal procedure for so doing, so please let me know if
there's anything more I need to do (or point me to the relevant docs).

Thanks to everyone!

^ permalink raw reply

* Re: [RFC PATCH net-next] net: phy: force phy suspend when calling phy_stop
From: Heiner Kallweit @ 2019-08-27 19:41 UTC (permalink / raw)
  To: shenjian (K), andrew, f.fainelli, davem
  Cc: netdev, forest.zhouchang, linuxarm
In-Reply-To: <d3cd1ef1-8add-84bb-c4d9-801b65d0fba1@huawei.com>

On 27.08.2019 10:29, shenjian (K) wrote:
> 
> 
> 在 2019/8/27 13:51, Heiner Kallweit 写道:
>> On 27.08.2019 04:47, Jian Shen wrote:
>>> Some ethernet drivers may call phy_start() and phy_stop() from
>>> ndo_open and ndo_close() respectively.
>>>
>>> When network cable is unconnected, and operate like below:
>>> step 1: ifconfig ethX up -> ndo_open -> phy_start ->start
>>> autoneg, and phy is no link.
>>> step 2: ifconfig ethX down -> ndo_close -> phy_stop -> just stop
>>> phy state machine.
>>> step 3: plugin the network cable, and autoneg complete, then
>>> LED for link status will be on.
>>> step 4: ethtool ethX --> see the result of "Link detected" is no.
>>>
>> Step 3 and 4 seem to be unrelated to the actual issue.
>> With which MAC + PHY driver did you observe this?
>>
> Thanks Heiner,
> 
> I tested this on HNS3 driver, with two phy, Marvell 88E1512 and RTL8211.
> 
> Step 3 and Step 4 is just to describe that the LED of link shows link up,
> but the port information shows no link.
> 
ethtool refers to the link at MAC level. Therefore default implementation
ethtool_op_get_link just returns the result of netif_carrier_ok().
Also using PHY link status if interface is down doesn't really make sense:
- phylib state machine isn't running, therefore PHY status doesn't get updated
- often MAC drivers shut down parts of the MAC on ndo_close, this typically
  makes the internal MDIO bus unaccessible
So just remove steps 3 and 4. The patch itself is fine with me.

> 
>>> This patch forces phy suspend even phydev->link is off.
>>>
>>> Signed-off-by: Jian Shen <shenjian15@huawei.com>
>>> ---
>>>  drivers/net/phy/phy.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
>>> index f3adea9..0acd5b4 100644
>>> --- a/drivers/net/phy/phy.c
>>> +++ b/drivers/net/phy/phy.c
>>> @@ -911,8 +911,8 @@ void phy_state_machine(struct work_struct *work)
>>>  		if (phydev->link) {
>>>  			phydev->link = 0;
>>>  			phy_link_down(phydev, true);
>>> -			do_suspend = true;
>>>  		}
>>> +		do_suspend = true;
>>>  		break;
>>>  	}
>>>  
>>>
>> Heiner
>>
>>
> 
> 


^ permalink raw reply

* Re: [PATCH 04/38] mlx5: Convert cq_table to XArray
From: Saeed Mahameed @ 2019-08-27 19:22 UTC (permalink / raw)
  To: willy@infradead.org, netdev@vger.kernel.org
In-Reply-To: <20190820223259.22348-5-willy@infradead.org>

On Tue, 2019-08-20 at 15:32 -0700, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> Since mlx5_cq_table would have shrunk down to just the xarray,
> eliminate
> it and embed the xarray directly into mlx5_eq.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/eq.c  | 27 ++++-------------
> --
>  .../net/ethernet/mellanox/mlx5/core/lib/eq.h  |  7 +----
>  2 files changed, 6 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
> b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
> index 09d4c64b6e73..c5953f6e0a69 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
> @@ -113,11 +113,10 @@ static int mlx5_cmd_destroy_eq(struct
> mlx5_core_dev *dev, u8 eqn)
>  /* caller must eventually call mlx5_cq_put on the returned cq */
>  static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32
> cqn)
>  {
> -	struct mlx5_cq_table *table = &eq->cq_table;
> -	struct mlx5_core_cq *cq = NULL;
> +	struct mlx5_core_cq *cq;
>  
>  	rcu_read_lock();
> -	cq = radix_tree_lookup(&table->tree, cqn);
> +	cq = xa_load(&eq->cq_table, cqn);
>  	if (likely(cq))
>  		mlx5_cq_hold(cq);
>  	rcu_read_unlock();
> @@ -243,7 +242,6 @@ static int
>  create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
>  	      struct mlx5_eq_param *param)
>  {
> -	struct mlx5_cq_table *cq_table = &eq->cq_table;
>  	u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0};
>  	struct mlx5_priv *priv = &dev->priv;
>  	u8 vecidx = param->irq_index;
> @@ -254,11 +252,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct
> mlx5_eq *eq,
>  	int err;
>  	int i;
>  
> -	/* Init CQ table */
> -	memset(cq_table, 0, sizeof(*cq_table));
> -	spin_lock_init(&cq_table->lock);
> -	INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
> -
> +	xa_init_flags(&eq->cq_table, XA_FLAGS_LOCK_IRQ);

Why the IRQ flag ? we are not going to modify the xarray in irq context
all we do is hold a ref count on the entry we looked up
mlx5_cq_hold(cq); and this is protected by rcu_lock. 

>  	eq->nent = roundup_pow_of_two(param->nent +
> MLX5_NUM_SPARE_EQE);
>  	eq->cons_index = 0;
>  	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf);
> @@ -378,25 +372,14 @@ static int destroy_unmap_eq(struct
> mlx5_core_dev *dev, struct mlx5_eq *eq)
>  
>  int mlx5_eq_add_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
>  {
> -	struct mlx5_cq_table *table = &eq->cq_table;
> -	int err;
> -
> -	spin_lock(&table->lock);
> -	err = radix_tree_insert(&table->tree, cq->cqn, cq);
> -	spin_unlock(&table->lock);
> -
> -	return err;
> +	return xa_err(xa_store(&eq->cq_table, cq->cqn, cq,
> GFP_KERNEL));
>  }
>  
>  void mlx5_eq_del_cq(struct mlx5_eq *eq, struct mlx5_core_cq *cq)
>  {
> -	struct mlx5_cq_table *table = &eq->cq_table;
>  	struct mlx5_core_cq *tmp;
>  
> -	spin_lock(&table->lock);
> -	tmp = radix_tree_delete(&table->tree, cq->cqn);
> -	spin_unlock(&table->lock);
> -
> +	tmp = xa_erase(&eq->cq_table, cq->cqn);
>  	if (!tmp) {
>  		mlx5_core_dbg(eq->dev, "cq 0x%x not found in eq 0x%x
> tree\n",
>  			      eq->eqn, cq->cqn);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
> b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
> index 4be4d2d36218..a342cf78120e 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
> @@ -16,14 +16,9 @@ struct mlx5_eq_tasklet {
>  	spinlock_t            lock; /* lock completion tasklet list */
>  };
>  
> -struct mlx5_cq_table {
> -	spinlock_t              lock;	/* protect radix tree */
> -	struct radix_tree_root  tree;
> -};
> -
>  struct mlx5_eq {
>  	struct mlx5_core_dev    *dev;
> -	struct mlx5_cq_table    cq_table;
> +	struct xarray		cq_table;
>  	__be32 __iomem	        *doorbell;
>  	u32                     cons_index;
>  	struct mlx5_frag_buf    buf;

^ permalink raw reply

* Re: [PATCH 03/38] mlx4: Convert qp_table_tree to XArray
From: Saeed Mahameed @ 2019-08-27 19:18 UTC (permalink / raw)
  To: willy@infradead.org, netdev@vger.kernel.org
In-Reply-To: <20190820223259.22348-4-willy@infradead.org>

On Tue, 2019-08-20 at 15:32 -0700, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> 
> This XArray appears to be modifiable from interrupt context, so we
> have
> to be a little more careful with the locking.  However, the lookup
> can
> be done without the spinlock held.  I cannot determine whether
> mlx4_qp_alloc() is allowed to sleep, so I've retained the GFP_ATOMIC
> there, but it could be turned into GFP_KERNEL if the callers can
> tolerate it sleeping.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 +-
>  drivers/net/ethernet/mellanox/mlx4/qp.c   | 37 ++++++---------------
> --
>  include/linux/mlx4/device.h               |  4 +--
>  include/linux/mlx4/qp.h                   |  2 +-
>  4 files changed, 14 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> index b6fe22bee9f4..aaece8480da7 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> @@ -38,7 +38,7 @@
>  #define MLX4_H
>  
>  #include <linux/mutex.h>
> -#include <linux/radix-tree.h>
> +#include <linux/xarray.h>
>  #include <linux/rbtree.h>
>  #include <linux/timer.h>
>  #include <linux/semaphore.h>
> @@ -716,7 +716,6 @@ struct mlx4_qp_table {
>  	u32			zones_uids[MLX4_QP_TABLE_ZONE_NUM];
>  	u32			rdmarc_base;
>  	int			rdmarc_shift;
> -	spinlock_t		lock;
>  	struct mlx4_icm_table	qp_table;
>  	struct mlx4_icm_table	auxc_table;
>  	struct mlx4_icm_table	altc_table;
> diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c
> b/drivers/net/ethernet/mellanox/mlx4/qp.c
> index 427e7a31862c..4659ecec12c1 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/qp.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
> @@ -48,16 +48,13 @@
>  
>  void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type)
>  {
> -	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
>  	struct mlx4_qp *qp;
>  
> -	spin_lock(&qp_table->lock);
> -
> +	xa_lock(&dev->qp_table);
>  	qp = __mlx4_qp_lookup(dev, qpn);
>  	if (qp)
>  		refcount_inc(&qp->refcount);
> -
> -	spin_unlock(&qp_table->lock);
> +	xa_unlock(&dev->qp_table);
>  
>  	if (!qp) {
>  		mlx4_dbg(dev, "Async event for none existent QP
> %08x\n", qpn);
> @@ -390,21 +387,11 @@ static void mlx4_qp_free_icm(struct mlx4_dev
> *dev, int qpn)
>  
>  struct mlx4_qp *mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
>  {
> -	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
> -	struct mlx4_qp *qp;
> -
> -	spin_lock_irq(&qp_table->lock);
> -
> -	qp = __mlx4_qp_lookup(dev, qpn);
> -
> -	spin_unlock_irq(&qp_table->lock);
> -	return qp;
> +	return __mlx4_qp_lookup(dev, qpn);
>  }
>  
>  int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp)
>  {
> -	struct mlx4_priv *priv = mlx4_priv(dev);
> -	struct mlx4_qp_table *qp_table = &priv->qp_table;
>  	int err;
>  
>  	if (!qpn)
> @@ -416,10 +403,9 @@ int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn,
> struct mlx4_qp *qp)
>  	if (err)
>  		return err;
>  
> -	spin_lock_irq(&qp_table->lock);
> -	err = radix_tree_insert(&dev->qp_table_tree, qp->qpn &
> -				(dev->caps.num_qps - 1), qp);
> -	spin_unlock_irq(&qp_table->lock);
> +	err = xa_err(xa_store_irq(&dev->qp_table,
> +				qp->qpn & (dev->caps.num_qps - 1),
> +				qp, GFP_ATOMIC));

mlx4_qp_alloc might sleep, so GFP_KERNEL here.

>  	if (err)
>  		goto err_icm;
>  
> @@ -512,12 +498,11 @@ EXPORT_SYMBOL_GPL(mlx4_update_qp);
>  
>  void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp)
>  {
> -	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&qp_table->lock, flags);
> -	radix_tree_delete(&dev->qp_table_tree, qp->qpn & (dev-
> >caps.num_qps - 1));
> -	spin_unlock_irqrestore(&qp_table->lock, flags);
> +	xa_lock_irqsave(&dev->qp_table, flags);
> +	__xa_erase(&dev->qp_table, qp->qpn & (dev->caps.num_qps - 1));
> +	xa_unlock_irqrestore(&dev->qp_table, flags);
>  }
>  EXPORT_SYMBOL_GPL(mlx4_qp_remove);
>  
> @@ -760,7 +745,6 @@ static void mlx4_cleanup_qp_zones(struct mlx4_dev
> *dev)
>  
>  int mlx4_init_qp_table(struct mlx4_dev *dev)
>  {
> -	struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table;
>  	int err;
>  	int reserved_from_top = 0;
>  	int reserved_from_bot;
> @@ -770,8 +754,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
>  	u32 max_table_offset = dev->caps.dmfs_high_rate_qpn_base +
>  			dev->caps.dmfs_high_rate_qpn_range;
>  
> -	spin_lock_init(&qp_table->lock);
> -	INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC);
> +	xa_init_flags(&dev->qp_table, XA_FLAGS_LOCK_IRQ);
>  	if (mlx4_is_slave(dev))
>  		return 0;
>  
> diff --git a/include/linux/mlx4/device.h
> b/include/linux/mlx4/device.h
> index 36e412c3d657..acffca7d9f00 100644
> --- a/include/linux/mlx4/device.h
> +++ b/include/linux/mlx4/device.h
> @@ -36,7 +36,7 @@
>  #include <linux/if_ether.h>
>  #include <linux/pci.h>
>  #include <linux/completion.h>
> -#include <linux/radix-tree.h>
> +#include <linux/xarray.h>
>  #include <linux/cpu_rmap.h>
>  #include <linux/crash_dump.h>
>  
> @@ -889,7 +889,7 @@ struct mlx4_dev {
>  	struct mlx4_caps	caps;
>  	struct mlx4_phys_caps	phys_caps;
>  	struct mlx4_quotas	quotas;
> -	struct radix_tree_root	qp_table_tree;
> +	struct xarray		qp_table;
>  	u8			rev_id;
>  	u8			port_random_macs;
>  	char			board_id[MLX4_BOARD_ID_LEN];
> diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
> index 8e2828d48d7f..6c3ec3197a10 100644
> --- a/include/linux/mlx4/qp.h
> +++ b/include/linux/mlx4/qp.h
> @@ -488,7 +488,7 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct
> mlx4_mtt *mtt,
>  
>  static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev,
> u32 qpn)
>  {
> -	return radix_tree_lookup(&dev->qp_table_tree, qpn & (dev-
> >caps.num_qps - 1));
> +	return xa_load(&dev->qp_table, qpn & (dev->caps.num_qps - 1));
>  }
>  
>  void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);

^ permalink raw reply

* [PATCH v1 2/5] mdev: Make mdev alias unique among all mdevs
From: Parav Pandit @ 2019-08-27 19:16 UTC (permalink / raw)
  To: alex.williamson, jiri, kwankhede, cohuck, davem
  Cc: kvm, linux-kernel, netdev, Parav Pandit
In-Reply-To: <20190827191654.41161-1-parav@mellanox.com>

Mdev alias should be unique among all the mdevs, so that when such alias
is used by the mdev users to derive other objects, there is no
collision in a given system.

Signed-off-by: Parav Pandit <parav@mellanox.com>

---
Changelog:
v0->v1:
 - Fixed inclusiong of alias for NULL check
 - Added ratelimited debug print for sha1 hash collision error
---
 drivers/vfio/mdev/mdev_core.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 62d29f57fe0c..4b9899e40665 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -375,6 +375,13 @@ int mdev_device_create(struct kobject *kobj, struct device *dev,
 			ret = -EEXIST;
 			goto mdev_fail;
 		}
+		if (tmp->alias && alias && strcmp(tmp->alias, alias) == 0) {
+			mutex_unlock(&mdev_list_lock);
+			ret = -EEXIST;
+			dev_dbg_ratelimited(dev, "Hash collision in alias creation for UUID %pUl\n",
+					    uuid);
+			goto mdev_fail;
+		}
 	}
 
 	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
-- 
2.19.2


^ permalink raw reply related

* [PATCH v1 1/5] mdev: Introduce sha1 based mdev alias
From: Parav Pandit @ 2019-08-27 19:16 UTC (permalink / raw)
  To: alex.williamson, jiri, kwankhede, cohuck, davem
  Cc: kvm, linux-kernel, netdev, Parav Pandit
In-Reply-To: <20190827191654.41161-1-parav@mellanox.com>

Some vendor drivers want an identifier for an mdev device that is
shorter than the UUID, due to length restrictions in the consumers of
that identifier.

Add a callback that allows a vendor driver to request an alias of a
specified length to be generated for an mdev device. If generated,
that alias is checked for collisions.

It is an optional attribute.
mdev alias is generated using sha1 from the mdev name.

Signed-off-by: Parav Pandit <parav@mellanox.com>

---
Changelog:

v0->v1:
 - Moved alias length check outside of the parent lock
 - Moved alias and digest allocation from kvzalloc to kzalloc
 - &alias[0] changed to alias
 - alias_length check is nested under get_alias_length callback check
 - Changed comments to start with an empty line
 - Fixed cleaunup of hash if mdev_bus_register() fails
 - Added comment where alias memory ownership is handed over to mdev device
 - Updated commit log to indicate motivation for this feature
---
 drivers/vfio/mdev/mdev_core.c    | 110 ++++++++++++++++++++++++++++++-
 drivers/vfio/mdev/mdev_private.h |   5 +-
 drivers/vfio/mdev/mdev_sysfs.c   |  13 ++--
 include/linux/mdev.h             |   4 ++
 4 files changed, 122 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index b558d4cfd082..62d29f57fe0c 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -10,9 +10,11 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/uuid.h>
 #include <linux/sysfs.h>
 #include <linux/mdev.h>
+#include <crypto/hash.h>
 
 #include "mdev_private.h"
 
@@ -27,6 +29,8 @@ static struct class_compat *mdev_bus_compat_class;
 static LIST_HEAD(mdev_list);
 static DEFINE_MUTEX(mdev_list_lock);
 
+static struct crypto_shash *alias_hash;
+
 struct device *mdev_parent_dev(struct mdev_device *mdev)
 {
 	return mdev->parent->dev;
@@ -150,6 +154,16 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
 		return -EINVAL;
 
+	if (ops->get_alias_length) {
+		unsigned int digest_size;
+		unsigned int aligned_len;
+
+		aligned_len = roundup(ops->get_alias_length(), 2);
+		digest_size = crypto_shash_digestsize(alias_hash);
+		if (aligned_len / 2 > digest_size)
+			return -EINVAL;
+	}
+
 	dev = get_device(dev);
 	if (!dev)
 		return -EINVAL;
@@ -259,6 +273,7 @@ static void mdev_device_free(struct mdev_device *mdev)
 	mutex_unlock(&mdev_list_lock);
 
 	dev_dbg(&mdev->dev, "MDEV: destroying\n");
+	kfree(mdev->alias);
 	kfree(mdev);
 }
 
@@ -269,18 +284,88 @@ static void mdev_device_release(struct device *dev)
 	mdev_device_free(mdev);
 }
 
-int mdev_device_create(struct kobject *kobj,
-		       struct device *dev, const guid_t *uuid)
+static const char *
+generate_alias(const char *uuid, unsigned int max_alias_len)
+{
+	struct shash_desc *hash_desc;
+	unsigned int digest_size;
+	unsigned char *digest;
+	unsigned int alias_len;
+	char *alias;
+	int ret = 0;
+
+	/*
+	 * Align to multiple of 2 as bin2hex will generate
+	 * even number of bytes.
+	 */
+	alias_len = roundup(max_alias_len, 2);
+	alias = kzalloc(alias_len + 1, GFP_KERNEL);
+	if (!alias)
+		return NULL;
+
+	/* Allocate and init descriptor */
+	hash_desc = kvzalloc(sizeof(*hash_desc) +
+			     crypto_shash_descsize(alias_hash),
+			     GFP_KERNEL);
+	if (!hash_desc)
+		goto desc_err;
+
+	hash_desc->tfm = alias_hash;
+
+	digest_size = crypto_shash_digestsize(alias_hash);
+
+	digest = kzalloc(digest_size, GFP_KERNEL);
+	if (!digest) {
+		ret = -ENOMEM;
+		goto digest_err;
+	}
+	crypto_shash_init(hash_desc);
+	crypto_shash_update(hash_desc, uuid, UUID_STRING_LEN);
+	crypto_shash_final(hash_desc, digest);
+	bin2hex(alias, digest, min_t(unsigned int, digest_size, alias_len / 2));
+	/*
+	 * When alias length is odd, zero out and additional last byte
+	 * that bin2hex has copied.
+	 */
+	if (max_alias_len % 2)
+		alias[max_alias_len] = 0;
+
+	kfree(digest);
+	kvfree(hash_desc);
+	return alias;
+
+digest_err:
+	kvfree(hash_desc);
+desc_err:
+	kfree(alias);
+	return NULL;
+}
+
+int mdev_device_create(struct kobject *kobj, struct device *dev,
+		       const char *uuid_str, const guid_t *uuid)
 {
 	int ret;
 	struct mdev_device *mdev, *tmp;
 	struct mdev_parent *parent;
 	struct mdev_type *type = to_mdev_type(kobj);
+	const char *alias = NULL;
 
 	parent = mdev_get_parent(type->parent);
 	if (!parent)
 		return -EINVAL;
 
+	if (parent->ops->get_alias_length) {
+		unsigned int alias_len;
+
+		alias_len = parent->ops->get_alias_length();
+		if (alias_len) {
+			alias = generate_alias(uuid_str, alias_len);
+			if (!alias) {
+				ret = -ENOMEM;
+				goto alias_fail;
+			}
+		}
+	}
 	mutex_lock(&mdev_list_lock);
 
 	/* Check for duplicate */
@@ -300,6 +385,12 @@ int mdev_device_create(struct kobject *kobj,
 	}
 
 	guid_copy(&mdev->uuid, uuid);
+	mdev->alias = alias;
+	/*
+	 * At this point alias memory is owned by the mdev.
+	 * Mark it NULL, so that only mdev can free it.
+	 */
+	alias = NULL;
 	list_add(&mdev->next, &mdev_list);
 	mutex_unlock(&mdev_list_lock);
 
@@ -346,6 +437,8 @@ int mdev_device_create(struct kobject *kobj,
 	up_read(&parent->unreg_sem);
 	put_device(&mdev->dev);
 mdev_fail:
+	kfree(alias);
+alias_fail:
 	mdev_put_parent(parent);
 	return ret;
 }
@@ -406,7 +499,17 @@ EXPORT_SYMBOL(mdev_get_iommu_device);
 
 static int __init mdev_init(void)
 {
-	return mdev_bus_register();
+	int ret;
+
+	alias_hash = crypto_alloc_shash("sha1", 0, 0);
+	if (!alias_hash)
+		return -ENOMEM;
+
+	ret = mdev_bus_register();
+	if (ret)
+		crypto_free_shash(alias_hash);
+
+	return ret;
 }
 
 static void __exit mdev_exit(void)
@@ -415,6 +518,7 @@ static void __exit mdev_exit(void)
 		class_compat_unregister(mdev_bus_compat_class);
 
 	mdev_bus_unregister();
+	crypto_free_shash(alias_hash);
 }
 
 module_init(mdev_init)
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index 7d922950caaf..cf1c0d9842c6 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -33,6 +33,7 @@ struct mdev_device {
 	struct kobject *type_kobj;
 	struct device *iommu_device;
 	bool active;
+	const char *alias;
 };
 
 #define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)
@@ -57,8 +58,8 @@ void parent_remove_sysfs_files(struct mdev_parent *parent);
 int  mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
 void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
 
-int  mdev_device_create(struct kobject *kobj,
-			struct device *dev, const guid_t *uuid);
+int mdev_device_create(struct kobject *kobj, struct device *dev,
+		       const char *uuid_str, const guid_t *uuid);
 int  mdev_device_remove(struct device *dev);
 
 #endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 7570c7602ab4..43afe0e80b76 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -63,15 +63,18 @@ static ssize_t create_store(struct kobject *kobj, struct device *dev,
 		return -ENOMEM;
 
 	ret = guid_parse(str, &uuid);
-	kfree(str);
 	if (ret)
-		return ret;
+		goto err;
 
-	ret = mdev_device_create(kobj, dev, &uuid);
+	ret = mdev_device_create(kobj, dev, str, &uuid);
 	if (ret)
-		return ret;
+		goto err;
 
-	return count;
+	ret = count;
+
+err:
+	kfree(str);
+	return ret;
 }
 
 MDEV_TYPE_ATTR_WO(create);
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0ce30ca78db0..f036fe9854ee 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,6 +72,9 @@ struct device *mdev_get_iommu_device(struct device *dev);
  * @mmap:		mmap callback
  *			@mdev: mediated device structure
  *			@vma: vma structure
+ * @get_alias_length:	Generate alias for the mdevs of this parent based on the
+ *			mdev device name when it returns non zero alias length.
+ *			It is optional.
  * Parent device that support mediated device should be registered with mdev
  * module with mdev_parent_ops structure.
  **/
@@ -92,6 +95,7 @@ struct mdev_parent_ops {
 	long	(*ioctl)(struct mdev_device *mdev, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
+	unsigned int (*get_alias_length)(void);
 };
 
 /* interface for exporting mdev supported type attributes */
-- 
2.19.2


^ permalink raw reply related

* [PATCH v1 5/5] mtty: Optionally support mtty alias
From: Parav Pandit @ 2019-08-27 19:16 UTC (permalink / raw)
  To: alex.williamson, jiri, kwankhede, cohuck, davem
  Cc: kvm, linux-kernel, netdev, Parav Pandit
In-Reply-To: <20190827191654.41161-1-parav@mellanox.com>

Provide a module parameter to set alias length to optionally generate
mdev alias.

Example to request mdev alias.
$ modprobe mtty alias_length=12

Signed-off-by: Parav Pandit <parav@mellanox.com>
---
 samples/vfio-mdev/mtty.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 92e770a06ea2..92208245b057 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1410,6 +1410,15 @@ static struct attribute_group *mdev_type_groups[] = {
 	NULL,
 };
 
+static unsigned int mtty_alias_length;
+module_param_named(alias_length, mtty_alias_length, uint, 0444);
+MODULE_PARM_DESC(alias_length, "mdev alias length; default=0");
+
+static unsigned int mtty_get_alias_length(void)
+{
+	return mtty_alias_length;
+}
+
 static const struct mdev_parent_ops mdev_fops = {
 	.owner                  = THIS_MODULE,
 	.dev_attr_groups        = mtty_dev_groups,
@@ -1422,6 +1431,7 @@ static const struct mdev_parent_ops mdev_fops = {
 	.read                   = mtty_read,
 	.write                  = mtty_write,
 	.ioctl		        = mtty_ioctl,
+	.get_alias_length	= mtty_get_alias_length
 };
 
 static void mtty_device_release(struct device *dev)
-- 
2.19.2


^ permalink raw reply related

* [PATCH v1 4/5] mdev: Update sysfs documentation
From: Parav Pandit @ 2019-08-27 19:16 UTC (permalink / raw)
  To: alex.williamson, jiri, kwankhede, cohuck, davem
  Cc: kvm, linux-kernel, netdev, Parav Pandit
In-Reply-To: <20190827191654.41161-1-parav@mellanox.com>

Updated documentation for optional read only sysfs attribute.

Signed-off-by: Parav Pandit <parav@mellanox.com>
---
 Documentation/driver-api/vfio-mediated-device.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
index 25eb7d5b834b..0ab03d3f5629 100644
--- a/Documentation/driver-api/vfio-mediated-device.rst
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -270,6 +270,7 @@ Directories and Files Under the sysfs for Each mdev Device
          |--- remove
          |--- mdev_type {link to its type}
          |--- vendor-specific-attributes [optional]
+         |--- alias [optional]
 
 * remove (write only)
 
@@ -281,6 +282,10 @@ Example::
 
 	# echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
 
+* alias (read only)
+Whenever a parent requested to generate an alias, each mdev is assigned a unique
+alias by the mdev core. This file shows the alias of the mdev device.
+
 Mediated device Hot plug
 ------------------------
 
-- 
2.19.2


^ permalink raw reply related

* [PATCH v1 3/5] mdev: Expose mdev alias in sysfs tree
From: Parav Pandit @ 2019-08-27 19:16 UTC (permalink / raw)
  To: alex.williamson, jiri, kwankhede, cohuck, davem
  Cc: kvm, linux-kernel, netdev, Parav Pandit
In-Reply-To: <20190827191654.41161-1-parav@mellanox.com>

Expose the optional alias for an mdev device as a sysfs attribute.
This way, userspace tools such as udev may make use of the alias, for
example to create a netdevice name for the mdev.

Signed-off-by: Parav Pandit <parav@mellanox.com>

---
Changelog:
v0->v1:
 - Addressed comments from Cornelia Huck
 - Updated commit description
---
 drivers/vfio/mdev/mdev_sysfs.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 43afe0e80b76..59f4e3cc5233 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -246,7 +246,20 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR_WO(remove);
 
+static ssize_t alias_show(struct device *device,
+			  struct device_attribute *attr, char *buf)
+{
+	struct mdev_device *dev = mdev_from_dev(device);
+
+	if (!dev->alias)
+		return -EOPNOTSUPP;
+
+	return sprintf(buf, "%s\n", dev->alias);
+}
+static DEVICE_ATTR_RO(alias);
+
 static const struct attribute *mdev_device_attrs[] = {
+	&dev_attr_alias.attr,
 	&dev_attr_remove.attr,
 	NULL,
 };
-- 
2.19.2


^ permalink raw reply related

* [PATCH v1 0/5] Introduce variable length mdev alias
From: Parav Pandit @ 2019-08-27 19:16 UTC (permalink / raw)
  To: alex.williamson, jiri, kwankhede, cohuck, davem
  Cc: kvm, linux-kernel, netdev, Parav Pandit
In-Reply-To: <20190826204119.54386-1-parav@mellanox.com>

To have consistent naming for the netdevice of a mdev and to have
consistent naming of the devlink port [1] of a mdev, which is formed using
phys_port_name of the devlink port, current UUID is not usable because
UUID is too long.

UUID in string format is 36-characters long and in binary 128-bit.
Both formats are not able to fit within 15 characters limit of netdev
name.

It is desired to have mdev device naming consistent using UUID.
So that widely used user space framework such as ovs [2] can make use
of mdev representor in similar way as PCIe SR-IOV VF and PF representors.

Hence,
(a) mdev alias is created which is derived using sha1 from the mdev name.
(b) Vendor driver describes how long an alias should be for the child mdev
created for a given parent.
(c) Mdev aliases are unique at system level.
(d) alias is created optionally whenever parent requested.
This ensures that non networking mdev parents can function without alias
creation overhead.

This design is discussed at [3].

An example systemd/udev extension will have,

1. netdev name created using mdev alias available in sysfs.

mdev UUID=83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
mdev 12 character alias=cd5b146a80a5

netdev name of this mdev = enmcd5b146a80a5
Here en = Ethernet link
m = mediated device

2. devlink port phys_port_name created using mdev alias.
devlink phys_port_name=pcd5b146a80a5

This patchset enables mdev core to maintain unique alias for a mdev.

Patch-1 Introduces mdev alias using sha1.
Patch-2 Ensures that mdev alias is unique in a system.
Patch-3 Exposes mdev alias in a sysfs hirerchy.
Patch-4 Extends mtty driver to optionally provide alias generation.
This also enables to test UUID based sha1 collision and trigger
error handling for duplicate sha1 results.

In future when networking driver wants to use mdev alias, mdev_alias()
API will be added to derive devlink port name.

[1] http://man7.org/linux/man-pages/man8/devlink-port.8.html
[2] https://docs.openstack.org/os-vif/latest/user/plugins/ovs.html
[3] https://patchwork.kernel.org/cover/11084231/

---
Changelog:

v0->v1:
 - Addressed comments from Alex Williamson, Cornelia Hunk and Mark Bloch
 - Moved alias length check outside of the parent lock
 - Moved alias and digest allocation from kvzalloc to kzalloc
 - &alias[0] changed to alias
 - alias_length check is nested under get_alias_length callback check
 - Changed comments to start with an empty line
 - Added comment where alias memory ownership is handed over to mdev device
 - Fixed cleaunup of hash if mdev_bus_register() fails
 - Updated documentation for new sysfs alias file
 - Improved commit logs to make description more clear
 - Fixed inclusiong of alias for NULL check
 - Added ratelimited debug print for sha1 hash collision error

Parav Pandit (5):
  mdev: Introduce sha1 based mdev alias
  mdev: Make mdev alias unique among all mdevs
  mdev: Expose mdev alias in sysfs tree
  mdev: Update sysfs documentation
  mtty: Optionally support mtty alias

 .../driver-api/vfio-mediated-device.rst       |   5 +
 drivers/vfio/mdev/mdev_core.c                 | 117 +++++++++++++++++-
 drivers/vfio/mdev/mdev_private.h              |   5 +-
 drivers/vfio/mdev/mdev_sysfs.c                |  26 +++-
 include/linux/mdev.h                          |   4 +
 samples/vfio-mdev/mtty.c                      |  10 ++
 6 files changed, 157 insertions(+), 10 deletions(-)

-- 
2.19.2

^ permalink raw reply

* [PATCH net] tcp: inherit timestamp on mtu probe
From: Willem de Bruijn @ 2019-08-27 19:09 UTC (permalink / raw)
  To: netdev; +Cc: davem, edumazet, jakub.kicinski, Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

TCP associates tx timestamp requests with a byte in the bytestream.
If merging skbs in tcp_mtu_probe, migrate the tstamp request.

Similar to MSG_EOR, do not allow moving a timestamp from any segment
in the probe but the last. This to avoid merging multiple timestamps.

Tested with the packetdrill script at
https://github.com/wdebruij/packetdrill/commits/mtu_probe-1

Link: http://patchwork.ozlabs.org/patch/1143278/#2232897
Fixes: 4ed2d765dfac ("net-timestamp: TCP timestamping")
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/tcp_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5c46bc4c7e8d..42abc9bd687a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2053,7 +2053,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
 		if (len <= skb->len)
 			break;
 
-		if (unlikely(TCP_SKB_CB(skb)->eor))
+		if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
 			return false;
 
 		len -= skb->len;
@@ -2170,6 +2170,7 @@ static int tcp_mtu_probe(struct sock *sk)
 			 * we need to propagate it to the new skb.
 			 */
 			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
+			tcp_skb_collapse_tstamp(nskb, skb);
 			tcp_unlink_write_queue(skb, sk);
 			sk_wmem_free_skb(sk, skb);
 		} else {
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related

* Re: [PATCH] net: intel: Cleanup e1000 - add space between }}
From: Joe Perches @ 2019-08-27 19:07 UTC (permalink / raw)
  To: jeffrey.t.kirsher, Forrest Fleming, Andrew Morton
  Cc: David S. Miller, intel-wired-lan, netdev, linux-kernel
In-Reply-To: <c40b4043424055fc4dae97771bb46c8ab15c6230.camel@intel.com>

On Tue, 2019-08-27 at 12:02 -0700, Jeff Kirsher wrote:
> On Mon, 2019-08-26 at 20:41 -0700, Joe Perches wrote:
> > On Mon, 2019-08-26 at 01:03 -0700, Jeff Kirsher wrote:
> > > On Fri, 2019-08-23 at 19:14 +0000, Forrest Fleming wrote:
> > > > suggested by checkpatch
> > > > 
> > > > Signed-off-by: Forrest Fleming <ffleming@gmail.com>
> > > > ---
> > > >  .../net/ethernet/intel/e1000/e1000_param.c    | 28 +++++++++--
> > > > --------
> > > >  1 file changed, 14 insertions(+), 14 deletions(-)
> > > 
> > > While I do not see an issue with this change, I wonder how
> > > important it is
> > > to make such a change.  Especially since most of the hardware
> > > supported by
> > > this driver is not available for testing.  In addition, this is one
> > > suggested change by checkpatch.pl that I personally do not agree
> > > with.
> > 
> > I think checkpatch should allow consecutive }}.
> 
> Agreed, have you already submitted a formal patch Joe with the
> suggested change below?

No.

>   If so, I will ACK it.

Of course you can add an Acked-by:

> > Maybe:
> > ---
> > diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
> > index 287fe73688f0..ac5e0f06e1af 100755
> > --- a/scripts/checkpatch.pl
> > +++ b/scripts/checkpatch.pl
> > @@ -4687,7 +4687,7 @@ sub process {
> >  
> >  # closing brace should have a space following it when it has
> > anything
> >  # on the line
> > -		if ($line =~ /}(?!(?:,|;|\)))\S/) {
> > +		if ($line =~ /}(?!(?:,|;|\)|\}))\S/) {
> >  			if (ERROR("SPACING",
> >  				  "space required after that close
> > brace '}'\n" . $herecurr) &&
> >  			    $fix) {
> > 
> > 


^ permalink raw reply

* [PATCH v3 1/1] netfilter: nf_tables: fib: Drop IPV6 packets if IPv6 is disabled on boot
From: Leonardo Bras @ 2019-08-27 18:57 UTC (permalink / raw)
  To: netfilter-devel, coreteam, netdev, linux-kernel
  Cc: Leonardo Bras, Pablo Neira Ayuso, Jozsef Kadlecsik,
	Florian Westphal, David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI

If IPv6 is disabled on boot (ipv6.disable=1), but nft_fib_inet ends up
dealing with a IPv6 packet, it causes a kernel panic in
fib6_node_lookup_1(), crashing in bad_page_fault.

The panic is caused by trying to deference a very low address (0x38
in ppc64le), due to ipv6.fib6_main_tbl = NULL.
BUG: Kernel NULL pointer dereference at 0x00000038

Fix this behavior by dropping IPv6 packets if !ipv6_mod_enabled().

Signed-off-by: Leonardo Bras <leonardo@linux.ibm.com>
---
Changes from v2:
- Replace veredict.code from NF_DROP to NFT_BREAK
- Updated commit message (s/package/packet)

Changes from v1:
- Move drop logic from nft_fib_inet_eval() to nft_fib6_eval{,_type}
so it can affect other usages of these functions.

 net/ipv6/netfilter/nft_fib_ipv6.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 7ece86afd079..8496e43b73bd 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -125,6 +125,11 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
 	u32 *dest = &regs->data[priv->dreg];
 	struct ipv6hdr *iph, _iph;
 
+	if (!ipv6_mod_enabled()) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
 	iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
 	if (!iph) {
 		regs->verdict.code = NFT_BREAK;
@@ -150,6 +155,11 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	struct rt6_info *rt;
 	int lookup_flags;
 
+	if (!ipv6_mod_enabled()) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+
 	if (priv->flags & NFTA_FIB_F_IIF)
 		oif = nft_in(pkt);
 	else if (priv->flags & NFTA_FIB_F_OIF)
-- 
2.20.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox