Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v4 bpf-next 2/4] bpf: implement CAP_BPF
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>

Implement permissions as stated in uapi/linux/capability.h

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c         |  2 +-
 kernel/bpf/cgroup.c           |  2 +-
 kernel/bpf/core.c             |  4 ++--
 kernel/bpf/hashtab.c          |  4 ++--
 kernel/bpf/lpm_trie.c         |  2 +-
 kernel/bpf/queue_stack_maps.c |  2 +-
 kernel/bpf/reuseport_array.c  |  2 +-
 kernel/bpf/stackmap.c         |  2 +-
 kernel/bpf/syscall.c          | 32 +++++++++++++++++++-------------
 kernel/bpf/verifier.c         |  2 +-
 kernel/trace/bpf_trace.c      |  2 +-
 net/core/bpf_sk_storage.c     |  2 +-
 net/core/filter.c             | 10 ++++++----
 13 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..149f868a02dc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool unpriv = !capable(CAP_SYS_ADMIN);
+	bool unpriv = !capable_bpf();
 	u64 cost, array_size, mask64;
 	struct bpf_map_memory mem;
 	struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..9c659ba5c146 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
+		if (capable_bpf_tracing())
 			return bpf_get_trace_printk_proto();
 		/* fall through */
 	default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 66088a9e9b9e..6643099bc64b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !capable(CAP_SYS_ADMIN))
+	    !capable_bpf())
 		return;
 
 	spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages)
 {
 	if (atomic_long_add_return(pages, &bpf_jit_current) >
 	    (bpf_jit_limit >> PAGE_SHIFT)) {
-		if (!capable(CAP_SYS_ADMIN)) {
+		if (!capable_bpf()) {
 			atomic_long_sub(pages, &bpf_jit_current);
 			return -EPERM;
 		}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..0fae5c45f425 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
 
-	if (lru && !capable(CAP_SYS_ADMIN))
+	if (lru && !capable_bpf())
 		/* LRU implementation is much complicated than other
-		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
+		 * maps.  Hence, limit to CAP_BPF.
 		 */
 		return -EPERM;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..11da3be8a4e5 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..d83afac32863 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
 /* Called from syscall */
 static int queue_stack_map_alloc_check(union bpf_attr *attr)
 {
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..b268fe4b2972 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	u64 array_size;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return ERR_PTR(-EPERM);
 
 	array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..477063c63b27 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u64 cost, n_buckets;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_tracing())
 		return ERR_PTR(-EPERM);
 
 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 82eabd4e38ad..cd2d1b21f0f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr)
 		err = -EBUSY;
 		goto err_put;
 	}
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable_bpf()) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1635,7 +1635,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable_bpf())
 		return -EPERM;
 
 	/* copy eBPF program license from user space */
@@ -1648,11 +1648,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	is_gpl = license_is_gpl_compatible(license);
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+	    attr->insn_cnt > (capable_bpf() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
 		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable_bpf())
 		return -EPERM;
 
 	bpf_prog_load_fixup_attach_type(attr);
@@ -1809,6 +1809,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	char tp_name[128];
 	int tp_fd, err;
 
+	if (!capable_bpf_tracing())
+		return -EPERM;
+
 	if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
 			      sizeof(tp_name) - 1) < 0)
 		return -EFAULT;
@@ -2087,7 +2090,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf_net_admin())
+		/* test_run callback is available for networking progs only.
+		 * Add capable_bpf_tracing() above when tracing progs become runable.
+		 */
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
@@ -2124,7 +2130,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	next_id++;
@@ -2150,7 +2156,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	spin_lock_bh(&prog_idr_lock);
@@ -2184,7 +2190,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2359,7 +2365,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.run_time_ns = stats.nsecs;
 	info.run_cnt = stats.cnt;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable_bpf()) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
@@ -2677,7 +2683,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	return btf_new_fd(attr);
@@ -2690,7 +2696,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	return btf_get_fd_by_id(attr->btf_id);
@@ -2759,7 +2765,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf_tracing())
 		return -EPERM;
 
 	if (attr->task_fd_query.flags != 0)
@@ -2827,7 +2833,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr = {};
 	int err;
 
-	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+	if (sysctl_unprivileged_bpf_disabled && !capable_bpf())
 		return -EPERM;
 
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fb50757e812..7e519711c689 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9234,7 +9234,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
-	is_priv = capable(CAP_SYS_ADMIN);
+	is_priv = capable_bpf();
 
 	/* grab the mutex to protect few globals used by verifier */
 	if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..cdf8d6c8a430 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf_tracing())
 		return -EPERM;
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..aa74be21f5b6 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return -EPERM;
 
 	if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index ed6563622ce3..b233ed8438f1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable_bpf())
 		return NULL;
 
 	switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
-		return bpf_get_trace_printk_proto();
+		if (capable_bpf_tracing())
+			return bpf_get_trace_printk_proto();
+		/* fall through */
 	default:
 		return NULL;
 	}
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable_bpf())
 			return false;
 		break;
 	}
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!capable(CAP_SYS_ADMIN))
+			if (!capable_bpf())
 				return false;
 			break;
 		default:
-- 
2.20.0


^ permalink raw reply related

* [PATCH v4 bpf-next 4/4] selftests/bpf: use CAP_BPF and CAP_TRACING in tests
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>

Make all test_verifier test exercise CAP_BPF and CAP_TRACING

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++++----
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d27fd929abb9..0d5567962c4e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -807,10 +807,20 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	}
 }
 
+struct libcap {
+	struct __user_cap_header_struct hdr;
+	struct __user_cap_data_struct data[2];
+};
+
 static int set_admin(bool admin)
 {
 	cap_t caps;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
+	/* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs,
+	 * and CAP_TRACING to create stackmap
+	 */
+	const cap_value_t cap_net_admin = CAP_NET_ADMIN;
+	const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
+	struct libcap *cap;
 	int ret = -1;
 
 	caps = cap_get_proc();
@@ -818,11 +828,26 @@ static int set_admin(bool admin)
 		perror("cap_get_proc");
 		return -1;
 	}
-	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+	cap = (struct libcap *)caps;
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
+		perror("cap_set_flag clear admin");
+		goto out;
+	}
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
 				admin ? CAP_SET : CAP_CLEAR)) {
-		perror("cap_set_flag");
+		perror("cap_set_flag set_or_clear net");
 		goto out;
 	}
+	/* libcap is likely old and simply ignores CAP_BPF and CAP_TRACING,
+	 * so update effective bits manually
+	 */
+	if (admin) {
+		cap->data[1].effective |= 1 << (38 /* CAP_BPF */ - 32);
+		cap->data[1].effective |= 1 << (39 /* CAP_TRACING */ - 32);
+	} else {
+		cap->data[1].effective &= ~(1 << (38 - 32));
+		cap->data[1].effective &= ~(1 << (39 - 32));
+	}
 	if (cap_set_proc(caps)) {
 		perror("cap_set_proc");
 		goto out;
@@ -1051,9 +1076,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 
 static bool is_admin(void)
 {
+	cap_flag_value_t net_priv = CAP_CLEAR;
+	bool tracing_priv = false;
+	bool bpf_priv = false;
+	struct libcap *cap;
 	cap_t caps;
-	cap_flag_value_t sysadmin = CAP_CLEAR;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
 
 #ifdef CAP_IS_SUPPORTED
 	if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1066,11 +1093,14 @@ static bool is_admin(void)
 		perror("cap_get_proc");
 		return false;
 	}
-	if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
-		perror("cap_get_flag");
+	cap = (struct libcap *)caps;
+	bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32));
+	tracing_priv = cap->data[1].effective & (1 << (39/* CAP_TRACING */ - 32));
+	if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+		perror("cap_get_flag NET");
 	if (cap_free(caps))
 		perror("cap_free");
-	return (sysadmin == CAP_SET);
+	return bpf_priv && tracing_priv && net_priv == CAP_SET;
 }
 
 static void get_unpriv_disabled()
-- 
2.20.0


^ permalink raw reply related

* [PATCH v4 bpf-next 1/4] capability: introduce CAP_BPF and CAP_TRACING
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
  To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>

Split BPF and perf/tracing operations that are allowed under
CAP_SYS_ADMIN into corresponding CAP_BPF and CAP_TRACING.
For backward compatibility include them in CAP_SYS_ADMIN as well.

The end result provides simple safety model for applications that use BPF:
- for tracing program types
  BPF_PROG_TYPE_{KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT, etc}
  use CAP_BPF and CAP_TRACING
- for networking program types
  BPF_PROG_TYPE_{SCHED_CLS, XDP, CGROUP_SKB, SK_SKB, etc}
  use CAP_BPF and CAP_NET_ADMIN

There are few exceptions from this simple rule:
- bpf_trace_printk() is allowed in networking programs, but it's using
  ftrace mechanism, hence this helper needs additional CAP_TRACING.
- cpumap is used by XDP programs. Currently it's kept under CAP_SYS_ADMIN,
  but could be relaxed to CAP_NET_ADMIN in the future.
- BPF_F_ZERO_SEED flag for hash/lru map is allowed under CAP_SYS_ADMIN only
  to discourage production use.
- BPF HW offload is allowed under CAP_SYS_ADMIN.
- cg_sysctl, cg_device, lirc program types are neither networking nor tracing.
  They can be loaded under CAP_BPF, but attach is allowed under CAP_NET_ADMIN.
  This will be cleaned up in the future.

userid=nobody + (CAP_TRACING | CAP_NET_ADMIN) + CAP_BPF is safer than
typical setup with userid=root and sudo by existing bpf applications.
It's not secure, since these capabilities:
- allow bpf progs access arbitrary memory
- let tasks access any bpf map
- let tasks attach/detach any bpf prog

bpftool, bpftrace, bcc tools binaries should not be installed with
cap_bpf+cap_tracing, since unpriv users will be able to read kernel secrets.

CAP_BPF, CAP_NET_ADMIN, CAP_TRACING are roughly equal in terms of
damage they can make to the system.
Example:
CAP_NET_ADMIN can stop network traffic. CAP_BPF can write into map
and if that map is used by firewall-like bpf prog the network traffic
may stop.
CAP_BPF allows many bpf prog_load commands in parallel. The verifier
may consume large amount of memory and significantly slow down the system.
CAP_TRACING allows many kprobes that can slow down the system.

In the future more fine-grained bpf permissions may be added.

Existing unprivileged BPF operations are not affected.
In particular unprivileged users are allowed to load socket_filter and cg_skb
program types and to create array, hash, prog_array, map-in-map map types.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/capability.h          | 18 +++++++++++
 include/uapi/linux/capability.h     | 49 ++++++++++++++++++++++++++++-
 security/selinux/include/classmap.h |  4 +--
 3 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index ecce0f43c73a..13eb49c75797 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -247,6 +247,24 @@ static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
 	return true;
 }
 #endif /* CONFIG_MULTIUSER */
+
+static inline bool capable_bpf(void)
+{
+	return capable(CAP_SYS_ADMIN) || capable(CAP_BPF);
+}
+static inline bool capable_tracing(void)
+{
+	return capable(CAP_SYS_ADMIN) || capable(CAP_TRACING);
+}
+static inline bool capable_bpf_tracing(void)
+{
+	return capable(CAP_SYS_ADMIN) || (capable(CAP_BPF) && capable(CAP_TRACING));
+}
+static inline bool capable_bpf_net_admin(void)
+{
+	return (capable(CAP_SYS_ADMIN) || capable(CAP_BPF)) && capable(CAP_NET_ADMIN);
+}
+
 extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..fe01d8235e1e 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -274,6 +274,7 @@ struct vfs_ns_cap_data {
    arbitrary SCSI commands */
 /* Allow setting encryption key on loopback filesystem */
 /* Allow setting zone reclaim policy */
+/* Allow everything under CAP_BPF and CAP_TRACING for backward compatibility */
 
 #define CAP_SYS_ADMIN        21
 
@@ -366,8 +367,54 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_READ		37
 
+/*
+ * CAP_BPF allows the following BPF operations:
+ * - Loading all types of BPF programs
+ * - Creating all types of BPF maps except:
+ *    - stackmap that needs CAP_TRACING
+ *    - devmap that needs CAP_NET_ADMIN
+ *    - cpumap that needs CAP_SYS_ADMIN
+ * - Advanced verifier features
+ *   - Indirect variable access
+ *   - Bounded loops
+ *   - BPF to BPF function calls
+ *   - Scalar precision tracking
+ *   - Larger complexity limits
+ *   - Dead code elimination
+ *   - And potentially other features
+ * - Use of pointer-to-integer conversions in BPF programs
+ * - Bypassing of speculation attack hardening measures
+ * - Loading BPF Type Format (BTF) data
+ * - Iterate system wide loaded programs, maps, BTF objects
+ * - Retrieve xlated and JITed code of BPF programs
+ * - Access maps and programs via id
+ * - Use bpf_spin_lock() helper
+ *
+ * CAP_BPF and CAP_TRACING together allow the following:
+ * - bpf_probe_read to read arbitrary kernel memory
+ * - bpf_trace_printk to print data to ftrace ring buffer
+ * - Attach to raw_tracepoint
+ * - Query association between kprobe/tracepoint and bpf program
+ *
+ * CAP_BPF and CAP_NET_ADMIN together allow the following:
+ * - Attach to cgroup-bpf hooks and query
+ * - skb, xdp, flow_dissector test_run command
+ *
+ * CAP_NET_ADMIN allows:
+ * - Attach networking bpf programs to xdp, tc, lwt, flow dissector
+ */
+#define CAP_BPF			38
+
+/*
+ * CAP_TRACING allows:
+ * - Full use of perf_event_open(), similarly to the effect of
+ *   kernel.perf_event_paranoid == -1
+ * - Creation of [ku][ret]probe
+ * - Attach tracing bpf programs to perf events
+ */
+#define CAP_TRACING		39
 
-#define CAP_LAST_CAP         CAP_AUDIT_READ
+#define CAP_LAST_CAP         CAP_TRACING
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 201f7e588a29..0b364e245163 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -26,9 +26,9 @@
 	    "audit_control", "setfcap"
 
 #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
-		"wake_alarm", "block_suspend", "audit_read"
+		"wake_alarm", "block_suspend", "audit_read", "bpf", "tracing"
 
-#if CAP_LAST_CAP > CAP_AUDIT_READ
+#if CAP_LAST_CAP > CAP_TRACING
 #error New capability defined, please update COMMON_CAP2_PERMS.
 #endif
 
-- 
2.20.0


^ permalink raw reply related

* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Yonghong Song @ 2019-09-06 23:21 UTC (permalink / raw)
  To: Al Viro, Carlos Neira
  Cc: netdev@vger.kernel.org, ebiederm@xmission.com, brouer@redhat.com,
	bpf@vger.kernel.org
In-Reply-To: <20190906160020.GX1131@ZenIV.linux.org.uk>



On 9/6/19 9:00 AM, Al Viro wrote:
> On Fri, Sep 06, 2019 at 04:46:47PM +0100, Al Viro wrote:
> 
>>> Where do I begin?
>>> 	* getname_kernel() is there for purpose
>>> 	* so's kern_path(), damnit
>>
>> Oh, and filename_lookup() *CAN* sleep, obviously.  So that
>> GFP_ATOMIC above is completely pointless.
>>
>>>> +
>>>> +	inode = d_backing_inode(kp.dentry);
>>>> +	pidns_info->dev = (u32)inode->i_rdev;
> 
> In the original variant of patchset it used to be ->i_sb->s_dev,
> which is also bloody strange - you are not asking filename_lookup()
> to follow symlinks, so you'd get that of whatever filesystem
> /proc/self/ns resides on.
> 
> ->i_rdev use makes no sense whatsoever - it's a symlink and
> neither it nor its target are device nodes; ->i_rdev will be
> left zero for both.
> 
> What data are you really trying to get there?

Let me explain a little bit background here.
The ultimate goal is for bpf program to filter over
(pid_namespace, tgid/pid inside pid_namespace)
so bpf based tools can run inside the container.

Typically, pid namespace is achieved by looking at
/proc/self/ns/pid:
-bash-4.4$ lsns
         NS TYPE   NPROCS   PID USER COMMAND
4026531835 cgroup     44  8261 yhs  /usr/lib/systemd/systemd --user
4026531836 pid        44  8261 yhs  /usr/lib/systemd/systemd --user
4026531837 user       44  8261 yhs  /usr/lib/systemd/systemd --user
4026531838 uts        44  8261 yhs  /usr/lib/systemd/systemd --user
4026531839 ipc        44  8261 yhs  /usr/lib/systemd/systemd --user
4026531840 mnt        44  8261 yhs  /usr/lib/systemd/systemd --user
4026532008 net        44  8261 yhs  /usr/lib/systemd/systemd --user
-bash-4.4$ readlink /proc/self/ns/pid
pid:[4026531836]
-bash-4.4$ stat /proc/self/ns/pid
   File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
   Size: 0               Blocks: 0          IO Block: 1024   symbolic link
Device: 4h/4d   Inode: 344795989   Links: 1
Access: (0777/lrwxrwxrwx)  Uid: (128203/     yhs)   Gid: (  100/   users)
Context: user_u:base_r:base_t
Access: 2019-09-06 16:06:09.431616380 -0700
Modify: 2019-09-06 16:06:09.431616380 -0700
Change: 2019-09-06 16:06:09.431616380 -0700
  Birth: -
-bash-4.4$

Based on a discussion with Eric Biederman back in 2019 Linux
Plumbers, Eric suggested that to uniquely identify a
namespace, device id (major/minor) number should also
be included. Although today's kernel implementation
has the same device for all namespace pseudo files,
but from uapi perspective, device id should be included.

That is the reason why we try to get device id which holds
pid namespace pseudo file.

Do you have a better suggestion on how to get
the device id for 'current' pid namespace? Or from design, we
really should not care about device id at all?

^ permalink raw reply

* Re: [PATCH v2] net: enable wireless core features with LEGACY_WEXT_ALLCONFIG
From: Greg KH @ 2019-09-06 23:30 UTC (permalink / raw)
  To: Mark Salyzyn
  Cc: linux-kernel, kernel-team, Johannes Berg, David S. Miller,
	Marcel Holtmann, linux-wireless, netdev, stable
In-Reply-To: <20190906192403.195620-1-salyzyn@android.com>

On Fri, Sep 06, 2019 at 12:24:00PM -0700, Mark Salyzyn wrote:
> In embedded environments the requirements are to be able to pick and
> chose which features one requires built into the kernel.  If an
> embedded environment wants to supports loading modules that have been
> kbuilt out of tree, there is a need to enable hidden configurations
> for legacy wireless core features to provide the API surface for
> them to load.
> 
> Introduce CONFIG_LEGACY_WEXT_ALLCONFIG to select all legacy wireless
> extension core features by activating in turn all the associated
> hidden configuration options, without having to specifically select
> any wireless module(s).
> 
> Signed-off-by: Mark Salyzyn <salyzyn@android.com>
> Cc: kernel-team@android.com
> Cc: Johannes Berg <johannes@sipsolutions.net>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Marcel Holtmann <marcel@holtmann.org>
> Cc: linux-wireless@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: stable@vger.kernel.org # 4.19
> ---
> v2: change name and documentation to CONFIG_LEGACY_WEXT_ALLCONFIG
> ---
>  net/wireless/Kconfig | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
> index 67f8360dfcee..0d646cf28de5 100644
> --- a/net/wireless/Kconfig
> +++ b/net/wireless/Kconfig
> @@ -17,6 +17,20 @@ config WEXT_SPY
>  config WEXT_PRIV
>  	bool
>  
> +config LEGACY_WEXT_ALLCONFIG
> +	bool "allconfig for legacy wireless extensions"
> +	select WIRELESS_EXT
> +	select WEXT_CORE
> +	select WEXT_PROC
> +	select WEXT_SPY
> +	select WEXT_PRIV
> +	help
> +	  Config option used to enable all the legacy wireless extensions to
> +	  the core functionality used by add-in modules.
> +
> +	  If you are not building a kernel to be used for a variety of
> +	  out-of-kernel built wireless modules, say N here.
> +
>  config CFG80211
>  	tristate "cfg80211 - wireless configuration API"
>  	depends on RFKILL || !RFKILL
> -- 
> 2.23.0.187.g17f5b7556c-goog
> 

How is this patch applicable to stable kernels???

^ permalink raw reply

* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Alexei Starovoitov @ 2019-09-06 23:31 UTC (permalink / raw)
  To: Ivan Khoronzhuk
  Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
	linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190904212212.13052-3-ivan.khoronzhuk@linaro.org>

On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
> No need to set --target for native build, at least for arm, the
> default target will be used anyway. In case of arm, for at least
> clang 5 - 10 it causes error like:
> 
> clang: warning: unknown platform, assuming -mfloat-abi=soft
> LLVM ERROR: Unsupported calling convention
> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
> 
> Only set to real triple helps: --target=arm-linux-gnueabihf
> or just drop the target key to use default one. Decision to just
> drop it and thus default target will be used (wich is native),
> looks better.
> 
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> ---
>  samples/bpf/Makefile | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 61b7394b811e..a2953357927e 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
>  ifdef CROSS_COMPILE
>  HOSTCC = $(CROSS_COMPILE)gcc
>  CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
> -else
> -CLANG_ARCH_ARGS = -target $(ARCH)
>  endif

I don't follow here.
Didn't you introduce this bug in patch 1 and now fixing it in patch 2?


^ permalink raw reply

* Re: [PATCH bpf-next 8/8] samples: bpf: Makefile: base progs build on Makefile.progs
From: Alexei Starovoitov @ 2019-09-06 23:34 UTC (permalink / raw)
  To: Ivan Khoronzhuk
  Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
	linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190904212212.13052-9-ivan.khoronzhuk@linaro.org>

On Thu, Sep 05, 2019 at 12:22:12AM +0300, Ivan Khoronzhuk wrote:
> +
> +If need to use environment of target board, the SYSROOT also can be set,
> +pointing on FS of target board:
> +
> +make samples/bpf/ LLC=~/git/llvm/build/bin/llc \
> +     CLANG=~/git/llvm/build/bin/clang \
> +     SYSROOT=~/some_sdk/linux-devkit/sysroots/aarch64-linux-gnu

Patches 7 and 8 look quite heavy. I don't have a way to test them
which makes me a bit uneasy to accept them as-is.
Would be great if somebody could give Tested-by.


^ permalink raw reply

* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Ivan Khoronzhuk @ 2019-09-06 23:52 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
	linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190906233138.4d4fqdnlbikemhau@ast-mbp.dhcp.thefacebook.com>

On Fri, Sep 06, 2019 at 04:31:39PM -0700, Alexei Starovoitov wrote:
>On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
>> No need to set --target for native build, at least for arm, the
>> default target will be used anyway. In case of arm, for at least
>> clang 5 - 10 it causes error like:
>>
>> clang: warning: unknown platform, assuming -mfloat-abi=soft
>> LLVM ERROR: Unsupported calling convention
>> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
>> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
>>
>> Only set to real triple helps: --target=arm-linux-gnueabihf
>> or just drop the target key to use default one. Decision to just
>> drop it and thus default target will be used (wich is native),
>> looks better.
>>
>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> ---
>>  samples/bpf/Makefile | 2 --
>>  1 file changed, 2 deletions(-)
>>
>> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
>> index 61b7394b811e..a2953357927e 100644
>> --- a/samples/bpf/Makefile
>> +++ b/samples/bpf/Makefile
>> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
>>  ifdef CROSS_COMPILE
>>  HOSTCC = $(CROSS_COMPILE)gcc
>>  CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
>> -else
>> -CLANG_ARCH_ARGS = -target $(ARCH)
>>  endif
>
>I don't follow here.
>Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
>

It looks like but that's not true.
Previous patch adds target only for cross compiling,
before the patch the target was used for both, cross compiling and w/o cc.

This patch removes target only for native build (it's not cross compiling).

By fact, it's two separate significant changes.

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Alexei Starovoitov @ 2019-09-07  0:04 UTC (permalink / raw)
  To: Ivan Khoronzhuk
  Cc: Alexei Starovoitov, Daniel Borkmann, Yonghong Song,
	David S. Miller, Jakub Kicinski, Jesper Dangaard Brouer,
	John Fastabend, LKML, Network Development, bpf,
	Clang-Built-Linux ML
In-Reply-To: <20190906235207.GA3053@khorivan>

On Fri, Sep 6, 2019 at 4:52 PM Ivan Khoronzhuk
<ivan.khoronzhuk@linaro.org> wrote:
>
> On Fri, Sep 06, 2019 at 04:31:39PM -0700, Alexei Starovoitov wrote:
> >On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
> >> No need to set --target for native build, at least for arm, the
> >> default target will be used anyway. In case of arm, for at least
> >> clang 5 - 10 it causes error like:
> >>
> >> clang: warning: unknown platform, assuming -mfloat-abi=soft
> >> LLVM ERROR: Unsupported calling convention
> >> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
> >> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
> >>
> >> Only set to real triple helps: --target=arm-linux-gnueabihf
> >> or just drop the target key to use default one. Decision to just
> >> drop it and thus default target will be used (wich is native),
> >> looks better.
> >>
> >> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> >> ---
> >>  samples/bpf/Makefile | 2 --
> >>  1 file changed, 2 deletions(-)
> >>
> >> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> >> index 61b7394b811e..a2953357927e 100644
> >> --- a/samples/bpf/Makefile
> >> +++ b/samples/bpf/Makefile
> >> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
> >>  ifdef CROSS_COMPILE
> >>  HOSTCC = $(CROSS_COMPILE)gcc
> >>  CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
> >> -else
> >> -CLANG_ARCH_ARGS = -target $(ARCH)
> >>  endif
> >
> >I don't follow here.
> >Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
> >
>
> It looks like but that's not true.
> Previous patch adds target only for cross compiling,
> before the patch the target was used for both, cross compiling and w/o cc.
>
> This patch removes target only for native build (it's not cross compiling).
>
> By fact, it's two separate significant changes.

How so?
before first patch CLANG_ARCH_ARGS is only used under CROSS_COMPILE.
After the first patch CLANG_ARCH_ARGS is now suddenly defined w/o CROSS_COMPILE
and second patch brings it to the state before first patch.

^ permalink raw reply

* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Al Viro @ 2019-09-07  0:10 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Carlos Neira, netdev@vger.kernel.org, ebiederm@xmission.com,
	brouer@redhat.com, bpf@vger.kernel.org
In-Reply-To: <c0e67fc7-be66-c4c6-6aad-316cbba18757@fb.com>

On Fri, Sep 06, 2019 at 11:21:14PM +0000, Yonghong Song wrote:

> -bash-4.4$ readlink /proc/self/ns/pid
> pid:[4026531836]
> -bash-4.4$ stat /proc/self/ns/pid
>    File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
>    Size: 0               Blocks: 0          IO Block: 1024   symbolic link
> Device: 4h/4d   Inode: 344795989   Links: 1
> Access: (0777/lrwxrwxrwx)  Uid: (128203/     yhs)   Gid: (  100/   users)
> Context: user_u:base_r:base_t
> Access: 2019-09-06 16:06:09.431616380 -0700
> Modify: 2019-09-06 16:06:09.431616380 -0700
> Change: 2019-09-06 16:06:09.431616380 -0700
>   Birth: -
> -bash-4.4$
> 
> Based on a discussion with Eric Biederman back in 2019 Linux
> Plumbers, Eric suggested that to uniquely identify a
> namespace, device id (major/minor) number should also
> be included. Although today's kernel implementation
> has the same device for all namespace pseudo files,
> but from uapi perspective, device id should be included.
> 
> That is the reason why we try to get device id which holds
> pid namespace pseudo file.
> 
> Do you have a better suggestion on how to get
> the device id for 'current' pid namespace? Or from design, we
> really should not care about device id at all?

What the hell is "device id for pid namespace"?  This is the
first time I've heard about that mystery object, so it's
hard to tell where it could be found.

I can tell you what device numbers are involved in the areas
you seem to be looking in.

1) there's whatever device number that gets assigned to
(this) procfs instance.  That, ironically, _is_ per-pidns, but
that of the procfs instance, not that of your process (and
those can be different).  That's what you get in ->st_dev
when doing lstat() of anything in /proc (assuming that
procfs is mounted there, in the first place).  NOTE:
that's lstat(2), not stat(2).  stat(1) uses lstat(2),
unless given -L (in which case it's stat(2) time).  The
difference:

root@kvm1:~# stat /proc/self/ns/pid 
  File: /proc/self/ns/pid -> pid:[4026531836]
  Size: 0               Blocks: 0          IO Block: 1024   symbolic link
Device: 4h/4d   Inode: 17396       Links: 1
Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2019-09-06 19:43:11.871312319 -0400
Modify: 2019-09-06 19:43:11.871312319 -0400
Change: 2019-09-06 19:43:11.871312319 -0400
 Birth: -
root@kvm1:~# stat -L /proc/self/ns/pid 
  File: /proc/self/ns/pid
  Size: 0               Blocks: 0          IO Block: 4096   regular empty file
Device: 3h/3d   Inode: 4026531836  Links: 1
Access: (0444/-r--r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2019-09-06 19:43:15.955313293 -0400
Modify: 2019-09-06 19:43:15.955313293 -0400
Change: 2019-09-06 19:43:15.955313293 -0400
 Birth: -

The former is lstat, the latter - stat.

2) device number of the filesystem where the symlink target lives.
In this case, it's nsfs and there's only one instance on the entire
system.  _That_ would be obtained by looking at st_dev in stat(2) on
/proc/self/ns/pid (0:3 above).

3) device number *OF* the symlink.  That would be st_rdev in lstat(2).
There's none - it's a symlink, not a character or block device.  It's
always zero and always will be zero.

4) the same for the target; st_rdev in stat(2) results and again,
there's no such beast - it's neither character nor block device.

Your code is looking at (3).  Please, reread any textbook on Unix
in the section that would cover stat(2) and discussion of the
difference between st_dev and st_rdev.

I have no idea what Eric had been talking about - it's hard to
reconstruct by what you said so far.  Making nsfs per-userns,
perhaps?  But that makes no sense whatsoever, not that userns
ever had...  Cheap shots aside, I really can't guess what that's
about.  Sorry.

In any case, pathname resolution is *NOT* for the situations where
you can't block.  Even if it's procfs (and from the same pidns as
the process) mounted there, there is no promise that the target
of /proc/self has already been looked up and not evicted from
memory since then.  And in case of cache miss pathwalk will
have to call ->lookup(), which requires locking the directory
(rw_sem, shared).  You can't do that in such context.

And that doesn't even go into the possibility that process has
something very different mounted on /proc.

Again, I don't know what it is that you want to get to, but
I would strongly recommend finding a way to get to that data
that would not involve going anywhere near pathname resolution.

How would you expect the userland to work with that value,
whatever it might be?  If it's just a 32bit field that will
never be read, you might as well store there the same value
you store now (0, that is) in much cheaper and safer way ;-)

^ permalink raw reply

* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Ivan Khoronzhuk @ 2019-09-07  0:19 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, Yonghong Song,
	David S. Miller, Jakub Kicinski, Jesper Dangaard Brouer,
	John Fastabend, LKML, Network Development, bpf,
	Clang-Built-Linux ML
In-Reply-To: <CAADnVQKOT8D9156p49AQ0q0z5Zks5te4Ofi6DrBfpnitmRBgmg@mail.gmail.com>

On Fri, Sep 06, 2019 at 05:04:08PM -0700, Alexei Starovoitov wrote:
>On Fri, Sep 6, 2019 at 4:52 PM Ivan Khoronzhuk
><ivan.khoronzhuk@linaro.org> wrote:
>>
>> On Fri, Sep 06, 2019 at 04:31:39PM -0700, Alexei Starovoitov wrote:
>> >On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
>> >> No need to set --target for native build, at least for arm, the
>> >> default target will be used anyway. In case of arm, for at least
>> >> clang 5 - 10 it causes error like:
>> >>
>> >> clang: warning: unknown platform, assuming -mfloat-abi=soft
>> >> LLVM ERROR: Unsupported calling convention
>> >> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
>> >> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
>> >>
>> >> Only set to real triple helps: --target=arm-linux-gnueabihf
>> >> or just drop the target key to use default one. Decision to just
>> >> drop it and thus default target will be used (wich is native),
>> >> looks better.
>> >>
>> >> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> >> ---
>> >>  samples/bpf/Makefile | 2 --
>> >>  1 file changed, 2 deletions(-)
>> >>
>> >> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
>> >> index 61b7394b811e..a2953357927e 100644
>> >> --- a/samples/bpf/Makefile
>> >> +++ b/samples/bpf/Makefile
>> >> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
>> >>  ifdef CROSS_COMPILE
>> >>  HOSTCC = $(CROSS_COMPILE)gcc
>> >>  CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
>> >> -else
>> >> -CLANG_ARCH_ARGS = -target $(ARCH)
>> >>  endif
>> >
>> >I don't follow here.
>> >Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
>> >
>>
>> It looks like but that's not true.
>> Previous patch adds target only for cross compiling,
>> before the patch the target was used for both, cross compiling and w/o cc.
>>
>> This patch removes target only for native build (it's not cross compiling).
>>
>> By fact, it's two separate significant changes.
>
>How so?
>before first patch CLANG_ARCH_ARGS is only used under CROSS_COMPILE.
>After the first patch CLANG_ARCH_ARGS is now suddenly defined w/o CROSS_COMPILE
>and second patch brings it to the state before first patch.

Oh sorry ), messed with my local exp with target bpf, after rebase, even forgot
that's mine. Will drop it, with removing "else" for previous patch.

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: [PATCH bpf-next 8/8] samples: bpf: Makefile: base progs build on Makefile.progs
From: Ivan Khoronzhuk @ 2019-09-07  1:24 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
	linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190906233429.6ass5x5inaypvbpr@ast-mbp.dhcp.thefacebook.com>

On Fri, Sep 06, 2019 at 04:34:31PM -0700, Alexei Starovoitov wrote:
>On Thu, Sep 05, 2019 at 12:22:12AM +0300, Ivan Khoronzhuk wrote:
>> +
>> +If need to use environment of target board, the SYSROOT also can be set,
>> +pointing on FS of target board:
>> +
>> +make samples/bpf/ LLC=~/git/llvm/build/bin/llc \
>> +     CLANG=~/git/llvm/build/bin/clang \
>> +     SYSROOT=~/some_sdk/linux-devkit/sysroots/aarch64-linux-gnu
>
>Patches 7 and 8 look quite heavy. I don't have a way to test them
>which makes me a bit uneasy to accept them as-is.
>Would be great if somebody could give Tested-by.
>
I can try to split patch 8 in v2, but not significantly.

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* Re: [PATCH] ethernet: micrel: Use DIV_ROUND_CLOSEST directly to make it readable
From: zhong jiang @ 2019-09-07  3:14 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: davem, kstewart, gregkh, netdev, linux-kernel
In-Reply-To: <20190906194050.GB2339@lunn.ch>

On 2019/9/7 3:40, Andrew Lunn wrote:
> On Thu, Sep 05, 2019 at 11:53:48PM +0800, zhong jiang wrote:
>> The kernel.h macro DIV_ROUND_CLOSEST performs the computation (x + d/2)/d
>> but is perhaps more readable.
> Hi Zhong
>
> Did you find this by hand, or did you use a tool. If a tool is used,
> it is normal to give some credit to the tool.
With the following help of Coccinelle. 
-(((x) + ((__divisor) / 2)) / (__divisor))
+ DIV_ROUND_CLOSEST(x,__divisor)

Sometimes, I will add the information in the description. Sometimes, I desn't do that.

I will certainly add the description when I send an series of patches to modify the case.

Thanks,
zhong jiang

> Thanks
> 	Andrew
>
> .
>



^ permalink raw reply

* Re: [PATCH v3 bpf-next 2/3] bpf: implement CAP_BPF
From: kbuild test robot @ 2019-09-07  4:09 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: kbuild-all, davem, daniel, peterz, luto, netdev, bpf, kernel-team,
	linux-api
In-Reply-To: <20190904184335.360074-2-ast@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 4029 bytes --]

Hi Alexei,

I love your patch! Perhaps something to improve:

[auto build test WARNING on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Alexei-Starovoitov/capability-introduce-CAP_BPF-and-CAP_TRACING/20190906-215814
base:   https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-c003-201935 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-11) 7.4.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/export.h:45:0,
                    from include/linux/linkage.h:7,
                    from include/linux/kernel.h:8,
                    from include/linux/list.h:9,
                    from include/linux/timer.h:5,
                    from include/linux/workqueue.h:9,
                    from include/linux/bpf.h:9,
                    from kernel/bpf/syscall.c:4:
   kernel/bpf/syscall.c: In function 'bpf_prog_test_run':
   kernel/bpf/syscall.c:2087:6: warning: the address of 'capable_bpf_net_admin' will always evaluate as 'true' [-Waddress]
     if (!capable_bpf_net_admin)
         ^
   include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
    #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                       ^~~~
>> kernel/bpf/syscall.c:2087:2: note: in expansion of macro 'if'
     if (!capable_bpf_net_admin)
     ^~
   kernel/bpf/syscall.c:2087:6: warning: the address of 'capable_bpf_net_admin' will always evaluate as 'true' [-Waddress]
     if (!capable_bpf_net_admin)
         ^
   include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
    #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
                                                                ^~~~
>> kernel/bpf/syscall.c:2087:2: note: in expansion of macro 'if'
     if (!capable_bpf_net_admin)
     ^~
   kernel/bpf/syscall.c:2087:6: warning: the address of 'capable_bpf_net_admin' will always evaluate as 'true' [-Waddress]
     if (!capable_bpf_net_admin)
         ^
   include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
     (cond) ?     \
      ^~~~
   include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
    #define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
                               ^~~~~~~~~~~~~~
>> kernel/bpf/syscall.c:2087:2: note: in expansion of macro 'if'
     if (!capable_bpf_net_admin)
     ^~

vim +/if +2087 kernel/bpf/syscall.c

  2080	
  2081	static int bpf_prog_test_run(const union bpf_attr *attr,
  2082				     union bpf_attr __user *uattr)
  2083	{
  2084		struct bpf_prog *prog;
  2085		int ret = -ENOTSUPP;
  2086	
> 2087		if (!capable_bpf_net_admin)
  2088			/* test_run callback is available for networking progs only.
  2089			 * Add capable_bpf_tracing() above when tracing progs become runable.
  2090			 */
  2091			return -EPERM;
  2092		if (CHECK_ATTR(BPF_PROG_TEST_RUN))
  2093			return -EINVAL;
  2094	
  2095		if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
  2096		    (!attr->test.ctx_size_in && attr->test.ctx_in))
  2097			return -EINVAL;
  2098	
  2099		if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
  2100		    (!attr->test.ctx_size_out && attr->test.ctx_out))
  2101			return -EINVAL;
  2102	
  2103		prog = bpf_prog_get(attr->test.prog_fd);
  2104		if (IS_ERR(prog))
  2105			return PTR_ERR(prog);
  2106	
  2107		if (prog->aux->ops->test_run)
  2108			ret = prog->aux->ops->test_run(prog, attr, uattr);
  2109	
  2110		bpf_prog_put(prog);
  2111		return ret;
  2112	}
  2113	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29761 bytes --]

^ permalink raw reply

* Re: [net-next 02/11] devlink: add 'reset_dev_on_drv_probe' param
From: Jakub Kicinski @ 2019-09-07  4:17 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Dirk van der Merwe, Simon Horman, David Miller, netdev,
	oss-drivers
In-Reply-To: <8066ba35-2f9b-c175-100f-e754b4ca65be@netronome.com>

On Fri, 6 Sep 2019 11:40:54 -0700, Dirk van der Merwe wrote:
> >> DEVLINK_PARAM_RESET_DEV_VALUE_UNKNOWN (0)
> >> +			  Unknown or invalid value.  
> > Why do you need this? Do you have usecase for this value?  
> 
> I added this in to avoid having the entire netlink dump fail when there 
> are invalid values read from hardware.
> 
> This way, it can report an unknown or invalid value instead of failing 
> the operation.

That's the first reason, the second is that we also want to report 
the unknown value if it's not recognized by the driver. For u8/enum
parameters the value may possibly be set to a value older driver
doesn't understand, but users should still be able to set them to one
of the known ones.

We'd also like to add that to 'fw_load_policy'. WDYT?

^ permalink raw reply

* Re: [PATCH] net/ibmvnic: free reset work of removed device from queue
From: kbuild test robot @ 2019-09-07  4:24 UTC (permalink / raw)
  To: Juliet Kim; +Cc: kbuild-all, netdev, julietk, linuxppc-dev
In-Reply-To: <20190905213001.19818-1-julietk@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 7041 bytes --]

Hi Juliet,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.3-rc7 next-20190904]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Juliet-Kim/net-ibmvnic-free-reset-work-of-removed-device-from-queue/20190906-195317
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 7.4.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.4.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   drivers/net/ethernet/ibm/ibmvnic.c: In function '__ibmvnic_reset':
>> drivers/net/ethernet/ibm/ibmvnic.c:1986:3: warning: this 'if' clause does not guard... [-Wmisleading-indentation]
      if (adapter->state == VNIC_REMOVING ||
      ^~
   drivers/net/ethernet/ibm/ibmvnic.c:1989:4: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'
       rc = EBUSY;
       ^~
>> drivers/net/ethernet/ibm/ibmvnic.c:2002:4: error: break statement not within loop or switch
       break;
       ^~~~~
   drivers/net/ethernet/ibm/ibmvnic.c: At top level:
>> drivers/net/ethernet/ibm/ibmvnic.c:2007:2: error: expected identifier or '(' before 'if'
     if (adapter->wait_for_reset) {
     ^~
   drivers/net/ethernet/ibm/ibmvnic.c:2013:2: error: expected identifier or '(' before 'if'
     if (rc) {
     ^~
>> drivers/net/ethernet/ibm/ibmvnic.c:2018:9: error: expected '=', ',', ';', 'asm' or '__attribute__' before '->' token
     adapter->resetting = false;
            ^~
   drivers/net/ethernet/ibm/ibmvnic.c:2019:2: error: expected identifier or '(' before 'if'
     if (we_lock_rtnl)
     ^~
>> drivers/net/ethernet/ibm/ibmvnic.c:2021:1: error: expected identifier or '(' before '}' token
    }
    ^
   drivers/net/ethernet/ibm/ibmvnic.c:1953:13: warning: 'free_all_rwi' defined but not used [-Wunused-function]
    static void free_all_rwi(struct ibmvnic_adapter *adapter)
                ^~~~~~~~~~~~

vim +2002 drivers/net/ethernet/ibm/ibmvnic.c

ed651a10875f13 Nathan Fontenot 2017-05-03  1963  
ed651a10875f13 Nathan Fontenot 2017-05-03  1964  static void __ibmvnic_reset(struct work_struct *work)
ed651a10875f13 Nathan Fontenot 2017-05-03  1965  {
ed651a10875f13 Nathan Fontenot 2017-05-03  1966  	struct ibmvnic_rwi *rwi;
ed651a10875f13 Nathan Fontenot 2017-05-03  1967  	struct ibmvnic_adapter *adapter;
a5681e20b541a5 Juliet Kim      2018-11-19  1968  	bool we_lock_rtnl = false;
ed651a10875f13 Nathan Fontenot 2017-05-03  1969  	u32 reset_state;
c26eba03e4073b John Allen      2017-10-26  1970  	int rc = 0;
ed651a10875f13 Nathan Fontenot 2017-05-03  1971  
ed651a10875f13 Nathan Fontenot 2017-05-03  1972  	adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
ed651a10875f13 Nathan Fontenot 2017-05-03  1973  
a5681e20b541a5 Juliet Kim      2018-11-19  1974  	/* netif_set_real_num_xx_queues needs to take rtnl lock here
a5681e20b541a5 Juliet Kim      2018-11-19  1975  	 * unless wait_for_reset is set, in which case the rtnl lock
a5681e20b541a5 Juliet Kim      2018-11-19  1976  	 * has already been taken before initializing the reset
a5681e20b541a5 Juliet Kim      2018-11-19  1977  	 */
a5681e20b541a5 Juliet Kim      2018-11-19  1978  	if (!adapter->wait_for_reset) {
a5681e20b541a5 Juliet Kim      2018-11-19  1979  		rtnl_lock();
a5681e20b541a5 Juliet Kim      2018-11-19  1980  		we_lock_rtnl = true;
a5681e20b541a5 Juliet Kim      2018-11-19  1981  	}
ed651a10875f13 Nathan Fontenot 2017-05-03  1982  	reset_state = adapter->state;
ed651a10875f13 Nathan Fontenot 2017-05-03  1983  
ed651a10875f13 Nathan Fontenot 2017-05-03  1984  	rwi = get_next_rwi(adapter);
ed651a10875f13 Nathan Fontenot 2017-05-03  1985  	while (rwi) {
36f1031c51a253 Thomas Falcon   2019-08-27 @1986  		if (adapter->state == VNIC_REMOVING ||
36f1031c51a253 Thomas Falcon   2019-08-27  1987  		    adapter->state == VNIC_REMOVED)
42a863ed7971cb Juliet Kim      2019-09-05  1988  			kfree(rwi);
42a863ed7971cb Juliet Kim      2019-09-05 @1989  			rc = EBUSY;
42a863ed7971cb Juliet Kim      2019-09-05  1990  			break;
42a863ed7971cb Juliet Kim      2019-09-05  1991  		}
36f1031c51a253 Thomas Falcon   2019-08-27  1992  
2770a7984db588 Thomas Falcon   2018-05-23  1993  		if (adapter->force_reset_recovery) {
2770a7984db588 Thomas Falcon   2018-05-23  1994  			adapter->force_reset_recovery = false;
2770a7984db588 Thomas Falcon   2018-05-23  1995  			rc = do_hard_reset(adapter, rwi, reset_state);
2770a7984db588 Thomas Falcon   2018-05-23  1996  		} else {
ed651a10875f13 Nathan Fontenot 2017-05-03  1997  			rc = do_reset(adapter, rwi, reset_state);
2770a7984db588 Thomas Falcon   2018-05-23  1998  		}
ed651a10875f13 Nathan Fontenot 2017-05-03  1999  		kfree(rwi);
2770a7984db588 Thomas Falcon   2018-05-23  2000  		if (rc && rc != IBMVNIC_INIT_FAILED &&
2770a7984db588 Thomas Falcon   2018-05-23  2001  		    !adapter->force_reset_recovery)
ed651a10875f13 Nathan Fontenot 2017-05-03 @2002  			break;
ed651a10875f13 Nathan Fontenot 2017-05-03  2003  
ed651a10875f13 Nathan Fontenot 2017-05-03  2004  		rwi = get_next_rwi(adapter);
ed651a10875f13 Nathan Fontenot 2017-05-03  2005  	}
ed651a10875f13 Nathan Fontenot 2017-05-03  2006  
c26eba03e4073b John Allen      2017-10-26 @2007  	if (adapter->wait_for_reset) {
c26eba03e4073b John Allen      2017-10-26  2008  		adapter->wait_for_reset = false;
c26eba03e4073b John Allen      2017-10-26  2009  		adapter->reset_done_rc = rc;
c26eba03e4073b John Allen      2017-10-26  2010  		complete(&adapter->reset_done);
c26eba03e4073b John Allen      2017-10-26  2011  	}
c26eba03e4073b John Allen      2017-10-26  2012  
ed651a10875f13 Nathan Fontenot 2017-05-03 @2013  	if (rc) {
d1cf33d93166f1 Nathan Fontenot 2017-08-08  2014  		netdev_dbg(adapter->netdev, "Reset failed\n");
ed651a10875f13 Nathan Fontenot 2017-05-03  2015  		free_all_rwi(adapter);
ed651a10875f13 Nathan Fontenot 2017-05-03  2016  	}
42a863ed7971cb Juliet Kim      2019-09-05  2017  
ed651a10875f13 Nathan Fontenot 2017-05-03 @2018  	adapter->resetting = false;
a5681e20b541a5 Juliet Kim      2018-11-19  2019  	if (we_lock_rtnl)
a5681e20b541a5 Juliet Kim      2018-11-19  2020  		rtnl_unlock();
ed651a10875f13 Nathan Fontenot 2017-05-03 @2021  }
ed651a10875f13 Nathan Fontenot 2017-05-03  2022  

:::::: The code at line 2002 was first introduced by commit
:::::: ed651a10875f13135a5f59c1bae4d51b377b3925 ibmvnic: Updated reset handling

:::::: TO: Nathan Fontenot <nfont@linux.vnet.ibm.com>
:::::: CC: David S. Miller <davem@davemloft.net>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 62432 bytes --]

^ permalink raw reply

* Re: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF NIC
From: Jakub Kicinski @ 2019-09-07  4:25 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: sashal@kernel.org, linux-hyperv@vger.kernel.org,
	netdev@vger.kernel.org, KY Srinivasan, Stephen Hemminger,
	olaf@aepfle.de, vkuznets, davem@davemloft.net,
	linux-kernel@vger.kernel.org, Mark Bloch
In-Reply-To: <DM6PR21MB13373166435FD2FC5543D349CABB0@DM6PR21MB1337.namprd21.prod.outlook.com>

On Thu, 5 Sep 2019 23:07:32 +0000, Haiyang Zhang wrote:
> > On Fri, 30 Aug 2019 03:45:38 +0000, Haiyang Zhang wrote:  
> > > VF NIC may go down then come up during host servicing events. This
> > > causes the VF NIC offloading feature settings to roll back to the
> > > defaults. This patch can synchronize features from synthetic NIC to
> > > the VF NIC during ndo_set_features (ethtool -K), and
> > > netvsc_register_vf when VF comes back after host events.
> > >
> > > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> > > Cc: Mark Bloch <markb@mellanox.com>  
> > 
> > If we want to make this change in behaviour we should change net_failover
> > at the same time.  
> 
> After checking the net_failover, I found it's for virtio based SRIOV, and very 
> different from what we did for Hyper-V based SRIOV.
> 
> We let the netvsc driver acts as both the synthetic (PV) driver and the transparent 
> bonding master for the VF NIC. But net_failover acts as a master device on top 
> of both virtio PV NIC, and VF NIC. And the net_failover doesn't implemented 
> operations, like ndo_set_features.
> So the code change for our netvsc driver cannot be applied to net_failover driver.
> 
> I will re-submit my two patches (fixing the extra tab in the 1st one as you pointed 
> out). Thanks!

I think it stands to reason that two modules which implement the same
functionality behave the same.

^ permalink raw reply

* test
From: Rain River @ 2019-09-07  5:01 UTC (permalink / raw)
  To: netdev

Please ignore it.

^ permalink raw reply

* [PATCH net-next 0/4] net/tls: small TX offload optimizations
From: Jakub Kicinski @ 2019-09-07  5:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
	daniel, Jakub Kicinski

Hi!

This set brings small TLS TX device optimizations. The biggest
gain comes from fixing a misuse of non temporal copy instructions.
On a synthetic workload modelled after customer's RFC application
I see 3-5% percent gain.

Jakub Kicinski (4):
  net/tls: unref frags in order
  net/tls: use RCU for the adder to the offload record list
  net/tls: remove the record tail optimization
  net/tls: align non temporal copy to cache lines

 net/tls/tls_device.c | 121 ++++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 37 deletions(-)

-- 
2.21.0


^ permalink raw reply

* [PATCH net-next 1/4] net/tls: unref frags in order
From: Jakub Kicinski @ 2019-09-07  5:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
	daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>

It's generally more cache friendly to walk arrays in order,
especially those which are likely not in cache.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 net/tls/tls_device.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 41c106e45f01..285c9f9e94e4 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -122,13 +122,10 @@ static struct net_device *get_netdev_for_sock(struct sock *sk)
 
 static void destroy_record(struct tls_record_info *record)
 {
-	int nr_frags = record->num_frags;
-	skb_frag_t *frag;
+	int i;
 
-	while (nr_frags-- > 0) {
-		frag = &record->frags[nr_frags];
-		__skb_frag_unref(frag);
-	}
+	for (i = 0; i < record->num_frags; i++)
+		__skb_frag_unref(&record->frags[i]);
 	kfree(record);
 }
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next 2/4] net/tls: use RCU for the adder to the offload record list
From: Jakub Kicinski @ 2019-09-07  5:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
	daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>

All modifications to TLS record list happen under the socket
lock. Since records form an ordered queue readers are only
concerned about elements being removed, additions can happen
concurrently.

Use RCU primitives to ensure the correct access types
(READ_ONCE/WRITE_ONCE).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 net/tls/tls_device.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 285c9f9e94e4..b11355e00514 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -280,9 +280,7 @@ static int tls_push_record(struct sock *sk,
 
 	tls_append_frag(record, &dummy_tag_frag, prot->tag_size);
 	record->end_seq = tp->write_seq + record->len;
-	spin_lock_irq(&offload_ctx->lock);
-	list_add_tail(&record->list, &offload_ctx->records_list);
-	spin_unlock_irq(&offload_ctx->lock);
+	list_add_tail_rcu(&record->list, &offload_ctx->records_list);
 	offload_ctx->open_record = NULL;
 
 	if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
@@ -535,12 +533,16 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
 		/* if retransmit_hint is irrelevant start
 		 * from the beggining of the list
 		 */
-		info = list_first_entry(&context->records_list,
-					struct tls_record_info, list);
+		info = list_first_entry_or_null(&context->records_list,
+						struct tls_record_info, list);
+		if (!info)
+			return NULL;
 		record_sn = context->unacked_record_sn;
 	}
 
-	list_for_each_entry_from(info, &context->records_list, list) {
+	/* We just need the _rcu for the READ_ONCE() */
+	rcu_read_lock();
+	list_for_each_entry_from_rcu(info, &context->records_list, list) {
 		if (before(seq, info->end_seq)) {
 			if (!context->retransmit_hint ||
 			    after(info->end_seq,
@@ -549,12 +551,15 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
 				context->retransmit_hint = info;
 			}
 			*p_record_sn = record_sn;
-			return info;
+			goto exit_rcu_unlock;
 		}
 		record_sn++;
 	}
+	info = NULL;
 
-	return NULL;
+exit_rcu_unlock:
+	rcu_read_unlock();
+	return info;
 }
 EXPORT_SYMBOL(tls_get_record);
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next 3/4] net/tls: remove the record tail optimization
From: Jakub Kicinski @ 2019-09-07  5:29 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
	daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>

For TLS device offload the tag/message authentication code are
filled in by the device. The kernel merely reserves space for
them. Because device overwrites it, the contents of the tag make
do no matter. Current code tries to save space by reusing the
header as the tag. This, however, leads to an additional frag
being created and defeats buffer coalescing (which trickles
all the way down to the drivers).

Remove this optimization, and try to allocate the space for
the tag in the usual way, leave the memory uninitialized.
If memory allocation fails rewind the record pointer so that
we use the already copied user data as tag.

Note that the optimization was actually buggy, as the tag
for TLS 1.2 is 16 bytes, but header is just 13, so the reuse
may had looked past the end of the page..

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 net/tls/tls_device.c | 67 +++++++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b11355e00514..916c3c0a99f0 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -256,29 +256,13 @@ static int tls_push_record(struct sock *sk,
 			   struct tls_context *ctx,
 			   struct tls_offload_context_tx *offload_ctx,
 			   struct tls_record_info *record,
-			   struct page_frag *pfrag,
-			   int flags,
-			   unsigned char record_type)
+			   int flags)
 {
 	struct tls_prot_info *prot = &ctx->prot_info;
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct page_frag dummy_tag_frag;
 	skb_frag_t *frag;
 	int i;
 
-	/* fill prepend */
-	frag = &record->frags[0];
-	tls_fill_prepend(ctx,
-			 skb_frag_address(frag),
-			 record->len - prot->prepend_size,
-			 record_type,
-			 prot->version);
-
-	/* HW doesn't care about the data in the tag, because it fills it. */
-	dummy_tag_frag.page = skb_frag_page(frag);
-	dummy_tag_frag.offset = 0;
-
-	tls_append_frag(record, &dummy_tag_frag, prot->tag_size);
 	record->end_seq = tp->write_seq + record->len;
 	list_add_tail_rcu(&record->list, &offload_ctx->records_list);
 	offload_ctx->open_record = NULL;
@@ -302,6 +286,38 @@ static int tls_push_record(struct sock *sk,
 	return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
 }
 
+static int tls_device_record_close(struct sock *sk,
+				   struct tls_context *ctx,
+				   struct tls_record_info *record,
+				   struct page_frag *pfrag,
+				   unsigned char record_type)
+{
+	struct tls_prot_info *prot = &ctx->prot_info;
+	int ret;
+
+	/* append tag
+	 * device will fill in the tag, we just need to append a placeholder
+	 * use socket memory to improve coalescing (re-using a single buffer
+	 * increases frag count)
+	 * if we can't allocate memory now, steal some back from data
+	 */
+	if (likely(skb_page_frag_refill(prot->tag_size, pfrag,
+					sk->sk_allocation))) {
+		ret = 0;
+		tls_append_frag(record, pfrag, prot->tag_size);
+	} else {
+		ret = prot->tag_size;
+		if (record->len <= prot->overhead_size)
+			return -ENOMEM;
+	}
+
+	/* fill prepend */
+	tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
+			 record->len - prot->overhead_size,
+			 record_type, prot->version);
+	return ret;
+}
+
 static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
 				 struct page_frag *pfrag,
 				 size_t prepend_size)
@@ -452,13 +468,24 @@ static int tls_push_data(struct sock *sk,
 
 		if (done || record->len >= max_open_record_len ||
 		    (record->num_frags >= MAX_SKB_FRAGS - 1)) {
+			rc = tls_device_record_close(sk, tls_ctx, record,
+						     pfrag, record_type);
+			if (rc) {
+				if (rc > 0) {
+					size += rc;
+				} else {
+					size = orig_size;
+					destroy_record(record);
+					ctx->open_record = NULL;
+					break;
+				}
+			}
+
 			rc = tls_push_record(sk,
 					     tls_ctx,
 					     ctx,
 					     record,
-					     pfrag,
-					     tls_push_record_flags,
-					     record_type);
+					     tls_push_record_flags);
 			if (rc < 0)
 				break;
 		}
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next 4/4] net/tls: align non temporal copy to cache lines
From: Jakub Kicinski @ 2019-09-07  5:30 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
	daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>

Unlike normal TCP code TLS has to touch the cache lines
it copies into to fill header info. On memory-heavy workloads
having non temporal stores and normal accesses targeting
the same cache line leads to significant overhead.

Measured 3% overhead running 3600 round robin connections
with additional memory heavy workload.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
 net/tls/tls_device.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 916c3c0a99f0..f959487c5cd1 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -372,6 +372,31 @@ static int tls_do_allocation(struct sock *sk,
 	return 0;
 }
 
+static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
+{
+	size_t pre_copy, nocache;
+
+	pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1);
+	if (pre_copy) {
+		pre_copy = min(pre_copy, bytes);
+		if (copy_from_iter(addr, pre_copy, i) != pre_copy)
+			return -EFAULT;
+		bytes -= pre_copy;
+		addr += pre_copy;
+	}
+
+	nocache = round_down(bytes, SMP_CACHE_BYTES);
+	if (copy_from_iter_nocache(addr, nocache, i) != nocache)
+		return -EFAULT;
+	bytes -= nocache;
+	addr += nocache;
+
+	if (bytes && copy_from_iter(addr, bytes, i) != bytes)
+		return -EFAULT;
+
+	return 0;
+}
+
 static int tls_push_data(struct sock *sk,
 			 struct iov_iter *msg_iter,
 			 size_t size, int flags,
@@ -445,12 +470,10 @@ static int tls_push_data(struct sock *sk,
 		copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
 		copy = min_t(size_t, copy, (max_open_record_len - record->len));
 
-		if (copy_from_iter_nocache(page_address(pfrag->page) +
-					       pfrag->offset,
-					   copy, msg_iter) != copy) {
-			rc = -EFAULT;
+		rc = tls_device_copy_data(page_address(pfrag->page) +
+					  pfrag->offset, copy, msg_iter);
+		if (rc)
 			goto handle_error;
-		}
 		tls_append_frag(record, pfrag, copy);
 
 		size -= copy;
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Yonghong Song @ 2019-09-07  6:34 UTC (permalink / raw)
  To: Al Viro
  Cc: Carlos Neira, netdev@vger.kernel.org, ebiederm@xmission.com,
	brouer@redhat.com, bpf@vger.kernel.org
In-Reply-To: <20190907001056.GA1131@ZenIV.linux.org.uk>



On 9/6/19 5:10 PM, Al Viro wrote:
> On Fri, Sep 06, 2019 at 11:21:14PM +0000, Yonghong Song wrote:
> 
>> -bash-4.4$ readlink /proc/self/ns/pid
>> pid:[4026531836]
>> -bash-4.4$ stat /proc/self/ns/pid
>>     File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
>>     Size: 0               Blocks: 0          IO Block: 1024   symbolic link
>> Device: 4h/4d   Inode: 344795989   Links: 1
>> Access: (0777/lrwxrwxrwx)  Uid: (128203/     yhs)   Gid: (  100/   users)
>> Context: user_u:base_r:base_t
>> Access: 2019-09-06 16:06:09.431616380 -0700
>> Modify: 2019-09-06 16:06:09.431616380 -0700
>> Change: 2019-09-06 16:06:09.431616380 -0700
>>    Birth: -
>> -bash-4.4$
>>
>> Based on a discussion with Eric Biederman back in 2019 Linux
>> Plumbers, Eric suggested that to uniquely identify a
>> namespace, device id (major/minor) number should also
>> be included. Although today's kernel implementation
>> has the same device for all namespace pseudo files,
>> but from uapi perspective, device id should be included.
>>
>> That is the reason why we try to get device id which holds
>> pid namespace pseudo file.
>>
>> Do you have a better suggestion on how to get
>> the device id for 'current' pid namespace? Or from design, we
>> really should not care about device id at all?
> 
> What the hell is "device id for pid namespace"?  This is the
> first time I've heard about that mystery object, so it's
> hard to tell where it could be found.
> 
> I can tell you what device numbers are involved in the areas
> you seem to be looking in.
> 
> 1) there's whatever device number that gets assigned to
> (this) procfs instance.  That, ironically, _is_ per-pidns, but
> that of the procfs instance, not that of your process (and
> those can be different).  That's what you get in ->st_dev
> when doing lstat() of anything in /proc (assuming that
> procfs is mounted there, in the first place).  NOTE:
> that's lstat(2), not stat(2).  stat(1) uses lstat(2),
> unless given -L (in which case it's stat(2) time).  The
> difference:
> 
> root@kvm1:~# stat /proc/self/ns/pid
>    File: /proc/self/ns/pid -> pid:[4026531836]
>    Size: 0               Blocks: 0          IO Block: 1024   symbolic link
> Device: 4h/4d   Inode: 17396       Links: 1
> Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
> Access: 2019-09-06 19:43:11.871312319 -0400
> Modify: 2019-09-06 19:43:11.871312319 -0400
> Change: 2019-09-06 19:43:11.871312319 -0400
>   Birth: -
> root@kvm1:~# stat -L /proc/self/ns/pid
>    File: /proc/self/ns/pid
>    Size: 0               Blocks: 0          IO Block: 4096   regular empty file
> Device: 3h/3d   Inode: 4026531836  Links: 1
> Access: (0444/-r--r--r--)  Uid: (    0/    root)   Gid: (    0/    root)
> Access: 2019-09-06 19:43:15.955313293 -0400
> Modify: 2019-09-06 19:43:15.955313293 -0400
> Change: 2019-09-06 19:43:15.955313293 -0400
>   Birth: -
> 
> The former is lstat, the latter - stat.
> 
> 2) device number of the filesystem where the symlink target lives.
> In this case, it's nsfs and there's only one instance on the entire
> system.  _That_ would be obtained by looking at st_dev in stat(2) on
> /proc/self/ns/pid (0:3 above).
> 
> 3) device number *OF* the symlink.  That would be st_rdev in lstat(2).
> There's none - it's a symlink, not a character or block device.  It's
> always zero and always will be zero.
> 
> 4) the same for the target; st_rdev in stat(2) results and again,
> there's no such beast - it's neither character nor block device.
> 
> Your code is looking at (3).  Please, reread any textbook on Unix
> in the section that would cover stat(2) and discussion of the
> difference between st_dev and st_rdev.
> 
> I have no idea what Eric had been talking about - it's hard to
> reconstruct by what you said so far.  Making nsfs per-userns,
> perhaps?  But that makes no sense whatsoever, not that userns
> ever had...  Cheap shots aside, I really can't guess what that's
> about.  Sorry.

Thanks for the detailed information. The device number we want
is nsfs. Indeed, currently, there is only one instance
on the entire system. But not exactly sure what is the possibility
to have more than one nsfs device in the future. Maybe per-userns
or any other criteria?

> 
> In any case, pathname resolution is *NOT* for the situations where
> you can't block.  Even if it's procfs (and from the same pidns as
> the process) mounted there, there is no promise that the target
> of /proc/self has already been looked up and not evicted from
> memory since then.  And in case of cache miss pathwalk will
> have to call ->lookup(), which requires locking the directory
> (rw_sem, shared).  You can't do that in such context.
> 
> And that doesn't even go into the possibility that process has
> something very different mounted on /proc.
> 
> Again, I don't know what it is that you want to get to, but
> I would strongly recommend finding a way to get to that data
> that would not involve going anywhere near pathname resolution.
> 
> How would you expect the userland to work with that value,
> whatever it might be?  If it's just a 32bit field that will
> never be read, you might as well store there the same value
> you store now (0, that is) in much cheaper and safer way ;-)

Suppose inside pid namespace, user can pass the device number,
say n1, (`stat -L /proc/self/ns/pid`) to bpf program (through map
or JIT). At runtime, bpf program will try to get device number,
say n2, for the 'current' process. If n1 is not the same as
n2, that means they are not in the same namespace. 'current'
is in the same pid namespace as the user iff
n1 == n2 and also pidns id is the same for 'current' and
the one with `lsns -t pid`.

Are you aware of any way to get the pidns device number
for 'current' without going through the pathname
lookup?


^ permalink raw reply

* Re: [PATCH 0/7] libbpf: Fix cast away const qualifiers in btf.h
From: Jiri Olsa @ 2019-09-07  6:54 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann,
	netdev@vger.kernel.org, bpf@vger.kernel.org, Yonghong Song,
	Martin Lau
In-Reply-To: <62e760de-e746-c512-350a-c2188a2bb3ed@fb.com>

On Fri, Sep 06, 2019 at 09:09:17AM +0000, Andrii Nakryiko wrote:
> On 9/6/19 8:31 AM, Jiri Olsa wrote:
> > hi,
> > when including btf.h in bpftrace, I'm getting -Wcast-qual warnings like:
> > 
> >    bpf/btf.h: In function ‘btf_var_secinfo* btf_var_secinfos(const btf_type*)’:
> >    bpf/btf.h:302:41: warning: cast from type ‘const btf_type*’ to type
> >    ‘btf_var_secinfo*’ casts away qualifiers [-Wcast-qual]
> >      302 |  return (struct btf_var_secinfo *)(t + 1);
> >          |                                         ^
> > 
> > I changed the btf.h header to comply with -Wcast-qual checks
> > and used const cast away casting in libbpf objects, where it's
> 
> Hey Jiri,
> 
> We made all those helper funcs return non-const structs intentionally to 
> improve their usability and avoid all those casts that you added back.
> 
> Also, those helpers are now part of public API, so we can't just change 
> them to const, as it can break existing users easily.
> 
> If there is a need to run with -Wcast-qual, we should probably disable 
> those checks where appropriate in libbpf code.
> 
> So this will be a NACK from me, sorry.

ok, I'll disable disable it in bpftrace code then

thanks,
jirka

> 
> > all related to deduplication code, so I believe loosing const
> > is fine there.
> > 
> > thanks,
> > jirka
> > 
> > 
> > ---
> > Jiri Olsa (7):
> >        libbpf: Use const cast for btf_int_* functions
> >        libbpf: Return const btf_array from btf_array inline function
> >        libbpf: Return const btf_enum from btf_enum inline function
> >        libbpf: Return const btf_member from btf_members inline function
> >        libbpf: Return const btf_param from btf_params inline function
> >        libbpf: Return const btf_var from btf_var inline function
> >        libbpf: Return const struct btf_var_secinfo from btf_var_secinfos inline function
> > 
> >   tools/lib/bpf/btf.c    | 21 +++++++++++----------
> >   tools/lib/bpf/btf.h    | 30 +++++++++++++++---------------
> >   tools/lib/bpf/libbpf.c |  2 +-
> >   3 files changed, 27 insertions(+), 26 deletions(-)
> > 
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox