* [PATCH v4 bpf-next 2/4] bpf: implement CAP_BPF
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>
Implement permissions as stated in uapi/linux/capability.h
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
kernel/bpf/arraymap.c | 2 +-
kernel/bpf/cgroup.c | 2 +-
kernel/bpf/core.c | 4 ++--
kernel/bpf/hashtab.c | 4 ++--
kernel/bpf/lpm_trie.c | 2 +-
kernel/bpf/queue_stack_maps.c | 2 +-
kernel/bpf/reuseport_array.c | 2 +-
kernel/bpf/stackmap.c | 2 +-
kernel/bpf/syscall.c | 32 +++++++++++++++++++-------------
kernel/bpf/verifier.c | 2 +-
kernel/trace/bpf_trace.c | 2 +-
net/core/bpf_sk_storage.c | 2 +-
net/core/filter.c | 10 ++++++----
13 files changed, 38 insertions(+), 30 deletions(-)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..149f868a02dc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int ret, numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool unpriv = !capable(CAP_SYS_ADMIN);
+ bool unpriv = !capable_bpf();
u64 cost, array_size, mask64;
struct bpf_map_memory mem;
struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..9c659ba5c146 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
+ if (capable_bpf_tracing())
return bpf_get_trace_printk_proto();
/* fall through */
default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 66088a9e9b9e..6643099bc64b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !capable(CAP_SYS_ADMIN))
+ !capable_bpf())
return;
spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!capable_bpf()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..0fae5c45f425 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !capable_bpf())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..11da3be8a4e5 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
u64 cost = sizeof(*trie), cost_per_node;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..d83afac32863 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..b268fe4b2972 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
struct bpf_map_memory mem;
u64 array_size;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..477063c63b27 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_tracing())
return ERR_PTR(-EPERM);
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 82eabd4e38ad..cd2d1b21f0f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!capable_bpf()) {
err = -EPERM;
goto err_put;
}
@@ -1635,7 +1635,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !capable(CAP_SYS_ADMIN))
+ !capable_bpf())
return -EPERM;
/* copy eBPF program license from user space */
@@ -1648,11 +1648,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+ attr->insn_cnt > (capable_bpf() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !capable(CAP_SYS_ADMIN))
+ !capable_bpf())
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
@@ -1809,6 +1809,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
char tp_name[128];
int tp_fd, err;
+ if (!capable_bpf_tracing())
+ return -EPERM;
+
if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
sizeof(tp_name) - 1) < 0)
return -EFAULT;
@@ -2087,7 +2090,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
struct bpf_prog *prog;
int ret = -ENOTSUPP;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf_net_admin())
+ /* test_run callback is available for networking progs only.
+ * Add capable_bpf_tracing() above when tracing progs become runable.
+ */
return -EPERM;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
@@ -2124,7 +2130,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
next_id++;
@@ -2150,7 +2156,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
spin_lock_bh(&prog_idr_lock);
@@ -2184,7 +2190,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2359,7 +2365,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!capable_bpf()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
@@ -2677,7 +2683,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
return btf_new_fd(attr);
@@ -2690,7 +2696,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
return btf_get_fd_by_id(attr->btf_id);
@@ -2759,7 +2765,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (CHECK_ATTR(BPF_TASK_FD_QUERY))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf_tracing())
return -EPERM;
if (attr->task_fd_query.flags != 0)
@@ -2827,7 +2833,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
union bpf_attr attr = {};
int err;
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+ if (sysctl_unprivileged_bpf_disabled && !capable_bpf())
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fb50757e812..7e519711c689 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9234,7 +9234,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- is_priv = capable(CAP_SYS_ADMIN);
+ is_priv = capable_bpf();
/* grab the mutex to protect few globals used by verifier */
if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..cdf8d6c8a430 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
u32 *ids, prog_cnt, ids_len;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf_tracing())
return -EPERM;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..aa74be21f5b6 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index ed6563622ce3..b233ed8438f1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return NULL;
switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
+ if (capable_bpf_tracing())
+ return bpf_get_trace_printk_proto();
+ /* fall through */
default:
return NULL;
}
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size,
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return false;
break;
}
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return false;
break;
default:
--
2.20.0
^ permalink raw reply related
* [PATCH v4 bpf-next 4/4] selftests/bpf: use CAP_BPF and CAP_TRACING in tests
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>
Make all test_verifier test exercise CAP_BPF and CAP_TRACING
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++++----
1 file changed, 38 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d27fd929abb9..0d5567962c4e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -807,10 +807,20 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
}
}
+struct libcap {
+ struct __user_cap_header_struct hdr;
+ struct __user_cap_data_struct data[2];
+};
+
static int set_admin(bool admin)
{
cap_t caps;
- const cap_value_t cap_val = CAP_SYS_ADMIN;
+ /* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs,
+ * and CAP_TRACING to create stackmap
+ */
+ const cap_value_t cap_net_admin = CAP_NET_ADMIN;
+ const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
+ struct libcap *cap;
int ret = -1;
caps = cap_get_proc();
@@ -818,11 +828,26 @@ static int set_admin(bool admin)
perror("cap_get_proc");
return -1;
}
- if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+ cap = (struct libcap *)caps;
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
+ perror("cap_set_flag clear admin");
+ goto out;
+ }
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
admin ? CAP_SET : CAP_CLEAR)) {
- perror("cap_set_flag");
+ perror("cap_set_flag set_or_clear net");
goto out;
}
+ /* libcap is likely old and simply ignores CAP_BPF and CAP_TRACING,
+ * so update effective bits manually
+ */
+ if (admin) {
+ cap->data[1].effective |= 1 << (38 /* CAP_BPF */ - 32);
+ cap->data[1].effective |= 1 << (39 /* CAP_TRACING */ - 32);
+ } else {
+ cap->data[1].effective &= ~(1 << (38 - 32));
+ cap->data[1].effective &= ~(1 << (39 - 32));
+ }
if (cap_set_proc(caps)) {
perror("cap_set_proc");
goto out;
@@ -1051,9 +1076,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
static bool is_admin(void)
{
+ cap_flag_value_t net_priv = CAP_CLEAR;
+ bool tracing_priv = false;
+ bool bpf_priv = false;
+ struct libcap *cap;
cap_t caps;
- cap_flag_value_t sysadmin = CAP_CLEAR;
- const cap_value_t cap_val = CAP_SYS_ADMIN;
#ifdef CAP_IS_SUPPORTED
if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1066,11 +1093,14 @@ static bool is_admin(void)
perror("cap_get_proc");
return false;
}
- if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
- perror("cap_get_flag");
+ cap = (struct libcap *)caps;
+ bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32));
+ tracing_priv = cap->data[1].effective & (1 << (39/* CAP_TRACING */ - 32));
+ if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+ perror("cap_get_flag NET");
if (cap_free(caps))
perror("cap_free");
- return (sysadmin == CAP_SET);
+ return bpf_priv && tracing_priv && net_priv == CAP_SET;
}
static void get_unpriv_disabled()
--
2.20.0
^ permalink raw reply related
* [PATCH v4 bpf-next 1/4] capability: introduce CAP_BPF and CAP_TRACING
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>
Split BPF and perf/tracing operations that are allowed under
CAP_SYS_ADMIN into corresponding CAP_BPF and CAP_TRACING.
For backward compatibility include them in CAP_SYS_ADMIN as well.
The end result provides simple safety model for applications that use BPF:
- for tracing program types
BPF_PROG_TYPE_{KPROBE, TRACEPOINT, PERF_EVENT, RAW_TRACEPOINT, etc}
use CAP_BPF and CAP_TRACING
- for networking program types
BPF_PROG_TYPE_{SCHED_CLS, XDP, CGROUP_SKB, SK_SKB, etc}
use CAP_BPF and CAP_NET_ADMIN
There are few exceptions from this simple rule:
- bpf_trace_printk() is allowed in networking programs, but it's using
ftrace mechanism, hence this helper needs additional CAP_TRACING.
- cpumap is used by XDP programs. Currently it's kept under CAP_SYS_ADMIN,
but could be relaxed to CAP_NET_ADMIN in the future.
- BPF_F_ZERO_SEED flag for hash/lru map is allowed under CAP_SYS_ADMIN only
to discourage production use.
- BPF HW offload is allowed under CAP_SYS_ADMIN.
- cg_sysctl, cg_device, lirc program types are neither networking nor tracing.
They can be loaded under CAP_BPF, but attach is allowed under CAP_NET_ADMIN.
This will be cleaned up in the future.
userid=nobody + (CAP_TRACING | CAP_NET_ADMIN) + CAP_BPF is safer than
typical setup with userid=root and sudo by existing bpf applications.
It's not secure, since these capabilities:
- allow bpf progs access arbitrary memory
- let tasks access any bpf map
- let tasks attach/detach any bpf prog
bpftool, bpftrace, bcc tools binaries should not be installed with
cap_bpf+cap_tracing, since unpriv users will be able to read kernel secrets.
CAP_BPF, CAP_NET_ADMIN, CAP_TRACING are roughly equal in terms of
damage they can make to the system.
Example:
CAP_NET_ADMIN can stop network traffic. CAP_BPF can write into map
and if that map is used by firewall-like bpf prog the network traffic
may stop.
CAP_BPF allows many bpf prog_load commands in parallel. The verifier
may consume large amount of memory and significantly slow down the system.
CAP_TRACING allows many kprobes that can slow down the system.
In the future more fine-grained bpf permissions may be added.
Existing unprivileged BPF operations are not affected.
In particular unprivileged users are allowed to load socket_filter and cg_skb
program types and to create array, hash, prog_array, map-in-map map types.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
include/linux/capability.h | 18 +++++++++++
include/uapi/linux/capability.h | 49 ++++++++++++++++++++++++++++-
security/selinux/include/classmap.h | 4 +--
3 files changed, 68 insertions(+), 3 deletions(-)
diff --git a/include/linux/capability.h b/include/linux/capability.h
index ecce0f43c73a..13eb49c75797 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -247,6 +247,24 @@ static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
return true;
}
#endif /* CONFIG_MULTIUSER */
+
+static inline bool capable_bpf(void)
+{
+ return capable(CAP_SYS_ADMIN) || capable(CAP_BPF);
+}
+static inline bool capable_tracing(void)
+{
+ return capable(CAP_SYS_ADMIN) || capable(CAP_TRACING);
+}
+static inline bool capable_bpf_tracing(void)
+{
+ return capable(CAP_SYS_ADMIN) || (capable(CAP_BPF) && capable(CAP_TRACING));
+}
+static inline bool capable_bpf_net_admin(void)
+{
+ return (capable(CAP_SYS_ADMIN) || capable(CAP_BPF)) && capable(CAP_NET_ADMIN);
+}
+
extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..fe01d8235e1e 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -274,6 +274,7 @@ struct vfs_ns_cap_data {
arbitrary SCSI commands */
/* Allow setting encryption key on loopback filesystem */
/* Allow setting zone reclaim policy */
+/* Allow everything under CAP_BPF and CAP_TRACING for backward compatibility */
#define CAP_SYS_ADMIN 21
@@ -366,8 +367,54 @@ struct vfs_ns_cap_data {
#define CAP_AUDIT_READ 37
+/*
+ * CAP_BPF allows the following BPF operations:
+ * - Loading all types of BPF programs
+ * - Creating all types of BPF maps except:
+ * - stackmap that needs CAP_TRACING
+ * - devmap that needs CAP_NET_ADMIN
+ * - cpumap that needs CAP_SYS_ADMIN
+ * - Advanced verifier features
+ * - Indirect variable access
+ * - Bounded loops
+ * - BPF to BPF function calls
+ * - Scalar precision tracking
+ * - Larger complexity limits
+ * - Dead code elimination
+ * - And potentially other features
+ * - Use of pointer-to-integer conversions in BPF programs
+ * - Bypassing of speculation attack hardening measures
+ * - Loading BPF Type Format (BTF) data
+ * - Iterate system wide loaded programs, maps, BTF objects
+ * - Retrieve xlated and JITed code of BPF programs
+ * - Access maps and programs via id
+ * - Use bpf_spin_lock() helper
+ *
+ * CAP_BPF and CAP_TRACING together allow the following:
+ * - bpf_probe_read to read arbitrary kernel memory
+ * - bpf_trace_printk to print data to ftrace ring buffer
+ * - Attach to raw_tracepoint
+ * - Query association between kprobe/tracepoint and bpf program
+ *
+ * CAP_BPF and CAP_NET_ADMIN together allow the following:
+ * - Attach to cgroup-bpf hooks and query
+ * - skb, xdp, flow_dissector test_run command
+ *
+ * CAP_NET_ADMIN allows:
+ * - Attach networking bpf programs to xdp, tc, lwt, flow dissector
+ */
+#define CAP_BPF 38
+
+/*
+ * CAP_TRACING allows:
+ * - Full use of perf_event_open(), similarly to the effect of
+ * kernel.perf_event_paranoid == -1
+ * - Creation of [ku][ret]probe
+ * - Attach tracing bpf programs to perf events
+ */
+#define CAP_TRACING 39
-#define CAP_LAST_CAP CAP_AUDIT_READ
+#define CAP_LAST_CAP CAP_TRACING
#define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 201f7e588a29..0b364e245163 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -26,9 +26,9 @@
"audit_control", "setfcap"
#define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \
- "wake_alarm", "block_suspend", "audit_read"
+ "wake_alarm", "block_suspend", "audit_read", "bpf", "tracing"
-#if CAP_LAST_CAP > CAP_AUDIT_READ
+#if CAP_LAST_CAP > CAP_TRACING
#error New capability defined, please update COMMON_CAP2_PERMS.
#endif
--
2.20.0
^ permalink raw reply related
* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Yonghong Song @ 2019-09-06 23:21 UTC (permalink / raw)
To: Al Viro, Carlos Neira
Cc: netdev@vger.kernel.org, ebiederm@xmission.com, brouer@redhat.com,
bpf@vger.kernel.org
In-Reply-To: <20190906160020.GX1131@ZenIV.linux.org.uk>
On 9/6/19 9:00 AM, Al Viro wrote:
> On Fri, Sep 06, 2019 at 04:46:47PM +0100, Al Viro wrote:
>
>>> Where do I begin?
>>> * getname_kernel() is there for purpose
>>> * so's kern_path(), damnit
>>
>> Oh, and filename_lookup() *CAN* sleep, obviously. So that
>> GFP_ATOMIC above is completely pointless.
>>
>>>> +
>>>> + inode = d_backing_inode(kp.dentry);
>>>> + pidns_info->dev = (u32)inode->i_rdev;
>
> In the original variant of patchset it used to be ->i_sb->s_dev,
> which is also bloody strange - you are not asking filename_lookup()
> to follow symlinks, so you'd get that of whatever filesystem
> /proc/self/ns resides on.
>
> ->i_rdev use makes no sense whatsoever - it's a symlink and
> neither it nor its target are device nodes; ->i_rdev will be
> left zero for both.
>
> What data are you really trying to get there?
Let me explain a little bit background here.
The ultimate goal is for bpf program to filter over
(pid_namespace, tgid/pid inside pid_namespace)
so bpf based tools can run inside the container.
Typically, pid namespace is achieved by looking at
/proc/self/ns/pid:
-bash-4.4$ lsns
NS TYPE NPROCS PID USER COMMAND
4026531835 cgroup 44 8261 yhs /usr/lib/systemd/systemd --user
4026531836 pid 44 8261 yhs /usr/lib/systemd/systemd --user
4026531837 user 44 8261 yhs /usr/lib/systemd/systemd --user
4026531838 uts 44 8261 yhs /usr/lib/systemd/systemd --user
4026531839 ipc 44 8261 yhs /usr/lib/systemd/systemd --user
4026531840 mnt 44 8261 yhs /usr/lib/systemd/systemd --user
4026532008 net 44 8261 yhs /usr/lib/systemd/systemd --user
-bash-4.4$ readlink /proc/self/ns/pid
pid:[4026531836]
-bash-4.4$ stat /proc/self/ns/pid
File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
Size: 0 Blocks: 0 IO Block: 1024 symbolic link
Device: 4h/4d Inode: 344795989 Links: 1
Access: (0777/lrwxrwxrwx) Uid: (128203/ yhs) Gid: ( 100/ users)
Context: user_u:base_r:base_t
Access: 2019-09-06 16:06:09.431616380 -0700
Modify: 2019-09-06 16:06:09.431616380 -0700
Change: 2019-09-06 16:06:09.431616380 -0700
Birth: -
-bash-4.4$
Based on a discussion with Eric Biederman back in 2019 Linux
Plumbers, Eric suggested that to uniquely identify a
namespace, device id (major/minor) number should also
be included. Although today's kernel implementation
has the same device for all namespace pseudo files,
but from uapi perspective, device id should be included.
That is the reason why we try to get device id which holds
pid namespace pseudo file.
Do you have a better suggestion on how to get
the device id for 'current' pid namespace? Or from design, we
really should not care about device id at all?
^ permalink raw reply
* Re: [PATCH v2] net: enable wireless core features with LEGACY_WEXT_ALLCONFIG
From: Greg KH @ 2019-09-06 23:30 UTC (permalink / raw)
To: Mark Salyzyn
Cc: linux-kernel, kernel-team, Johannes Berg, David S. Miller,
Marcel Holtmann, linux-wireless, netdev, stable
In-Reply-To: <20190906192403.195620-1-salyzyn@android.com>
On Fri, Sep 06, 2019 at 12:24:00PM -0700, Mark Salyzyn wrote:
> In embedded environments the requirements are to be able to pick and
> chose which features one requires built into the kernel. If an
> embedded environment wants to supports loading modules that have been
> kbuilt out of tree, there is a need to enable hidden configurations
> for legacy wireless core features to provide the API surface for
> them to load.
>
> Introduce CONFIG_LEGACY_WEXT_ALLCONFIG to select all legacy wireless
> extension core features by activating in turn all the associated
> hidden configuration options, without having to specifically select
> any wireless module(s).
>
> Signed-off-by: Mark Salyzyn <salyzyn@android.com>
> Cc: kernel-team@android.com
> Cc: Johannes Berg <johannes@sipsolutions.net>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Marcel Holtmann <marcel@holtmann.org>
> Cc: linux-wireless@vger.kernel.org
> Cc: netdev@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: stable@vger.kernel.org # 4.19
> ---
> v2: change name and documentation to CONFIG_LEGACY_WEXT_ALLCONFIG
> ---
> net/wireless/Kconfig | 14 ++++++++++++++
> 1 file changed, 14 insertions(+)
>
> diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
> index 67f8360dfcee..0d646cf28de5 100644
> --- a/net/wireless/Kconfig
> +++ b/net/wireless/Kconfig
> @@ -17,6 +17,20 @@ config WEXT_SPY
> config WEXT_PRIV
> bool
>
> +config LEGACY_WEXT_ALLCONFIG
> + bool "allconfig for legacy wireless extensions"
> + select WIRELESS_EXT
> + select WEXT_CORE
> + select WEXT_PROC
> + select WEXT_SPY
> + select WEXT_PRIV
> + help
> + Config option used to enable all the legacy wireless extensions to
> + the core functionality used by add-in modules.
> +
> + If you are not building a kernel to be used for a variety of
> + out-of-kernel built wireless modules, say N here.
> +
> config CFG80211
> tristate "cfg80211 - wireless configuration API"
> depends on RFKILL || !RFKILL
> --
> 2.23.0.187.g17f5b7556c-goog
>
How is this patch applicable to stable kernels???
^ permalink raw reply
* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Alexei Starovoitov @ 2019-09-06 23:31 UTC (permalink / raw)
To: Ivan Khoronzhuk
Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190904212212.13052-3-ivan.khoronzhuk@linaro.org>
On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
> No need to set --target for native build, at least for arm, the
> default target will be used anyway. In case of arm, for at least
> clang 5 - 10 it causes error like:
>
> clang: warning: unknown platform, assuming -mfloat-abi=soft
> LLVM ERROR: Unsupported calling convention
> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
>
> Only set to real triple helps: --target=arm-linux-gnueabihf
> or just drop the target key to use default one. Decision to just
> drop it and thus default target will be used (wich is native),
> looks better.
>
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> ---
> samples/bpf/Makefile | 2 --
> 1 file changed, 2 deletions(-)
>
> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> index 61b7394b811e..a2953357927e 100644
> --- a/samples/bpf/Makefile
> +++ b/samples/bpf/Makefile
> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
> ifdef CROSS_COMPILE
> HOSTCC = $(CROSS_COMPILE)gcc
> CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
> -else
> -CLANG_ARCH_ARGS = -target $(ARCH)
> endif
I don't follow here.
Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
^ permalink raw reply
* Re: [PATCH bpf-next 8/8] samples: bpf: Makefile: base progs build on Makefile.progs
From: Alexei Starovoitov @ 2019-09-06 23:34 UTC (permalink / raw)
To: Ivan Khoronzhuk
Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190904212212.13052-9-ivan.khoronzhuk@linaro.org>
On Thu, Sep 05, 2019 at 12:22:12AM +0300, Ivan Khoronzhuk wrote:
> +
> +If need to use environment of target board, the SYSROOT also can be set,
> +pointing on FS of target board:
> +
> +make samples/bpf/ LLC=~/git/llvm/build/bin/llc \
> + CLANG=~/git/llvm/build/bin/clang \
> + SYSROOT=~/some_sdk/linux-devkit/sysroots/aarch64-linux-gnu
Patches 7 and 8 look quite heavy. I don't have a way to test them
which makes me a bit uneasy to accept them as-is.
Would be great if somebody could give Tested-by.
^ permalink raw reply
* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Ivan Khoronzhuk @ 2019-09-06 23:52 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190906233138.4d4fqdnlbikemhau@ast-mbp.dhcp.thefacebook.com>
On Fri, Sep 06, 2019 at 04:31:39PM -0700, Alexei Starovoitov wrote:
>On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
>> No need to set --target for native build, at least for arm, the
>> default target will be used anyway. In case of arm, for at least
>> clang 5 - 10 it causes error like:
>>
>> clang: warning: unknown platform, assuming -mfloat-abi=soft
>> LLVM ERROR: Unsupported calling convention
>> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
>> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
>>
>> Only set to real triple helps: --target=arm-linux-gnueabihf
>> or just drop the target key to use default one. Decision to just
>> drop it and thus default target will be used (wich is native),
>> looks better.
>>
>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> ---
>> samples/bpf/Makefile | 2 --
>> 1 file changed, 2 deletions(-)
>>
>> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
>> index 61b7394b811e..a2953357927e 100644
>> --- a/samples/bpf/Makefile
>> +++ b/samples/bpf/Makefile
>> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
>> ifdef CROSS_COMPILE
>> HOSTCC = $(CROSS_COMPILE)gcc
>> CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
>> -else
>> -CLANG_ARCH_ARGS = -target $(ARCH)
>> endif
>
>I don't follow here.
>Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
>
It looks like but that's not true.
Previous patch adds target only for cross compiling,
before the patch the target was used for both, cross compiling and w/o cc.
This patch removes target only for native build (it's not cross compiling).
By fact, it's two separate significant changes.
--
Regards,
Ivan Khoronzhuk
^ permalink raw reply
* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Alexei Starovoitov @ 2019-09-07 0:04 UTC (permalink / raw)
To: Ivan Khoronzhuk
Cc: Alexei Starovoitov, Daniel Borkmann, Yonghong Song,
David S. Miller, Jakub Kicinski, Jesper Dangaard Brouer,
John Fastabend, LKML, Network Development, bpf,
Clang-Built-Linux ML
In-Reply-To: <20190906235207.GA3053@khorivan>
On Fri, Sep 6, 2019 at 4:52 PM Ivan Khoronzhuk
<ivan.khoronzhuk@linaro.org> wrote:
>
> On Fri, Sep 06, 2019 at 04:31:39PM -0700, Alexei Starovoitov wrote:
> >On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
> >> No need to set --target for native build, at least for arm, the
> >> default target will be used anyway. In case of arm, for at least
> >> clang 5 - 10 it causes error like:
> >>
> >> clang: warning: unknown platform, assuming -mfloat-abi=soft
> >> LLVM ERROR: Unsupported calling convention
> >> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
> >> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
> >>
> >> Only set to real triple helps: --target=arm-linux-gnueabihf
> >> or just drop the target key to use default one. Decision to just
> >> drop it and thus default target will be used (wich is native),
> >> looks better.
> >>
> >> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
> >> ---
> >> samples/bpf/Makefile | 2 --
> >> 1 file changed, 2 deletions(-)
> >>
> >> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> >> index 61b7394b811e..a2953357927e 100644
> >> --- a/samples/bpf/Makefile
> >> +++ b/samples/bpf/Makefile
> >> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
> >> ifdef CROSS_COMPILE
> >> HOSTCC = $(CROSS_COMPILE)gcc
> >> CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
> >> -else
> >> -CLANG_ARCH_ARGS = -target $(ARCH)
> >> endif
> >
> >I don't follow here.
> >Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
> >
>
> It looks like but that's not true.
> Previous patch adds target only for cross compiling,
> before the patch the target was used for both, cross compiling and w/o cc.
>
> This patch removes target only for native build (it's not cross compiling).
>
> By fact, it's two separate significant changes.
How so?
before first patch CLANG_ARCH_ARGS is only used under CROSS_COMPILE.
After the first patch CLANG_ARCH_ARGS is now suddenly defined w/o CROSS_COMPILE
and second patch brings it to the state before first patch.
^ permalink raw reply
* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Al Viro @ 2019-09-07 0:10 UTC (permalink / raw)
To: Yonghong Song
Cc: Carlos Neira, netdev@vger.kernel.org, ebiederm@xmission.com,
brouer@redhat.com, bpf@vger.kernel.org
In-Reply-To: <c0e67fc7-be66-c4c6-6aad-316cbba18757@fb.com>
On Fri, Sep 06, 2019 at 11:21:14PM +0000, Yonghong Song wrote:
> -bash-4.4$ readlink /proc/self/ns/pid
> pid:[4026531836]
> -bash-4.4$ stat /proc/self/ns/pid
> File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
> Size: 0 Blocks: 0 IO Block: 1024 symbolic link
> Device: 4h/4d Inode: 344795989 Links: 1
> Access: (0777/lrwxrwxrwx) Uid: (128203/ yhs) Gid: ( 100/ users)
> Context: user_u:base_r:base_t
> Access: 2019-09-06 16:06:09.431616380 -0700
> Modify: 2019-09-06 16:06:09.431616380 -0700
> Change: 2019-09-06 16:06:09.431616380 -0700
> Birth: -
> -bash-4.4$
>
> Based on a discussion with Eric Biederman back in 2019 Linux
> Plumbers, Eric suggested that to uniquely identify a
> namespace, device id (major/minor) number should also
> be included. Although today's kernel implementation
> has the same device for all namespace pseudo files,
> but from uapi perspective, device id should be included.
>
> That is the reason why we try to get device id which holds
> pid namespace pseudo file.
>
> Do you have a better suggestion on how to get
> the device id for 'current' pid namespace? Or from design, we
> really should not care about device id at all?
What the hell is "device id for pid namespace"? This is the
first time I've heard about that mystery object, so it's
hard to tell where it could be found.
I can tell you what device numbers are involved in the areas
you seem to be looking in.
1) there's whatever device number that gets assigned to
(this) procfs instance. That, ironically, _is_ per-pidns, but
that of the procfs instance, not that of your process (and
those can be different). That's what you get in ->st_dev
when doing lstat() of anything in /proc (assuming that
procfs is mounted there, in the first place). NOTE:
that's lstat(2), not stat(2). stat(1) uses lstat(2),
unless given -L (in which case it's stat(2) time). The
difference:
root@kvm1:~# stat /proc/self/ns/pid
File: /proc/self/ns/pid -> pid:[4026531836]
Size: 0 Blocks: 0 IO Block: 1024 symbolic link
Device: 4h/4d Inode: 17396 Links: 1
Access: (0777/lrwxrwxrwx) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2019-09-06 19:43:11.871312319 -0400
Modify: 2019-09-06 19:43:11.871312319 -0400
Change: 2019-09-06 19:43:11.871312319 -0400
Birth: -
root@kvm1:~# stat -L /proc/self/ns/pid
File: /proc/self/ns/pid
Size: 0 Blocks: 0 IO Block: 4096 regular empty file
Device: 3h/3d Inode: 4026531836 Links: 1
Access: (0444/-r--r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2019-09-06 19:43:15.955313293 -0400
Modify: 2019-09-06 19:43:15.955313293 -0400
Change: 2019-09-06 19:43:15.955313293 -0400
Birth: -
The former is lstat, the latter - stat.
2) device number of the filesystem where the symlink target lives.
In this case, it's nsfs and there's only one instance on the entire
system. _That_ would be obtained by looking at st_dev in stat(2) on
/proc/self/ns/pid (0:3 above).
3) device number *OF* the symlink. That would be st_rdev in lstat(2).
There's none - it's a symlink, not a character or block device. It's
always zero and always will be zero.
4) the same for the target; st_rdev in stat(2) results and again,
there's no such beast - it's neither character nor block device.
Your code is looking at (3). Please, reread any textbook on Unix
in the section that would cover stat(2) and discussion of the
difference between st_dev and st_rdev.
I have no idea what Eric had been talking about - it's hard to
reconstruct by what you said so far. Making nsfs per-userns,
perhaps? But that makes no sense whatsoever, not that userns
ever had... Cheap shots aside, I really can't guess what that's
about. Sorry.
In any case, pathname resolution is *NOT* for the situations where
you can't block. Even if it's procfs (and from the same pidns as
the process) mounted there, there is no promise that the target
of /proc/self has already been looked up and not evicted from
memory since then. And in case of cache miss pathwalk will
have to call ->lookup(), which requires locking the directory
(rw_sem, shared). You can't do that in such context.
And that doesn't even go into the possibility that process has
something very different mounted on /proc.
Again, I don't know what it is that you want to get to, but
I would strongly recommend finding a way to get to that data
that would not involve going anywhere near pathname resolution.
How would you expect the userland to work with that value,
whatever it might be? If it's just a 32bit field that will
never be read, you might as well store there the same value
you store now (0, that is) in much cheaper and safer way ;-)
^ permalink raw reply
* Re: [PATCH bpf-next 2/8] samples: bpf: Makefile: remove target for native build
From: Ivan Khoronzhuk @ 2019-09-07 0:19 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Alexei Starovoitov, Daniel Borkmann, Yonghong Song,
David S. Miller, Jakub Kicinski, Jesper Dangaard Brouer,
John Fastabend, LKML, Network Development, bpf,
Clang-Built-Linux ML
In-Reply-To: <CAADnVQKOT8D9156p49AQ0q0z5Zks5te4Ofi6DrBfpnitmRBgmg@mail.gmail.com>
On Fri, Sep 06, 2019 at 05:04:08PM -0700, Alexei Starovoitov wrote:
>On Fri, Sep 6, 2019 at 4:52 PM Ivan Khoronzhuk
><ivan.khoronzhuk@linaro.org> wrote:
>>
>> On Fri, Sep 06, 2019 at 04:31:39PM -0700, Alexei Starovoitov wrote:
>> >On Thu, Sep 05, 2019 at 12:22:06AM +0300, Ivan Khoronzhuk wrote:
>> >> No need to set --target for native build, at least for arm, the
>> >> default target will be used anyway. In case of arm, for at least
>> >> clang 5 - 10 it causes error like:
>> >>
>> >> clang: warning: unknown platform, assuming -mfloat-abi=soft
>> >> LLVM ERROR: Unsupported calling convention
>> >> make[2]: *** [/home/root/snapshot/samples/bpf/Makefile:299:
>> >> /home/root/snapshot/samples/bpf/sockex1_kern.o] Error 1
>> >>
>> >> Only set to real triple helps: --target=arm-linux-gnueabihf
>> >> or just drop the target key to use default one. Decision to just
>> >> drop it and thus default target will be used (wich is native),
>> >> looks better.
>> >>
>> >> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> >> ---
>> >> samples/bpf/Makefile | 2 --
>> >> 1 file changed, 2 deletions(-)
>> >>
>> >> diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
>> >> index 61b7394b811e..a2953357927e 100644
>> >> --- a/samples/bpf/Makefile
>> >> +++ b/samples/bpf/Makefile
>> >> @@ -197,8 +197,6 @@ BTF_PAHOLE ?= pahole
>> >> ifdef CROSS_COMPILE
>> >> HOSTCC = $(CROSS_COMPILE)gcc
>> >> CLANG_ARCH_ARGS = --target=$(notdir $(CROSS_COMPILE:%-=%))
>> >> -else
>> >> -CLANG_ARCH_ARGS = -target $(ARCH)
>> >> endif
>> >
>> >I don't follow here.
>> >Didn't you introduce this bug in patch 1 and now fixing it in patch 2?
>> >
>>
>> It looks like but that's not true.
>> Previous patch adds target only for cross compiling,
>> before the patch the target was used for both, cross compiling and w/o cc.
>>
>> This patch removes target only for native build (it's not cross compiling).
>>
>> By fact, it's two separate significant changes.
>
>How so?
>before first patch CLANG_ARCH_ARGS is only used under CROSS_COMPILE.
>After the first patch CLANG_ARCH_ARGS is now suddenly defined w/o CROSS_COMPILE
>and second patch brings it to the state before first patch.
Oh sorry ), messed with my local exp with target bpf, after rebase, even forgot
that's mine. Will drop it, with removing "else" for previous patch.
--
Regards,
Ivan Khoronzhuk
^ permalink raw reply
* Re: [PATCH bpf-next 8/8] samples: bpf: Makefile: base progs build on Makefile.progs
From: Ivan Khoronzhuk @ 2019-09-07 1:24 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: ast, daniel, yhs, davem, jakub.kicinski, hawk, john.fastabend,
linux-kernel, netdev, bpf, clang-built-linux
In-Reply-To: <20190906233429.6ass5x5inaypvbpr@ast-mbp.dhcp.thefacebook.com>
On Fri, Sep 06, 2019 at 04:34:31PM -0700, Alexei Starovoitov wrote:
>On Thu, Sep 05, 2019 at 12:22:12AM +0300, Ivan Khoronzhuk wrote:
>> +
>> +If need to use environment of target board, the SYSROOT also can be set,
>> +pointing on FS of target board:
>> +
>> +make samples/bpf/ LLC=~/git/llvm/build/bin/llc \
>> + CLANG=~/git/llvm/build/bin/clang \
>> + SYSROOT=~/some_sdk/linux-devkit/sysroots/aarch64-linux-gnu
>
>Patches 7 and 8 look quite heavy. I don't have a way to test them
>which makes me a bit uneasy to accept them as-is.
>Would be great if somebody could give Tested-by.
>
I can try to split patch 8 in v2, but not significantly.
--
Regards,
Ivan Khoronzhuk
^ permalink raw reply
* Re: [PATCH] ethernet: micrel: Use DIV_ROUND_CLOSEST directly to make it readable
From: zhong jiang @ 2019-09-07 3:14 UTC (permalink / raw)
To: Andrew Lunn; +Cc: davem, kstewart, gregkh, netdev, linux-kernel
In-Reply-To: <20190906194050.GB2339@lunn.ch>
On 2019/9/7 3:40, Andrew Lunn wrote:
> On Thu, Sep 05, 2019 at 11:53:48PM +0800, zhong jiang wrote:
>> The kernel.h macro DIV_ROUND_CLOSEST performs the computation (x + d/2)/d
>> but is perhaps more readable.
> Hi Zhong
>
> Did you find this by hand, or did you use a tool. If a tool is used,
> it is normal to give some credit to the tool.
With the following help of Coccinelle.
-(((x) + ((__divisor) / 2)) / (__divisor))
+ DIV_ROUND_CLOSEST(x,__divisor)
Sometimes, I will add the information in the description. Sometimes, I desn't do that.
I will certainly add the description when I send an series of patches to modify the case.
Thanks,
zhong jiang
> Thanks
> Andrew
>
> .
>
^ permalink raw reply
* Re: [PATCH v3 bpf-next 2/3] bpf: implement CAP_BPF
From: kbuild test robot @ 2019-09-07 4:09 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: kbuild-all, davem, daniel, peterz, luto, netdev, bpf, kernel-team,
linux-api
In-Reply-To: <20190904184335.360074-2-ast@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 4029 bytes --]
Hi Alexei,
I love your patch! Perhaps something to improve:
[auto build test WARNING on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Alexei-Starovoitov/capability-introduce-CAP_BPF-and-CAP_TRACING/20190906-215814
base: https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: x86_64-randconfig-c003-201935 (attached as .config)
compiler: gcc-7 (Debian 7.4.0-11) 7.4.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
In file included from include/linux/export.h:45:0,
from include/linux/linkage.h:7,
from include/linux/kernel.h:8,
from include/linux/list.h:9,
from include/linux/timer.h:5,
from include/linux/workqueue.h:9,
from include/linux/bpf.h:9,
from kernel/bpf/syscall.c:4:
kernel/bpf/syscall.c: In function 'bpf_prog_test_run':
kernel/bpf/syscall.c:2087:6: warning: the address of 'capable_bpf_net_admin' will always evaluate as 'true' [-Waddress]
if (!capable_bpf_net_admin)
^
include/linux/compiler.h:58:52: note: in definition of macro '__trace_if_var'
#define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
^~~~
>> kernel/bpf/syscall.c:2087:2: note: in expansion of macro 'if'
if (!capable_bpf_net_admin)
^~
kernel/bpf/syscall.c:2087:6: warning: the address of 'capable_bpf_net_admin' will always evaluate as 'true' [-Waddress]
if (!capable_bpf_net_admin)
^
include/linux/compiler.h:58:61: note: in definition of macro '__trace_if_var'
#define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : __trace_if_value(cond))
^~~~
>> kernel/bpf/syscall.c:2087:2: note: in expansion of macro 'if'
if (!capable_bpf_net_admin)
^~
kernel/bpf/syscall.c:2087:6: warning: the address of 'capable_bpf_net_admin' will always evaluate as 'true' [-Waddress]
if (!capable_bpf_net_admin)
^
include/linux/compiler.h:69:3: note: in definition of macro '__trace_if_value'
(cond) ? \
^~~~
include/linux/compiler.h:56:28: note: in expansion of macro '__trace_if_var'
#define if(cond, ...) if ( __trace_if_var( !!(cond , ## __VA_ARGS__) ) )
^~~~~~~~~~~~~~
>> kernel/bpf/syscall.c:2087:2: note: in expansion of macro 'if'
if (!capable_bpf_net_admin)
^~
vim +/if +2087 kernel/bpf/syscall.c
2080
2081 static int bpf_prog_test_run(const union bpf_attr *attr,
2082 union bpf_attr __user *uattr)
2083 {
2084 struct bpf_prog *prog;
2085 int ret = -ENOTSUPP;
2086
> 2087 if (!capable_bpf_net_admin)
2088 /* test_run callback is available for networking progs only.
2089 * Add capable_bpf_tracing() above when tracing progs become runable.
2090 */
2091 return -EPERM;
2092 if (CHECK_ATTR(BPF_PROG_TEST_RUN))
2093 return -EINVAL;
2094
2095 if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
2096 (!attr->test.ctx_size_in && attr->test.ctx_in))
2097 return -EINVAL;
2098
2099 if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
2100 (!attr->test.ctx_size_out && attr->test.ctx_out))
2101 return -EINVAL;
2102
2103 prog = bpf_prog_get(attr->test.prog_fd);
2104 if (IS_ERR(prog))
2105 return PTR_ERR(prog);
2106
2107 if (prog->aux->ops->test_run)
2108 ret = prog->aux->ops->test_run(prog, attr, uattr);
2109
2110 bpf_prog_put(prog);
2111 return ret;
2112 }
2113
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29761 bytes --]
^ permalink raw reply
* Re: [net-next 02/11] devlink: add 'reset_dev_on_drv_probe' param
From: Jakub Kicinski @ 2019-09-07 4:17 UTC (permalink / raw)
To: Jiri Pirko
Cc: Dirk van der Merwe, Simon Horman, David Miller, netdev,
oss-drivers
In-Reply-To: <8066ba35-2f9b-c175-100f-e754b4ca65be@netronome.com>
On Fri, 6 Sep 2019 11:40:54 -0700, Dirk van der Merwe wrote:
> >> DEVLINK_PARAM_RESET_DEV_VALUE_UNKNOWN (0)
> >> + Unknown or invalid value.
> > Why do you need this? Do you have usecase for this value?
>
> I added this in to avoid having the entire netlink dump fail when there
> are invalid values read from hardware.
>
> This way, it can report an unknown or invalid value instead of failing
> the operation.
That's the first reason, the second is that we also want to report
the unknown value if it's not recognized by the driver. For u8/enum
parameters the value may possibly be set to a value older driver
doesn't understand, but users should still be able to set them to one
of the known ones.
We'd also like to add that to 'fw_load_policy'. WDYT?
^ permalink raw reply
* Re: [PATCH] net/ibmvnic: free reset work of removed device from queue
From: kbuild test robot @ 2019-09-07 4:24 UTC (permalink / raw)
To: Juliet Kim; +Cc: kbuild-all, netdev, julietk, linuxppc-dev
In-Reply-To: <20190905213001.19818-1-julietk@linux.vnet.ibm.com>
[-- Attachment #1: Type: text/plain, Size: 7041 bytes --]
Hi Juliet,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[cannot apply to v5.3-rc7 next-20190904]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Juliet-Kim/net-ibmvnic-free-reset-work-of-removed-device-from-queue/20190906-195317
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 7.4.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=powerpc
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>
All error/warnings (new ones prefixed by >>):
drivers/net/ethernet/ibm/ibmvnic.c: In function '__ibmvnic_reset':
>> drivers/net/ethernet/ibm/ibmvnic.c:1986:3: warning: this 'if' clause does not guard... [-Wmisleading-indentation]
if (adapter->state == VNIC_REMOVING ||
^~
drivers/net/ethernet/ibm/ibmvnic.c:1989:4: note: ...this statement, but the latter is misleadingly indented as if it were guarded by the 'if'
rc = EBUSY;
^~
>> drivers/net/ethernet/ibm/ibmvnic.c:2002:4: error: break statement not within loop or switch
break;
^~~~~
drivers/net/ethernet/ibm/ibmvnic.c: At top level:
>> drivers/net/ethernet/ibm/ibmvnic.c:2007:2: error: expected identifier or '(' before 'if'
if (adapter->wait_for_reset) {
^~
drivers/net/ethernet/ibm/ibmvnic.c:2013:2: error: expected identifier or '(' before 'if'
if (rc) {
^~
>> drivers/net/ethernet/ibm/ibmvnic.c:2018:9: error: expected '=', ',', ';', 'asm' or '__attribute__' before '->' token
adapter->resetting = false;
^~
drivers/net/ethernet/ibm/ibmvnic.c:2019:2: error: expected identifier or '(' before 'if'
if (we_lock_rtnl)
^~
>> drivers/net/ethernet/ibm/ibmvnic.c:2021:1: error: expected identifier or '(' before '}' token
}
^
drivers/net/ethernet/ibm/ibmvnic.c:1953:13: warning: 'free_all_rwi' defined but not used [-Wunused-function]
static void free_all_rwi(struct ibmvnic_adapter *adapter)
^~~~~~~~~~~~
vim +2002 drivers/net/ethernet/ibm/ibmvnic.c
ed651a10875f13 Nathan Fontenot 2017-05-03 1963
ed651a10875f13 Nathan Fontenot 2017-05-03 1964 static void __ibmvnic_reset(struct work_struct *work)
ed651a10875f13 Nathan Fontenot 2017-05-03 1965 {
ed651a10875f13 Nathan Fontenot 2017-05-03 1966 struct ibmvnic_rwi *rwi;
ed651a10875f13 Nathan Fontenot 2017-05-03 1967 struct ibmvnic_adapter *adapter;
a5681e20b541a5 Juliet Kim 2018-11-19 1968 bool we_lock_rtnl = false;
ed651a10875f13 Nathan Fontenot 2017-05-03 1969 u32 reset_state;
c26eba03e4073b John Allen 2017-10-26 1970 int rc = 0;
ed651a10875f13 Nathan Fontenot 2017-05-03 1971
ed651a10875f13 Nathan Fontenot 2017-05-03 1972 adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
ed651a10875f13 Nathan Fontenot 2017-05-03 1973
a5681e20b541a5 Juliet Kim 2018-11-19 1974 /* netif_set_real_num_xx_queues needs to take rtnl lock here
a5681e20b541a5 Juliet Kim 2018-11-19 1975 * unless wait_for_reset is set, in which case the rtnl lock
a5681e20b541a5 Juliet Kim 2018-11-19 1976 * has already been taken before initializing the reset
a5681e20b541a5 Juliet Kim 2018-11-19 1977 */
a5681e20b541a5 Juliet Kim 2018-11-19 1978 if (!adapter->wait_for_reset) {
a5681e20b541a5 Juliet Kim 2018-11-19 1979 rtnl_lock();
a5681e20b541a5 Juliet Kim 2018-11-19 1980 we_lock_rtnl = true;
a5681e20b541a5 Juliet Kim 2018-11-19 1981 }
ed651a10875f13 Nathan Fontenot 2017-05-03 1982 reset_state = adapter->state;
ed651a10875f13 Nathan Fontenot 2017-05-03 1983
ed651a10875f13 Nathan Fontenot 2017-05-03 1984 rwi = get_next_rwi(adapter);
ed651a10875f13 Nathan Fontenot 2017-05-03 1985 while (rwi) {
36f1031c51a253 Thomas Falcon 2019-08-27 @1986 if (adapter->state == VNIC_REMOVING ||
36f1031c51a253 Thomas Falcon 2019-08-27 1987 adapter->state == VNIC_REMOVED)
42a863ed7971cb Juliet Kim 2019-09-05 1988 kfree(rwi);
42a863ed7971cb Juliet Kim 2019-09-05 @1989 rc = EBUSY;
42a863ed7971cb Juliet Kim 2019-09-05 1990 break;
42a863ed7971cb Juliet Kim 2019-09-05 1991 }
36f1031c51a253 Thomas Falcon 2019-08-27 1992
2770a7984db588 Thomas Falcon 2018-05-23 1993 if (adapter->force_reset_recovery) {
2770a7984db588 Thomas Falcon 2018-05-23 1994 adapter->force_reset_recovery = false;
2770a7984db588 Thomas Falcon 2018-05-23 1995 rc = do_hard_reset(adapter, rwi, reset_state);
2770a7984db588 Thomas Falcon 2018-05-23 1996 } else {
ed651a10875f13 Nathan Fontenot 2017-05-03 1997 rc = do_reset(adapter, rwi, reset_state);
2770a7984db588 Thomas Falcon 2018-05-23 1998 }
ed651a10875f13 Nathan Fontenot 2017-05-03 1999 kfree(rwi);
2770a7984db588 Thomas Falcon 2018-05-23 2000 if (rc && rc != IBMVNIC_INIT_FAILED &&
2770a7984db588 Thomas Falcon 2018-05-23 2001 !adapter->force_reset_recovery)
ed651a10875f13 Nathan Fontenot 2017-05-03 @2002 break;
ed651a10875f13 Nathan Fontenot 2017-05-03 2003
ed651a10875f13 Nathan Fontenot 2017-05-03 2004 rwi = get_next_rwi(adapter);
ed651a10875f13 Nathan Fontenot 2017-05-03 2005 }
ed651a10875f13 Nathan Fontenot 2017-05-03 2006
c26eba03e4073b John Allen 2017-10-26 @2007 if (adapter->wait_for_reset) {
c26eba03e4073b John Allen 2017-10-26 2008 adapter->wait_for_reset = false;
c26eba03e4073b John Allen 2017-10-26 2009 adapter->reset_done_rc = rc;
c26eba03e4073b John Allen 2017-10-26 2010 complete(&adapter->reset_done);
c26eba03e4073b John Allen 2017-10-26 2011 }
c26eba03e4073b John Allen 2017-10-26 2012
ed651a10875f13 Nathan Fontenot 2017-05-03 @2013 if (rc) {
d1cf33d93166f1 Nathan Fontenot 2017-08-08 2014 netdev_dbg(adapter->netdev, "Reset failed\n");
ed651a10875f13 Nathan Fontenot 2017-05-03 2015 free_all_rwi(adapter);
ed651a10875f13 Nathan Fontenot 2017-05-03 2016 }
42a863ed7971cb Juliet Kim 2019-09-05 2017
ed651a10875f13 Nathan Fontenot 2017-05-03 @2018 adapter->resetting = false;
a5681e20b541a5 Juliet Kim 2018-11-19 2019 if (we_lock_rtnl)
a5681e20b541a5 Juliet Kim 2018-11-19 2020 rtnl_unlock();
ed651a10875f13 Nathan Fontenot 2017-05-03 @2021 }
ed651a10875f13 Nathan Fontenot 2017-05-03 2022
:::::: The code at line 2002 was first introduced by commit
:::::: ed651a10875f13135a5f59c1bae4d51b377b3925 ibmvnic: Updated reset handling
:::::: TO: Nathan Fontenot <nfont@linux.vnet.ibm.com>
:::::: CC: David S. Miller <davem@davemloft.net>
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 62432 bytes --]
^ permalink raw reply
* Re: [PATCH net-next, 2/2] hv_netvsc: Sync offloading features to VF NIC
From: Jakub Kicinski @ 2019-09-07 4:25 UTC (permalink / raw)
To: Haiyang Zhang
Cc: sashal@kernel.org, linux-hyperv@vger.kernel.org,
netdev@vger.kernel.org, KY Srinivasan, Stephen Hemminger,
olaf@aepfle.de, vkuznets, davem@davemloft.net,
linux-kernel@vger.kernel.org, Mark Bloch
In-Reply-To: <DM6PR21MB13373166435FD2FC5543D349CABB0@DM6PR21MB1337.namprd21.prod.outlook.com>
On Thu, 5 Sep 2019 23:07:32 +0000, Haiyang Zhang wrote:
> > On Fri, 30 Aug 2019 03:45:38 +0000, Haiyang Zhang wrote:
> > > VF NIC may go down then come up during host servicing events. This
> > > causes the VF NIC offloading feature settings to roll back to the
> > > defaults. This patch can synchronize features from synthetic NIC to
> > > the VF NIC during ndo_set_features (ethtool -K), and
> > > netvsc_register_vf when VF comes back after host events.
> > >
> > > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> > > Cc: Mark Bloch <markb@mellanox.com>
> >
> > If we want to make this change in behaviour we should change net_failover
> > at the same time.
>
> After checking the net_failover, I found it's for virtio based SRIOV, and very
> different from what we did for Hyper-V based SRIOV.
>
> We let the netvsc driver acts as both the synthetic (PV) driver and the transparent
> bonding master for the VF NIC. But net_failover acts as a master device on top
> of both virtio PV NIC, and VF NIC. And the net_failover doesn't implemented
> operations, like ndo_set_features.
> So the code change for our netvsc driver cannot be applied to net_failover driver.
>
> I will re-submit my two patches (fixing the extra tab in the 1st one as you pointed
> out). Thanks!
I think it stands to reason that two modules which implement the same
functionality behave the same.
^ permalink raw reply
* test
From: Rain River @ 2019-09-07 5:01 UTC (permalink / raw)
To: netdev
Please ignore it.
^ permalink raw reply
* [PATCH net-next 0/4] net/tls: small TX offload optimizations
From: Jakub Kicinski @ 2019-09-07 5:29 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski
Hi!
This set brings small TLS TX device optimizations. The biggest
gain comes from fixing a misuse of non temporal copy instructions.
On a synthetic workload modelled after customer's RFC application
I see 3-5% percent gain.
Jakub Kicinski (4):
net/tls: unref frags in order
net/tls: use RCU for the adder to the offload record list
net/tls: remove the record tail optimization
net/tls: align non temporal copy to cache lines
net/tls/tls_device.c | 121 ++++++++++++++++++++++++++++++-------------
1 file changed, 84 insertions(+), 37 deletions(-)
--
2.21.0
^ permalink raw reply
* [PATCH net-next 1/4] net/tls: unref frags in order
From: Jakub Kicinski @ 2019-09-07 5:29 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>
It's generally more cache friendly to walk arrays in order,
especially those which are likely not in cache.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 41c106e45f01..285c9f9e94e4 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -122,13 +122,10 @@ static struct net_device *get_netdev_for_sock(struct sock *sk)
static void destroy_record(struct tls_record_info *record)
{
- int nr_frags = record->num_frags;
- skb_frag_t *frag;
+ int i;
- while (nr_frags-- > 0) {
- frag = &record->frags[nr_frags];
- __skb_frag_unref(frag);
- }
+ for (i = 0; i < record->num_frags; i++)
+ __skb_frag_unref(&record->frags[i]);
kfree(record);
}
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 2/4] net/tls: use RCU for the adder to the offload record list
From: Jakub Kicinski @ 2019-09-07 5:29 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>
All modifications to TLS record list happen under the socket
lock. Since records form an ordered queue readers are only
concerned about elements being removed, additions can happen
concurrently.
Use RCU primitives to ensure the correct access types
(READ_ONCE/WRITE_ONCE).
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 285c9f9e94e4..b11355e00514 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -280,9 +280,7 @@ static int tls_push_record(struct sock *sk,
tls_append_frag(record, &dummy_tag_frag, prot->tag_size);
record->end_seq = tp->write_seq + record->len;
- spin_lock_irq(&offload_ctx->lock);
- list_add_tail(&record->list, &offload_ctx->records_list);
- spin_unlock_irq(&offload_ctx->lock);
+ list_add_tail_rcu(&record->list, &offload_ctx->records_list);
offload_ctx->open_record = NULL;
if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
@@ -535,12 +533,16 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
/* if retransmit_hint is irrelevant start
* from the beggining of the list
*/
- info = list_first_entry(&context->records_list,
- struct tls_record_info, list);
+ info = list_first_entry_or_null(&context->records_list,
+ struct tls_record_info, list);
+ if (!info)
+ return NULL;
record_sn = context->unacked_record_sn;
}
- list_for_each_entry_from(info, &context->records_list, list) {
+ /* We just need the _rcu for the READ_ONCE() */
+ rcu_read_lock();
+ list_for_each_entry_from_rcu(info, &context->records_list, list) {
if (before(seq, info->end_seq)) {
if (!context->retransmit_hint ||
after(info->end_seq,
@@ -549,12 +551,15 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
context->retransmit_hint = info;
}
*p_record_sn = record_sn;
- return info;
+ goto exit_rcu_unlock;
}
record_sn++;
}
+ info = NULL;
- return NULL;
+exit_rcu_unlock:
+ rcu_read_unlock();
+ return info;
}
EXPORT_SYMBOL(tls_get_record);
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 3/4] net/tls: remove the record tail optimization
From: Jakub Kicinski @ 2019-09-07 5:29 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>
For TLS device offload the tag/message authentication code are
filled in by the device. The kernel merely reserves space for
them. Because device overwrites it, the contents of the tag make
do no matter. Current code tries to save space by reusing the
header as the tag. This, however, leads to an additional frag
being created and defeats buffer coalescing (which trickles
all the way down to the drivers).
Remove this optimization, and try to allocate the space for
the tag in the usual way, leave the memory uninitialized.
If memory allocation fails rewind the record pointer so that
we use the already copied user data as tag.
Note that the optimization was actually buggy, as the tag
for TLS 1.2 is 16 bytes, but header is just 13, so the reuse
may had looked past the end of the page..
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 67 +++++++++++++++++++++++++++++++-------------
1 file changed, 47 insertions(+), 20 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index b11355e00514..916c3c0a99f0 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -256,29 +256,13 @@ static int tls_push_record(struct sock *sk,
struct tls_context *ctx,
struct tls_offload_context_tx *offload_ctx,
struct tls_record_info *record,
- struct page_frag *pfrag,
- int flags,
- unsigned char record_type)
+ int flags)
{
struct tls_prot_info *prot = &ctx->prot_info;
struct tcp_sock *tp = tcp_sk(sk);
- struct page_frag dummy_tag_frag;
skb_frag_t *frag;
int i;
- /* fill prepend */
- frag = &record->frags[0];
- tls_fill_prepend(ctx,
- skb_frag_address(frag),
- record->len - prot->prepend_size,
- record_type,
- prot->version);
-
- /* HW doesn't care about the data in the tag, because it fills it. */
- dummy_tag_frag.page = skb_frag_page(frag);
- dummy_tag_frag.offset = 0;
-
- tls_append_frag(record, &dummy_tag_frag, prot->tag_size);
record->end_seq = tp->write_seq + record->len;
list_add_tail_rcu(&record->list, &offload_ctx->records_list);
offload_ctx->open_record = NULL;
@@ -302,6 +286,38 @@ static int tls_push_record(struct sock *sk,
return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}
+static int tls_device_record_close(struct sock *sk,
+ struct tls_context *ctx,
+ struct tls_record_info *record,
+ struct page_frag *pfrag,
+ unsigned char record_type)
+{
+ struct tls_prot_info *prot = &ctx->prot_info;
+ int ret;
+
+ /* append tag
+ * device will fill in the tag, we just need to append a placeholder
+ * use socket memory to improve coalescing (re-using a single buffer
+ * increases frag count)
+ * if we can't allocate memory now, steal some back from data
+ */
+ if (likely(skb_page_frag_refill(prot->tag_size, pfrag,
+ sk->sk_allocation))) {
+ ret = 0;
+ tls_append_frag(record, pfrag, prot->tag_size);
+ } else {
+ ret = prot->tag_size;
+ if (record->len <= prot->overhead_size)
+ return -ENOMEM;
+ }
+
+ /* fill prepend */
+ tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
+ record->len - prot->overhead_size,
+ record_type, prot->version);
+ return ret;
+}
+
static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
@@ -452,13 +468,24 @@ static int tls_push_data(struct sock *sk,
if (done || record->len >= max_open_record_len ||
(record->num_frags >= MAX_SKB_FRAGS - 1)) {
+ rc = tls_device_record_close(sk, tls_ctx, record,
+ pfrag, record_type);
+ if (rc) {
+ if (rc > 0) {
+ size += rc;
+ } else {
+ size = orig_size;
+ destroy_record(record);
+ ctx->open_record = NULL;
+ break;
+ }
+ }
+
rc = tls_push_record(sk,
tls_ctx,
ctx,
record,
- pfrag,
- tls_push_record_flags,
- record_type);
+ tls_push_record_flags);
if (rc < 0)
break;
}
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 4/4] net/tls: align non temporal copy to cache lines
From: Jakub Kicinski @ 2019-09-07 5:30 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, Dirk van der Merwe
In-Reply-To: <20190907053000.23869-1-jakub.kicinski@netronome.com>
Unlike normal TCP code TLS has to touch the cache lines
it copies into to fill header info. On memory-heavy workloads
having non temporal stores and normal accesses targeting
the same cache line leads to significant overhead.
Measured 3% overhead running 3600 round robin connections
with additional memory heavy workload.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 33 ++++++++++++++++++++++++++++-----
1 file changed, 28 insertions(+), 5 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 916c3c0a99f0..f959487c5cd1 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -372,6 +372,31 @@ static int tls_do_allocation(struct sock *sk,
return 0;
}
+static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t pre_copy, nocache;
+
+ pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1);
+ if (pre_copy) {
+ pre_copy = min(pre_copy, bytes);
+ if (copy_from_iter(addr, pre_copy, i) != pre_copy)
+ return -EFAULT;
+ bytes -= pre_copy;
+ addr += pre_copy;
+ }
+
+ nocache = round_down(bytes, SMP_CACHE_BYTES);
+ if (copy_from_iter_nocache(addr, nocache, i) != nocache)
+ return -EFAULT;
+ bytes -= nocache;
+ addr += nocache;
+
+ if (bytes && copy_from_iter(addr, bytes, i) != bytes)
+ return -EFAULT;
+
+ return 0;
+}
+
static int tls_push_data(struct sock *sk,
struct iov_iter *msg_iter,
size_t size, int flags,
@@ -445,12 +470,10 @@ static int tls_push_data(struct sock *sk,
copy = min_t(size_t, size, (pfrag->size - pfrag->offset));
copy = min_t(size_t, copy, (max_open_record_len - record->len));
- if (copy_from_iter_nocache(page_address(pfrag->page) +
- pfrag->offset,
- copy, msg_iter) != copy) {
- rc = -EFAULT;
+ rc = tls_device_copy_data(page_address(pfrag->page) +
+ pfrag->offset, copy, msg_iter);
+ if (rc)
goto handle_error;
- }
tls_append_frag(record, pfrag, copy);
size -= copy;
--
2.21.0
^ permalink raw reply related
* Re: [PATCH bpf-next v10 2/4] bpf: new helper to obtain namespace data from current task New bpf helper bpf_get_current_pidns_info.
From: Yonghong Song @ 2019-09-07 6:34 UTC (permalink / raw)
To: Al Viro
Cc: Carlos Neira, netdev@vger.kernel.org, ebiederm@xmission.com,
brouer@redhat.com, bpf@vger.kernel.org
In-Reply-To: <20190907001056.GA1131@ZenIV.linux.org.uk>
On 9/6/19 5:10 PM, Al Viro wrote:
> On Fri, Sep 06, 2019 at 11:21:14PM +0000, Yonghong Song wrote:
>
>> -bash-4.4$ readlink /proc/self/ns/pid
>> pid:[4026531836]
>> -bash-4.4$ stat /proc/self/ns/pid
>> File: ‘/proc/self/ns/pid’ -> ‘pid:[4026531836]’
>> Size: 0 Blocks: 0 IO Block: 1024 symbolic link
>> Device: 4h/4d Inode: 344795989 Links: 1
>> Access: (0777/lrwxrwxrwx) Uid: (128203/ yhs) Gid: ( 100/ users)
>> Context: user_u:base_r:base_t
>> Access: 2019-09-06 16:06:09.431616380 -0700
>> Modify: 2019-09-06 16:06:09.431616380 -0700
>> Change: 2019-09-06 16:06:09.431616380 -0700
>> Birth: -
>> -bash-4.4$
>>
>> Based on a discussion with Eric Biederman back in 2019 Linux
>> Plumbers, Eric suggested that to uniquely identify a
>> namespace, device id (major/minor) number should also
>> be included. Although today's kernel implementation
>> has the same device for all namespace pseudo files,
>> but from uapi perspective, device id should be included.
>>
>> That is the reason why we try to get device id which holds
>> pid namespace pseudo file.
>>
>> Do you have a better suggestion on how to get
>> the device id for 'current' pid namespace? Or from design, we
>> really should not care about device id at all?
>
> What the hell is "device id for pid namespace"? This is the
> first time I've heard about that mystery object, so it's
> hard to tell where it could be found.
>
> I can tell you what device numbers are involved in the areas
> you seem to be looking in.
>
> 1) there's whatever device number that gets assigned to
> (this) procfs instance. That, ironically, _is_ per-pidns, but
> that of the procfs instance, not that of your process (and
> those can be different). That's what you get in ->st_dev
> when doing lstat() of anything in /proc (assuming that
> procfs is mounted there, in the first place). NOTE:
> that's lstat(2), not stat(2). stat(1) uses lstat(2),
> unless given -L (in which case it's stat(2) time). The
> difference:
>
> root@kvm1:~# stat /proc/self/ns/pid
> File: /proc/self/ns/pid -> pid:[4026531836]
> Size: 0 Blocks: 0 IO Block: 1024 symbolic link
> Device: 4h/4d Inode: 17396 Links: 1
> Access: (0777/lrwxrwxrwx) Uid: ( 0/ root) Gid: ( 0/ root)
> Access: 2019-09-06 19:43:11.871312319 -0400
> Modify: 2019-09-06 19:43:11.871312319 -0400
> Change: 2019-09-06 19:43:11.871312319 -0400
> Birth: -
> root@kvm1:~# stat -L /proc/self/ns/pid
> File: /proc/self/ns/pid
> Size: 0 Blocks: 0 IO Block: 4096 regular empty file
> Device: 3h/3d Inode: 4026531836 Links: 1
> Access: (0444/-r--r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
> Access: 2019-09-06 19:43:15.955313293 -0400
> Modify: 2019-09-06 19:43:15.955313293 -0400
> Change: 2019-09-06 19:43:15.955313293 -0400
> Birth: -
>
> The former is lstat, the latter - stat.
>
> 2) device number of the filesystem where the symlink target lives.
> In this case, it's nsfs and there's only one instance on the entire
> system. _That_ would be obtained by looking at st_dev in stat(2) on
> /proc/self/ns/pid (0:3 above).
>
> 3) device number *OF* the symlink. That would be st_rdev in lstat(2).
> There's none - it's a symlink, not a character or block device. It's
> always zero and always will be zero.
>
> 4) the same for the target; st_rdev in stat(2) results and again,
> there's no such beast - it's neither character nor block device.
>
> Your code is looking at (3). Please, reread any textbook on Unix
> in the section that would cover stat(2) and discussion of the
> difference between st_dev and st_rdev.
>
> I have no idea what Eric had been talking about - it's hard to
> reconstruct by what you said so far. Making nsfs per-userns,
> perhaps? But that makes no sense whatsoever, not that userns
> ever had... Cheap shots aside, I really can't guess what that's
> about. Sorry.
Thanks for the detailed information. The device number we want
is nsfs. Indeed, currently, there is only one instance
on the entire system. But not exactly sure what is the possibility
to have more than one nsfs device in the future. Maybe per-userns
or any other criteria?
>
> In any case, pathname resolution is *NOT* for the situations where
> you can't block. Even if it's procfs (and from the same pidns as
> the process) mounted there, there is no promise that the target
> of /proc/self has already been looked up and not evicted from
> memory since then. And in case of cache miss pathwalk will
> have to call ->lookup(), which requires locking the directory
> (rw_sem, shared). You can't do that in such context.
>
> And that doesn't even go into the possibility that process has
> something very different mounted on /proc.
>
> Again, I don't know what it is that you want to get to, but
> I would strongly recommend finding a way to get to that data
> that would not involve going anywhere near pathname resolution.
>
> How would you expect the userland to work with that value,
> whatever it might be? If it's just a 32bit field that will
> never be read, you might as well store there the same value
> you store now (0, that is) in much cheaper and safer way ;-)
Suppose inside pid namespace, user can pass the device number,
say n1, (`stat -L /proc/self/ns/pid`) to bpf program (through map
or JIT). At runtime, bpf program will try to get device number,
say n2, for the 'current' process. If n1 is not the same as
n2, that means they are not in the same namespace. 'current'
is in the same pid namespace as the user iff
n1 == n2 and also pidns id is the same for 'current' and
the one with `lsns -t pid`.
Are you aware of any way to get the pidns device number
for 'current' without going through the pathname
lookup?
^ permalink raw reply
* Re: [PATCH 0/7] libbpf: Fix cast away const qualifiers in btf.h
From: Jiri Olsa @ 2019-09-07 6:54 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Jiri Olsa, Alexei Starovoitov, Daniel Borkmann,
netdev@vger.kernel.org, bpf@vger.kernel.org, Yonghong Song,
Martin Lau
In-Reply-To: <62e760de-e746-c512-350a-c2188a2bb3ed@fb.com>
On Fri, Sep 06, 2019 at 09:09:17AM +0000, Andrii Nakryiko wrote:
> On 9/6/19 8:31 AM, Jiri Olsa wrote:
> > hi,
> > when including btf.h in bpftrace, I'm getting -Wcast-qual warnings like:
> >
> > bpf/btf.h: In function ‘btf_var_secinfo* btf_var_secinfos(const btf_type*)’:
> > bpf/btf.h:302:41: warning: cast from type ‘const btf_type*’ to type
> > ‘btf_var_secinfo*’ casts away qualifiers [-Wcast-qual]
> > 302 | return (struct btf_var_secinfo *)(t + 1);
> > | ^
> >
> > I changed the btf.h header to comply with -Wcast-qual checks
> > and used const cast away casting in libbpf objects, where it's
>
> Hey Jiri,
>
> We made all those helper funcs return non-const structs intentionally to
> improve their usability and avoid all those casts that you added back.
>
> Also, those helpers are now part of public API, so we can't just change
> them to const, as it can break existing users easily.
>
> If there is a need to run with -Wcast-qual, we should probably disable
> those checks where appropriate in libbpf code.
>
> So this will be a NACK from me, sorry.
ok, I'll disable disable it in bpftrace code then
thanks,
jirka
>
> > all related to deduplication code, so I believe loosing const
> > is fine there.
> >
> > thanks,
> > jirka
> >
> >
> > ---
> > Jiri Olsa (7):
> > libbpf: Use const cast for btf_int_* functions
> > libbpf: Return const btf_array from btf_array inline function
> > libbpf: Return const btf_enum from btf_enum inline function
> > libbpf: Return const btf_member from btf_members inline function
> > libbpf: Return const btf_param from btf_params inline function
> > libbpf: Return const btf_var from btf_var inline function
> > libbpf: Return const struct btf_var_secinfo from btf_var_secinfos inline function
> >
> > tools/lib/bpf/btf.c | 21 +++++++++++----------
> > tools/lib/bpf/btf.h | 30 +++++++++++++++---------------
> > tools/lib/bpf/libbpf.c | 2 +-
> > 3 files changed, 27 insertions(+), 26 deletions(-)
> >
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox