* [PATCH v4 bpf-next 4/4] selftests/bpf: use CAP_BPF and CAP_TRACING in tests
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>
Make all test_verifier test exercise CAP_BPF and CAP_TRACING
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++++----
1 file changed, 38 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d27fd929abb9..0d5567962c4e 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -807,10 +807,20 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
}
}
+struct libcap {
+ struct __user_cap_header_struct hdr;
+ struct __user_cap_data_struct data[2];
+};
+
static int set_admin(bool admin)
{
cap_t caps;
- const cap_value_t cap_val = CAP_SYS_ADMIN;
+ /* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs,
+ * and CAP_TRACING to create stackmap
+ */
+ const cap_value_t cap_net_admin = CAP_NET_ADMIN;
+ const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
+ struct libcap *cap;
int ret = -1;
caps = cap_get_proc();
@@ -818,11 +828,26 @@ static int set_admin(bool admin)
perror("cap_get_proc");
return -1;
}
- if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+ cap = (struct libcap *)caps;
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
+ perror("cap_set_flag clear admin");
+ goto out;
+ }
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
admin ? CAP_SET : CAP_CLEAR)) {
- perror("cap_set_flag");
+ perror("cap_set_flag set_or_clear net");
goto out;
}
+ /* libcap is likely old and simply ignores CAP_BPF and CAP_TRACING,
+ * so update effective bits manually
+ */
+ if (admin) {
+ cap->data[1].effective |= 1 << (38 /* CAP_BPF */ - 32);
+ cap->data[1].effective |= 1 << (39 /* CAP_TRACING */ - 32);
+ } else {
+ cap->data[1].effective &= ~(1 << (38 - 32));
+ cap->data[1].effective &= ~(1 << (39 - 32));
+ }
if (cap_set_proc(caps)) {
perror("cap_set_proc");
goto out;
@@ -1051,9 +1076,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
static bool is_admin(void)
{
+ cap_flag_value_t net_priv = CAP_CLEAR;
+ bool tracing_priv = false;
+ bool bpf_priv = false;
+ struct libcap *cap;
cap_t caps;
- cap_flag_value_t sysadmin = CAP_CLEAR;
- const cap_value_t cap_val = CAP_SYS_ADMIN;
#ifdef CAP_IS_SUPPORTED
if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1066,11 +1093,14 @@ static bool is_admin(void)
perror("cap_get_proc");
return false;
}
- if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
- perror("cap_get_flag");
+ cap = (struct libcap *)caps;
+ bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32));
+ tracing_priv = cap->data[1].effective & (1 << (39/* CAP_TRACING */ - 32));
+ if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+ perror("cap_get_flag NET");
if (cap_free(caps))
perror("cap_free");
- return (sysadmin == CAP_SET);
+ return bpf_priv && tracing_priv && net_priv == CAP_SET;
}
static void get_unpriv_disabled()
--
2.20.0
^ permalink raw reply related
* [PATCH v4 bpf-next 2/4] bpf: implement CAP_BPF
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>
Implement permissions as stated in uapi/linux/capability.h
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
kernel/bpf/arraymap.c | 2 +-
kernel/bpf/cgroup.c | 2 +-
kernel/bpf/core.c | 4 ++--
kernel/bpf/hashtab.c | 4 ++--
kernel/bpf/lpm_trie.c | 2 +-
kernel/bpf/queue_stack_maps.c | 2 +-
kernel/bpf/reuseport_array.c | 2 +-
kernel/bpf/stackmap.c | 2 +-
kernel/bpf/syscall.c | 32 +++++++++++++++++++-------------
kernel/bpf/verifier.c | 2 +-
kernel/trace/bpf_trace.c | 2 +-
net/core/bpf_sk_storage.c | 2 +-
net/core/filter.c | 10 ++++++----
13 files changed, 38 insertions(+), 30 deletions(-)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..149f868a02dc 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int ret, numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool unpriv = !capable(CAP_SYS_ADMIN);
+ bool unpriv = !capable_bpf();
u64 cost, array_size, mask64;
struct bpf_map_memory mem;
struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..9c659ba5c146 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
+ if (capable_bpf_tracing())
return bpf_get_trace_printk_proto();
/* fall through */
default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 66088a9e9b9e..6643099bc64b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !capable(CAP_SYS_ADMIN))
+ !capable_bpf())
return;
spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!capable_bpf()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..0fae5c45f425 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !capable_bpf())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..11da3be8a4e5 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
u64 cost = sizeof(*trie), cost_per_node;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..d83afac32863 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
/* Called from syscall */
static int queue_stack_map_alloc_check(union bpf_attr *attr)
{
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..b268fe4b2972 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
struct bpf_map_memory mem;
u64 array_size;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..477063c63b27 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
u64 cost, n_buckets;
int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_tracing())
return ERR_PTR(-EPERM);
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 82eabd4e38ad..cd2d1b21f0f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr)
err = -EBUSY;
goto err_put;
}
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!capable_bpf()) {
err = -EPERM;
goto err_put;
}
@@ -1635,7 +1635,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
- !capable(CAP_SYS_ADMIN))
+ !capable_bpf())
return -EPERM;
/* copy eBPF program license from user space */
@@ -1648,11 +1648,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
is_gpl = license_is_gpl_compatible(license);
if (attr->insn_cnt == 0 ||
- attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+ attr->insn_cnt > (capable_bpf() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
- !capable(CAP_SYS_ADMIN))
+ !capable_bpf())
return -EPERM;
bpf_prog_load_fixup_attach_type(attr);
@@ -1809,6 +1809,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
char tp_name[128];
int tp_fd, err;
+ if (!capable_bpf_tracing())
+ return -EPERM;
+
if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
sizeof(tp_name) - 1) < 0)
return -EFAULT;
@@ -2087,7 +2090,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
struct bpf_prog *prog;
int ret = -ENOTSUPP;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf_net_admin())
+ /* test_run callback is available for networking progs only.
+ * Add capable_bpf_tracing() above when tracing progs become runable.
+ */
return -EPERM;
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
@@ -2124,7 +2130,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
next_id++;
@@ -2150,7 +2156,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
spin_lock_bh(&prog_idr_lock);
@@ -2184,7 +2190,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
attr->open_flags & ~BPF_OBJ_FLAG_MASK)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2359,7 +2365,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.run_time_ns = stats.nsecs;
info.run_cnt = stats.cnt;
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!capable_bpf()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
info.nr_jited_ksyms = 0;
@@ -2677,7 +2683,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
return btf_new_fd(attr);
@@ -2690,7 +2696,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
return btf_get_fd_by_id(attr->btf_id);
@@ -2759,7 +2765,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (CHECK_ATTR(BPF_TASK_FD_QUERY))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf_tracing())
return -EPERM;
if (attr->task_fd_query.flags != 0)
@@ -2827,7 +2833,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
union bpf_attr attr = {};
int err;
- if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+ if (sysctl_unprivileged_bpf_disabled && !capable_bpf())
return -EPERM;
err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fb50757e812..7e519711c689 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9234,7 +9234,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- is_priv = capable(CAP_SYS_ADMIN);
+ is_priv = capable_bpf();
/* grab the mutex to protect few globals used by verifier */
if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..cdf8d6c8a430 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
u32 *ids, prog_cnt, ids_len;
int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf_tracing())
return -EPERM;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..aa74be21f5b6 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
!attr->btf_key_type_id || !attr->btf_value_type_id)
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return -EPERM;
if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index ed6563622ce3..b233ed8438f1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
break;
}
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return NULL;
switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
+ if (capable_bpf_tracing())
+ return bpf_get_trace_printk_proto();
+ /* fall through */
default:
return NULL;
}
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size,
return false;
case bpf_ctx_range(struct __sk_buff, data):
case bpf_ctx_range(struct __sk_buff, data_end):
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return false;
break;
}
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size,
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
break;
case bpf_ctx_range(struct __sk_buff, tstamp):
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_bpf())
return false;
break;
default:
--
2.20.0
^ permalink raw reply related
* [PATCH v4 bpf-next 3/4] perf: implement CAP_TRACING
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190906231053.1276792-1-ast@kernel.org>
Implement permissions as stated in uapi/linux/capability.h
and update Documentation.
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
Documentation/admin-guide/perf-security.rst | 4 ++--
Documentation/admin-guide/sysctl/kernel.rst | 10 ++++------
arch/powerpc/perf/core-book3s.c | 4 ++--
arch/x86/events/intel/bts.c | 2 +-
arch/x86/events/intel/core.c | 2 +-
arch/x86/events/intel/p4.c | 2 +-
kernel/events/core.c | 14 +++++++-------
kernel/events/hw_breakpoint.c | 2 +-
kernel/trace/trace_event_perf.c | 4 ++--
9 files changed, 21 insertions(+), 23 deletions(-)
diff --git a/Documentation/admin-guide/perf-security.rst b/Documentation/admin-guide/perf-security.rst
index 72effa7c23b9..c84152d1dfd4 100644
--- a/Documentation/admin-guide/perf-security.rst
+++ b/Documentation/admin-guide/perf-security.rst
@@ -66,8 +66,8 @@ into distinct units, known as capabilities [6]_ , which can be
independently enabled and disabled on per-thread basis for processes and
files of unprivileged users.
-Unprivileged processes with enabled CAP_SYS_ADMIN capability are treated
-as privileged processes with respect to perf_events performance
+Unprivileged processes with enabled CAP_SYS_ADMIN or CAP_TRACING capability
+are treated as privileged processes with respect to perf_events performance
monitoring and bypass *scope* permissions checks in the kernel.
Unprivileged processes using perf_events system call API is also subject
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 032c7cd3cede..595bf2b1363f 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -720,20 +720,18 @@ allowed to execute.
====================
Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN). The default value is 2.
+users (without CAP_SYS_ADMIN and without CAP_TRACING). The default value is 2.
=== ==================================================================
-1 Allow use of (almost) all events by all users
Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
->=0 Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
+>=0 Disallow ftrace function tracepoint and raw tracepoint
- Disallow raw tracepoint access by users without CAP_SYS_ADMIN
+>=1 Disallow CPU event access
->=1 Disallow CPU event access by users without CAP_SYS_ADMIN
-
->=2 Disallow kernel profiling by users without CAP_SYS_ADMIN
+>=2 Disallow kernel profiling
=== ==================================================================
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index ca92e01d0bd1..a204a3c6c68b 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -204,7 +204,7 @@ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp)
if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
*addrp = mfspr(SPRN_SDAR);
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) &&
+ if (perf_paranoid_kernel() && !capable_tracing() &&
is_kernel_addr(mfspr(SPRN_SDAR)))
*addrp = 0;
}
@@ -472,7 +472,7 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw)
* exporting it to userspace (avoid exposure of regions
* where we could have speculative execution)
*/
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) &&
+ if (perf_paranoid_kernel() && !capable_tracing() &&
is_kernel_addr(addr))
continue;
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 5ee3fed881d3..bd713b2dd7c2 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -550,7 +550,7 @@ static int bts_event_init(struct perf_event *event)
* users to profile the kernel.
*/
if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
- !capable(CAP_SYS_ADMIN))
+ !capable_tracing())
return -EACCES;
if (x86_add_exclusive(x86_lbr_exclusive_bts))
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e4c2cb65ea50..a7f8c18bd82b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3307,7 +3307,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (x86_pmu.version < 3)
return -EINVAL;
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu() && !capable_tracing())
return -EACCES;
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579efb2b2..f379a358c9cb 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,7 +776,7 @@ static int p4_validate_raw_event(struct perf_event *event)
* the user needs special permissions to be able to use it
*/
if (p4_ht_active() && p4_event_bind_map[v].shared) {
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu() && !capable_tracing())
return -EACCES;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0463c1151bae..eaba102e5d91 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4134,7 +4134,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu() && !capable_tracing())
return ERR_PTR(-EACCES);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -8741,7 +8741,7 @@ static int perf_kprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_kprobe.type)
return -ENOENT;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_tracing())
return -EACCES;
/*
@@ -8801,7 +8801,7 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_tracing())
return -EACCES;
/*
@@ -10588,7 +10588,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
}
/* privileged levels capture (kernel, hv): check permissions */
if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ && perf_paranoid_kernel() && !capable_tracing())
return -EACCES;
}
@@ -10807,12 +10807,12 @@ SYSCALL_DEFINE5(perf_event_open,
return err;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_kernel() && !capable_tracing())
return -EACCES;
}
if (attr.namespaces) {
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_tracing())
return -EACCES;
}
@@ -10826,7 +10826,7 @@ SYSCALL_DEFINE5(perf_event_open,
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ perf_paranoid_kernel() && !capable_tracing())
return -EACCES;
/*
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c5cd852fe86b..8bc4d7d8c913 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -404,7 +404,7 @@ static int hw_breakpoint_parse(struct perf_event *bp,
* Don't let unprivileged users set a breakpoint in the trap
* path to avoid trap recursion attacks.
*/
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable_tracing())
return -EPERM;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0892e38ed6fb..6861307f14d6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -46,7 +46,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_tracepoint_raw() && !capable_tracing())
return -EPERM;
if (!is_sampling_event(p_event))
@@ -82,7 +82,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_tracepoint_raw() && !capable_tracing())
return -EPERM;
return 0;
--
2.20.0
^ permalink raw reply related
* [PATCH v4 bpf-next 0/4] CAP_BPF and CAP_TRACING
From: Alexei Starovoitov @ 2019-09-06 23:10 UTC (permalink / raw)
To: davem; +Cc: daniel, peterz, luto, netdev, bpf, kernel-team, linux-api
v3->v4:
- rebase and typo fixes
- split selftests into separate patch
- update perf* docs with CAP_TRACING
- add a note to commit log that existing unpriv bpf behavior is not changing
v2->v3:
- dropped ftrace and kallsyms from CAP_TRACING description.
In the future these mechanisms can start using it too.
- added CAP_SYS_ADMIN backward compatibility.
Alexei Starovoitov (4):
capability: introduce CAP_BPF and CAP_TRACING
bpf: implement CAP_BPF
perf: implement CAP_TRACING
selftests/bpf: use CAP_BPF and CAP_TRACING in tests
Documentation/admin-guide/perf-security.rst | 4 +-
Documentation/admin-guide/sysctl/kernel.rst | 10 ++---
arch/powerpc/perf/core-book3s.c | 4 +-
arch/x86/events/intel/bts.c | 2 +-
arch/x86/events/intel/core.c | 2 +-
arch/x86/events/intel/p4.c | 2 +-
include/linux/capability.h | 18 ++++++++
include/uapi/linux/capability.h | 49 ++++++++++++++++++++-
kernel/bpf/arraymap.c | 2 +-
kernel/bpf/cgroup.c | 2 +-
kernel/bpf/core.c | 4 +-
kernel/bpf/hashtab.c | 4 +-
kernel/bpf/lpm_trie.c | 2 +-
kernel/bpf/queue_stack_maps.c | 2 +-
kernel/bpf/reuseport_array.c | 2 +-
kernel/bpf/stackmap.c | 2 +-
kernel/bpf/syscall.c | 32 ++++++++------
kernel/bpf/verifier.c | 2 +-
kernel/events/core.c | 14 +++---
kernel/events/hw_breakpoint.c | 2 +-
kernel/trace/bpf_trace.c | 2 +-
kernel/trace/trace_event_perf.c | 4 +-
net/core/bpf_sk_storage.c | 2 +-
net/core/filter.c | 10 +++--
security/selinux/include/classmap.h | 4 +-
tools/testing/selftests/bpf/test_verifier.c | 46 +++++++++++++++----
26 files changed, 165 insertions(+), 64 deletions(-)
--
2.20.0
^ permalink raw reply
* Re: general protection fault in dev_map_hash_update_elem
From: Toke Høiland-Jørgensen @ 2019-09-06 23:04 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Alexei Starovoitov
Cc: syzbot, bpf, Daniel Borkmann, Jesper Dangaard Brouer, LKML,
Network Development, syzkaller-bugs
In-Reply-To: <20190906145408.05406b0f@carbon>
Jesper Dangaard Brouer <jbrouer@redhat.com> writes:
> On Thu, 5 Sep 2019 14:44:37 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
>> On Thu, Sep 5, 2019 at 1:08 PM syzbot
>> <syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com> wrote:
>> >
>> > Hello,
>> >
>> > syzbot found the following crash on:
>> >
>> > HEAD commit: 6d028043 Add linux-next specific files for 20190830
>> > git tree: linux-next
>> > console output: https://syzkaller.appspot.com/x/log.txt?x=135c1a92600000
>> > kernel config: https://syzkaller.appspot.com/x/.config?x=82a6bec43ab0cb69
>> > dashboard link: https://syzkaller.appspot.com/bug?extid=4e7a85b1432052e8d6f8
>> > compiler: gcc (GCC) 9.0.0 20181231 (experimental)
>> > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=109124e1600000
>> >
>> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
>> > Reported-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com
>> >
>> > kasan: CONFIG_KASAN_INLINE enabled
>> > kasan: GPF could be caused by NULL-ptr deref or user memory access
>> > general protection fault: 0000 [#1] PREEMPT SMP KASAN
>> > CPU: 1 PID: 10235 Comm: syz-executor.0 Not tainted 5.3.0-rc6-next-20190830
>> > #75
>> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
>> > Google 01/01/2011
>> > RIP: 0010:__write_once_size include/linux/compiler.h:203 [inline]
>> > RIP: 0010:__hlist_del include/linux/list.h:795 [inline]
>> > RIP: 0010:hlist_del_rcu include/linux/rculist.h:475 [inline]
>> > RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
>> > RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
>> > Code: 48 89 f1 48 89 75 c8 48 c1 e9 03 80 3c 11 00 0f 85 d3 02 00 00 48 b9
>> > 00 00 00 00 00 fc ff df 48 8b 53 10 48 89 d6 48 c1 ee 03 <80> 3c 0e 00 0f
>> > 85 97 02 00 00 48 85 c0 48 89 02 74 38 48 89 55 b8
>> > RSP: 0018:ffff88808d607c30 EFLAGS: 00010046
>> > RAX: 0000000000000000 RBX: ffff8880a7f14580 RCX: dffffc0000000000
>> > RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a7f14588
>> > RBP: ffff88808d607c78 R08: 0000000000000004 R09: ffffed1011ac0f73
>> > R10: ffffed1011ac0f72 R11: 0000000000000003 R12: ffff88809f4e9400
>> > R13: ffff88809b06ba00 R14: 0000000000000000 R15: ffff88809f4e9528
>> > FS: 00007f3a3d50c700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
>> > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> > CR2: 00007feb3fcd0000 CR3: 00000000986b9000 CR4: 00000000001406e0
>> > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>> > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>> > Call Trace:
>> > map_update_elem+0xc82/0x10b0 kernel/bpf/syscall.c:966
>> > __do_sys_bpf+0x8b5/0x3350 kernel/bpf/syscall.c:2854
>> > __se_sys_bpf kernel/bpf/syscall.c:2825 [inline]
>> > __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:2825
>> > do_syscall_64+0xfa/0x760 arch/x86/entry/common.c:290
>> > entry_SYSCALL_64_after_hwframe+0x49/0xbe
>> > RIP: 0033:0x459879
>> > Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
>> > 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
>> > ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
>> > RSP: 002b:00007f3a3d50bc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
>> > RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000459879
>> > RDX: 0000000000000020 RSI: 0000000020000040 RDI: 0000000000000002
>> > RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
>> > R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3a3d50c6d4
>> > R13: 00000000004bfc86 R14: 00000000004d1960 R15: 00000000ffffffff
>> > Modules linked in:
>> > ---[ end trace 083223e21dbd0ae5 ]---
>> > RIP: 0010:__write_once_size include/linux/compiler.h:203 [inline]
>> > RIP: 0010:__hlist_del include/linux/list.h:795 [inline]
>> > RIP: 0010:hlist_del_rcu include/linux/rculist.h:475 [inline]
>> > RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
>> > RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
>>
>> Toke,
>> please take a look.
>> Thanks!
>
> Hi Toke,
>
> I think the problem is that you read:
> old_dev = __dev_map_hash_lookup_elem(map, idx);
>
> Before holding the lock dtab->index_lock...
>
> I'm not sure this is the correct fix, but I think below change should
> solve the issue (not even compile tested):
>
> [bpf-next]$ git diff
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 9af048a932b5..c41854a68e9e 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -664,6 +664,9 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
>
> spin_lock_irqsave(&dtab->index_lock, flags);
>
> + /* Re-read old_dev while holding lock*/
> + old_dev = __dev_map_hash_lookup_elem(map, idx);
> +
> if (old_dev) {
> hlist_del_rcu(&old_dev->index_hlist);
> } else {
I think you're right that it's a race between reading the old_dev ptr
and the removal, leading to attempts to remove the same element twice.
Your patch would be one way to fix it, another would be to check the
pointer for list poison before removing it. Let me run both approaches
by the bot to make sure it actually fixes the bug; I'll submit a proper
fix that.
-Toke
^ permalink raw reply
* Re: ndo_xdp_xmit - on which queue to transmit the packet (if core_id >= total_xdp_queues ) ?
From: Toke Høiland-Jørgensen @ 2019-09-06 22:56 UTC (permalink / raw)
To: Manish Chopra, netdev@vger.kernel.org
In-Reply-To: <DM6PR18MB3388D5F49B3A0A3522A40184ABBA0@DM6PR18MB3388.namprd18.prod.outlook.com>
Manish Chopra <manishc@marvell.com> writes:
> Hello,
>
> I am working on XDP_REDIRECT implementation and got a query. Some of
> the ethernet drivers decide the xdp queue index on which xdp packet
> should be redirected based on smp_processor_id() in their
> ndo_xdp_xmit() handler, if smp_processor_id() >= total_num_xdp_queues,
> they decide to drop the packets and return error from the handler.
Congratulations, you've hit upon one of the major usability issues with
XDP_REDIRECT! ;)
> I am hitting the same condition where using 8 XDP queues, I get CPU id
> 8 to redirect the XDP packet and I am not sure if it should be dropped
> or can be transmitted on a queue (= smp_processor_id() %
> total_num_xdp_queues) safely ?.
I would expect you would at least need some kind of locking to do this
safely, but I guess it depends on how your driver is structured...
> freescale/dpaa2 seems to be handling this case by sending the packet
> on the queue (= smp_processor_id() % total_num_xdp_queues) but unsure
> what should be the expected behavior.
As you've noted, this varies somewhat between drivers, and there's
really no "expected behaviour" today. Drivers basically do what they
think makes sense for their hardware.
We're trying to fix this, and make the behaviour configurable; if you
happen to be at LPC, please come discuss it with us at this session:
https://linuxplumbersconf.org/event/4/contributions/462/
-Toke
^ permalink raw reply
* [RFC PATCH bpf-next v2 0/2] bpf: adding map batch processing support
From: Yonghong Song @ 2019-09-06 22:54 UTC (permalink / raw)
To: ast, daniel, netdev, bpf
Cc: kernel-team, Jakub Kicinski, Brian Vazquez, Stanislav Fomichev
Previous discussion at:
https://lore.kernel.org/bpf/7ba9b492-8a08-a1d0-9c6e-03be4b8e5e07@fb.com/T/#t
Previous approach tries to use existing per-map looks like
bpf_map_{get_next_key, lookup_elem, update_elem, delete_elem}
to implement a batching process.
It has a series drawback when the prev_key used by bpf_map_get_next_key()
is not in hash table. In that case, as the hash table has no idea where
the `prev_key` has been placed in the bucket before deletion, currently,
it returns the first key. This makes batch processing may see
duplicated elements, or in worst case if the hash table has heavy
update/delete, the batch processing may never finish.
This RFC patch set implements bucket based batching for hashtab.
That is, for lookup/delete, either the whole bucket is processed
or none of elements in the bucket is processed. Forward progress
is also guaranteed as long as user provides enough buffer.
This RFC also serves as a base for discussion at upcoming
LPC2019 BPF Microconference.
Changelogs:
v1 -> RFC v2:
. To address the bpf_map_get_next_key() issue where
if a key is not available the first key will be returned,
implement per-map batch operations for hashtab/lru_hashtab,
using bucket lock, as suggested by Alexei.
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Brian Vazquez <brianvv@google.com>
Cc: Stanislav Fomichev <sdf@google.com>
Yonghong Song (2):
bpf: adding map batch processing support
tools/bpf: test bpf_map_lookup_and_delete_batch()
include/linux/bpf.h | 9 +
include/uapi/linux/bpf.h | 22 ++
kernel/bpf/hashtab.c | 324 ++++++++++++++++++
kernel/bpf/syscall.c | 68 ++++
tools/include/uapi/linux/bpf.h | 22 ++
tools/lib/bpf/bpf.c | 59 ++++
tools/lib/bpf/bpf.h | 13 +
tools/lib/bpf/libbpf.map | 4 +
.../map_tests/map_lookup_and_delete_batch.c | 155 +++++++++
9 files changed, 676 insertions(+)
create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
--
2.17.1
^ permalink raw reply
* [RFC PATCH bpf-next v2 1/2] bpf: adding map batch processing support
From: Yonghong Song @ 2019-09-06 22:54 UTC (permalink / raw)
To: ast, daniel, netdev, bpf
Cc: kernel-team, Jakub Kicinski, Brian Vazquez, Stanislav Fomichev
In-Reply-To: <20190906225434.3635421-1-yhs@fb.com>
Brian Vazquez has proposed BPF_MAP_DUMP command to look up more than one
map entries per syscall.
https://lore.kernel.org/bpf/CABCgpaU3xxX6CMMxD+1knApivtc2jLBHysDXw-0E9bQEL0qC3A@mail.gmail.com/T/#t
During discussion, we found more use cases can be supported in a similar
map operation batching framework. For example, batched map lookup and delete,
which can be really helpful for bcc.
https://github.com/iovisor/bcc/blob/master/tools/tcptop.py#L233-L243
https://github.com/iovisor/bcc/blob/master/tools/slabratetop.py#L129-L138
Also, in bcc, we have API to delete all entries in a map.
https://github.com/iovisor/bcc/blob/master/src/cc/api/BPFTable.h#L257-L264
For map update, batched operations also useful as sometimes applications need
to populate initial maps with more than one entry. For example, the below
example is from kernel/samples/bpf/xdp_redirect_cpu_user.c:
https://github.com/torvalds/linux/blob/master/samples/bpf/xdp_redirect_cpu_user.c#L543-L550
This patch addresses all the above use cases. To make uapi stable, it also
covers other potential use cases. For bpf syscall subcommands are introduced:
BPF_MAP_LOOKUP_BATCH
BPF_MAP_LOOKUP_AND_DELETE_BATCH
BPF_MAP_UPDATE_BATCH
BPF_MAP_DELETE_BATCH
The UAPI attribute structure looks like:
struct { /* struct used by BPF_MAP_*_BATCH commands */
__u64 batch; /* input/output:
* input: start batch,
* 0 to start from beginning.
* output: next start batch,
* 0 to end batching.
*/
__aligned_u64 keys;
__aligned_u64 values;
__u32 count; /* input/output:
* input: # of elements keys/values.
* output: # of filled elements.
*/
__u32 map_fd;
__u64 elem_flags;
__u64 flags;
} batch;
An opaque value 'batch' is used for user/kernel space communication
for where in the map to start the operation for lookup/lookup_and_delete/delete.
input 'batch' = 0: to start the operation from the beginning of the map.
output 'batch': if not 0, the next input for batch operation.
For lookup/lookup_and_delete:
operation: lookup/lookup_and_delete starting from a particular 'batch'.
return:
'batch' 'count' return code meaning
0 0 0 Done. Nothing left
0 0 -ENOSPC no space to handle batch 0
> 0 0 -ENOSPC no space to handle 'batch'
> 0 > 0 0 stopped right before 'batch'
Note that:
(1). Even if return code is 0 and return 'count' > 0, the return 'count' may
not be equal to input 'count'. This happens when there is no enough space
to handle a batch.
(2). If the return code is an error and not -EFAULT,
'batch' indicates the batch has issues and 'count' indicates the number
of elements successfully processed.
For delete:
operation: deletion starting from a particular 'batch'.
return: 0 means everything is deleted from 'batch'.
error code means something deletion not happening.
For update:
operation: update 'count' number of elements in 'keys'/'values'.
return: 0 means successful updates for all elements.
error code, if not -EFAULT, 'count' is the number of successful updates.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
include/linux/bpf.h | 9 ++
include/uapi/linux/bpf.h | 22 +++
kernel/bpf/hashtab.c | 324 +++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 68 ++++++++
4 files changed, 423 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d22338606..3c1302e8e2d4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,15 @@ struct bpf_map_ops {
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
+ int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
+ int (*map_lookup_and_delete_batch)(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
+ int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
+ int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5d2fb183ee2d..9d4f76073dd9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
BPF_BTF_GET_NEXT_ID,
+ BPF_MAP_LOOKUP_BATCH,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+ BPF_MAP_UPDATE_BATCH,
+ BPF_MAP_DELETE_BATCH,
};
enum bpf_map_type {
@@ -396,6 +400,24 @@ union bpf_attr {
__u64 flags;
};
+ struct { /* struct used by BPF_MAP_*_BATCH commands */
+ __u64 batch; /* input/output:
+ * input: start batch,
+ * 0 to start from beginning.
+ * output: next start batch,
+ * 0 to end batching.
+ */
+ __aligned_u64 keys;
+ __aligned_u64 values;
+ __u32 count; /* input/output:
+ * input: # of elements keys/values.
+ * output: # of filled elements.
+ */
+ __u32 map_fd;
+ __u64 elem_flags;
+ __u64 flags;
+ } batch;
+
struct { /* anonymous struct used by BPF_PROG_LOAD command */
__u32 prog_type; /* one of enum bpf_prog_type */
__u32 insn_cnt;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..ee7b90200f4d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1232,6 +1232,322 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static int
+__htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete, bool is_lru_map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
+ void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ void __user *ukeys, *uvalues;
+ struct hlist_nulls_node *n;
+ u32 batch, max_count;
+ unsigned long flags;
+ struct htab_elem *l;
+ struct bucket *b;
+ int ret = 0;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & ~BPF_F_LOCK) ||
+ ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ batch = (u32)attr->batch.batch;
+ if (batch >= htab->n_buckets)
+ return -EINVAL;
+
+ /* We cannot do copy_from_user or copy_to_user inside
+ * the rcu_read_lock. Allocate enough space here.
+ */
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ keys = kvmalloc(key_size * max_count, GFP_USER | __GFP_NOWARN);
+ values = kvmalloc(value_size * max_count, GFP_USER | __GFP_NOWARN);
+ if (!keys || !values) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dst_key = keys;
+ dst_val = values;
+ total = 0;
+
+ preempt_disable();
+ this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+
+again:
+ b = &htab->buckets[batch];
+ head = &b->head;
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ bucket_cnt = 0;
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ bucket_cnt++;
+
+ if (bucket_cnt > (max_count - total)) {
+ if (total == 0)
+ ret = -ENOSPC;
+ goto after_loop;
+ }
+
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
+ memcpy(dst_key, l->key, key_size);
+
+ value = l->key + roundup_key_size;
+ if (elem_map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, dst_val, value, true);
+ else
+ copy_map_value(map, dst_val, value);
+ check_and_init_map_lock(map, dst_val);
+
+ dst_key += key_size;
+ dst_val += value_size;
+ total++;
+ }
+
+ if (do_delete) {
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (is_lru_map)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ else
+ free_htab_elem(htab, l);
+ }
+ }
+
+ batch++;
+ if (batch >= htab->n_buckets) {
+ batch = 0;
+ goto after_loop;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ goto again;
+
+after_loop:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ rcu_read_unlock();
+ this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+
+ /* copy data back to user */
+ ukeys = u64_to_user_ptr(attr->batch.keys);
+ uvalues = u64_to_user_ptr(attr->batch.values);
+ if (put_user(batch, &uattr->batch.batch) ||
+ copy_to_user(ukeys, keys, total * key_size) ||
+ copy_to_user(uvalues, values, total * value_size) ||
+ put_user(total, &uattr->batch.count))
+ ret = -EFAULT;
+
+out:
+ kvfree(keys);
+ kvfree(values);
+ return ret;
+}
+
+static int
+__htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr, bool is_lru_map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 count, max_count, key_size, roundup_key_size, value_size;
+ u64 elem_map_flags, map_flags;
+ void __user *ukey, *uvalue;
+ void *key, *value;
+ int ret = 0;
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ elem_map_flags = attr->batch.elem_flags;
+ if ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))
+ return -EINVAL;
+
+ map_flags = attr->batch.flags;
+ if (map_flags)
+ return -EINVAL;
+
+ key_size = htab->map.key_size;
+ roundup_key_size = round_up(htab->map.key_size, 8);
+ value_size = htab->map.value_size;
+ key = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
+ if (!key || !value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ukey = u64_to_user_ptr(attr->batch.keys);
+ uvalue = u64_to_user_ptr(attr->batch.values);
+ for (count = 0; count < max_count; count++) {
+ if (copy_from_user(key, ukey + count * key_size, key_size) ||
+ copy_from_user(value, uvalue + count * value_size, value_size)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+ if (is_lru_map)
+ ret = htab_lru_map_update_elem(map, key, value, elem_map_flags);
+ else
+ ret = htab_map_update_elem(map, key, value, elem_map_flags);
+ rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+
+ if (ret) {
+ if (put_user(count, &uattr->batch.count))
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+out:
+ kfree(key);
+ kfree(value);
+ return ret;
+}
+
+static int
+__htab_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool is_lru_map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u64 elem_map_flags, map_flags;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
+ u32 batch, max_count;
+ unsigned long flags;
+ struct htab_elem *l;
+ struct bucket *b;
+
+ elem_map_flags = attr->batch.elem_flags;
+ map_flags = attr->batch.flags;
+ if (elem_map_flags || map_flags)
+ return -EINVAL;
+
+ max_count = attr->batch.count;
+ batch = (u32)attr->batch.batch;
+ if (max_count || batch >= htab->n_buckets)
+ return -EINVAL;
+
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ rcu_read_lock();
+
+again:
+ b = &htab->buckets[batch];
+ head = &b->head;
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (is_lru_map)
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ else
+ free_htab_elem(htab, l);
+ }
+
+ batch++;
+ if (batch >= htab->n_buckets)
+ goto out;
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ goto again;
+
+out:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+
+ return 0;
+}
+
+static int
+htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ false);
+}
+
+static int
+htab_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ false);
+}
+
+static int
+htab_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_update_batch(map, attr, uattr, false);
+}
+
+static int
+htab_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_delete_batch(map, attr, uattr, false);
+}
+
+static int
+htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
+ true);
+}
+
+static int
+htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
+ true);
+}
+
+static int
+htab_lru_map_update_batch(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_update_batch(map, attr, uattr, true);
+}
+
+static int
+htab_lru_map_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __htab_map_delete_batch(map, attr, uattr, true);
+}
+
const struct bpf_map_ops htab_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
@@ -1242,6 +1558,10 @@ const struct bpf_map_ops htab_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_lookup_batch = htab_map_lookup_batch,
+ .map_lookup_and_delete_batch = htab_map_lookup_and_delete_batch,
+ .map_update_batch = htab_map_update_batch,
+ .map_delete_batch = htab_map_delete_batch,
};
const struct bpf_map_ops htab_lru_map_ops = {
@@ -1255,6 +1575,10 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
+ .map_lookup_batch = htab_lru_map_lookup_batch,
+ .map_lookup_and_delete_batch = htab_lru_map_lookup_and_delete_batch,
+ .map_update_batch = htab_lru_map_update_batch,
+ .map_delete_batch = htab_lru_map_delete_batch,
};
/* Called from eBPF program */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ca60eafa6922..e83bdf7efbd8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2816,6 +2816,62 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
return err;
}
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(map, attr, uattr); \
+ } while(0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ struct bpf_map *map;
+ int err, ufd;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ ufd = attr->batch.map_fd;
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd != BPF_MAP_LOOKUP_BATCH &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH) {
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
+ } else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) {
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ } else if (cmd == BPF_MAP_UPDATE_BATCH) {
+ BPF_DO_BATCH(map->ops->map_update_batch);
+ } else {
+ BPF_DO_BATCH(map->ops->map_delete_batch);
+ }
+
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -2913,6 +2969,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
+ case BPF_MAP_UPDATE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_UPDATE_BATCH);
+ break;
+ case BPF_MAP_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_DELETE_BATCH);
+ break;
default:
err = -EINVAL;
break;
--
2.17.1
^ permalink raw reply related
* [RFC PATCH bpf-next v2 2/2] tools/bpf: test bpf_map_lookup_and_delete_batch()
From: Yonghong Song @ 2019-09-06 22:54 UTC (permalink / raw)
To: ast, daniel, netdev, bpf
Cc: kernel-team, Jakub Kicinski, Brian Vazquez, Stanislav Fomichev
In-Reply-To: <20190906225434.3635421-1-yhs@fb.com>
Added four libbpf API functions to support map batch operations:
. int bpf_map_delete_batch( ... )
. int bpf_map_lookup_batch( ... )
. int bpf_map_lookup_and_delete_batch( ... )
. int bpf_map_update_batch( ... )
Tested bpf_map_lookup_and_delete_batch() and bpf_map_update_batch()
functionality.
$ ./test_maps
...
test_map_lookup_and_delete_batch:PASS
...
Note that I clumped uapi header sync patch, libbpf patch
and tests patch together considering this is a RFC patch.
Will do proper formating once it is out of RFC stage.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/include/uapi/linux/bpf.h | 22 +++
tools/lib/bpf/bpf.c | 59 +++++++
tools/lib/bpf/bpf.h | 13 ++
tools/lib/bpf/libbpf.map | 4 +
.../map_tests/map_lookup_and_delete_batch.c | 155 ++++++++++++++++++
5 files changed, 253 insertions(+)
create mode 100644 tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5d2fb183ee2d..9d4f76073dd9 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -107,6 +107,10 @@ enum bpf_cmd {
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
BPF_BTF_GET_NEXT_ID,
+ BPF_MAP_LOOKUP_BATCH,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+ BPF_MAP_UPDATE_BATCH,
+ BPF_MAP_DELETE_BATCH,
};
enum bpf_map_type {
@@ -396,6 +400,24 @@ union bpf_attr {
__u64 flags;
};
+ struct { /* struct used by BPF_MAP_*_BATCH commands */
+ __u64 batch; /* input/output:
+ * input: start batch,
+ * 0 to start from beginning.
+ * output: next start batch,
+ * 0 to end batching.
+ */
+ __aligned_u64 keys;
+ __aligned_u64 values;
+ __u32 count; /* input/output:
+ * input: # of elements keys/values.
+ * output: # of filled elements.
+ */
+ __u32 map_fd;
+ __u64 elem_flags;
+ __u64 flags;
+ } batch;
+
struct { /* anonymous struct used by BPF_PROG_LOAD command */
__u32 prog_type; /* one of enum bpf_prog_type */
__u32 insn_cnt;
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index cbb933532981..367bdcb3c62b 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -438,6 +438,65 @@ int bpf_map_freeze(int fd)
return sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr));
}
+static int bpf_map_batch_common(int cmd, int fd, __u64 *batch,
+ void *keys, void *values,
+ __u32 *count, __u64 elem_flags,
+ __u64 flags)
+{
+ union bpf_attr attr = {};
+ int ret;
+
+ attr.batch.map_fd = fd;
+ if (batch)
+ attr.batch.batch = *batch;
+ attr.batch.keys = ptr_to_u64(keys);
+ attr.batch.values = ptr_to_u64(values);
+ if (count)
+ attr.batch.count = *count;
+ attr.batch.elem_flags = elem_flags;
+ attr.batch.flags = flags;
+
+ ret = sys_bpf(cmd, &attr, sizeof(attr));
+ if (batch)
+ *batch = attr.batch.batch;
+ if (count)
+ *count = attr.batch.count;
+
+ return ret;
+}
+
+int bpf_map_delete_batch(int fd, __u64 *batch, __u32 *count, __u64 elem_flags,
+ __u64 flags)
+{
+ return bpf_map_batch_common(BPF_MAP_DELETE_BATCH, fd, batch,
+ NULL, NULL, count, elem_flags, flags);
+}
+
+int bpf_map_lookup_batch(int fd, __u64 *batch, void *keys, void *values,
+ __u32 *count, __u64 elem_flags, __u64 flags)
+{
+ return bpf_map_batch_common(BPF_MAP_LOOKUP_BATCH, fd, batch,
+ keys, values, count, elem_flags, flags);
+}
+
+int bpf_map_lookup_and_delete_batch(int fd, __u64 *batch,
+ void *keys, void *values,
+ __u32 *count, __u64 elem_flags,
+ __u64 flags)
+{
+ return bpf_map_batch_common(BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+ fd, batch, keys, values,
+ count, elem_flags, flags);
+}
+
+int bpf_map_update_batch(int fd, void *keys, void *values, __u32 *count,
+ __u64 elem_flags, __u64 flags)
+{
+ return bpf_map_batch_common(BPF_MAP_UPDATE_BATCH,
+ fd, NULL, keys, values,
+ count, elem_flags, flags);
+}
+
int bpf_obj_pin(int fd, const char *pathname)
{
union bpf_attr attr;
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 0db01334740f..37211840f345 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -120,6 +120,19 @@ LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key,
LIBBPF_API int bpf_map_delete_elem(int fd, const void *key);
LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key);
LIBBPF_API int bpf_map_freeze(int fd);
+LIBBPF_API int bpf_map_delete_batch(int fd, __u64 *batch, __u32 *count,
+ __u64 elem_flags, __u64 flags);
+LIBBPF_API int bpf_map_lookup_batch(int fd, __u64 *batch, void *keys,
+ void *values, __u32 *count,
+ __u64 elem_flags, __u64 flags);
+LIBBPF_API int bpf_map_lookup_and_delete_batch(int fd, __u64 *batch,
+ void *keys, void *values,
+ __u32 *count, __u64 elem_flags,
+ __u64 flags);
+LIBBPF_API int bpf_map_update_batch(int fd, void *keys, void *values,
+ __u32 *count, __u64 elem_flags,
+ __u64 flags);
+
LIBBPF_API int bpf_obj_pin(int fd, const char *pathname);
LIBBPF_API int bpf_obj_get(const char *pathname);
LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index d04c7cb623ed..739bd9f76e50 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -189,4 +189,8 @@ LIBBPF_0.0.4 {
LIBBPF_0.0.5 {
global:
bpf_btf_get_next_id;
+ bpf_map_delete_batch;
+ bpf_map_lookup_and_delete_batch;
+ bpf_map_lookup_batch;
+ bpf_map_update_batch;
} LIBBPF_0.0.4;
diff --git a/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
new file mode 100644
index 000000000000..dd906b1de595
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/map_lookup_and_delete_batch.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+ int *values)
+{
+ int i, err;
+
+ for (i = 0; i < max_entries; i++) {
+ keys[i] = i + 1;
+ values[i] = i + 2;
+ }
+
+ err = bpf_map_update_batch(map_fd, keys, values, &max_entries, 0, 0);
+ CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+ int *keys, int *values)
+{
+ int i;
+
+ memset(visited, 0, max_entries * sizeof(*visited));
+ for (i = 0; i < max_entries; i++) {
+ CHECK(keys[i] + 1 != values[i], "key/value checking",
+ "error: i %d key %d value %d\n", i, keys[i], values[i]);
+ visited[i] = 1;
+ }
+ for (i = 0; i < max_entries; i++) {
+ CHECK(visited[i] != 1, "visited checking",
+ "error: keys array at index %d missing\n", i);
+ }
+}
+
+void test_map_lookup_and_delete_batch(void)
+{
+ struct bpf_create_map_attr xattr = {
+ .name = "hash_map",
+ .map_type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ };
+ int map_fd, *keys, *values, *visited, key;
+ __u32 count, total, total_success;
+ const __u32 max_entries = 10;
+ int err, i, step;
+ bool nospace_err;
+ __u64 batch = 0;
+
+ xattr.max_entries = max_entries;
+ map_fd = bpf_create_map_xattr(&xattr);
+ CHECK(map_fd == -1,
+ "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+ keys = malloc(max_entries * sizeof(int));
+ values = malloc(max_entries * sizeof(int));
+ visited = malloc(max_entries * sizeof(int));
+ CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", strerror(errno));
+
+ /* test 1: lookup/delete an empty hash table, success */
+ count = max_entries;
+ err = bpf_map_lookup_and_delete_batch(map_fd, &batch, keys, values,
+ &count, 0, 0);
+ CHECK(err, "empty map", "error: %s\n", strerror(errno));
+ CHECK(batch || count, "empty map", "batch = %lld, count = %u\n", batch, count);
+
+ /* populate elements to the map */
+ map_batch_update(map_fd, max_entries, keys, values);
+
+ /* test 2: lookup/delete with count = 0, success */
+ batch = 0;
+ count = 0;
+ err = bpf_map_lookup_and_delete_batch(map_fd, &batch, keys, values,
+ &count, 0, 0);
+ CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+ /* test 3: lookup/delete with count = max_entries, success */
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * sizeof(*values));
+ count = max_entries;
+ batch = 0;
+ err = bpf_map_lookup_and_delete_batch(map_fd, &batch, keys,
+ values, &count, 0, 0);
+ CHECK(err, "count = max_entries", "error: %s\n", strerror(errno));
+ CHECK(count != max_entries || batch != 0, "count = max_entries",
+ "count = %u, max_entries = %u, batch = %lld\n",
+ count, max_entries, batch);
+ map_batch_verify(visited, max_entries, keys, values);
+
+ /* bpf_map_get_next_key() should return -ENOENT for an empty map. */
+ err = bpf_map_get_next_key(map_fd, NULL, &key);
+ CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+ /* test 4: lookup/delete in a loop with various steps. */
+ total_success = 0;
+ for (step = 1; step < max_entries; step++) {
+ map_batch_update(map_fd, max_entries, keys, values);
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * sizeof(*values));
+ batch = 0;
+ total = 0;
+ i = 0;
+ /* iteratively lookup/delete elements with 'step' elements each */
+ count = step;
+ nospace_err = false;
+ while (true) {
+ err = bpf_map_lookup_and_delete_batch(map_fd, &batch,
+ keys + total,
+ values + total,
+ &count, 0, 0);
+ /* It is possible that we are failing due to buffer size
+ * not big enough. In such cases, let us just exit and
+ * go with large steps. Not that a buffer size with
+ * max_entries should always work.
+ */
+ if (err && errno == ENOSPC) {
+ nospace_err = true;
+ break;
+ }
+
+ CHECK(err, "lookup/delete with steps", "error: %s\n",
+ strerror(errno));
+
+ total += count;
+ if (batch == 0)
+ break;
+
+ i++;
+ }
+
+ if (nospace_err == true)
+ continue;
+
+ CHECK(total != max_entries, "lookup/delete with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+
+ map_batch_verify(visited, max_entries, keys, values);
+ err = bpf_map_get_next_key(map_fd, NULL, &key);
+ CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+ total_success++;
+ }
+
+ CHECK(total_success == 0, "check total_success", "unexpected failure\n");
+
+ printf("%s:PASS\n", __func__);
+}
--
2.17.1
^ permalink raw reply related
* Re: [PATCH bpf-next v2 1/6] selftests/bpf: test_progs: add test__join_cgroup helper
From: Stanislav Fomichev @ 2019-09-06 22:51 UTC (permalink / raw)
To: Andrii Nakryiko
Cc: Stanislav Fomichev, Networking, bpf, David S. Miller,
Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <CAEf4Bzb=0gJv148r+RARMOYHikvvrzXJ-o5jQ7F_WtSzhRF38w@mail.gmail.com>
On 09/06, Andrii Nakryiko wrote:
> On Thu, Sep 5, 2019 at 7:40 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > test__join_cgroup() combines the following operations that usually
> > go hand in hand and returns cgroup fd:
> >
> > * setup cgroup environment (make sure cgroupfs is mounted)
> > * mkdir cgroup
> > * join cgroup
> >
> > It also marks a test as a "cgroup cleanup needed" and removes cgroup
> > state after the test is done.
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
>
> First of all, thanks a lot for all these improvements to test_progs
> and converting existing tests to test_progs tests, it's great to see
> this consolidation!
>
> [...]
>
> > @@ -17,6 +18,7 @@ struct prog_test_def {
> > int error_cnt;
> > int skip_cnt;
> > bool tested;
> > + bool need_cgroup_cleanup;
> >
> > const char *subtest_name;
> > int subtest_num;
> > @@ -122,6 +124,39 @@ void test__fail(void)
> > env.test->error_cnt++;
> > }
> >
> > +int test__join_cgroup(const char *path)
>
> This doesn't seem to be testing-specific functionality, tbh. It's
> certainly useful helper, but I don't think it warrants test__ prefix.
I didn't like the mess we used to have:
if (setup_cgroup_environment())
goto cleanup_obj;
cgroup_fd = create_and_get_cgroup(CG_PATH);
if (cgroup_fd < 0)
goto cleanup_cgroup_env;
if (join_cgroup(CG_PATH))
goto cleanup_cgroup;
... do the test
cleanup_cgroup_environment();
All I really want to do in several tests is to create a temporary cgroup
and join it (I don't even really care about the name most of the time).
We can rename and move this test__join_cgroup into cgroup_helpers.h if
you prefer, I don't really mind. I just want to avoid repeating those
10 lines over and over in each test that just wants to run in a cgroup.
> As for test->need_cgroup_cleanup field, this approach won't scale if
> we need other types of custom/optional clean up after test ends.
> Generic test framework code will need to know about every possible
> custom setup to be able to cleanup/undo it.
>
> I wonder if generalizing it to be able to add custom clean up code
> (some test frameworks have "teardown" overrides for this) would be
> cleaner and more maintainable solution.
>
> Something like:
>
> typedef void (* test_teardown_fn)(struct test *test, void *ctx);
>
> /* somewhere at the beginning of test: */
> test__schedule_teardown(test_teardown_fn cb, void *ctx);
>
> [...]
>
> > +
> > + if (test->need_cgroup_cleanup)
> > + cleanup_cgroup_environment();
>
> Then in generic framework we'll just process a list of callbacks and
> call each one with stored ctx per each callback (in case we need some
> custom data to be stored, of course).
>
> Thoughts?
Idk, I don't see the need to be too generic since we control both the
tests and the framework. So putting something like test__join_cgroup
and doing automatic cleanup looks fine to me if this is shared between
several tests. If, at some point, it becomes unmanageable, we can
think about refactoring; but until then, I'd not bother tbh.
^ permalink raw reply
* Re: [PATCH bpf-next v2 1/6] selftests/bpf: test_progs: add test__join_cgroup helper
From: Andrii Nakryiko @ 2019-09-06 22:29 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: Networking, bpf, David S. Miller, Alexei Starovoitov,
Daniel Borkmann
In-Reply-To: <20190905152709.111193-2-sdf@google.com>
On Thu, Sep 5, 2019 at 7:40 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> test__join_cgroup() combines the following operations that usually
> go hand in hand and returns cgroup fd:
>
> * setup cgroup environment (make sure cgroupfs is mounted)
> * mkdir cgroup
> * join cgroup
>
> It also marks a test as a "cgroup cleanup needed" and removes cgroup
> state after the test is done.
>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
First of all, thanks a lot for all these improvements to test_progs
and converting existing tests to test_progs tests, it's great to see
this consolidation!
[...]
> @@ -17,6 +18,7 @@ struct prog_test_def {
> int error_cnt;
> int skip_cnt;
> bool tested;
> + bool need_cgroup_cleanup;
>
> const char *subtest_name;
> int subtest_num;
> @@ -122,6 +124,39 @@ void test__fail(void)
> env.test->error_cnt++;
> }
>
> +int test__join_cgroup(const char *path)
This doesn't seem to be testing-specific functionality, tbh. It's
certainly useful helper, but I don't think it warrants test__ prefix.
As for test->need_cgroup_cleanup field, this approach won't scale if
we need other types of custom/optional clean up after test ends.
Generic test framework code will need to know about every possible
custom setup to be able to cleanup/undo it.
I wonder if generalizing it to be able to add custom clean up code
(some test frameworks have "teardown" overrides for this) would be
cleaner and more maintainable solution.
Something like:
typedef void (* test_teardown_fn)(struct test *test, void *ctx);
/* somewhere at the beginning of test: */
test__schedule_teardown(test_teardown_fn cb, void *ctx);
[...]
> +
> + if (test->need_cgroup_cleanup)
> + cleanup_cgroup_environment();
Then in generic framework we'll just process a list of callbacks and
call each one with stored ctx per each callback (in case we need some
custom data to be stored, of course).
Thoughts?
[...]
^ permalink raw reply
* pull-request: bpf 2019-09-06
From: Alexei Starovoitov @ 2019-09-06 22:20 UTC (permalink / raw)
To: davem; +Cc: daniel, netdev, bpf, kernel-team
Hi David,
The following pull-request contains BPF updates for your *net* tree.
The main changes are:
1) verifier precision tracking fix, from Alexei.
Please consider pulling these changes from:
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
Thanks a lot!
----------------------------------------------------------------
The following changes since commit 44580a0118d3ede95fec4dce32df5f75f73cd663:
net: sock_map, fix missing ulp check in sock hash case (2019-09-05 11:56:19 +0200)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
for you to fetch changes up to 2339cd6cd0b5401fa3fe886bf1c0cb8822041957:
bpf: fix precision tracking of stack slots (2019-09-05 14:06:58 +0200)
----------------------------------------------------------------
Alexei Starovoitov (1):
bpf: fix precision tracking of stack slots
kernel/bpf/verifier.c | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
^ permalink raw reply
* Re: [PATCH net-next 3/3] net: dsa: microchip: remove NET_DSA_TAG_KSZ_COMMON
From: Marek Vasut @ 2019-09-06 21:42 UTC (permalink / raw)
To: George McCollister, netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, linux-kernel
In-Reply-To: <20190906213054.48908-4-george.mccollister@gmail.com>
On 9/6/19 11:30 PM, George McCollister wrote:
> Remove the superfluous NET_DSA_TAG_KSZ_COMMON and just use the existing
> NET_DSA_TAG_KSZ. Update the description to mention the three switch
> families it supports. No functional change.
>
> Signed-off-by: George McCollister <george.mccollister@gmail.com>
Reviewed-by: Marek Vasut <marex@denx.de>
--
Best regards,
Marek Vasut
^ permalink raw reply
* Re: [PATCH net-next 1/3] net: dsa: microchip: add KSZ9477 I2C driver
From: Marek Vasut @ 2019-09-06 21:39 UTC (permalink / raw)
To: George McCollister, netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, linux-kernel
In-Reply-To: <20190906213054.48908-2-george.mccollister@gmail.com>
On 9/6/19 11:30 PM, George McCollister wrote:
[...]
> --- /dev/null
> +++ b/drivers/net/dsa/microchip/ksz9477_i2c.c
> @@ -0,0 +1,100 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Microchip KSZ9477 series register access through I2C
> + *
> + * Copyright (C) 2018-2019 Microchip Technology Inc.
Doesn't the copyright need update ?
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/regmap.h>
> +#include <linux/i2c.h>
Please keep the headers sorted.
> +#include "ksz_common.h"
> +
> +KSZ_REGMAP_TABLE(ksz9477, not_used, 16, 0, 0);
> +
The rest looks good.
[...]
--
Best regards,
Marek Vasut
^ permalink raw reply
* Re: [PATCH net-next 2/3] net: dsa: microchip: add ksz9567 to ksz9477 driver
From: Marek Vasut @ 2019-09-06 21:41 UTC (permalink / raw)
To: George McCollister, netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, linux-kernel
In-Reply-To: <20190906213054.48908-3-george.mccollister@gmail.com>
On 9/6/19 11:30 PM, George McCollister wrote:
> Add support for the KSZ9567 7-Port Gigabit Ethernet Switch to the
> ksz9477 driver. The KSZ9567 supports both SPI and I2C. Oddly the
> ksz9567 is already in the device tree binding documentation.
>
> Signed-off-by: George McCollister <george.mccollister@gmail.com>
> ---
> drivers/net/dsa/microchip/ksz9477.c | 9 +++++++++
> drivers/net/dsa/microchip/ksz9477_i2c.c | 1 +
> drivers/net/dsa/microchip/ksz9477_spi.c | 1 +
> 3 files changed, 11 insertions(+)
>
> diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
> index 187be42de5f1..50ffc63d6231 100644
> --- a/drivers/net/dsa/microchip/ksz9477.c
> +++ b/drivers/net/dsa/microchip/ksz9477.c
> @@ -1529,6 +1529,15 @@ static const struct ksz_chip_data ksz9477_switch_chips[] = {
> .cpu_ports = 0x07, /* can be configured as cpu port */
> .port_cnt = 3, /* total port count */
> },
> + {
> + .chip_id = 0x00956700,
> + .dev_name = "KSZ9567",
> + .num_vlans = 4096,
> + .num_alus = 4096,
> + .num_statics = 16,
> + .cpu_ports = 0x7F, /* can be configured as cpu port */
> + .port_cnt = 7, /* total physical port count */
I might be wrong, and this is just an idea for future improvement, but
is .cpu_ports = GEN_MASK(.port_cnt, 0) always ?
> + },
> };
>
> static int ksz9477_switch_init(struct ksz_device *dev)
> diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c
> index 85fd0fb43941..c1548a43b60d 100644
> --- a/drivers/net/dsa/microchip/ksz9477_i2c.c
> +++ b/drivers/net/dsa/microchip/ksz9477_i2c.c
> @@ -77,6 +77,7 @@ MODULE_DEVICE_TABLE(i2c, ksz9477_i2c_id);
> static const struct of_device_id ksz9477_dt_ids[] = {
> { .compatible = "microchip,ksz9477" },
> { .compatible = "microchip,ksz9897" },
> + { .compatible = "microchip,ksz9567" },
> {},
> };
> MODULE_DEVICE_TABLE(of, ksz9477_dt_ids);
> diff --git a/drivers/net/dsa/microchip/ksz9477_spi.c b/drivers/net/dsa/microchip/ksz9477_spi.c
> index 2e402e4d866f..f4198d6f72be 100644
> --- a/drivers/net/dsa/microchip/ksz9477_spi.c
> +++ b/drivers/net/dsa/microchip/ksz9477_spi.c
> @@ -81,6 +81,7 @@ static const struct of_device_id ksz9477_dt_ids[] = {
> { .compatible = "microchip,ksz9893" },
> { .compatible = "microchip,ksz9563" },
> { .compatible = "microchip,ksz8563" },
> + { .compatible = "microchip,ksz9567" },
> {},
> };
> MODULE_DEVICE_TABLE(of, ksz9477_dt_ids);
>
Reviewed-by: Marek Vasut <marex@denx.de>
--
Best regards,
Marek Vasut
^ permalink raw reply
* [PATCH net-next 0/3] add ksz9567 with I2C support to ksz9477 driver
From: George McCollister @ 2019-09-06 21:30 UTC (permalink / raw)
To: netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, Marek Vasut, linux-kernel, George McCollister
Resurrect KSZ9477 I2C driver support patch originally sent to the list
by Tristram Ha and resolve outstanding issues. It now works as similarly to
the ksz9477 SPI driver as possible, using the same regmap macros.
Add support for ksz9567 to the ksz9477 driver (tested on a board with
ksz9567 connected via I2C).
Remove NET_DSA_TAG_KSZ_COMMON since it's not needed.
George McCollister (2):
net: dsa: microchip: add ksz9567 to ksz9477 driver
net: dsa: microchip: remove NET_DSA_TAG_KSZ_COMMON
Tristram Ha (1):
net: dsa: microchip: add KSZ9477 I2C driver
drivers/net/dsa/microchip/Kconfig | 7 +++
drivers/net/dsa/microchip/Makefile | 1 +
drivers/net/dsa/microchip/ksz9477.c | 9 +++
drivers/net/dsa/microchip/ksz9477_i2c.c | 101 ++++++++++++++++++++++++++++++++
drivers/net/dsa/microchip/ksz9477_spi.c | 1 +
drivers/net/dsa/microchip/ksz_common.h | 2 +
net/dsa/Kconfig | 9 +--
net/dsa/Makefile | 2 +-
8 files changed, 124 insertions(+), 8 deletions(-)
create mode 100644 drivers/net/dsa/microchip/ksz9477_i2c.c
--
2.11.0
^ permalink raw reply
* [PATCH net-next 3/3] net: dsa: microchip: remove NET_DSA_TAG_KSZ_COMMON
From: George McCollister @ 2019-09-06 21:30 UTC (permalink / raw)
To: netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, Marek Vasut, linux-kernel, George McCollister
In-Reply-To: <20190906213054.48908-1-george.mccollister@gmail.com>
Remove the superfluous NET_DSA_TAG_KSZ_COMMON and just use the existing
NET_DSA_TAG_KSZ. Update the description to mention the three switch
families it supports. No functional change.
Signed-off-by: George McCollister <george.mccollister@gmail.com>
---
net/dsa/Kconfig | 9 ++-------
net/dsa/Makefile | 2 +-
2 files changed, 3 insertions(+), 8 deletions(-)
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 2f69d4b53d46..29e2bd5cc5af 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -73,16 +73,11 @@ config NET_DSA_TAG_MTK
Say Y or M if you want to enable support for tagging frames for
Mediatek switches.
-config NET_DSA_TAG_KSZ_COMMON
- tristate
- default n
-
config NET_DSA_TAG_KSZ
- tristate "Tag driver for Microchip 9893 family of switches"
- select NET_DSA_TAG_KSZ_COMMON
+ tristate "Tag driver for Microchip 8795/9477/9893 families of switches"
help
Say Y if you want to enable support for tagging frames for the
- Microchip 9893 family of switches.
+ Microchip 8795/9477/9893 families of switches.
config NET_DSA_TAG_QCA
tristate "Tag driver for Qualcomm Atheros QCA8K switches"
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index c342f54715ba..2c6d286f0511 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o
obj-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
obj-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
-obj-$(CONFIG_NET_DSA_TAG_KSZ_COMMON) += tag_ksz.o
+obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
--
2.11.0
^ permalink raw reply related
* [PATCH net-next 1/3] net: dsa: microchip: add KSZ9477 I2C driver
From: George McCollister @ 2019-09-06 21:30 UTC (permalink / raw)
To: netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, Marek Vasut, linux-kernel, George McCollister
In-Reply-To: <20190906213054.48908-1-george.mccollister@gmail.com>
From: Tristram Ha <Tristram.Ha@microchip.com>
Add KSZ9477 I2C driver support. The code ksz9477.c and ksz_common.c are
used together to generate the I2C driver.
Signed-off-by: Tristram Ha <Tristram.Ha@microchip.com>
[george.mccollister@gmail.com: bring up to date, use ksz_common regmap macros]
Signed-off-by: George McCollister <george.mccollister@gmail.com>
---
drivers/net/dsa/microchip/Kconfig | 7 +++
drivers/net/dsa/microchip/Makefile | 1 +
drivers/net/dsa/microchip/ksz9477_i2c.c | 100 ++++++++++++++++++++++++++++++++
drivers/net/dsa/microchip/ksz_common.h | 2 +
4 files changed, 110 insertions(+)
create mode 100644 drivers/net/dsa/microchip/ksz9477_i2c.c
diff --git a/drivers/net/dsa/microchip/Kconfig b/drivers/net/dsa/microchip/Kconfig
index e1c23d1e91e6..1d7870c6df3c 100644
--- a/drivers/net/dsa/microchip/Kconfig
+++ b/drivers/net/dsa/microchip/Kconfig
@@ -9,6 +9,13 @@ menuconfig NET_DSA_MICROCHIP_KSZ9477
help
This driver adds support for Microchip KSZ9477 switch chips.
+config NET_DSA_MICROCHIP_KSZ9477_I2C
+ tristate "KSZ9477 series I2C connected switch driver"
+ depends on NET_DSA_MICROCHIP_KSZ9477 && I2C
+ select REGMAP_I2C
+ help
+ Select to enable support for registering switches configured through I2C.
+
config NET_DSA_MICROCHIP_KSZ9477_SPI
tristate "KSZ9477 series SPI connected switch driver"
depends on NET_DSA_MICROCHIP_KSZ9477 && SPI
diff --git a/drivers/net/dsa/microchip/Makefile b/drivers/net/dsa/microchip/Makefile
index e3d799b95d7d..929caa81e782 100644
--- a/drivers/net/dsa/microchip/Makefile
+++ b/drivers/net/dsa/microchip/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON) += ksz_common.o
obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ9477) += ksz9477.o
+obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C) += ksz9477_i2c.o
obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI) += ksz9477_spi.o
obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ8795) += ksz8795.o
obj-$(CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI) += ksz8795_spi.o
diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c
new file mode 100644
index 000000000000..85fd0fb43941
--- /dev/null
+++ b/drivers/net/dsa/microchip/ksz9477_i2c.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Microchip KSZ9477 series register access through I2C
+ *
+ * Copyright (C) 2018-2019 Microchip Technology Inc.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/i2c.h>
+
+#include "ksz_common.h"
+
+KSZ_REGMAP_TABLE(ksz9477, not_used, 16, 0, 0);
+
+static int ksz9477_i2c_probe(struct i2c_client *i2c,
+ const struct i2c_device_id *i2c_id)
+{
+ struct ksz_device *dev;
+ int i, ret;
+
+ dev = ksz_switch_alloc(&i2c->dev, i2c);
+ if (!dev)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(ksz9477_regmap_config); i++) {
+ dev->regmap[i] = devm_regmap_init_i2c(i2c,
+ &ksz9477_regmap_config[i]);
+ if (IS_ERR(dev->regmap[i])) {
+ ret = PTR_ERR(dev->regmap[i]);
+ dev_err(&i2c->dev,
+ "Failed to initialize regmap%i: %d\n",
+ ksz9477_regmap_config[i].val_bits, ret);
+ return ret;
+ }
+ }
+
+ if (i2c->dev.platform_data)
+ dev->pdata = i2c->dev.platform_data;
+
+ ret = ksz9477_switch_register(dev);
+
+ /* Main DSA driver may not be started yet. */
+ if (ret)
+ return ret;
+
+ i2c_set_clientdata(i2c, dev);
+
+ return 0;
+}
+
+static int ksz9477_i2c_remove(struct i2c_client *i2c)
+{
+ struct ksz_device *dev = i2c_get_clientdata(i2c);
+
+ ksz_switch_remove(dev);
+
+ return 0;
+}
+
+static void ksz9477_i2c_shutdown(struct i2c_client *i2c)
+{
+ struct ksz_device *dev = i2c_get_clientdata(i2c);
+
+ if (dev && dev->dev_ops->shutdown)
+ dev->dev_ops->shutdown(dev);
+}
+
+static const struct i2c_device_id ksz9477_i2c_id[] = {
+ { "ksz9477-switch", 0 },
+ {},
+};
+
+MODULE_DEVICE_TABLE(i2c, ksz9477_i2c_id);
+
+static const struct of_device_id ksz9477_dt_ids[] = {
+ { .compatible = "microchip,ksz9477" },
+ { .compatible = "microchip,ksz9897" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, ksz9477_dt_ids);
+
+static struct i2c_driver ksz9477_i2c_driver = {
+ .driver = {
+ .name = "ksz9477-switch",
+ .owner = THIS_MODULE,
+ .of_match_table = of_match_ptr(ksz9477_dt_ids),
+ },
+ .probe = ksz9477_i2c_probe,
+ .remove = ksz9477_i2c_remove,
+ .shutdown = ksz9477_i2c_shutdown,
+ .id_table = ksz9477_i2c_id,
+};
+
+module_i2c_driver(ksz9477_i2c_driver);
+
+MODULE_AUTHOR("Tristram Ha <Tristram.Ha@microchip.com>");
+MODULE_DESCRIPTION("Microchip KSZ9477 Series Switch I2C access Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index 13d027baaa8b..a24d8e61fbe7 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -294,6 +294,8 @@ static inline void ksz_pwrite32(struct ksz_device *dev, int port, int offset,
#define KSZ_SPI_OP_RD 3
#define KSZ_SPI_OP_WR 2
+#define swabnot_used(x) 0
+
#define KSZ_SPI_OP_FLAG_MASK(opcode, swp, regbits, regpad) \
swab##swp((opcode) << ((regbits) + (regpad)))
--
2.11.0
^ permalink raw reply related
* [PATCH net-next 2/3] net: dsa: microchip: add ksz9567 to ksz9477 driver
From: George McCollister @ 2019-09-06 21:30 UTC (permalink / raw)
To: netdev
Cc: Woojung Huh, Andrew Lunn, Florian Fainelli, Tristram Ha,
David S. Miller, Marek Vasut, linux-kernel, George McCollister
In-Reply-To: <20190906213054.48908-1-george.mccollister@gmail.com>
Add support for the KSZ9567 7-Port Gigabit Ethernet Switch to the
ksz9477 driver. The KSZ9567 supports both SPI and I2C. Oddly the
ksz9567 is already in the device tree binding documentation.
Signed-off-by: George McCollister <george.mccollister@gmail.com>
---
drivers/net/dsa/microchip/ksz9477.c | 9 +++++++++
drivers/net/dsa/microchip/ksz9477_i2c.c | 1 +
drivers/net/dsa/microchip/ksz9477_spi.c | 1 +
3 files changed, 11 insertions(+)
diff --git a/drivers/net/dsa/microchip/ksz9477.c b/drivers/net/dsa/microchip/ksz9477.c
index 187be42de5f1..50ffc63d6231 100644
--- a/drivers/net/dsa/microchip/ksz9477.c
+++ b/drivers/net/dsa/microchip/ksz9477.c
@@ -1529,6 +1529,15 @@ static const struct ksz_chip_data ksz9477_switch_chips[] = {
.cpu_ports = 0x07, /* can be configured as cpu port */
.port_cnt = 3, /* total port count */
},
+ {
+ .chip_id = 0x00956700,
+ .dev_name = "KSZ9567",
+ .num_vlans = 4096,
+ .num_alus = 4096,
+ .num_statics = 16,
+ .cpu_ports = 0x7F, /* can be configured as cpu port */
+ .port_cnt = 7, /* total physical port count */
+ },
};
static int ksz9477_switch_init(struct ksz_device *dev)
diff --git a/drivers/net/dsa/microchip/ksz9477_i2c.c b/drivers/net/dsa/microchip/ksz9477_i2c.c
index 85fd0fb43941..c1548a43b60d 100644
--- a/drivers/net/dsa/microchip/ksz9477_i2c.c
+++ b/drivers/net/dsa/microchip/ksz9477_i2c.c
@@ -77,6 +77,7 @@ MODULE_DEVICE_TABLE(i2c, ksz9477_i2c_id);
static const struct of_device_id ksz9477_dt_ids[] = {
{ .compatible = "microchip,ksz9477" },
{ .compatible = "microchip,ksz9897" },
+ { .compatible = "microchip,ksz9567" },
{},
};
MODULE_DEVICE_TABLE(of, ksz9477_dt_ids);
diff --git a/drivers/net/dsa/microchip/ksz9477_spi.c b/drivers/net/dsa/microchip/ksz9477_spi.c
index 2e402e4d866f..f4198d6f72be 100644
--- a/drivers/net/dsa/microchip/ksz9477_spi.c
+++ b/drivers/net/dsa/microchip/ksz9477_spi.c
@@ -81,6 +81,7 @@ static const struct of_device_id ksz9477_dt_ids[] = {
{ .compatible = "microchip,ksz9893" },
{ .compatible = "microchip,ksz9563" },
{ .compatible = "microchip,ksz8563" },
+ { .compatible = "microchip,ksz9567" },
{},
};
MODULE_DEVICE_TABLE(of, ksz9477_dt_ids);
--
2.11.0
^ permalink raw reply related
* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Qian Cai @ 2019-09-06 21:17 UTC (permalink / raw)
To: Sergey Senozhatsky
Cc: Steven Rostedt, Petr Mladek, Sergey Senozhatsky, Michal Hocko,
Eric Dumazet, davem, netdev, linux-mm, linux-kernel
In-Reply-To: <20190906043224.GA18163@jagdpanzerIV>
On Fri, 2019-09-06 at 13:32 +0900, Sergey Senozhatsky wrote:
> On (09/05/19 12:03), Qian Cai wrote:
> > > ---
> > > diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> > > index cd51aa7d08a9..89cb47882254 100644
> > > --- a/kernel/printk/printk.c
> > > +++ b/kernel/printk/printk.c
> > > @@ -2027,8 +2027,11 @@ asmlinkage int vprintk_emit(int facility, int level,
> > > pending_output = (curr_log_seq != log_next_seq);
> > > logbuf_unlock_irqrestore(flags);
> > >
> > > + if (!pending_output)
> > > + return printed_len;
> > > +
> > > /* If called from the scheduler, we can not call up(). */
> > > - if (!in_sched && pending_output) {
> > > + if (!in_sched) {
> > > /*
> > > * Disable preemption to avoid being preempted while holding
> > > * console_sem which would prevent anyone from printing to
> > > @@ -2043,10 +2046,11 @@ asmlinkage int vprintk_emit(int facility, int level,
> > > if (console_trylock_spinning())
> > > console_unlock();
> > > preempt_enable();
> > > - }
> > >
> > > - if (pending_output)
> > > + wake_up_interruptible(&log_wait);
> > > + } else {
> > > wake_up_klogd();
> > > + }
> > > return printed_len;
> > > }
> > > EXPORT_SYMBOL(vprintk_emit);
> > > ---
>
> Qian Cai, any chance you can test that patch?
So far as good, but it is hard to tell if this really nail the issue down. I'll
leave it running over the weekend and report back if it occurs again.
^ permalink raw reply
* Re: [PATCH v2 net] net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list
From: Alexander Duyck @ 2019-09-06 20:51 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Shmulik Ladkani, Daniel Borkmann, Eric Dumazet, netdev, eyal,
Shmulik Ladkani
In-Reply-To: <CAF=yD-LX-XemD8QpU-=Hn5bdX8jPP6nWS1YgpDxcrBu7sdBxRg@mail.gmail.com>
On Fri, Sep 6, 2019 at 1:15 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Fri, Sep 6, 2019 at 5:23 AM Shmulik Ladkani <shmulik@metanetworks.com> wrote:
> >
> > Historically, support for frag_list packets entering skb_segment() was
> > limited to frag_list members terminating on exact same gso_size
> > boundaries. This is verified with a BUG_ON since commit 89319d3801d1
> > ("net: Add frag_list support to skb_segment"), quote:
> >
> > As such we require all frag_list members terminate on exact MSS
> > boundaries. This is checked using BUG_ON.
> > As there should only be one producer in the kernel of such packets,
> > namely GRO, this requirement should not be difficult to maintain.
> >
> > However, since commit 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper"),
> > the "exact MSS boundaries" assumption no longer holds:
> > An eBPF program using bpf_skb_change_proto() DOES modify 'gso_size', but
> > leaves the frag_list members as originally merged by GRO with the
> > original 'gso_size'. Example of such programs are bpf-based NAT46 or
> > NAT64.
> >
> > This lead to a kernel BUG_ON for flows involving:
> > - GRO generating a frag_list skb
> > - bpf program performing bpf_skb_change_proto() or bpf_skb_adjust_room()
> > - skb_segment() of the skb
> >
> > See example BUG_ON reports in [0].
> >
> > In commit 13acc94eff12 ("net: permit skb_segment on head_frag frag_list skb"),
> > skb_segment() was modified to support the "gso_size mangling" case of
> > a frag_list GRO'ed skb, but *only* for frag_list members having
> > head_frag==true (having a page-fragment head).
> >
> > Alas, GRO packets having frag_list members with a linear kmalloced head
> > (head_frag==false) still hit the BUG_ON.
> >
> > This commit adds support to skb_segment() for a 'head_skb' packet having
> > a frag_list whose members are *non* head_frag, with gso_size mangled, by
> > disabling SG and thus falling-back to copying the data from the given
> > 'head_skb' into the generated segmented skbs - as suggested by Willem de
> > Bruijn [1].
> >
> > Since this approach involves the penalty of skb_copy_and_csum_bits()
> > when building the segments, care was taken in order to enable this
> > solution only when required:
> > - untrusted gso_size, by testing SKB_GSO_DODGY is set
> > (SKB_GSO_DODGY is set by any gso_size mangling functions in
> > net/core/filter.c)
> > - the frag_list is non empty, its item is a non head_frag, *and* the
> > headlen of the given 'head_skb' does not match the gso_size.
> >
> > [0]
> > https://lore.kernel.org/netdev/20190826170724.25ff616f@pixies/
> > https://lore.kernel.org/netdev/9265b93f-253d-6b8c-f2b8-4b54eff1835c@fb.com/
> >
> > [1]
> > https://lore.kernel.org/netdev/CA+FuTSfVsgNDi7c=GUU8nMg2hWxF2SjCNLXetHeVPdnxAW5K-w@mail.gmail.com/
> >
> > Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
> > Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> > Cc: Daniel Borkmann <daniel@iogearbox.net>
> > Cc: Eric Dumazet <eric.dumazet@gmail.com>
> > Cc: Alexander Duyck <alexander.duyck@gmail.com>
> > Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
>
> Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
^ permalink raw reply
* ndo_xdp_xmit - on which queue to transmit the packet (if core_id >= total_xdp_queues ) ?
From: Manish Chopra @ 2019-09-06 20:49 UTC (permalink / raw)
To: netdev@vger.kernel.org
Hello,
I am working on XDP_REDIRECT implementation and got a query. Some of the ethernet drivers decide the xdp queue index on which xdp packet should be redirected based
on smp_processor_id() in their ndo_xdp_xmit() handler, if smp_processor_id() >= total_num_xdp_queues, they decide to drop the packets and return error from the handler.
I am hitting the same condition where using 8 XDP queues, I get CPU id 8 to redirect the XDP packet and I am not sure if it should be dropped or can be transmitted on a
queue (= smp_processor_id() % total_num_xdp_queues) safely ?.
freescale/dpaa2 seems to be handling this case by sending the packet on the queue (= smp_processor_id() % total_num_xdp_queues) but unsure what should be the expected behavior.
Regards,
Manish Chopra.
^ permalink raw reply
* Re: [PATCH 1/2] net: phy: dp83867: Add documentation for SGMII mode type
From: Vitaly Gaiduk @ 2019-09-06 20:45 UTC (permalink / raw)
To: Andrew Lunn
Cc: davem, robh+dt, f.fainelli, Mark Rutland, Trent Piepho, netdev,
devicetree, linux-kernel
In-Reply-To: <20190906192919.GA2339@lunn.ch>
Hi, Andrew.
I'm not familiar with generic PHY HW archs but suppose that it is
proprietary to TI.
I'v never seen such feature so moved it in TI dts field.
Vitaly.
06.09.2019 22:29, Andrew Lunn wrote:
> On Thu, Sep 05, 2019 at 07:26:00PM +0300, Vitaly Gaiduk wrote:
>> Add documentation of ti,sgmii-type which can be used to select
>> SGMII mode type (4 or 6-wire).
> Hi Vitaly
>
> Is 4 vs 6-wire a generic SGMII property? Or is it proprietary to TI?
>
> I did a quick search and i could not find any other PHYs supporting
> it.
>
> Andrew
^ permalink raw reply
* Re: [PATCH v2 net] net: gso: Fix skb_segment splat when splitting gso_size mangled skb having linear-headed frag_list
From: Willem de Bruijn @ 2019-09-06 20:15 UTC (permalink / raw)
To: Shmulik Ladkani
Cc: Alexander Duyck, Daniel Borkmann, Eric Dumazet, netdev, eyal,
Shmulik Ladkani
In-Reply-To: <20190906092350.13929-1-shmulik.ladkani@gmail.com>
On Fri, Sep 6, 2019 at 5:23 AM Shmulik Ladkani <shmulik@metanetworks.com> wrote:
>
> Historically, support for frag_list packets entering skb_segment() was
> limited to frag_list members terminating on exact same gso_size
> boundaries. This is verified with a BUG_ON since commit 89319d3801d1
> ("net: Add frag_list support to skb_segment"), quote:
>
> As such we require all frag_list members terminate on exact MSS
> boundaries. This is checked using BUG_ON.
> As there should only be one producer in the kernel of such packets,
> namely GRO, this requirement should not be difficult to maintain.
>
> However, since commit 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper"),
> the "exact MSS boundaries" assumption no longer holds:
> An eBPF program using bpf_skb_change_proto() DOES modify 'gso_size', but
> leaves the frag_list members as originally merged by GRO with the
> original 'gso_size'. Example of such programs are bpf-based NAT46 or
> NAT64.
>
> This lead to a kernel BUG_ON for flows involving:
> - GRO generating a frag_list skb
> - bpf program performing bpf_skb_change_proto() or bpf_skb_adjust_room()
> - skb_segment() of the skb
>
> See example BUG_ON reports in [0].
>
> In commit 13acc94eff12 ("net: permit skb_segment on head_frag frag_list skb"),
> skb_segment() was modified to support the "gso_size mangling" case of
> a frag_list GRO'ed skb, but *only* for frag_list members having
> head_frag==true (having a page-fragment head).
>
> Alas, GRO packets having frag_list members with a linear kmalloced head
> (head_frag==false) still hit the BUG_ON.
>
> This commit adds support to skb_segment() for a 'head_skb' packet having
> a frag_list whose members are *non* head_frag, with gso_size mangled, by
> disabling SG and thus falling-back to copying the data from the given
> 'head_skb' into the generated segmented skbs - as suggested by Willem de
> Bruijn [1].
>
> Since this approach involves the penalty of skb_copy_and_csum_bits()
> when building the segments, care was taken in order to enable this
> solution only when required:
> - untrusted gso_size, by testing SKB_GSO_DODGY is set
> (SKB_GSO_DODGY is set by any gso_size mangling functions in
> net/core/filter.c)
> - the frag_list is non empty, its item is a non head_frag, *and* the
> headlen of the given 'head_skb' does not match the gso_size.
>
> [0]
> https://lore.kernel.org/netdev/20190826170724.25ff616f@pixies/
> https://lore.kernel.org/netdev/9265b93f-253d-6b8c-f2b8-4b54eff1835c@fb.com/
>
> [1]
> https://lore.kernel.org/netdev/CA+FuTSfVsgNDi7c=GUU8nMg2hWxF2SjCNLXetHeVPdnxAW5K-w@mail.gmail.com/
>
> Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
> Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
^ permalink raw reply
* Re: [PATCH net-next 5/5] enetc: Use DT protocol information to set up the ports
From: Andrew Lunn @ 2019-09-06 20:06 UTC (permalink / raw)
To: Claudiu Manoil; +Cc: David S . Miller, alexandru.marginean, netdev
In-Reply-To: <1567779344-30965-6-git-send-email-claudiu.manoil@nxp.com>
> +static void enetc_configure_port_mac(struct enetc_hw *hw,
> + phy_interface_t phy_mode)
> {
> enetc_port_wr(hw, ENETC_PM0_MAXFRM,
> ENETC_SET_MAXFRM(ENETC_RX_MAXFRM_SIZE));
> @@ -523,9 +524,11 @@ static void enetc_configure_port_mac(struct enetc_hw *hw)
> ENETC_PM0_CMD_TXP | ENETC_PM0_PROMISC |
> ENETC_PM0_TX_EN | ENETC_PM0_RX_EN);
> /* set auto-speed for RGMII */
> - if (enetc_port_rd(hw, ENETC_PM0_IF_MODE) & ENETC_PMO_IFM_RG)
> + if (enetc_port_rd(hw, ENETC_PM0_IF_MODE) & ENETC_PMO_IFM_RG ||
> + phy_mode == PHY_INTERFACE_MODE_RGMII)
> enetc_port_wr(hw, ENETC_PM0_IF_MODE, ENETC_PM0_IFM_RGAUTO);
What about PHY_INTERFACE_MODE_RGMII_ID, PHY_INTERFACE_MODE_RGMII_RXID
and PHY_INTERFACE_MODE_RGMII_TXID.
Andrew
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox