From: Brian Vazquez <brianvv@google.com>
To: Brian Vazquez <brianvv.kernel@gmail.com>,
Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
"David S . Miller" <davem@davemloft.net>
Cc: Yonghong Song <yhs@fb.com>, Stanislav Fomichev <sdf@google.com>,
Petar Penkov <ppenkov@google.com>,
Willem de Bruijn <willemb@google.com>,
linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
bpf@vger.kernel.org, Brian Vazquez <brianvv@google.com>
Subject: [PATCH bpf-next 2/9] bpf: add generic support for lookup and lookup_and_delete batch ops
Date: Mon, 18 Nov 2019 17:43:50 -0800 [thread overview]
Message-ID: <20191119014357.98465-3-brianvv@google.com> (raw)
In-Reply-To: <20191119014357.98465-1-brianvv@google.com>
This commit introduces generic support for the bpf_map_lookup_batch and
bpf_map_lookup_and_delete_batch ops. This implementation can be used by
almost all the bpf maps since its core implementation is relying on the
existing map_get_next_key, map_lookup_elem and map_delete_elem
functions. The bpf syscall subcommands introduced are:
BPF_MAP_LOOKUP_BATCH
BPF_MAP_LOOKUP_AND_DELETE_BATCH
The UAPI attribute is:
struct { /* struct used by BPF_MAP_*_BATCH commands */
__aligned_u64 in_batch; /* start batch,
* NULL to start from beginning
*/
__aligned_u64 out_batch; /* output: next start batch */
__aligned_u64 keys;
__aligned_u64 values;
__u32 count; /* input/output:
* input: # of key/value
* elements
* output: # of filled elements
*/
__u32 map_fd;
__u64 elem_flags;
__u64 flags;
} batch;
in_batch/out_batch are opaque values use to communicate between
user/kernel space, in_batch/out_batch must be of key_size length.
To start iterating from the beginning in_batch must be null,
count is the # of key/value elements to retrieve. Note that the 'keys'
buffer must be a buffer of key_size * count size and the 'values' buffer
must be value_size * count, where value_size must be aligned to 8 bytes
by userspace if it's dealing with percpu maps. 'count' will contain the
number of keys/values successfully retrieved. Note that 'count' is an
input/output variable and it can contain a lower value after a call.
If there's no more entries to retrieve, ENOENT will be return. If an
error is returned count should be 0, only if error is ENOENT, count might
be > 0 in case it copied some values but there were no more entries to
retrieve.
Suggested-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Brian Vazquez <brianvv@google.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
include/linux/bpf.h | 11 +++
include/uapi/linux/bpf.h | 19 +++++
kernel/bpf/syscall.c | 176 +++++++++++++++++++++++++++++++++++++++
3 files changed, 206 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b81cde47314e..767a823dbac74 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -41,6 +41,11 @@ struct bpf_map_ops {
int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
void (*map_release_uref)(struct bpf_map *map);
void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
+ int (*map_lookup_batch)(struct bpf_map *map, const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
+ int (*map_lookup_and_delete_batch)(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
/* funcs callable from userspace and from eBPF programs */
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
@@ -797,6 +802,12 @@ void bpf_map_charge_move(struct bpf_map_memory *dst,
void *bpf_map_area_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
+int generic_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
extern int sysctl_unprivileged_bpf_disabled;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4842a134b202a..e60b7b7cda61a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -107,6 +107,8 @@ enum bpf_cmd {
BPF_MAP_LOOKUP_AND_DELETE_ELEM,
BPF_MAP_FREEZE,
BPF_BTF_GET_NEXT_ID,
+ BPF_MAP_LOOKUP_BATCH,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH,
};
enum bpf_map_type {
@@ -400,6 +402,23 @@ union bpf_attr {
__u64 flags;
};
+ struct { /* struct used by BPF_MAP_*_BATCH commands */
+ __aligned_u64 in_batch; /* start batch,
+ * NULL to start from beginning
+ */
+ __aligned_u64 out_batch; /* output: next start batch */
+ __aligned_u64 keys;
+ __aligned_u64 values;
+ __u32 count; /* input/output:
+ * input: # of key/value
+ * elements
+ * output: # of filled elements
+ */
+ __u32 map_fd;
+ __u64 elem_flags;
+ __u64 flags;
+ } batch;
+
struct { /* anonymous struct used by BPF_PROG_LOAD command */
__u32 prog_type; /* one of enum bpf_prog_type */
__u32 insn_cnt;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cc714c9d5b4cc..d0d3d0e0eaca4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1127,6 +1127,124 @@ static int map_get_next_key(union bpf_attr *attr)
return err;
}
+static int __generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ bool do_delete)
+{
+ void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+ void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
+ void __user *values = u64_to_user_ptr(attr->batch.values);
+ void __user *keys = u64_to_user_ptr(attr->batch.keys);
+ void *buf, *prev_key, *key, *value;
+ u32 value_size, cp, max_count;
+ bool first_key = false;
+ int err, retry = 3;
+
+ if (attr->batch.elem_flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
+ if ((attr->batch.elem_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_QUEUE ||
+ map->map_type == BPF_MAP_TYPE_STACK) {
+ err = -ENOTSUPP;
+ goto err_put;
+ }
+
+ value_size = bpf_map_value_size(map);
+
+ max_count = attr->batch.count;
+ if (!max_count)
+ return 0;
+
+ err = -ENOMEM;
+ buf = kmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN);
+ if (!buf)
+ goto err_put;
+
+ err = -EFAULT;
+ first_key = false;
+ if (ubatch && copy_from_user(buf, ubatch, map->key_size))
+ goto free_buf;
+ key = buf;
+ value = key + map->key_size;
+ if (!ubatch) {
+ prev_key = NULL;
+ first_key = true;
+ }
+
+
+ for (cp = 0; cp < max_count; cp++) {
+ if (cp || first_key) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ if (err)
+ break;
+ }
+ err = bpf_map_copy_value(map, key, value,
+ attr->batch.elem_flags, do_delete);
+
+ if (err == -ENOENT) {
+ if (retry) {
+ retry--;
+ continue;
+ }
+ err = -EINTR;
+ break;
+ }
+
+ if (err)
+ goto free_buf;
+
+ if (copy_to_user(keys + cp * map->key_size, key,
+ map->key_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+ if (copy_to_user(values + cp * value_size, value, value_size)) {
+ err = -EFAULT;
+ goto free_buf;
+ }
+
+ prev_key = key;
+ retry = 3;
+ }
+ if (!err) {
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, prev_key, key);
+ rcu_read_unlock();
+ }
+
+ if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) ||
+ (copy_to_user(uobatch, key, map->key_size))))
+ err = -EFAULT;
+
+free_buf:
+ kfree(buf);
+err_put:
+ return err;
+}
+
+int generic_map_lookup_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __generic_map_lookup_batch(map, attr, uattr, false);
+}
+
+int generic_map_lookup_and_delete_batch(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ return __generic_map_lookup_batch(map, attr, uattr, true);
+}
+
#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
static int map_lookup_and_delete_elem(union bpf_attr *attr)
@@ -2956,6 +3074,57 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
return err;
}
+#define BPF_MAP_BATCH_LAST_FIELD batch.flags
+
+#define BPF_DO_BATCH(fn) \
+ do { \
+ if (!fn) { \
+ err = -ENOTSUPP; \
+ goto err_put; \
+ } \
+ err = fn(map, attr, uattr); \
+ } while (0)
+
+static int bpf_map_do_batch(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ int cmd)
+{
+ struct bpf_map *map;
+ int err, ufd;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_BATCH))
+ return -EINVAL;
+
+ ufd = attr->batch.map_fd;
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if ((cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd != BPF_MAP_LOOKUP_BATCH &&
+ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ if (cmd == BPF_MAP_LOOKUP_BATCH)
+ BPF_DO_BATCH(map->ops->map_lookup_batch);
+ else
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+
+err_put:
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -3053,6 +3222,13 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
err = map_lookup_and_delete_elem(&attr);
break;
+ case BPF_MAP_LOOKUP_BATCH:
+ err = bpf_map_do_batch(&attr, uattr, BPF_MAP_LOOKUP_BATCH);
+ break;
+ case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
+ err = bpf_map_do_batch(&attr, uattr,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH);
+ break;
default:
err = -EINVAL;
break;
--
2.24.0.432.g9d3f5f5b63-goog
next prev parent reply other threads:[~2019-11-19 1:44 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-11-19 1:43 [PATCH bpf-next 0/9] add bpf batch ops to process more than 1 elem Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 1/9] bpf: add bpf_map_{value_size,update_value,map_copy_value} functions Brian Vazquez
2019-11-19 1:43 ` Brian Vazquez [this message]
2019-11-19 1:43 ` [PATCH bpf-next 3/9] bpf: add generic support for update and delete batch ops Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 4/9] bpf: add lookup and updated batch ops to arraymap Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 5/9] bpf: add batch ops to all htab bpf map Brian Vazquez
2019-11-19 4:35 ` Alexei Starovoitov
2019-11-19 5:26 ` Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 6/9] tools/bpf: sync uapi header bpf.h Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 7/9] libbpf: add libbpf support to batch ops Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 8/9] selftests/bpf: add batch ops testing for hmap and hmap_percpu Brian Vazquez
2019-11-19 1:43 ` [PATCH bpf-next 9/9] selftests/bpf: add batch ops testing to array bpf map Brian Vazquez
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20191119014357.98465-3-brianvv@google.com \
--to=brianvv@google.com \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=brianvv.kernel@gmail.com \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=ppenkov@google.com \
--cc=sdf@google.com \
--cc=willemb@google.com \
--cc=yhs@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.