All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alexei Starovoitov <ast@plumgrid.com>
To: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andy Lutomirski <luto@amacapital.net>,
	Steven Rostedt <rostedt@goodmis.org>,
	Daniel Borkmann <dborkman@redhat.com>,
	Chema Gonzalez <chema@google.com>,
	Eric Dumazet <edumazet@google.com>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Brendan Gregg <brendan.d.gregg@gmail.com>,
	Namhyung Kim <namhyung@kernel.org>,
	"H. Peter Anvin" <hpa@zytor.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Kees Cook <keescook@chromium.org>,
	linux-api@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH RFC v7 net-next 05/28] bpf: add lookup/update/delete/iterate methods to BPF maps
Date: Tue, 26 Aug 2014 19:29:19 -0700	[thread overview]
Message-ID: <1409106582-10095-6-git-send-email-ast@plumgrid.com> (raw)
In-Reply-To: <1409106582-10095-1-git-send-email-ast@plumgrid.com>

'maps' is a generic storage of different types for sharing data between kernel
and userspace.

The maps are accessed from user space via BPF syscall, which has commands:

- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error

- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error

- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key

- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key

- close(fd) deletes the map

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/linux/bpf.h      |    8 ++
 include/uapi/linux/bpf.h |   42 +++++++++
 kernel/bpf/syscall.c     |  219 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 269 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48014a71f0fe..2887f3f9da59 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -9,6 +9,7 @@
 
 #include <uapi/linux/bpf.h>
 #include <linux/workqueue.h>
+#include <linux/file.h>
 
 struct bpf_map;
 
@@ -17,6 +18,12 @@ struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
 	struct bpf_map *(*map_alloc)(union bpf_attr *attr);
 	void (*map_free)(struct bpf_map *);
+	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+
+	/* funcs callable from userspace and from eBPF programs */
+	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
+	int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+	int (*map_delete_elem)(struct bpf_map *map, void *key);
 };
 
 struct bpf_map {
@@ -37,5 +44,6 @@ struct bpf_map_type_list {
 
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 void bpf_map_put(struct bpf_map *map);
+struct bpf_map *bpf_map_get(struct fd f);
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 51dc51c898c6..1602de6423b5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -326,6 +326,35 @@ enum bpf_cmd {
 	 * map is deleted when fd is closed
 	 */
 	BPF_MAP_CREATE,
+
+	/* lookup key in a given map
+	 * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->value
+	 * returns zero and stores found elem into value
+	 * or negative error
+	 */
+	BPF_MAP_LOOKUP_ELEM,
+
+	/* create or update key/value pair in a given map
+	 * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->value
+	 * returns zero or negative error
+	 */
+	BPF_MAP_UPDATE_ELEM,
+
+	/* find and delete elem by key in a given map
+	 * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key
+	 * returns zero or negative error
+	 */
+	BPF_MAP_DELETE_ELEM,
+
+	/* lookup key in a given map and return next key
+	 * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->next_key
+	 * returns zero and stores next key or negative error
+	 */
+	BPF_MAP_GET_NEXT_KEY,
 };
 
 enum bpf_map_type {
@@ -340,6 +369,19 @@ union bpf_attr {
 		__u32	max_entries;	/* max number of entries in a map */
 #define BPF_MAP_CREATE_LAST_FIELD max_entries
 	};
+
+	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+		int map_fd;
+		void __user *key;
+		union {
+			void __user *value;
+			void __user *next_key;
+		};
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+	};
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 989c17c35a62..b863976741d4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
+#include <linux/file.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -109,6 +110,212 @@ free_map:
 	return err;
 }
 
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+	struct bpf_map *map;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &bpf_map_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	map = f.file->private_data;
+
+	return map;
+}
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	void __user *uvalue = attr->value;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *value;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ESRCH;
+	rcu_read_lock();
+	value = map->ops->map_lookup_elem(map, key);
+	if (!value)
+		goto err_unlock;
+
+	err = -EFAULT;
+	if (copy_to_user(uvalue, value, map->value_size) != 0)
+		goto err_unlock;
+
+	err = 0;
+
+err_unlock:
+	rcu_read_unlock();
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+static int map_update_elem(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	void __user *uvalue = attr->value;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *value;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	value = kmalloc(map->value_size, GFP_USER);
+	if (!value)
+		goto free_key;
+
+	err = -EFAULT;
+	if (copy_from_user(value, uvalue, map->value_size) != 0)
+		goto free_value;
+
+	/* eBPF program that use maps are running under rcu_read_lock(),
+	 * therefore all map accessors rely on this fact, so do the same here
+	 */
+	rcu_read_lock();
+	err = map->ops->map_update_elem(map, key, value);
+	rcu_read_unlock();
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	rcu_read_lock();
+	err = map->ops->map_delete_elem(map, key);
+	rcu_read_unlock();
+
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+	void __user *ukey = attr->key;
+	void __user *unext_key = attr->next_key;
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *next_key;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	next_key = kmalloc(map->key_size, GFP_USER);
+	if (!next_key)
+		goto free_key;
+
+	rcu_read_lock();
+	err = map->ops->map_get_next_key(map, key, next_key);
+	rcu_read_unlock();
+	if (err)
+		goto free_next_key;
+
+	err = -EFAULT;
+	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+		goto free_next_key;
+
+	err = 0;
+
+free_next_key:
+	kfree(next_key);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr *attr;
@@ -140,6 +347,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_CREATE:
 		err = map_create(attr);
 		break;
+	case BPF_MAP_LOOKUP_ELEM:
+		err = map_lookup_elem(attr);
+		break;
+	case BPF_MAP_UPDATE_ELEM:
+		err = map_update_elem(attr);
+		break;
+	case BPF_MAP_DELETE_ELEM:
+		err = map_delete_elem(attr);
+		break;
+	case BPF_MAP_GET_NEXT_KEY:
+		err = map_get_next_key(attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
1.7.9.5

  parent reply	other threads:[~2014-08-27  2:29 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-27  2:29 [PATCH RFC v7 net-next 00/28] BPF syscall Alexei Starovoitov
2014-08-27  2:29 ` Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 01/28] net: filter: add "load 64-bit immediate" eBPF instruction Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 02/28] net: filter: split filter.h and expose eBPF to user space Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 03/28] bpf: introduce syscall(BPF, ...) and BPF maps Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 04/28] bpf: enable bpf syscall on x64 and i386 Alexei Starovoitov
2014-08-27  2:29 ` Alexei Starovoitov [this message]
2014-08-27  2:29 ` [PATCH RFC v7 net-next 06/28] bpf: add hashtable type of BPF maps Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 07/28] bpf: expand BPF syscall with program load/unload Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 08/28] bpf: handle pseudo BPF_CALL insn Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 09/28] bpf: verifier (add docs) Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 10/28] bpf: verifier (add ability to receive verification log) Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 11/28] bpf: handle pseudo BPF_LD_IMM64 insn Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 12/28] bpf: verifier (add branch/goto checks) Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 13/28] bpf: verifier (add verifier core) Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 14/28] bpf: verifier (add state prunning optimization) Alexei Starovoitov
     [not found] ` <1409106582-10095-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
2014-08-27  2:29   ` [PATCH RFC v7 net-next 15/28] bpf: allow eBPF programs to use maps Alexei Starovoitov
2014-08-27  2:29     ` Alexei Starovoitov
2014-08-27  2:29   ` [PATCH RFC v7 net-next 16/28] bpf: split eBPF out of NET Alexei Starovoitov
2014-08-27  2:29     ` Alexei Starovoitov
2014-08-27  2:29   ` [PATCH RFC v7 net-next 18/28] tracing: allow eBPF programs call printk() Alexei Starovoitov
2014-08-27  2:29     ` Alexei Starovoitov
2014-08-27  2:29   ` [PATCH RFC v7 net-next 24/28] bpf: llvm backend Alexei Starovoitov
2014-08-27  2:29     ` Alexei Starovoitov
2014-08-27  2:29   ` [PATCH RFC v7 net-next 25/28] samples: bpf: elf file loader Alexei Starovoitov
2014-08-27  2:29     ` Alexei Starovoitov
2014-08-27  6:11   ` [PATCH RFC v7 net-next 00/28] BPF syscall David Miller
2014-08-27  6:11     ` David Miller
     [not found]     ` <20140826.231155.421325307812864648.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2014-08-27 18:24       ` Steven Stewart-Gallus
2014-08-27 18:24         ` Steven Stewart-Gallus
2014-08-27  2:29 ` [PATCH RFC v7 net-next 17/28] tracing: allow eBPF programs to be attached to events Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 19/28] tracing: allow eBPF programs to be attached to kprobe/kretprobe Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 20/28] tracing: allow eBPF programs to call ktime_get_ns() and get_current() Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 21/28] samples: bpf: add mini eBPF library to manipulate maps and programs Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 22/28] samples: bpf: example of tracing filters with eBPF Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 23/28] bpf: verifier test Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 26/28] samples: bpf: eBPF example in C Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 27/28] samples: bpf: counting " Alexei Starovoitov
2014-08-27  2:29 ` [PATCH RFC v7 net-next 28/28] samples: bpf: IO latency analysis (iosnoop/heatmap) Alexei Starovoitov
2014-08-27  3:56 ` [PATCH RFC v7 net-next 00/28] BPF syscall Andy Lutomirski
2014-08-27  4:35   ` Alexei Starovoitov
2014-08-27  4:49     ` Andy Lutomirski
2014-08-27  4:57       ` Alexei Starovoitov
     [not found]         ` <CAMEtUuw1n1HzAeyKFj9=nGq7RKZq7TADS-6M_BkHbTsWJ_Gm-Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2014-08-27 18:26           ` Andy Lutomirski
2014-08-27 18:26             ` Andy Lutomirski
     [not found]             ` <CALCETrXAfZJTsF2nPFw55rHkfbNXKQuF8Frnq3e1wHEoGxLM4w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2014-08-27 19:18               ` Stephen Hemminger
2014-08-27 19:18                 ` Stephen Hemminger
2014-08-27 19:35                 ` Daniel Borkmann
2014-08-27 19:35                   ` Daniel Borkmann
2014-08-27 19:37                 ` Alexei Starovoitov
2014-08-27 19:37                   ` Alexei Starovoitov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1409106582-10095-6-git-send-email-ast@plumgrid.com \
    --to=ast@plumgrid.com \
    --cc=a.p.zijlstra@chello.nl \
    --cc=akpm@linux-foundation.org \
    --cc=brendan.d.gregg@gmail.com \
    --cc=chema@google.com \
    --cc=davem@davemloft.net \
    --cc=dborkman@redhat.com \
    --cc=edumazet@google.com \
    --cc=hpa@zytor.com \
    --cc=keescook@chromium.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.