Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* [PATCH v2 net-next 2/7] bpf: add hashtable type of eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

add new map type BPF_MAP_TYPE_HASH and its implementation

- maps are created/destroyed by userspace. Both userspace and eBPF programs
  can lookup/update/delete elements from the map

- eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism
  for concurrent updates

- key/value are opaque range of bytes (aligned to 8 bytes)

- user space provides 3 configuration attributes via BPF syscall:
  key_size, value_size, max_entries

- map takes care of allocating/freeing key/value pairs

- map_update_elem() must fail to insert new element when max_entries
  limit is reached to make sure that eBPF programs cannot exhaust memory

- map_update_elem() replaces elements in an atomic way

- optimized for speed of lookup() which can be called multiple times from
  eBPF program which itself is triggered by high volume of events
  . in the future JIT compiler may recognize lookup() call and optimize it
    further, since key_size is constant for life of eBPF program

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/hashtab.c     |  362 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/hashtab.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3e9e1b77f29d..03a01fd609aa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@ enum bpf_cmd {
 
 enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
+	BPF_MAP_TYPE_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..2c0ec7f9da78 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..d234a012f046
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,362 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct hlist_head *buckets;
+	spinlock_t lock;
+	u32 count;	/* number of elements in this hashtable */
+	u32 n_buckets;	/* number of hash buckets */
+	u32 elem_size;	/* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	struct hlist_node hash_node;
+	struct rcu_head rcu;
+	u32 hash;
+	char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int err, i;
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	htab->map.key_size = attr->key_size;
+	htab->map.value_size = attr->value_size;
+	htab->map.max_entries = attr->max_entries;
+
+	/* check sanity of attributes.
+	 * value_size == 0 may be allowed in the future to use map as a set
+	 */
+	err = -EINVAL;
+	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+	    htab->map.value_size == 0)
+		goto free_htab;
+
+	/* hash table size must be power of 2 */
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+	err = -E2BIG;
+	if (htab->map.key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+				      GFP_USER | __GFP_NOWARN);
+
+	if (!htab->buckets) {
+		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+		if (!htab->buckets)
+			goto free_htab;
+	}
+
+	for (i = 0; i < htab->n_buckets; i++)
+		INIT_HLIST_HEAD(&htab->buckets[i]);
+
+	spin_lock_init(&htab->lock);
+	htab->count = 0;
+
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8) +
+			  htab->map.value_size;
+	return &htab->map;
+
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+					 void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node)
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 hash, key_size;
+
+	/* Must be called with rcu_read_lock. */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l)
+		return l->key + round_up(map->key_size, 8);
+
+	return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l, *next_l;
+	u32 hash, key_size;
+	int i;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	/* lookup the key */
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (!l) {
+		i = 0;
+		goto find_first_elem;
+	}
+
+	/* key was found, get next key in the same bucket */
+	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+				  struct htab_elem, hash_node);
+
+	if (next_l) {
+		/* if next elem in this hash list is non-zero, just return it */
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		head = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					  struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* itereated over all buckets and all elements */
+	return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	u32 key_size;
+	int ret;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* allocate new element outside of lock */
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+	if (!l_new)
+		return -ENOMEM;
+
+	key_size = map->key_size;
+
+	memcpy(l_new->key, key, key_size);
+	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+	l_new->hash = htab_map_hash(l_new->key, key_size);
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, l_new->hash);
+
+	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+	if (!l_old && unlikely(htab->count >= map->max_entries)) {
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		ret = -E2BIG;
+		goto err;
+	}
+
+	if (l_old && map_flags == BPF_NOEXIST) {
+		/* elem already exists */
+		ret = -EEXIST;
+		goto err;
+	}
+
+	if (!l_old && map_flags == BPF_EXIST) {
+		/* elem doesn't exist, cannot update it */
+		ret = -ENOENT;
+		goto err;
+	}
+
+	/* add new element to the head of the list, so that concurrent
+	 * search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		hlist_del_rcu(&l_old->hash_node);
+		kfree_rcu(l_old, rcu);
+	} else {
+		htab->count++;
+	}
+	spin_unlock_irqrestore(&htab->lock, flags);
+
+	return 0;
+err:
+	spin_unlock_irqrestore(&htab->lock, flags);
+	kfree(l_new);
+	return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	unsigned long flags;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		htab->count--;
+		kfree_rcu(l, rcu);
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&htab->lock, flags);
+	return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+	int i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_del_rcu(&l->hash_node);
+			htab->count--;
+			kfree(l);
+		}
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete
+	 */
+	synchronize_rcu();
+
+	/* some of kfree_rcu() callbacks for elements of this map may not have
+	 * executed. It's ok. Proceed to free residual elements and map itself
+	 */
+	delete_all_elements(htab);
+	kvfree(htab->buckets);
+	kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_map_lookup_elem,
+	.map_update_elem = htab_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &htab_ops,
+	.type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_htab_map);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 3/7] bpf: add array type of eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

add new map type BPF_MAP_TYPE_ARRAY and its implementation

- optimized for fastest possible lookup()
  . in the future verifier/JIT may recognize lookup() with constant key
    and optimize it into constant pointer. Can optimize non-constant
    key into direct pointer arithmetic as well, since pointers and
    value_size are constant for the life of the eBPF program.
    In other words array_map_lookup_elem() may be 'inlined' by verifier/JIT
    while preserving concurrent access to this map from user space

- two main use cases for array type:
  . 'global' eBPF variables: array of 1 element with key=0 and value is a
    collection of 'global' variables which programs can use to keep the state
    between events
  . aggregation of tracing events into fixed set of buckets

- all array elements pre-allocated and zero initialized at init time

- key as an index in array and can only be 4 byte

- map_delete_elem() returns EINVAL, since elements cannot be deleted

- map_update_elem() replaces elements in an non-atomic way
  (for atomic updates hashtable type should be used instead)

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

Note, from eBPF program and from user space, all map types are accessed
through the same API.

Example of using array type for 'global' variables from eBPF program:
struct globals {
    u64 lat_ave;
    u64 lat_sum;
    u64 missed;
    u64 max_lat;
    int num_samples;
};

struct bpf_map_def SEC("maps") global_map = {
    .type = BPF_MAP_TYPE_ARRAY,
    .key_size = sizeof(int),
    .value_size = sizeof(struct globals),
    .max_entries = 1,
};

int bpf_prog(struct bpf_context *ctx)
{
    ...
    int ind = 0;
    struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
    if (!g)
            return 0;
    if (g->lat_ave == 0) {
            g->num_samples++;
            g->lat_sum += delta;
            if (g->num_samples >= 100) {
                    g->lat_ave = g->lat_sum / g->num_samples;
    ...

The future verifier/JIT optimization will replace bpf_map_lookup_elem()
call inside eBPF program with const pointer to element value of key=0,
so that eBPF program will have no penalty whatsoever to access such
'global' variables.
At the same time user space can access this 'globals' via common map API.

Full example of both kernel and user side follows in later patches.

The array map is like C array of structures. Nothing protects concurrent access.
It's used in the cases where accuracy is not needed or when there is no
concurrent access. To compute a histogram of events in tracing the array
of integers is used. Every integer is a counter. Program increments it
(may be without using xadd) and user space periodically reads it back.
map_update_elem() is called by userspace once to initialize it if zero-init
is not enough. Programs do lookup() and modify the values.
For array type update() method is used rarely, delete() is never used and
get_next() is needed for completeness to browse maps through common map API.

 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/arraymap.c    |  151 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/arraymap.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 03a01fd609aa..0d662fe75df5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_cmd {
 enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
+	BPF_MAP_TYPE_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 2c0ec7f9da78..72ec98ba2d42 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..f4f6965f86cb
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,151 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_array *array;
+	u32 elem_size;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size == 0)
+		return ERR_PTR(-EINVAL);
+
+	elem_size = round_up(attr->value_size, 8);
+
+	/* allocate all map elements and zero-initialize them */
+	array = kzalloc(sizeof(*array) + attr->max_entries * elem_size,
+			GFP_USER | __GFP_NOWARN);
+	if (!array) {
+		array = vzalloc(array->map.max_entries * array->elem_size);
+		if (!array)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	/* copy mandatory map attributes */
+	array->map.key_size = attr->key_size;
+	array->map.value_size = attr->value_size;
+	array->map.max_entries = attr->max_entries;
+
+	array->elem_size = elem_size;
+
+	return &array->map;
+
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return NULL;
+
+	return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= array->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == array->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		/* all elements were pre-allocated, cannot insert a new one */
+		return -E2BIG;
+	
+	if (map_flags == BPF_NOEXIST)
+		/* all elemenets already exist */
+		return -EEXIST;
+
+	memcpy(array->value + array->elem_size * index, value, array->elem_size);
+	return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding programs to complete
+	 * and free the array
+	 */
+	synchronize_rcu();
+
+	kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+	.map_alloc = array_map_alloc,
+	.map_free = array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = array_map_lookup_elem,
+	.map_update_elem = array_map_update_elem,
+	.map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &array_ops,
+	.type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_array_map);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 4/7] bpf: fix BPF_MAP_LOOKUP_ELEM command return code
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

fix errno of BPF_MAP_LOOKUP_ELEM command as bpf manpage
described it in commit b4fc1a460f30("Merge branch 'bpf-next'"):
-----
BPF_MAP_LOOKUP_ELEM
    int bpf_lookup_elem(int fd, void *key, void *value)
    {
        union bpf_attr attr = {
            .map_fd = fd,
            .key = ptr_to_u64(key),
            .value = ptr_to_u64(value),
        };

        return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
    }
    bpf() syscall looks up an element with given key in  a  map  fd.
    If  element  is found it returns zero and stores element's value
    into value.  If element is not found  it  returns  -1  and  sets
    errno to ENOENT.

and further down in manpage:

   ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM,  indicates  that
          element with given key was not found.
-----

In general all BPF commands return ENOENT when map element is not found
(including BPF_MAP_GET_NEXT_KEY and BPF_MAP_UPDATE_ELEM with
 flags == BPF_MAP_UPDATE_ONLY)

Subsequent patch adds a testsuite to check return values for all of
these combinations.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

I don't think this patch is needed for 'net', since 'net' has syscall shell
only. Actual map types and their implementations are being introduced by
this set of patches.

 kernel/bpf/syscall.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0d03bf317a2..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
-	err = -ESRCH;
+	err = -ENOENT;
 	rcu_read_lock();
 	value = map->ops->map_lookup_elem(map, key);
 	if (!value)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 5/7] bpf: add a testsuite for eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

. check error conditions and sanity of hash and array map APIs
. check large maps (that kernel gracefully switches to vmalloc from kmalloc)
. check multi-process parallel access and stress test

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
Eventually it can be moved tools/testing/selftests/bpf/, but for now keep
it in samples/bpf/, since that's where all subsequent samples are coming to.

 samples/bpf/Makefile    |    3 +-
 samples/bpf/libbpf.c    |    3 +-
 samples/bpf/libbpf.h    |    2 +-
 samples/bpf/test_maps.c |  291 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 296 insertions(+), 3 deletions(-)
 create mode 100644 samples/bpf/test_maps.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 634391797856..0718d9ce4619 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -2,9 +2,10 @@
 obj- := dummy.o
 
 # List of programs to build
-hostprogs-y := test_verifier
+hostprogs-y := test_verifier test_maps
 
 test_verifier-objs := test_verifier.o libbpf.o
+test_maps-objs := test_maps.o libbpf.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index ff6504420738..17bb520eb57f 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -27,12 +27,13 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 	return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
 }
 
-int bpf_update_elem(int fd, void *key, void *value)
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
 {
 	union bpf_attr attr = {
 		.map_fd = fd,
 		.key = ptr_to_u64(key),
 		.value = ptr_to_u64(value),
+		.flags = flags,
 	};
 
 	return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 8a31babeca5d..f8678e5f48bf 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -6,7 +6,7 @@ struct bpf_insn;
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 		   int max_entries);
-int bpf_update_elem(int fd, void *key, void *value);
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
 int bpf_lookup_elem(int fd, void *key, void *value);
 int bpf_delete_elem(int fd, void *key);
 int bpf_get_next_key(int fd, void *key, void *next_key);
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
new file mode 100644
index 000000000000..e286b42307f3
--- /dev/null
+++ b/samples/bpf/test_maps.c
@@ -0,0 +1,291 @@
+/*
+ * Testsuite for eBPF maps
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include "libbpf.h"
+
+/* sanity tests for map API */
+static void test_hashmap_sanity(int i, void *data)
+{
+	long long key, next_key, value;
+	int map_fd;
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2);
+	if (map_fd < 0) {
+		printf("failed to create hashmap '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	key = 1;
+	value = 1234;
+	/* insert key=1 element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+
+	value = 0;
+	/* BPF_NOEXIST means: add new element if it doesn't exist */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       /* key=1 already exists */
+	       errno == EEXIST);
+
+	assert(bpf_update_elem(map_fd, &key, &value, -1) == -1 && errno == EINVAL);
+
+	/* check that key=1 can be found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+
+	key = 2;
+	/* check that key=2 is not found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	/* BPF_EXIST means: update existing element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+	       /* key=2 is not there */
+	       errno == ENOENT);
+
+	/* insert key=2 element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+
+	/* key=1 and key=2 were inserted, check that key=0 cannot be inserted
+	 * due to max_entries limit
+	 */
+	key = 0;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* check that key = 0 doesn't exist */
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	/* iterate over two elements */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
+	       next_key == 2);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 &&
+	       next_key == 1);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 &&
+	       errno == ENOENT);
+
+	/* delete both elements */
+	key = 1;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	key = 2;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	key = 0;
+	/* check that map is empty */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 &&
+	       errno == ENOENT);
+	close(map_fd);
+}
+
+static void test_arraymap_sanity(int i, void *data)
+{
+	int key, next_key, map_fd;
+	long long value;
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 2);
+	if (map_fd < 0) {
+		printf("failed to create arraymap '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	key = 1;
+	value = 1234;
+	/* insert key=1 element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+
+	value = 0;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == EEXIST);
+
+	/* check that key=1 can be found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+
+	key = 0;
+	/* check that key=0 is also found and zero initialized */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+
+
+	/* key=0 and key=1 were inserted, check that key=2 cannot be inserted
+	 * due to max_entries limit
+	 */
+	key = 2;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* check that key = 2 doesn't exist */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	/* iterate over two elements */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
+	       next_key == 0);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 &&
+	       next_key == 1);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 &&
+	       errno == ENOENT);
+
+	/* delete shouldn't succeed */
+	key = 1;
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == EINVAL);
+
+	close(map_fd);
+}
+
+#define MAP_SIZE (32 * 1024)
+static void test_map_large(void)
+{
+	struct bigkey {
+		int a;
+		char b[116];
+		long long c;
+	} key;
+	int map_fd, i, value;
+
+	/* allocate 4Mbyte of memory */
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+				MAP_SIZE);
+	if (map_fd < 0) {
+		printf("failed to create large map '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	for (i = 0; i < MAP_SIZE; i++) {
+		key = (struct bigkey) {.c = i};
+		value = i;
+		assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+	}
+	key.c = -1;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* iterate through all elements */
+	for (i = 0; i < MAP_SIZE; i++)
+		assert(bpf_get_next_key(map_fd, &key, &key) == 0);
+	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+
+	key.c = 0;
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+	key.a = 1;
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	close(map_fd);
+}
+
+/* fork N children and wait for them to complete */
+static void run_parallel(int tasks, void (*fn)(int i, void *data), void *data)
+{
+	pid_t pid[tasks];
+	int i;
+
+	for (i = 0; i < tasks; i++) {
+		pid[i] = fork();
+		if (pid[i] == 0) {
+			fn(i, data);
+			exit(0);
+		} else if (pid[i] == -1) {
+			printf("couldn't spawn #%d process\n", i);
+			exit(1);
+		}
+	}
+	for (i = 0; i < tasks; i++) {
+		int status;
+
+		assert(waitpid(pid[i], &status, 0) == pid[i]);
+		assert(status == 0);
+	}
+}
+
+static void test_map_stress(void)
+{
+	run_parallel(100, test_hashmap_sanity, NULL);
+	run_parallel(100, test_arraymap_sanity, NULL);
+}
+
+#define TASKS 1024
+#define DO_UPDATE 1
+#define DO_DELETE 0
+static void do_work(int fn, void *data)
+{
+	int map_fd = ((int *)data)[0];
+	int do_update = ((int *)data)[1];
+	int i;
+	int key, value;
+
+	for (i = fn; i < MAP_SIZE; i += TASKS) {
+		key = value = i;
+		if (do_update)
+			assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+		else
+			assert(bpf_delete_elem(map_fd, &key) == 0);
+	}
+}
+
+static void test_map_parallel(void)
+{
+	int i, map_fd, key = 0, value = 0;
+	int data[2];
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+				MAP_SIZE);
+	if (map_fd < 0) {
+		printf("failed to create map for parallel test '%s'\n",
+		       strerror(errno));
+		exit(1);
+	}
+
+	data[0] = map_fd;
+	data[1] = DO_UPDATE;
+	/* use the same map_fd in children to add elements to this map
+	 * child_0 adds key=0, key=1024, key=2048, ...
+	 * child_1 adds key=1, key=1025, key=2049, ...
+	 * child_1023 adds key=1023, ...
+	 */
+	run_parallel(TASKS, do_work, data);
+
+	/* check that key=0 is already there */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == EEXIST);
+
+	/* check that all elements were inserted */
+	key = -1;
+	for (i = 0; i < MAP_SIZE; i++)
+		assert(bpf_get_next_key(map_fd, &key, &key) == 0);
+	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+
+	/* another check for all elements */
+	for (i = 0; i < MAP_SIZE; i++) {
+		key = MAP_SIZE - i - 1;
+		assert(bpf_lookup_elem(map_fd, &key, &value) == 0 &&
+		       value == key);
+	}
+
+	/* now let's delete all elemenets in parallel */
+	data[1] = DO_DELETE;
+	run_parallel(TASKS, do_work, data);
+
+	/* nothing should be left */
+	key = -1;
+	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+}
+
+int main(void)
+{
+	test_hashmap_sanity(0, NULL);
+	test_arraymap_sanity(0, NULL);
+	test_map_large();
+	test_map_parallel();
+	test_map_stress();
+	printf("test_maps: OK\n");
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 6/7] bpf: allow eBPF programs to use maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

expose bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
map accessors to eBPF programs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
Note, these helpers are exposed as '.gpl_only = false', so non-GPL eBPF programs
can use them. That was requested by AndyL and DavidL before.

 include/linux/bpf.h      |    5 +++
 include/uapi/linux/bpf.h |    3 ++
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/helpers.c     |   89 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/helpers.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 51e9242e4803..75e94eaa228b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -133,4 +133,9 @@ struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
+/* verifier prototypes for helper functions called from eBPF programs */
+extern struct bpf_func_proto bpf_map_lookup_elem_proto;
+extern struct bpf_func_proto bpf_map_update_elem_proto;
+extern struct bpf_func_proto bpf_map_delete_elem_proto;
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d662fe75df5..4a3d0f84f178 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -158,6 +158,9 @@ union bpf_attr {
  */
 enum bpf_func_id {
 	BPF_FUNC_unspec,
+	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
+	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
+	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 72ec98ba2d42..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* verifier checked that R1 contains a valid pointer to bpf_map
+	 * and R2 points to a program stack and map->key_size bytes were
+	 * initialized
+	 */
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	value = map->ops->map_lookup_elem(map, key);
+
+	/* lookup() returns either pointer to element value or NULL
+	 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+	 */
+	return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+	.func = bpf_map_lookup_elem,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value = (void *) (unsigned long) r3;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+	.func = bpf_map_update_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+	.arg3_type = ARG_PTR_TO_MAP_VALUE,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+	.func = bpf_map_delete_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 7/7] bpf: remove test map scaffolding and user proper types
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

proper types and function helpers are ready. Use them in verifier testsuite.
Remove temporary stubs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 kernel/bpf/test_stub.c      |   56 +++++++------------------------------------
 samples/bpf/test_verifier.c |   14 +++++------
 2 files changed, 16 insertions(+), 54 deletions(-)

diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
 	u64 arg2;
 };
 
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	return 0;
-}
-
-static struct bpf_func_proto test_funcs[] = {
-	[BPF_FUNC_unspec] = {
-		.func = test_func,
-		.gpl_only = true,
-		.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
-		.arg1_type = ARG_CONST_MAP_PTR,
-		.arg2_type = ARG_PTR_TO_MAP_KEY,
-	},
-};
-
 static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
 {
-	if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
 		return NULL;
-	return &test_funcs[func_id];
+	}
 }
 
 static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
 	.type = BPF_PROG_TYPE_UNSPEC,
 };
 
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_map *map;
-
-	map = kzalloc(sizeof(*map), GFP_USER);
-	if (!map)
-		return ERR_PTR(-ENOMEM);
-
-	map->key_size = attr->key_size;
-	map->value_size = attr->value_size;
-	map->max_entries = attr->max_entries;
-	return map;
-}
-
-static void test_map_free(struct bpf_map *map)
-{
-	kfree(map);
-}
-
-static struct bpf_map_ops test_map_ops = {
-	.map_alloc = test_map_alloc,
-	.map_free = test_map_free,
-};
-
-static struct bpf_map_type_list tl_map = {
-	.ops = &test_map_ops,
-	.type = BPF_MAP_TYPE_UNSPEC,
-};
-
 static int __init register_test_ops(void)
 {
-	bpf_register_map_type(&tl_map);
 	bpf_register_prog_type(&tl_prog);
 	return 0;
 }
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 63402742345e..b96175e90363 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -261,7 +261,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_EXIT_INSN(),
 		},
 		.fixup = {2},
@@ -417,7 +417,7 @@ static struct bpf_test tests[] = {
 			BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "fd 0 is not pointing to valid bpf_map",
@@ -430,7 +430,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
 			BPF_EXIT_INSN(),
 		},
@@ -445,7 +445,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 			BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
 			BPF_EXIT_INSN(),
@@ -461,7 +461,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
 			BPF_EXIT_INSN(),
@@ -548,7 +548,7 @@ static struct bpf_test tests[] = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -56),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
 			BPF_EXIT_INSN(),
 		},
 		.fixup = {24},
@@ -659,7 +659,7 @@ static int create_map(void)
 	long long key, value = 0;
 	int map_fd;
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024);
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
 	if (map_fd < 0) {
 		printf("failed to create map '%s'\n", strerror(errno));
 	}
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH v17 0/7] MADV_FREE support
From: Andrew Morton @ 2014-11-14  1:52 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Michael Kerrisk,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Hugh Dickins, Johannes Weiner,
	Rik van Riel, KOSAKI Motohiro, Mel Gorman, Jason Evans,
	zhangyanfei-BthXqXjhjHXQFUHtdCDX3A, Kirill A. Shutemov
In-Reply-To: <20141113225809.GA8997@bbox>

On Fri, 14 Nov 2014 07:58:09 +0900 Minchan Kim <minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:

> It seems I have waited your review for a long time.
> What should I do to take your time slot?

I'm being terrible, sorry.

I'll merge the patches into -mm next week so at least they get some
external testing while I get my ass into gear.

^ permalink raw reply

* Re: kdbus: add selftests
From: Michael Ellerman @ 2014-11-14  3:42 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	john.stultz-QSEj5FYQhm4dnm+yROfE0A, arnd-r2nGTMty4D4,
	tj-DgEjT+Ai2ygdnm+yROfE0A, marcel-kz+m5ild9QBg9hUCZPvPmw,
	desrt-0xnayjDhYQY, hadess-0MeiytkfxGOsTnJN9+BGXg,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, tixxdz-Umm1ozX2/EEdnm+yROfE0A,
	simon.mcvittie-ZGY8ohtN/8pPYcu2f3hruQ,
	daniel-cYrQPVfZoowdnm+yROfE0A,
	alban.crequy-ZGY8ohtN/8pPYcu2f3hruQ,
	javier.martinez-ZGY8ohtN/8pPYcu2f3hruQ, teg-B22kvLQNl6c
In-Reply-To: <1414620056-6675-13-git-send-email-gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>

On Wed, 2014-10-29 at 15:00 -0700, Greg Kroah-Hartman wrote:
> From: Daniel Mack <daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
> 
> This patch adds a quite extensive test suite for kdbus that checks
> the most important code pathes in the driver. The idea is to extend
> the test suite over time.
> 
> Also, this code can serve as an example implementation to show how to
> use the kernel API from userspace.

Great to see selftests included.

I needed this to get them building:

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index b70237e8bc37..b1438a02e49f 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -210,6 +210,7 @@ header-y += ixjuser.h
 header-y += jffs2.h
 header-y += joystick.h
 header-y += kd.h
+header-y += kdbus.h
 header-y += kdev_t.h
 header-y += kernel-page-flags.h
 header-y += kernel.h
diff --git a/tools/testing/selftests/kdbus/Makefile b/tools/testing/selftests/kdbus/Makefile
index 0f6a745202af..96766c12a6e3 100644
--- a/tools/testing/selftests/kdbus/Makefile
+++ b/tools/testing/selftests/kdbus/Makefile
@@ -2,7 +2,7 @@ CFLAGS += -I../../../../usr/include/
 CFLAGS += -I../../../../include/uapi/
 CFLAGS += -std=gnu99
 CFLAGS += -DKBUILD_MODNAME=\"kdbus\" -D_GNU_SOURCE
-LDFLAGS = -pthread -lcap
+LDLIBS = -pthread -lcap

 OBJS= \
        kdbus-enum.o            \
@@ -37,7 +37,7 @@ all: kdbus-test
        gcc $(CFLAGS) -c $< -o $@

 kdbus-test: $(OBJS)
-       gcc $(CFLAGS) $(LDFLAGS) $^ -o $@
+       gcc $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@

 run_tests:
        ./kdbus-test

And with that it's all happy on ppc64le:

Testing bus make functions (bus-make) .................................. OK
Testing the HELLO command (hello) ...................................... OK
Testing the BYEBYE command (byebye) .................................... OK
Testing a chat pattern (chat) .......................................... OK
Testing a simple dameon (daemon) ....................................... OK
Testing file descriptor passing (fd-passing) ........................... OK
Testing custom endpoint (endpoint) ..................................... OK
Testing monitor functionality (monitor) ................................ OK
Testing basic name registry functions (name-basics) .................... OK
Testing name registry conflict details (name-conflict) ................. OK
Testing queuing of names (name-queue) .................................. OK
Testing basic message handling (message-basic) ......................... OK
Testing handling of messages with priority (message-prio) .............. OK
Testing timeout (timeout) .............................................. OK
Testing synchronous replies vs. BYEBYE (sync-byebye) ................... OK
Testing synchronous replies (sync-reply) ............................... OK
Testing freeing of memory (message-free) ............................... OK
Testing retrieving connection information (connection-info) ............ OK
Testing updating connection information (connection-update) ............ OK
Testing verifying pools are never writable (writable-pool) ............. OK
Testing policy (policy) ................................................ OK
Testing unprivileged bus access (policy-priv) .......................... OK
Testing policy in user namespaces (policy-ns) .......................... OK
Testing metadata in user namespaces (metadata-ns) ...................... OK
Testing adding of matches by id (match-id-add) ......................... OK
Testing removing of matches by id (match-id-remove) .................... OK
Testing adding of matches by name (match-name-add) ..................... OK
Testing removing of matches by name (match-name-remove) ................ OK
Testing matching for name changes (match-name-change) .................. OK
Testing matching with bloom filters (match-bloom) ...................... OK
Testing activator connections (activator) .............................. OK
Testing creating a domain (domain-make) ................................ OK
Testing benchmark (benchmark) .......................................... OK
Testing race multiple byebyes (race-byebye) ............................ OK
Testing race byebye vs match removal (race-byebye-match) ............... OK

SUMMARY: 35 tests passed, 0 skipped, 0 failed

cheers

^ permalink raw reply related

* Re: [PATCH net-next] net: introduce SO_INCOMING_CPU
From: Michael Kerrisk @ 2014-11-14  8:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Ying Cai, Willem de Bruijn, Neal Cardwell,
	Linux API
In-Reply-To: <1415393472.13896.119.camel-XN9IlZ5yJG9HTL0Zs8A6p/gx64E7kk8eUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>

Hi Eric,

Since this is an API change ( Documentation/SubmitChecklist),
linux-api@ should be CCed.

Thanks,

Michael



On Fri, Nov 7, 2014 at 9:51 PM, Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> From: Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
>
> Alternative to RPS/RFS is to use hardware support for multi queue.
>
> Then split a set of million of sockets into worker threads, each
> one using epoll() to manage events on its own socket pool.
>
> Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
> know after accept() or connect() on which queue/cpu a socket is managed.
>
> We normally use one cpu per RX queue (IRQ smp_affinity being properly
> set), so remembering on socket structure which cpu delivered last packet
> is enough to solve the problem.
>
> After accept(), connect(), or even file descriptor passing around
> processes, applications can use :
>
>  int cpu;
>  socklen_t len = sizeof(cpu);
>
>  getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
>
> And use this information to put the socket into the right silo
> for optimal performance, as all networking stack should run
> on the appropriate cpu, without need to send IPI (RPS/RFS).
>
> Signed-off-by: Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
>  arch/alpha/include/uapi/asm/socket.h   |    2 ++
>  arch/avr32/include/uapi/asm/socket.h   |    2 ++
>  arch/cris/include/uapi/asm/socket.h    |    2 ++
>  arch/frv/include/uapi/asm/socket.h     |    2 ++
>  arch/ia64/include/uapi/asm/socket.h    |    2 ++
>  arch/m32r/include/uapi/asm/socket.h    |    2 ++
>  arch/mips/include/uapi/asm/socket.h    |    2 ++
>  arch/mn10300/include/uapi/asm/socket.h |    2 ++
>  arch/parisc/include/uapi/asm/socket.h  |    2 ++
>  arch/powerpc/include/uapi/asm/socket.h |    2 ++
>  arch/s390/include/uapi/asm/socket.h    |    2 ++
>  arch/sparc/include/uapi/asm/socket.h   |    2 ++
>  arch/xtensa/include/uapi/asm/socket.h  |    2 ++
>  include/net/sock.h                     |   12 ++++++++++++
>  include/uapi/asm-generic/socket.h      |    2 ++
>  net/core/sock.c                        |    5 +++++
>  net/ipv4/tcp_ipv4.c                    |    1 +
>  net/ipv4/udp.c                         |    1 +
>  net/ipv6/tcp_ipv6.c                    |    1 +
>  net/ipv6/udp.c                         |    1 +
>  net/sctp/ulpqueue.c                    |    5 +++--
>  21 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index 3de1394bcab821984674e89a3ee022cc6dd5f0f2..e2fe0700b3b442bffc1f606b1b8b0bb7759aa157 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
> index 6e6cd159924b1855aa5f1811ad4e4c60b403c431..92121b0f5b989a61c008e0be24030725bab88e36 100644
> --- a/arch/avr32/include/uapi/asm/socket.h
> +++ b/arch/avr32/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI__ASM_AVR32_SOCKET_H */
> diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
> index ed94e5ed0a238c2750e677ccb806a6bc0a94041a..60f60f5b9b35bd219d7a9834fe5394e8ac5fdbab 100644
> --- a/arch/cris/include/uapi/asm/socket.h
> +++ b/arch/cris/include/uapi/asm/socket.h
> @@ -82,6 +82,8 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
>
> diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
> index ca2c6e6f31c6817780d31a246652adcc9847e373..2c6890209ea60c149bf097c2a1b369519cb8c301 100644
> --- a/arch/frv/include/uapi/asm/socket.h
> +++ b/arch/frv/include/uapi/asm/socket.h
> @@ -80,5 +80,7 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
> diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
> index a1b49bac7951929127ed08db549218c2c16ccf89..09a93fb566f6c6c6fe29c10c95b931881843d1cd 100644
> --- a/arch/ia64/include/uapi/asm/socket.h
> +++ b/arch/ia64/include/uapi/asm/socket.h
> @@ -89,4 +89,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_IA64_SOCKET_H */
> diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
> index 6c9a24b3aefa3a4f3048c17a7fa06d97b585ec14..e8589819c2743c6e112b15a245fc3ebd146e6313 100644
> --- a/arch/m32r/include/uapi/asm/socket.h
> +++ b/arch/m32r/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_M32R_SOCKET_H */
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index a14baa218c76f14de988ef106bdac5dadc48aceb..2e9ee8c55a103a0337d9f80f71fe9ef28be1154b 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -98,4 +98,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
> index 6aa3ce1854aa9523d46bc28851eddabd59edeb37..f3492e8c9f7009c33e07168df916f7337bef3929 100644
> --- a/arch/mn10300/include/uapi/asm/socket.h
> +++ b/arch/mn10300/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index fe35ceacf0e72cad69a43d9b1ce7b8f5ec3da98a..7984a1cab3da980f1f810827967b4b67616eb89b 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -79,4 +79,6 @@
>
>  #define SO_BPF_EXTENSIONS      0x4029
>
> +#define SO_INCOMING_CPU                0x402A
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
> index a9c3e2e18c054a1e952fe33599401de57c6a6544..3474e4ef166df4a573773916b325d0fa9f3b45d0 100644
> --- a/arch/powerpc/include/uapi/asm/socket.h
> +++ b/arch/powerpc/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_POWERPC_SOCKET_H */
> diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
> index e031332096d7c7b23b5953680289e8f3bcc3b378..8457636c33e1b67a9b7804daa05627839035a8fb 100644
> --- a/arch/s390/include/uapi/asm/socket.h
> +++ b/arch/s390/include/uapi/asm/socket.h
> @@ -86,4 +86,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 54d9608681b6947ae25dab008f808841d96125c0..4a8003a9416348006cfa85d5bcdf7553c8d23958 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -76,6 +76,8 @@
>
>  #define SO_BPF_EXTENSIONS      0x0032
>
> +#define SO_INCOMING_CPU                0x0033
> +
>  /* Security levels - as per NRL IPv6 - don't actually do anything */
>  #define SO_SECURITY_AUTHENTICATION             0x5001
>  #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
> diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
> index 39acec0cf0b1d500c1c40f9b523ef3a9a142c2f1..c46f6a696849c6f7f8a34b2cc522b48e04b17380 100644
> --- a/arch/xtensa/include/uapi/asm/socket.h
> +++ b/arch/xtensa/include/uapi/asm/socket.h
> @@ -91,4 +91,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _XTENSA_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 6767d75ecb17693eb59a99b8218da4319854ccc0..7789b59c0c400eb99f65d1f0e03cd9773664cf93 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -273,6 +273,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> +  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_protinfo: private area, net family specific, when not using slab
> @@ -350,6 +351,12 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> +       u16                     sk_incoming_cpu;
> +       /* 16bit hole
> +        * Warned : sk_incoming_cpu can be set from softirq,
> +        * Do not use this hole without fully understanding possible issues.
> +        */
> +
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>         unsigned int            sk_napi_id;
> @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>         return sk->sk_backlog_rcv(sk, skb);
>  }
>
> +static inline void sk_incoming_cpu_update(struct sock *sk)
> +{
> +       sk->sk_incoming_cpu = raw_smp_processor_id();
> +}
> +
>  static inline void sock_rps_record_flow_hash(__u32 hash)
>  {
>  #ifdef CONFIG_RPS
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index ea0796bdcf88404ef0f127eb6e64ba00c16ea856..f541ccefd4acbeb4ad757be9dbf4b67f204bf21d 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -82,4 +82,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ac56dd06c306f3712e57ce8e4724c79565589499..0725cf0cb685787b2122606437da53299fb24621 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sk->sk_max_pacing_rate;
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               v.val = sk->sk_incoming_cpu;
> +               break;
> +
>         default:
>                 return -ENOPROTOOPT;
>         }
> @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>
>                 newsk->sk_err      = 0;
>                 newsk->sk_priority = 0;
> +               newsk->sk_incoming_cpu = raw_smp_processor_id();
>                 /*
>                  * Before updating sk_refcnt, we must commit prior changes to memory
>                  * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 9c7d7621466b1241f404a5ca11de809dcff2d02a..3893f51972f28271a6d27a763c05495c5c2554f7 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1662,6 +1662,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index df19027f44f3d6fbe13dec78d3b085968dbf2329..f52b6081158e87caa5df32e8e5d27dbf314a01b1 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (inet_sk(sk)->inet_daddr) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index ace29b60813cf8a1d7182ad2262cbcbd21810fa7..ac40d23204b5e55da5172c80dafd1d4854b370d5 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1455,6 +1455,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 9b6809232b178c16d699ce3d152196b8c4cb096b..0125ca3daf47a4a3333e7462a11550d3e2f96875 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
> index d49dc2ed30adb97a809eb37902b9956c366a2862..ce469d648ffbe166f9ae1c5650f481256f31a7f8 100644
> --- a/net/sctp/ulpqueue.c
> +++ b/net/sctp/ulpqueue.c
> @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
>         if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
>                 goto out_free;
>
> -       if (!sctp_ulpevent_is_notification(event))
> +       if (!sctp_ulpevent_is_notification(event)) {
>                 sk_mark_napi_id(sk, skb);
> -
> +               sk_incoming_cpu_update(sk);
> +       }
>         /* Check if the user wishes to receive this event.  */
>         if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
>                 goto out_free;
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

* Re: [PATCH v2 net-next 0/2] net: SO_INCOMING_CPU support
From: Michael Kerrisk @ 2014-11-14  8:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, netdev, Neal Cardwell, Willem de Bruijn,
	Ying Cai, Linux API
In-Reply-To: <1415714068-21028-1-git-send-email-edumazet@google.com>

[CC += linux-api@]

On Tue, Nov 11, 2014 at 2:54 PM, Eric Dumazet <edumazet@google.com> wrote:
> SO_INCOMING_CPU socket option (read by getsockopt()) provides
> an alternative to RPS/RFS for high performance servers using
> multi queues NIC.
>
> TCP should use sk_mark_napi_id() for established sockets only.
>
> Eric Dumazet (2):
>   tcp: move sk_mark_napi_id() at the right place
>   net: introduce SO_INCOMING_CPU
>
>  arch/alpha/include/uapi/asm/socket.h   |  2 ++
>  arch/avr32/include/uapi/asm/socket.h   |  2 ++
>  arch/cris/include/uapi/asm/socket.h    |  2 ++
>  arch/frv/include/uapi/asm/socket.h     |  2 ++
>  arch/ia64/include/uapi/asm/socket.h    |  2 ++
>  arch/m32r/include/uapi/asm/socket.h    |  2 ++
>  arch/mips/include/uapi/asm/socket.h    |  2 ++
>  arch/mn10300/include/uapi/asm/socket.h |  2 ++
>  arch/parisc/include/uapi/asm/socket.h  |  2 ++
>  arch/powerpc/include/uapi/asm/socket.h |  2 ++
>  arch/s390/include/uapi/asm/socket.h    |  2 ++
>  arch/sparc/include/uapi/asm/socket.h   |  2 ++
>  arch/xtensa/include/uapi/asm/socket.h  |  2 ++
>  include/net/sock.h                     | 12 ++++++++++++
>  include/uapi/asm-generic/socket.h      |  2 ++
>  net/core/sock.c                        |  5 +++++
>  net/ipv4/tcp_ipv4.c                    |  4 +++-
>  net/ipv4/udp.c                         |  1 +
>  net/ipv6/tcp_ipv6.c                    |  4 +++-
>  net/ipv6/udp.c                         |  1 +
>  net/sctp/ulpqueue.c                    |  5 +++--
>  21 files changed, 56 insertions(+), 4 deletions(-)
>
> --
> 2.1.0.rc2.206.gedb03e5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

* Re: [PATCH v2 net-next 2/2] net: introduce SO_INCOMING_CPU
From: Michael Kerrisk @ 2014-11-14  8:07 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, netdev, Neal Cardwell, Willem de Bruijn,
	Ying Cai, Linux API
In-Reply-To: <1415714068-21028-3-git-send-email-edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

[CC += linux-api@]

On Tue, Nov 11, 2014 at 2:54 PM, Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> Alternative to RPS/RFS is to use hardware support for multiple
> queues.
>
> Then split a set of million of sockets into worker threads, each
> one using epoll() to manage events on its own socket pool.
>
> Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
> know after accept() or connect() on which queue/cpu a socket is managed.
>
> We normally use one cpu per RX queue (IRQ smp_affinity being properly
> set), so remembering on socket structure which cpu delivered last packet
> is enough to solve the problem.
>
> After accept(), connect(), or even file descriptor passing around
> processes, applications can use :
>
>  int cpu;
>  socklen_t len = sizeof(cpu);
>
>  getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
>
> And use this information to put the socket into the right silo
> for optimal performance, as all networking stack should run
> on the appropriate cpu, without need to send IPI (RPS/RFS).
>
> Signed-off-by: Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
>  arch/alpha/include/uapi/asm/socket.h   |  2 ++
>  arch/avr32/include/uapi/asm/socket.h   |  2 ++
>  arch/cris/include/uapi/asm/socket.h    |  2 ++
>  arch/frv/include/uapi/asm/socket.h     |  2 ++
>  arch/ia64/include/uapi/asm/socket.h    |  2 ++
>  arch/m32r/include/uapi/asm/socket.h    |  2 ++
>  arch/mips/include/uapi/asm/socket.h    |  2 ++
>  arch/mn10300/include/uapi/asm/socket.h |  2 ++
>  arch/parisc/include/uapi/asm/socket.h  |  2 ++
>  arch/powerpc/include/uapi/asm/socket.h |  2 ++
>  arch/s390/include/uapi/asm/socket.h    |  2 ++
>  arch/sparc/include/uapi/asm/socket.h   |  2 ++
>  arch/xtensa/include/uapi/asm/socket.h  |  2 ++
>  include/net/sock.h                     | 12 ++++++++++++
>  include/uapi/asm-generic/socket.h      |  2 ++
>  net/core/sock.c                        |  5 +++++
>  net/ipv4/tcp_ipv4.c                    |  1 +
>  net/ipv4/udp.c                         |  1 +
>  net/ipv6/tcp_ipv6.c                    |  1 +
>  net/ipv6/udp.c                         |  1 +
>  net/sctp/ulpqueue.c                    |  5 +++--
>  21 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index 3de1394bcab8..e2fe0700b3b4 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
> index 6e6cd159924b..92121b0f5b98 100644
> --- a/arch/avr32/include/uapi/asm/socket.h
> +++ b/arch/avr32/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI__ASM_AVR32_SOCKET_H */
> diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
> index ed94e5ed0a23..60f60f5b9b35 100644
> --- a/arch/cris/include/uapi/asm/socket.h
> +++ b/arch/cris/include/uapi/asm/socket.h
> @@ -82,6 +82,8 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
>
> diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
> index ca2c6e6f31c6..2c6890209ea6 100644
> --- a/arch/frv/include/uapi/asm/socket.h
> +++ b/arch/frv/include/uapi/asm/socket.h
> @@ -80,5 +80,7 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
> diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
> index a1b49bac7951..09a93fb566f6 100644
> --- a/arch/ia64/include/uapi/asm/socket.h
> +++ b/arch/ia64/include/uapi/asm/socket.h
> @@ -89,4 +89,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_IA64_SOCKET_H */
> diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
> index 6c9a24b3aefa..e8589819c274 100644
> --- a/arch/m32r/include/uapi/asm/socket.h
> +++ b/arch/m32r/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_M32R_SOCKET_H */
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index a14baa218c76..2e9ee8c55a10 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -98,4 +98,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
> index 6aa3ce1854aa..f3492e8c9f70 100644
> --- a/arch/mn10300/include/uapi/asm/socket.h
> +++ b/arch/mn10300/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index fe35ceacf0e7..7984a1cab3da 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -79,4 +79,6 @@
>
>  #define SO_BPF_EXTENSIONS      0x4029
>
> +#define SO_INCOMING_CPU                0x402A
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
> index a9c3e2e18c05..3474e4ef166d 100644
> --- a/arch/powerpc/include/uapi/asm/socket.h
> +++ b/arch/powerpc/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_POWERPC_SOCKET_H */
> diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
> index e031332096d7..8457636c33e1 100644
> --- a/arch/s390/include/uapi/asm/socket.h
> +++ b/arch/s390/include/uapi/asm/socket.h
> @@ -86,4 +86,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 54d9608681b6..4a8003a94163 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -76,6 +76,8 @@
>
>  #define SO_BPF_EXTENSIONS      0x0032
>
> +#define SO_INCOMING_CPU                0x0033
> +
>  /* Security levels - as per NRL IPv6 - don't actually do anything */
>  #define SO_SECURITY_AUTHENTICATION             0x5001
>  #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
> diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
> index 39acec0cf0b1..c46f6a696849 100644
> --- a/arch/xtensa/include/uapi/asm/socket.h
> +++ b/arch/xtensa/include/uapi/asm/socket.h
> @@ -91,4 +91,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _XTENSA_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7db3db112baa..ff2c3f11fb8f 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -273,6 +273,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> +  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_protinfo: private area, net family specific, when not using slab
> @@ -350,6 +351,12 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> +       u16                     sk_incoming_cpu;
> +       /* 16bit hole
> +        * Warned : sk_incoming_cpu can be set from softirq,
> +        * Do not use this hole without fully understanding possible issues.
> +        */
> +
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>         unsigned int            sk_napi_id;
> @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>         return sk->sk_backlog_rcv(sk, skb);
>  }
>
> +static inline void sk_incoming_cpu_update(struct sock *sk)
> +{
> +       sk->sk_incoming_cpu = raw_smp_processor_id();
> +}
> +
>  static inline void sock_rps_record_flow_hash(__u32 hash)
>  {
>  #ifdef CONFIG_RPS
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index ea0796bdcf88..f541ccefd4ac 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -82,4 +82,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 15e0c67b1069..14998b161035 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sk->sk_max_pacing_rate;
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               v.val = sk->sk_incoming_cpu;
> +               break;
> +
>         default:
>                 return -ENOPROTOOPT;
>         }
> @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>
>                 newsk->sk_err      = 0;
>                 newsk->sk_priority = 0;
> +               newsk->sk_incoming_cpu = raw_smp_processor_id();
>                 /*
>                  * Before updating sk_refcnt, we must commit prior changes to memory
>                  * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 8893598a4124..2c6a955fd5c3 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1663,6 +1663,7 @@ process:
>         if (sk_filter(sk, skb))
>                 goto discard_and_relse;
>
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index cd0db5471bb5..52235ca1f352 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (inet_sk(sk)->inet_daddr) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index fd8e50b380e7..1985b4933a6b 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1456,6 +1456,7 @@ process:
>         if (sk_filter(sk, skb))
>                 goto discard_and_relse;
>
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index f6ba535b6feb..2c7790c9ac65 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
> index d49dc2ed30ad..ce469d648ffb 100644
> --- a/net/sctp/ulpqueue.c
> +++ b/net/sctp/ulpqueue.c
> @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
>         if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
>                 goto out_free;
>
> -       if (!sctp_ulpevent_is_notification(event))
> +       if (!sctp_ulpevent_is_notification(event)) {
>                 sk_mark_napi_id(sk, skb);
> -
> +               sk_incoming_cpu_update(sk);
> +       }
>         /* Check if the user wishes to receive this event.  */
>         if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
>                 goto out_free;
> --
> 2.1.0.rc2.206.gedb03e5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

* RE: [PATCH] ARM: cacheflush: disallow pending signals during cacheflush
From: Chanho Min @ 2014-11-14  8:40 UTC (permalink / raw)
  To: 'Will Deacon'
  Cc: 'Jon Medhurst', 'HyoJun Im',
	'Russell King', 'Jongsung Kim', peter.maydell,
	'Taras Kondratiuk', linux-kernel, mtk.manpages,
	'Gunho Lee', 'Olof Johansson', linux-api,
	linux-man, linux-arm-kernel
In-Reply-To: <20141113112633.GE13350@arm.com>

> -----Original Message-----
> From: Will Deacon [mailto:will.deacon@arm.com]

> Whilst I don't think this is the correct solution, I agree that there's
> a potential issue here. We could change the restart return value to
> -ERESTARTNOINTR instead, but I can imagine something like a periodic
> SIGALRM which could prevent a large cacheflush from ever completing.
> Do we actually care about making forward progress in such a scenario?
It's not complete solution. But, I don't think this is incorrect solution
as well. Potential issue could be more serious than improvement of signal
responsiveness.

> 
> It is interesting to note that this change has been in mainline since
> May last year without any reported issues. That could be down to a number
> of reasons:
> 
>   (1) People are using old kernels on ARM
> 
>   (2) Code doesn't check the return value from the cacheflush system call,
>       because it historically always returned 0
> 
>   (3) People are getting lucky with timing, as this is likely difficult
>       to hit
> 
> Related to (2) is that a `man cacheflush' invocation returns something
> about the MIPs system call, that doesn't match what we do for ARM. The
> (relatively recent) history of the system call on ARM is:
> 
>   < v3.5 [*]
> 
>     - Always returns 0
>     - Restricts virtual address range to a single VMA
>     - Page-aligns the region limits (over flushing for smaller ranges)
>     - Terminates on the first fault
>     - Flags are ignored but must "ALWAYS be passed as ZERO"
> 
>   v3.5 - v3.12
>     - Returns -EINVAL if flags is set or if end < start
>     - Returns -EINVAL if we couldn't find a vma
>     - Terminates on the first fault and returns -EFAULT
> 
>   v3.12 - HEAD
> 
>     - No longer page-aligns region
>     - Removes VMA checking as this had a deadlock bug with mmap_sem
>       and we could handle faults by this point anyway
>     - Returns -EINVAL if !access_ok for the range
>     - Splits the range into PAGE_SIZE chunks, checking for reschedule
>       and pending signals to avoid DoSing the system (the hardware can
>       only clean by cacheline). This is where the -ERESTART_RESTARTBLOCK
>       behaviour came in, potentially returning -EINTR to userspace.
> 
> This leaves me with the following questions:
> 
>   - Has this change been shown to break anything in practice?
In practice, node.js (Currently, It doesn't check -EINTR of cacheflush)
crashes occasionally and non-reproducibly at some point some while after
the cacheflush call. At that time, strace tells cacheflush returns -EINTR.

>   - Can we change the internal return value to -ERESTARTNOINTR?
In worst case, I can imagine that periodic signal interrupts cacheflush
and it repeats restart of syscall from start of address with unlucky timing.

>   - What do we do about kernels that *do* return -EINTR? (>=3.12?)
>   - Can we get a manpage put together to describe this mess?
> 
> Cheers,
> 
> Will
> 
> [*] rmk may have some more ancient history kicking around, if you like!
> 
> > diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
> > index abd2fc0..275e086 100644
> > --- a/arch/arm/kernel/traps.c
> > +++ b/arch/arm/kernel/traps.c
> > @@ -521,25 +521,6 @@ __do_cache_op(unsigned long start, unsigned long end)
> >  	do {
> >  		unsigned long chunk = min(PAGE_SIZE, end - start);
> >
> > -		if (signal_pending(current)) {
> > -			struct thread_info *ti = current_thread_info();
> > -
> > -			ti->restart_block = (struct restart_block) {
> > -				.fn	= do_cache_op_restart,
> > -			};
> > -
> > -			ti->arm_restart_block = (struct arm_restart_block) {
> > -				{
> > -					.cache = {
> > -						.start	= start,
> > -						.end	= end,
> > -					},
> > -				},
> > -			};
> > -
> > -			return -ERESTART_RESTARTBLOCK;
> > -		}
> > -
> >  		ret = flush_cache_user_range(start, start + chunk);
> >  		if (ret)
> >  			return ret;
> > --
> > 1.7.9.5
> >
> >

^ permalink raw reply

* Re: kdbus: add selftests
From: Daniel Mack @ 2014-11-14  8:56 UTC (permalink / raw)
  To: Michael Ellerman, Greg Kroah-Hartman
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	john.stultz-QSEj5FYQhm4dnm+yROfE0A, arnd-r2nGTMty4D4,
	tj-DgEjT+Ai2ygdnm+yROfE0A, marcel-kz+m5ild9QBg9hUCZPvPmw,
	desrt-0xnayjDhYQY, hadess-0MeiytkfxGOsTnJN9+BGXg,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, tixxdz-Umm1ozX2/EEdnm+yROfE0A,
	simon.mcvittie-ZGY8ohtN/8pPYcu2f3hruQ,
	alban.crequy-ZGY8ohtN/8pPYcu2f3hruQ,
	javier.martinez-ZGY8ohtN/8pPYcu2f3hruQ, teg-B22kvLQNl6c
In-Reply-To: <1415936530.26378.2.camel@concordia>

On 11/14/2014 04:42 AM, Michael Ellerman wrote:
> On Wed, 2014-10-29 at 15:00 -0700, Greg Kroah-Hartman wrote:
>> From: Daniel Mack <daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
>>
>> This patch adds a quite extensive test suite for kdbus that checks
>> the most important code pathes in the driver. The idea is to extend
>> the test suite over time.
>>
>> Also, this code can serve as an example implementation to show how to
>> use the kernel API from userspace.
> 
> Great to see selftests included.
> 
> I needed this to get them building:

Thanks a lot for testing! I've added your hunks to the patch set now.


Daniel

^ permalink raw reply

* Re: [PATCH v2 1/1] truncate: generate fanotify and inotify events
From: Jan Kara @ 2014-11-14  9:01 UTC (permalink / raw)
  To: Heinrich Schuchardt
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54650D23.9040701-Mmb7MZpHnFY@public.gmane.org>

  Hello Heinrich,

On Thu 13-11-14 20:57:23, Heinrich Schuchardt wrote:
> could you, please, give me some feedback for
> https://lkml.org/lkml/2014/10/6/380
  Oh, I completely forgot about that one...

> Currently truncate() only generates an IN_MODIFY event.
> 
> Do you agree that it also should additionally create
> FAN_OPEN_PERM, FAN_OPEN, IN_OPEN, FAN_MODIFY, FAN_CLOSE_WRITE,
> IN_CLOSE_WRITE as well?
>
> IN_CLOSE_WRITE is used by editors to warn the user about updated files.
> 
> FAN_OPEN_PERM, FAN_MODIFY, and FAN_CLOSE_WRITE would be relevant for
> a malware scanner.
> 
> FAN_OPEN_PERM would be needed if the fanotify interface were used to
> build a hierarchical storage managers.
  I agree about FAN_MODIFY, that is a clear bug. OPEN / CLOSE events are
generated on open / close. Noone really guarantees you that to modify a
file you have to open it. So I agree they would make life simpler for
userspace but at this point I don't think we can change the user interface
in such way :(.

I can imagine that especially for content checkers the fact that file can
be truncated without FAN_OPEN_PERM is rather unpleasant (OTOH file can be
unlinked & replaced in a directory also without FAN_OPEN_PERM so it doesn't
seem like a completely new problem area either).

The cleanest way I see how we could deal with the situation is to add a new
type of event. Maybe something like IN_TRUNCATE, FAN_TRUNCATE,
FAN_TRUNCATE_PERM - and userspace application aware of the problem could
use these events instead of watching for IN_MODIFY / FAN_MODIFY which is
generated rather frequently. But this definitely needs more thought -
we have to carefully define when the events get generated so that they are
useful - is e.g. punching holes and similar fallocate tricks elligible as
well? What about ftruncate() where you have open file descriptor?

								Honza

-- 
Jan Kara <jack-AlSwsSmVLrQ@public.gmane.org>
SUSE Labs, CR

^ permalink raw reply

* [PATCH] [media] Add RGB444_1X12 and RGB565_1X16 media bus formats
From: Boris Brezillon @ 2014-11-14 10:36 UTC (permalink / raw)
  To: Mauro Carvalho Chehab, Hans Verkuil, Laurent Pinchart,
	linux-media-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-doc-u79uwXL29TY76Z2rM5mHXA, Boris Brezillon

Add RGB444_1X12 and RGB565_1X16 format definitions and update the
documentation.

Signed-off-by: Boris Brezillon <boris.brezillon-wi1+55ScJUtKEb57/3fJTNBPR1lH4CV8@public.gmane.org>
Acked-by: Mauro Carvalho Chehab <mchehab-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>
---
 Documentation/DocBook/media/v4l/subdev-formats.xml | 40 ++++++++++++++++++++++
 include/uapi/linux/media-bus-format.h              |  4 ++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml
index 18730b9..8c396db 100644
--- a/Documentation/DocBook/media/v4l/subdev-formats.xml
+++ b/Documentation/DocBook/media/v4l/subdev-formats.xml
@@ -563,6 +563,46 @@
 	      <entry>b<subscript>1</subscript></entry>
 	      <entry>b<subscript>0</subscript></entry>
 	    </row>
+	    <row id="MEDIA-BUS-FMT-RGB444-1X12">
+	      <entry>MEDIA_BUS_FMT_RGB444_1X12</entry>
+	      <entry>0x100d</entry>
+	      <entry></entry>
+	      &dash-ent-20;
+	      <entry>r<subscript>3</subscript></entry>
+	      <entry>r<subscript>2</subscript></entry>
+	      <entry>r<subscript>1</subscript></entry>
+	      <entry>r<subscript>0</subscript></entry>
+	      <entry>g<subscript>3</subscript></entry>
+	      <entry>g<subscript>2</subscript></entry>
+	      <entry>g<subscript>1</subscript></entry>
+	      <entry>g<subscript>0</subscript></entry>
+	      <entry>b<subscript>3</subscript></entry>
+	      <entry>b<subscript>2</subscript></entry>
+	      <entry>b<subscript>1</subscript></entry>
+	      <entry>b<subscript>0</subscript></entry>
+	    </row>
+	    <row id="MEDIA-BUS-FMT-RGB565-1X16">
+	      <entry>MEDIA_BUS_FMT_RGB565_1X16</entry>
+	      <entry>0x100d</entry>
+	      <entry></entry>
+	      &dash-ent-16;
+	      <entry>r<subscript>4</subscript></entry>
+	      <entry>r<subscript>3</subscript></entry>
+	      <entry>r<subscript>2</subscript></entry>
+	      <entry>r<subscript>1</subscript></entry>
+	      <entry>r<subscript>0</subscript></entry>
+	      <entry>g<subscript>5</subscript></entry>
+	      <entry>g<subscript>4</subscript></entry>
+	      <entry>g<subscript>3</subscript></entry>
+	      <entry>g<subscript>2</subscript></entry>
+	      <entry>g<subscript>1</subscript></entry>
+	      <entry>g<subscript>0</subscript></entry>
+	      <entry>b<subscript>4</subscript></entry>
+	      <entry>b<subscript>3</subscript></entry>
+	      <entry>b<subscript>2</subscript></entry>
+	      <entry>b<subscript>1</subscript></entry>
+	      <entry>b<subscript>0</subscript></entry>
+	    </row>
 	  </tbody>
 	</tgroup>
       </table>
diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
index 23b4090..cc7b79e 100644
--- a/include/uapi/linux/media-bus-format.h
+++ b/include/uapi/linux/media-bus-format.h
@@ -33,7 +33,7 @@
 
 #define MEDIA_BUS_FMT_FIXED			0x0001
 
-/* RGB - next is	0x100e */
+/* RGB - next is	0x1010 */
 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE	0x1001
 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE	0x1002
 #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE	0x1003
@@ -47,6 +47,8 @@
 #define MEDIA_BUS_FMT_RGB888_2X12_BE		0x100b
 #define MEDIA_BUS_FMT_RGB888_2X12_LE		0x100c
 #define MEDIA_BUS_FMT_ARGB8888_1X32		0x100d
+#define MEDIA_BUS_FMT_RGB444_1X12		0x100e
+#define MEDIA_BUS_FMT_RGB565_1X16		0x100f
 
 /* YUV (including grey) - next is	0x2024 */
 #define MEDIA_BUS_FMT_Y8_1X8			0x2001
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v2 net-next 1/7] bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
From: Hannes Frederic Sowa @ 2014-11-14 12:11 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S. Miller, Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Eric Dumazet, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1415929010-9361-2-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
> either update existing map element or create a new one.
> Initially the plan was to add a new command to handle the case of
> 'create new element if it didn't exist', but 'flags' style looks
> cleaner and overall diff is much smaller (more code reused), so add 'flags'
> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
>  #define BPF_ANY	0 /* create new element or update existing */
>  #define BPF_NOEXIST	1 /* create new element if it didn't exist */
>  #define BPF_EXIST	2 /* update existing element */

Would a cmpxchg-alike function be handy here?

Bye,
Hannes

^ permalink raw reply

* Re: [PATCH] [media] Add RGB444_1X12 and RGB565_1X16 media bus formats
From: Sakari Ailus @ 2014-11-14 13:58 UTC (permalink / raw)
  To: Boris Brezillon
  Cc: Mauro Carvalho Chehab, Hans Verkuil, Laurent Pinchart,
	linux-media-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-doc-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1415961360-14898-1-git-send-email-boris.brezillon-wi1+55ScJUtKEb57/3fJTNBPR1lH4CV8@public.gmane.org>

Hi Boris,

On Fri, Nov 14, 2014 at 11:36:00AM +0100, Boris Brezillon wrote:
> Add RGB444_1X12 and RGB565_1X16 format definitions and update the
> documentation.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon-wi1+55ScJUtKEb57/3fJTNBPR1lH4CV8@public.gmane.org>
> Acked-by: Mauro Carvalho Chehab <mchehab-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>
> ---
>  Documentation/DocBook/media/v4l/subdev-formats.xml | 40 ++++++++++++++++++++++
>  include/uapi/linux/media-bus-format.h              |  4 ++-
>  2 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml
> index 18730b9..8c396db 100644
> --- a/Documentation/DocBook/media/v4l/subdev-formats.xml
> +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml
> @@ -563,6 +563,46 @@
>  	      <entry>b<subscript>1</subscript></entry>
>  	      <entry>b<subscript>0</subscript></entry>
>  	    </row>
> +	    <row id="MEDIA-BUS-FMT-RGB444-1X12">
> +	      <entry>MEDIA_BUS_FMT_RGB444_1X12</entry>
> +	      <entry>0x100d</entry>
> +	      <entry></entry>
> +	      &dash-ent-20;
> +	      <entry>r<subscript>3</subscript></entry>
> +	      <entry>r<subscript>2</subscript></entry>
> +	      <entry>r<subscript>1</subscript></entry>
> +	      <entry>r<subscript>0</subscript></entry>
> +	      <entry>g<subscript>3</subscript></entry>
> +	      <entry>g<subscript>2</subscript></entry>
> +	      <entry>g<subscript>1</subscript></entry>
> +	      <entry>g<subscript>0</subscript></entry>
> +	      <entry>b<subscript>3</subscript></entry>
> +	      <entry>b<subscript>2</subscript></entry>
> +	      <entry>b<subscript>1</subscript></entry>
> +	      <entry>b<subscript>0</subscript></entry>
> +	    </row>
> +	    <row id="MEDIA-BUS-FMT-RGB565-1X16">
> +	      <entry>MEDIA_BUS_FMT_RGB565_1X16</entry>
> +	      <entry>0x100d</entry>
> +	      <entry></entry>
> +	      &dash-ent-16;
> +	      <entry>r<subscript>4</subscript></entry>
> +	      <entry>r<subscript>3</subscript></entry>
> +	      <entry>r<subscript>2</subscript></entry>
> +	      <entry>r<subscript>1</subscript></entry>
> +	      <entry>r<subscript>0</subscript></entry>
> +	      <entry>g<subscript>5</subscript></entry>
> +	      <entry>g<subscript>4</subscript></entry>
> +	      <entry>g<subscript>3</subscript></entry>
> +	      <entry>g<subscript>2</subscript></entry>
> +	      <entry>g<subscript>1</subscript></entry>
> +	      <entry>g<subscript>0</subscript></entry>
> +	      <entry>b<subscript>4</subscript></entry>
> +	      <entry>b<subscript>3</subscript></entry>
> +	      <entry>b<subscript>2</subscript></entry>
> +	      <entry>b<subscript>1</subscript></entry>
> +	      <entry>b<subscript>0</subscript></entry>
> +	    </row>
>  	  </tbody>
>  	</tgroup>
>        </table>
> diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
> index 23b4090..cc7b79e 100644
> --- a/include/uapi/linux/media-bus-format.h
> +++ b/include/uapi/linux/media-bus-format.h
> @@ -33,7 +33,7 @@
>  
>  #define MEDIA_BUS_FMT_FIXED			0x0001
>  
> -/* RGB - next is	0x100e */
> +/* RGB - next is	0x1010 */
>  #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE	0x1001
>  #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE	0x1002
>  #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE	0x1003
> @@ -47,6 +47,8 @@
>  #define MEDIA_BUS_FMT_RGB888_2X12_BE		0x100b
>  #define MEDIA_BUS_FMT_RGB888_2X12_LE		0x100c
>  #define MEDIA_BUS_FMT_ARGB8888_1X32		0x100d
> +#define MEDIA_BUS_FMT_RGB444_1X12		0x100e
> +#define MEDIA_BUS_FMT_RGB565_1X16		0x100f

I'd arrange these according to BPP and bits per sample, both in the header
and documentation.

>  /* YUV (including grey) - next is	0x2024 */
>  #define MEDIA_BUS_FMT_Y8_1X8			0x2001

-- 
Regards,

Sakari Ailus
e-mail: sakari.ailus-X3B1VOXEql0@public.gmane.org	XMPP: sailus-PCDdDYkjdNMDXYZnReoRVg@public.gmane.org

^ permalink raw reply

* [RFC] Possible new execveat(2) Linux syscall
From: David Drysdale @ 2014-11-14 14:54 UTC (permalink / raw)
  To: libc-alpha-9JcytcrH/bA+uJoB2kUjGw
  Cc: Andrew Morton, Christoph Hellwig, Rich Felker, Linux API,
	Andy Lutomirski

Hi,

Over at the LKML[1] we've been discussing a possible new syscall, execveat(2),
and it would be good to hear a glibc perspective about it (and whether there
are any interface changes that would make it easier to use from userspace).

The syscall prototype is:
  int execveat(int fd, const char *pathname,
                      char *const argv[],  char *const envp[],
                      int flags); /* AT_EMPTY_PATH, AT_SYMLINK_NOFOLLOW */
and it works similarly to execve(2) except:
 - the executable to run is identified by the combination of fd+pathname, like
   other *at(2) syscalls
 - there's an extra flags field to control behaviour.
(I've attached a text version of the suggested man page below)

One particular benefit of this is that it allows an fexecve(3) implementation
that doesn't rely on /proc being accessible, which is useful for sandboxed
applications.  (However, that does only work for non-interpreted programs:
the name passed to a script interpreter is of the form "/dev/fd/<fd>/<path>"
or "/dev/fd/<fd>", so the executed interpreter will normally still need /proc
access to load the script file).

How does this sound from a glibc perspective?

Thanks,
David

[1] https://lkml.org/lkml/2014/11/7/512, with earlier discussions at
https://lkml.org/lkml/2014/11/6/469, https://lkml.org/lkml/2014/10/22/275
and https://lkml.org/lkml/2014/10/17/428

----

EXECVEAT(2)              Linux Programmer's Manual             EXECVEAT(2)

NAME
       execveat - execute program relative to a directory file descriptor

SYNOPSIS
       #include <unistd.h>

       int execveat(int fd, const char *pathname,
                    char *const argv[],  char *const envp[],
                    int flags);

DESCRIPTION
       The  execveat()  system call executes the program pointed to by the
       combination of fd and pathname.  The execveat() system  call  oper‐
       ates  in  exactly the same way as execve(2), except for the differ‐
       ences described in this manual page.

       If the pathname given in pathname is relative, then  it  is  inter‐
       preted relative to the directory referred to by the file descriptor
       fd (rather than relative to the current working  directory  of  the
       calling process, as is done by execve(2) for a relative pathname).

       If  pathname is relative and fd is the special value AT_FDCWD, then
       pathname is interpreted relative to the current  working  directory
       of the calling process (like execve(2)).

       If pathname is absolute, then fd is ignored.

       If pathname is an empty string and the AT_EMPTY_PATH flag is speci‐
       fied, then the file descriptor fd specifies the  file  to  be  exe‐
       cuted.

       flags can either be 0, or include the following flags:

       AT_EMPTY_PATH
              If pathname is an empty string, operate on the file referred
              to by fd (which may have been  obtained  using  the  open(2)
              O_PATH flag).

       AT_SYMLINK_NOFOLLOW
              If  the  file  identified by fd and a non-NULL pathname is a
              symbolic link, then the call fails with the error EINVAL.

RETURN VALUE
       On success, execveat() does not return. On error  -1  is  returned,
       and errno is set appropriately.

ERRORS
       The  same  errors  that  occur  for  execve(2)  can  also occur for
       execveat().   The  following  additional  errors  can   occur   for
       execveat():

       EBADF  fd is not a valid file descriptor.

       ENOENT The  program  identified by fd and pathname requires the use
              of an interpreter program (such as a  script  starting  with
              "#!")  but  the  file  descriptor  fd  was  opened  with the
              O_CLOEXEC flag and so the program file  is  inaccessible  to
              the launched interpreter.

       EINVAL Invalid flag specified in flags.

       ENOTDIR
              pathname  is  relative and fd is a file descriptor referring
              to a file other than a directory.

VERSIONS
       execveat() was added to Linux in kernel 3.???.

NOTES
       In addition to the reasons explained in openat(2),  the  execveat()
       system call is also needed to allow fexecve(3) to be implemented on
       systems that do not have the /proc filesystem mounted.

SEE ALSO
       execve(2), fexecve(3)

Linux                           2014-04-02                     EXECVEAT(2)

^ permalink raw reply

* Re: [PATCH] [media] Add RGB444_1X12 and RGB565_1X16 media bus formats
From: Boris Brezillon @ 2014-11-14 15:04 UTC (permalink / raw)
  To: Sakari Ailus
  Cc: Mauro Carvalho Chehab, Hans Verkuil, Laurent Pinchart,
	linux-media, linux-api, linux-kernel, linux-doc
In-Reply-To: <20141114135831.GC8907@valkosipuli.retiisi.org.uk>

Hi Sakari,

On Fri, 14 Nov 2014 15:58:31 +0200
Sakari Ailus <sakari.ailus@iki.fi> wrote:

> Hi Boris,
> 
> On Fri, Nov 14, 2014 at 11:36:00AM +0100, Boris Brezillon wrote:
> > Add RGB444_1X12 and RGB565_1X16 format definitions and update the
> > documentation.
> > 
> > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> > Acked-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
> > ---
> >  Documentation/DocBook/media/v4l/subdev-formats.xml | 40 ++++++++++++++++++++++
> >  include/uapi/linux/media-bus-format.h              |  4 ++-
> >  2 files changed, 43 insertions(+), 1 deletion(-)
> > 
> > diff --git a/Documentation/DocBook/media/v4l/subdev-formats.xml b/Documentation/DocBook/media/v4l/subdev-formats.xml
> > index 18730b9..8c396db 100644
> > --- a/Documentation/DocBook/media/v4l/subdev-formats.xml
> > +++ b/Documentation/DocBook/media/v4l/subdev-formats.xml
> > @@ -563,6 +563,46 @@
> >  	      <entry>b<subscript>1</subscript></entry>
> >  	      <entry>b<subscript>0</subscript></entry>
> >  	    </row>
> > +	    <row id="MEDIA-BUS-FMT-RGB444-1X12">
> > +	      <entry>MEDIA_BUS_FMT_RGB444_1X12</entry>
> > +	      <entry>0x100d</entry>
> > +	      <entry></entry>
> > +	      &dash-ent-20;
> > +	      <entry>r<subscript>3</subscript></entry>
> > +	      <entry>r<subscript>2</subscript></entry>
> > +	      <entry>r<subscript>1</subscript></entry>
> > +	      <entry>r<subscript>0</subscript></entry>
> > +	      <entry>g<subscript>3</subscript></entry>
> > +	      <entry>g<subscript>2</subscript></entry>
> > +	      <entry>g<subscript>1</subscript></entry>
> > +	      <entry>g<subscript>0</subscript></entry>
> > +	      <entry>b<subscript>3</subscript></entry>
> > +	      <entry>b<subscript>2</subscript></entry>
> > +	      <entry>b<subscript>1</subscript></entry>
> > +	      <entry>b<subscript>0</subscript></entry>
> > +	    </row>
> > +	    <row id="MEDIA-BUS-FMT-RGB565-1X16">
> > +	      <entry>MEDIA_BUS_FMT_RGB565_1X16</entry>
> > +	      <entry>0x100d</entry>
> > +	      <entry></entry>
> > +	      &dash-ent-16;
> > +	      <entry>r<subscript>4</subscript></entry>
> > +	      <entry>r<subscript>3</subscript></entry>
> > +	      <entry>r<subscript>2</subscript></entry>
> > +	      <entry>r<subscript>1</subscript></entry>
> > +	      <entry>r<subscript>0</subscript></entry>
> > +	      <entry>g<subscript>5</subscript></entry>
> > +	      <entry>g<subscript>4</subscript></entry>
> > +	      <entry>g<subscript>3</subscript></entry>
> > +	      <entry>g<subscript>2</subscript></entry>
> > +	      <entry>g<subscript>1</subscript></entry>
> > +	      <entry>g<subscript>0</subscript></entry>
> > +	      <entry>b<subscript>4</subscript></entry>
> > +	      <entry>b<subscript>3</subscript></entry>
> > +	      <entry>b<subscript>2</subscript></entry>
> > +	      <entry>b<subscript>1</subscript></entry>
> > +	      <entry>b<subscript>0</subscript></entry>
> > +	    </row>
> >  	  </tbody>
> >  	</tgroup>
> >        </table>
> > diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
> > index 23b4090..cc7b79e 100644
> > --- a/include/uapi/linux/media-bus-format.h
> > +++ b/include/uapi/linux/media-bus-format.h
> > @@ -33,7 +33,7 @@
> >  
> >  #define MEDIA_BUS_FMT_FIXED			0x0001
> >  
> > -/* RGB - next is	0x100e */
> > +/* RGB - next is	0x1010 */
> >  #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE	0x1001
> >  #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE	0x1002
> >  #define MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE	0x1003
> > @@ -47,6 +47,8 @@
> >  #define MEDIA_BUS_FMT_RGB888_2X12_BE		0x100b
> >  #define MEDIA_BUS_FMT_RGB888_2X12_LE		0x100c
> >  #define MEDIA_BUS_FMT_ARGB8888_1X32		0x100d
> > +#define MEDIA_BUS_FMT_RGB444_1X12		0x100e
> > +#define MEDIA_BUS_FMT_RGB565_1X16		0x100f
> 
> I'd arrange these according to BPP and bits per sample, both in the header
> and documentation.

I cannot keep both macro values and BPP/bits per sample in incrementing
order. Are you sure you prefer to order macros in BPP/bits per sample
order ?

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply

* Re: [PATCH v2 net-next 1/7] bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
From: Alexei Starovoitov @ 2014-11-14 15:33 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: David S. Miller, Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Eric Dumazet, Linux API, Network Development, LKML
In-Reply-To: <1415967071.15154.9.camel@localhost>

On Fri, Nov 14, 2014 at 4:11 AM, Hannes Frederic Sowa
<hannes-tFNcAqjVMyqKXQKiL6tip0B+6BGkLq7r@public.gmane.org> wrote:
> On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
>> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
>> either update existing map element or create a new one.
>> Initially the plan was to add a new command to handle the case of
>> 'create new element if it didn't exist', but 'flags' style looks
>> cleaner and overall diff is much smaller (more code reused), so add 'flags'
>> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
>>  #define BPF_ANY      0 /* create new element or update existing */
>>  #define BPF_NOEXIST  1 /* create new element if it didn't exist */
>>  #define BPF_EXIST    2 /* update existing element */
>
> Would a cmpxchg-alike function be handy here?

you mean cmpxchg command in addition to
update() command ?
May be... it will have an extra 'value' argument
(key, old_value, new_value)
I don't have a use case for it yet though.

^ permalink raw reply

* Re: [PATCH v2 net-next 1/7] bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
From: Hannes Frederic Sowa @ 2014-11-14 16:06 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S. Miller, Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Eric Dumazet, Linux API, Network Development, LKML
In-Reply-To: <CAMEtUux2vuN3iNaAsxybyFwzfMRVe1+BOW-JvnvYYh+=-vpELw@mail.gmail.com>

On Fr, 2014-11-14 at 07:33 -0800, Alexei Starovoitov wrote:
> On Fri, Nov 14, 2014 at 4:11 AM, Hannes Frederic Sowa
> <hannes@stressinduktion.org> wrote:
> > On Do, 2014-11-13 at 17:36 -0800, Alexei Starovoitov wrote:
> >> the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
> >> either update existing map element or create a new one.
> >> Initially the plan was to add a new command to handle the case of
> >> 'create new element if it didn't exist', but 'flags' style looks
> >> cleaner and overall diff is much smaller (more code reused), so add 'flags'
> >> attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
> >>  #define BPF_ANY      0 /* create new element or update existing */
> >>  #define BPF_NOEXIST  1 /* create new element if it didn't exist */
> >>  #define BPF_EXIST    2 /* update existing element */
> >
> > Would a cmpxchg-alike function be handy here?
> 
> you mean cmpxchg command in addition to
> update() command ?
> May be... it will have an extra 'value' argument
> (key, old_value, new_value)
> I don't have a use case for it yet though.

I don't neither. ;)

I just wanted to bring this up before user space api might get public
and the additional argument might make problems.

Bye,
Hannes

^ permalink raw reply

* [PATCHv8 0/4] syscalls,x86,sparc: Add execveat() system call
From: David Drysdale @ 2014-11-14 16:22 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton, David Miller
  Cc: Oleg Nesterov, Michael Kerrisk, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	sparclinux-u79uwXL29TY76Z2rM5mHXA, David Drysdale

Sparc folks: the kbuild robots complained about sparc missing this
syscall, so I've speculatively included an extra patch in this
version for that.

However, I've not run it (only cross-compiled it), so it may be best
to skip the [3/4] patch and let the experts deal with other
architectures later -- I'm not sure what's the best practice for adding
a new syscall.

.........

This patch set adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).

The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc
filesystem, at least for executables (rather than scripts).  The
current glibc version of fexecve(3) is implemented via /proc, which
causes problems in sandboxed or otherwise restricted environments.

Given the desire for a /proc-free fexecve() implementation, HPA
suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2)
syscall would be an appropriate generalization.

Also, having a new syscall means that it can take a flags argument
without back-compatibility concerns.  The current implementation just
defines the AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other
flags could be added in future -- for example, flags for new namespaces
(as suggested at https://lkml.org/lkml/2006/7/11/474).

Related history:
 - https://lkml.org/lkml/2006/12/27/123 is an example of someone
   realizing that fexecve() is likely to fail in a chroot environment.
 - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
   documenting the /proc requirement of fexecve(3) in its manpage, to
   "prevent other people from wasting their time".
 - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
   problem where a process that did setuid() could not fexecve()
   because it no longer had access to /proc/self/fd; this has since
   been fixed.


Changes since v7:
 - Speculatively wire up sparc version of syscall (untested)
 - Fix leak of pathbuf in mainline arm [Oleg Nesterov]
 - Add rcu_dereference_raw() on fdt access [sparse kbuild robot]
 - Realigned comment [Andrew Morton]
 - Merged up to v3.18-rc4

Changes since v6:
 - Remove special case for O_PATH file descriptors [Andy Lutomirski]
 - Use kasprintf rather than error-prone arithmetic [Kees Cook]
 - Add test for long name [Kees Cook]
 - Add test for non-executable O_PATH fd [Andy Lutomirski]

Changes since v5:
 - Set new flag in bprm->interp_flags for O_CLOEXEC fds, so that binfmts
   that invoke an interpreter fail the exec (as they will not be able
   to access the invoked file). [Andy Lutomirski]
 - Don't truncate long paths. [Andy Lutomirski]
 - Commonize code to open the executed file. [Eric W. Biederman]
 - Mark O_PATH file descriptors so they cannot be fexecve()ed.
 - Make self-test more helpful, and add additional cases:
     - file offset non-zero
     - binary file without execute bit
     - O_CLOEXEC fds

Changes since v4, suggested by Eric W. Biederman:
 - Use empty filename with AT_EMPTY_PATH flag rather than NULL
   pathname to request fexecve-like behaviour.
 - Build pathname as "/dev/fd/<fd>/<filename>" (or "/dev/fd/<fd>")
   rather than using d_path().
 - Patch against v3.17 (bfe01a5ba249)

Changes since Meredydd's v3 patch:
 - Added a selftest.
 - Added a man page.
 - Left open_exec() signature untouched to reduce patch impact
   elsewhere (as suggested by Al Viro).
 - Filled in bprm->filename with d_path() into a buffer, to avoid use
   of potentially-ephemeral dentry->d_name.
 - Patch against v3.14 (455c6fdbd21916).


David Drysdale (3):
  syscalls,x86: implement execveat() system call
  syscalls,x86: add selftest for execveat(2)
  sparc: Hook up execveat system call.

 arch/sparc/include/uapi/asm/unistd.h    |   3 +-
 arch/sparc/kernel/systbls_32.S          |   1 +
 arch/sparc/kernel/systbls_64.S          |   2 +
 arch/x86/ia32/audit.c                   |   1 +
 arch/x86/ia32/ia32entry.S               |   1 +
 arch/x86/kernel/audit_64.c              |   1 +
 arch/x86/kernel/entry_64.S              |  28 +++
 arch/x86/syscalls/syscall_32.tbl        |   1 +
 arch/x86/syscalls/syscall_64.tbl        |   2 +
 arch/x86/um/sys_call_table_64.c         |   1 +
 fs/binfmt_em86.c                        |   4 +
 fs/binfmt_misc.c                        |   4 +
 fs/binfmt_script.c                      |  10 +
 fs/exec.c                               | 113 +++++++--
 fs/namei.c                              |   2 +-
 include/linux/binfmts.h                 |   4 +
 include/linux/compat.h                  |   3 +
 include/linux/fs.h                      |   1 +
 include/linux/sched.h                   |   4 +
 include/linux/syscalls.h                |   5 +
 include/uapi/asm-generic/unistd.h       |   4 +-
 kernel/sys_ni.c                         |   3 +
 lib/audit.c                             |   3 +
 tools/testing/selftests/Makefile        |   1 +
 tools/testing/selftests/exec/.gitignore |   9 +
 tools/testing/selftests/exec/Makefile   |  25 ++
 tools/testing/selftests/exec/execveat.c | 397 ++++++++++++++++++++++++++++++++
 27 files changed, 617 insertions(+), 16 deletions(-)
 create mode 100644 tools/testing/selftests/exec/.gitignore
 create mode 100644 tools/testing/selftests/exec/Makefile
 create mode 100644 tools/testing/selftests/exec/execveat.c

--
2.1.0.rc2.206.gedb03e5

^ permalink raw reply

* [PATCHv8 1/4] syscalls,x86: implement execveat() system call
From: David Drysdale @ 2014-11-14 16:23 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton, David Miller
  Cc: Oleg Nesterov, Michael Kerrisk, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	sparclinux-u79uwXL29TY76Z2rM5mHXA, David Drysdale
In-Reply-To: <1415982183-20525-1-git-send-email-drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Add a new execveat(2) system call. execveat() is to execve() as
openat() is to open(): it takes a file descriptor that refers to a
directory, and resolves the filename relative to that.

In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers. This
replicates the functionality of fexecve(), which is a system call in
other UNIXen, but in Linux glibc it depends on opening
"/proc/self/fd/<fd>" (and so relies on /proc being mounted).

The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found.  This does however mean that
execution of a script in a /proc-less environment won't work; also,
script execution via an O_CLOEXEC file descriptor fails (as the file
will not be accessible after exec).

Only x86-64, i386 and x32 ABIs are supported in this patch.

Based on patches by Meredydd Luff <meredydd-zPN50pYk8eUaUu29zAJCuw@public.gmane.org>

Signed-off-by: David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
 arch/x86/ia32/audit.c             |   1 +
 arch/x86/ia32/ia32entry.S         |   1 +
 arch/x86/kernel/audit_64.c        |   1 +
 arch/x86/kernel/entry_64.S        |  28 ++++++++++
 arch/x86/syscalls/syscall_32.tbl  |   1 +
 arch/x86/syscalls/syscall_64.tbl  |   2 +
 arch/x86/um/sys_call_table_64.c   |   1 +
 fs/binfmt_em86.c                  |   4 ++
 fs/binfmt_misc.c                  |   4 ++
 fs/binfmt_script.c                |  10 ++++
 fs/exec.c                         | 113 +++++++++++++++++++++++++++++++++-----
 fs/namei.c                        |   2 +-
 include/linux/binfmts.h           |   4 ++
 include/linux/compat.h            |   3 +
 include/linux/fs.h                |   1 +
 include/linux/sched.h             |   4 ++
 include/linux/syscalls.h          |   5 ++
 include/uapi/asm-generic/unistd.h |   4 +-
 kernel/sys_ni.c                   |   3 +
 lib/audit.c                       |   3 +
 20 files changed, 180 insertions(+), 15 deletions(-)

diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c
index 5d7b381da692..2eccc8932ae6 100644
--- a/arch/x86/ia32/audit.c
+++ b/arch/x86/ia32/audit.c
@@ -35,6 +35,7 @@ int ia32_classify_syscall(unsigned syscall)
 	case __NR_socketcall:
 		return 4;
 	case __NR_execve:
+	case __NR_execveat:
 		return 5;
 	default:
 		return 1;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffe71228fc10..82e8a1d44658 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -480,6 +480,7 @@ GLOBAL(\label)
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn
 	PTREGSCALL stub32_execve, compat_sys_execve
+	PTREGSCALL stub32_execveat, compat_sys_execveat
 	PTREGSCALL stub32_fork, sys_fork
 	PTREGSCALL stub32_vfork, sys_vfork

diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 06d3e5a14d9d..f3672508b249 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall)
 	case __NR_openat:
 		return 3;
 	case __NR_execve:
+	case __NR_execveat:
 		return 5;
 	default:
 		return 0;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df088bb03fb3..40d893c60fcc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -652,6 +652,20 @@ ENTRY(stub_execve)
 	CFI_ENDPROC
 END(stub_execve)

+ENTRY(stub_execveat)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call sys_execveat
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_execveat)
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
@@ -697,6 +711,20 @@ ENTRY(stub_x32_execve)
 	CFI_ENDPROC
 END(stub_x32_execve)

+ENTRY(stub_x32_execveat)
+	CFI_STARTPROC
+	addq $8, %rsp
+	PARTIAL_FRAME 0
+	SAVE_REST
+	FIXUP_TOP_OF_STACK %r11
+	call compat_sys_execveat
+	RESTORE_TOP_OF_STACK %r11
+	movq %rax,RAX(%rsp)
+	RESTORE_REST
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(stub_x32_execveat)
+
 #endif

 /*
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 9fe1b5d002f0..b3560ece1c9f 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -364,3 +364,4 @@
 355	i386	getrandom		sys_getrandom
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
+358	i386	execveat		sys_execveat			stub32_execveat
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 281150b539a2..8d656fbb57aa 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -328,6 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
+322	64	execveat		stub_execveat

 #
 # x32-specific system call numbers start at 512 to avoid cache impact
@@ -366,3 +367,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
+545	x32	execveat		stub_x32_execveat
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723070ca..20c3649d0691 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -31,6 +31,7 @@
 #define stub_fork sys_fork
 #define stub_vfork sys_vfork
 #define stub_execve sys_execve
+#define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn

 #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index f37b08cea1f7..490538536cb4 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm)
 			return -ENOEXEC;
 	}

+	/* Need to be able to load the file after exec */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
 	allow_write_access(bprm->file);
 	fput(bprm->file);
 	bprm->file = NULL;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd8beb9657a2..85acb8c83a9a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -142,6 +142,10 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	if (!fmt)
 		goto _ret;

+	/* Need to be able to load the file after exec */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
 	if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) {
 		retval = remove_arg_zero(bprm);
 		if (retval)
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 5027a3e14922..afdf4e3cafc2 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm)

 	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
 		return -ENOEXEC;
+
+	/*
+	 * If the script filename will be inaccessible after exec, typically
+	 * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give
+	 * up now (on the assumption that the interpreter will want to load
+	 * this file).
+	 */
+	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
+		return -ENOENT;
+
 	/*
 	 * This section does the #! interpretation.
 	 * Sorta complicated, but hopefully it will work.  -TYT
diff --git a/fs/exec.c b/fs/exec.c
index 7302b75a9820..6ce5cc47a201 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -747,18 +747,25 @@ EXPORT_SYMBOL(setup_arg_pages);

 #endif /* CONFIG_MMU */

-static struct file *do_open_exec(struct filename *name)
+static struct file *do_open_execat(int fd, struct filename *name, int flags)
 {
 	struct file *file;
 	int err;
-	static const struct open_flags open_exec_flags = {
+	struct open_flags open_exec_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 		.acc_mode = MAY_EXEC | MAY_OPEN,
 		.intent = LOOKUP_OPEN,
 		.lookup_flags = LOOKUP_FOLLOW,
 	};

-	file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
+	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return ERR_PTR(-EINVAL);
+	if (flags & AT_SYMLINK_NOFOLLOW)
+		open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
+	if (flags & AT_EMPTY_PATH)
+		open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
+
+	file = do_filp_open(fd, name, &open_exec_flags);
 	if (IS_ERR(file))
 		goto out;

@@ -769,12 +776,13 @@ static struct file *do_open_exec(struct filename *name)
 	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;

-	fsnotify_open(file);
-
 	err = deny_write_access(file);
 	if (err)
 		goto exit;

+	if (name->name[0] != '\0')
+		fsnotify_open(file);
+
 out:
 	return file;

@@ -786,7 +794,7 @@ exit:
 struct file *open_exec(const char *name)
 {
 	struct filename tmp = { .name = name };
-	return do_open_exec(&tmp);
+	return do_open_execat(AT_FDCWD, &tmp, 0);
 }
 EXPORT_SYMBOL(open_exec);

@@ -1427,10 +1435,12 @@ static int exec_binprm(struct linux_binprm *bprm)
 /*
  * sys_execve() executes a new program.
  */
-static int do_execve_common(struct filename *filename,
-				struct user_arg_ptr argv,
-				struct user_arg_ptr envp)
+static int do_execveat_common(int fd, struct filename *filename,
+			      struct user_arg_ptr argv,
+			      struct user_arg_ptr envp,
+			      int flags)
 {
+	char *pathbuf = NULL;
 	struct linux_binprm *bprm;
 	struct file *file;
 	struct files_struct *displaced;
@@ -1471,7 +1481,7 @@ static int do_execve_common(struct filename *filename,
 	check_unsafe_exec(bprm);
 	current->in_execve = 1;

-	file = do_open_exec(filename);
+	file = do_open_execat(fd, filename, flags);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_unmark;
@@ -1479,7 +1489,28 @@ static int do_execve_common(struct filename *filename,
 	sched_exec();

 	bprm->file = file;
-	bprm->filename = bprm->interp = filename->name;
+	if (fd == AT_FDCWD || filename->name[0] == '/') {
+		bprm->filename = filename->name;
+	} else {
+		if (filename->name[0] == '\0')
+			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd);
+		else
+			pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s",
+					    fd, filename->name);
+		if (!pathbuf) {
+			retval = -ENOMEM;
+			goto out_unmark;
+		}
+		/*
+		 * Record that a name derived from an O_CLOEXEC fd will be
+		 * inaccessible after exec. Relies on having exclusive access to
+		 * current->files (due to unshare_files above).
+		 */
+		if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
+			bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+		bprm->filename = pathbuf;
+	}
+	bprm->interp = bprm->filename;

 	retval = bprm_mm_init(bprm);
 	if (retval)
@@ -1520,6 +1551,7 @@ static int do_execve_common(struct filename *filename,
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
+	kfree(pathbuf);
 	putname(filename);
 	if (displaced)
 		put_files_struct(displaced);
@@ -1537,6 +1569,7 @@ out_unmark:

 out_free:
 	free_bprm(bprm);
+	kfree(pathbuf);

 out_files:
 	if (displaced)
@@ -1552,7 +1585,18 @@ int do_execve(struct filename *filename,
 {
 	struct user_arg_ptr argv = { .ptr.native = __argv };
 	struct user_arg_ptr envp = { .ptr.native = __envp };
-	return do_execve_common(filename, argv, envp);
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+int do_execveat(int fd, struct filename *filename,
+		const char __user *const __user *__argv,
+		const char __user *const __user *__envp,
+		int flags)
+{
+	struct user_arg_ptr argv = { .ptr.native = __argv };
+	struct user_arg_ptr envp = { .ptr.native = __envp };
+
+	return do_execveat_common(fd, filename, argv, envp, flags);
 }

 #ifdef CONFIG_COMPAT
@@ -1568,7 +1612,23 @@ static int compat_do_execve(struct filename *filename,
 		.is_compat = true,
 		.ptr.compat = __envp,
 	};
-	return do_execve_common(filename, argv, envp);
+	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
+}
+
+static int compat_do_execveat(int fd, struct filename *filename,
+			      const compat_uptr_t __user *__argv,
+			      const compat_uptr_t __user *__envp,
+			      int flags)
+{
+	struct user_arg_ptr argv = {
+		.is_compat = true,
+		.ptr.compat = __argv,
+	};
+	struct user_arg_ptr envp = {
+		.is_compat = true,
+		.ptr.compat = __envp,
+	};
+	return do_execveat_common(fd, filename, argv, envp, flags);
 }
 #endif

@@ -1608,6 +1668,20 @@ SYSCALL_DEFINE3(execve,
 {
 	return do_execve(getname(filename), argv, envp);
 }
+
+SYSCALL_DEFINE5(execveat,
+		int, fd, const char __user *, filename,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp,
+		int, flags)
+{
+	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+	return do_execveat(fd,
+			   getname_flags(filename, lookup_flags, NULL),
+			   argv, envp, flags);
+}
+
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
 	const compat_uptr_t __user *, argv,
@@ -1615,4 +1689,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
 {
 	return compat_do_execve(getname(filename), argv, envp);
 }
+
+COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
+		       const char __user *, filename,
+		       const compat_uptr_t __user *, argv,
+		       const compat_uptr_t __user *, envp,
+		       int,  flags)
+{
+	int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
+
+	return compat_do_execveat(fd,
+				  getname_flags(filename, lookup_flags, NULL),
+				  argv, envp, flags);
+}
 #endif
diff --git a/fs/namei.c b/fs/namei.c
index db5fe86319e6..ca814165d84c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -130,7 +130,7 @@ void final_putname(struct filename *name)

 #define EMBEDDED_NAME_MAX	(PATH_MAX - sizeof(struct filename))

-static struct filename *
+struct filename *
 getname_flags(const char __user *filename, int flags, int *empty)
 {
 	struct filename *result, *err;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 61f29e5ea840..576e4639ca60 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -53,6 +53,10 @@ struct linux_binprm {
 #define BINPRM_FLAGS_EXECFD_BIT 1
 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT)

+/* filename of the binary will be inaccessible after exec */
+#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2
+#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT)
+
 /* Function parameter for binfmt->coredump */
 struct coredump_params {
 	const siginfo_t *siginfo;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index e6494261eaff..7450ca2ac1fc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int);

 asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv,
 		     const compat_uptr_t __user *envp);
+asmlinkage long compat_sys_execveat(int dfd, const char __user *filename,
+		     const compat_uptr_t __user *argv,
+		     const compat_uptr_t __user *envp, int flags);

 asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 		compat_ulong_t __user *outp, compat_ulong_t __user *exp,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e8a63c..133b60b1d4d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2072,6 +2072,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *);
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);

+extern struct filename *getname_flags(const char __user *, int, int *);
 extern struct filename *getname(const char __user *);
 extern struct filename *getname_kernel(const char *);

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5e344bbe63ec..344163d09efb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2441,6 +2441,10 @@ extern void do_group_exit(int);
 extern int do_execve(struct filename *,
 		     const char __user * const __user *,
 		     const char __user * const __user *);
+extern int do_execveat(int, struct filename *,
+		       const char __user * const __user *,
+		       const char __user * const __user *,
+		       int);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index bda9b81357cc..1ff5a4d09693 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 asmlinkage long sys_getrandom(char __user *buf, size_t count,
 			      unsigned int flags);
 asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+
+asmlinkage long sys_execveat(int dfd, const char __user *filename,
+			const char __user *const __user *argv,
+			const char __user *const __user *envp, int flags);
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 22749c134117..e016bd9b1a04 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom)
 __SYSCALL(__NR_memfd_create, sys_memfd_create)
 #define __NR_bpf 280
 __SYSCALL(__NR_bpf, sys_bpf)
+#define __NR_execveat 281
+__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)

 #undef __NR_syscalls
-#define __NR_syscalls 281
+#define __NR_syscalls 282

 /*
  * All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 02aa4185b17e..832fba6e2eb1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -224,3 +224,6 @@ cond_syscall(sys_seccomp);

 /* access BPF programs and maps */
 cond_syscall(sys_bpf);
+
+/* execveat */
+cond_syscall(sys_execveat);
diff --git a/lib/audit.c b/lib/audit.c
index 1d726a22565b..b8fb5ee81e26 100644
--- a/lib/audit.c
+++ b/lib/audit.c
@@ -54,6 +54,9 @@ int audit_classify_syscall(int abi, unsigned syscall)
 	case __NR_socketcall:
 		return 4;
 #endif
+#ifdef __NR_execveat
+	case __NR_execveat:
+#endif
 	case __NR_execve:
 		return 5;
 	default:
--
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCHv8 2/4] syscalls,x86: add selftest for execveat(2)
From: David Drysdale @ 2014-11-14 16:23 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton, David Miller
  Cc: Oleg Nesterov, Michael Kerrisk, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	sparclinux-u79uwXL29TY76Z2rM5mHXA, David Drysdale
In-Reply-To: <1415982183-20525-1-git-send-email-drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Signed-off-by: David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
 tools/testing/selftests/Makefile        |   1 +
 tools/testing/selftests/exec/.gitignore |   9 +
 tools/testing/selftests/exec/Makefile   |  25 ++
 tools/testing/selftests/exec/execveat.c | 397 ++++++++++++++++++++++++++++++++
 4 files changed, 432 insertions(+)
 create mode 100644 tools/testing/selftests/exec/.gitignore
 create mode 100644 tools/testing/selftests/exec/Makefile
 create mode 100644 tools/testing/selftests/exec/execveat.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 45f145c6f843..c14893b501a9 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@ TARGETS += user
 TARGETS += sysctl
 TARGETS += firmware
 TARGETS += ftrace
+TARGETS += exec

 TARGETS_HOTPLUG = cpu-hotplug
 TARGETS_HOTPLUG += memory-hotplug
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
new file mode 100644
index 000000000000..64073e050c6a
--- /dev/null
+++ b/tools/testing/selftests/exec/.gitignore
@@ -0,0 +1,9 @@
+subdir*
+script*
+execveat
+execveat.symlink
+execveat.moved
+execveat.path.ephemeral
+execveat.ephemeral
+execveat.denatured
+xxxxxxxx*
\ No newline at end of file
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
new file mode 100644
index 000000000000..66dfc2ce1788
--- /dev/null
+++ b/tools/testing/selftests/exec/Makefile
@@ -0,0 +1,25 @@
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall
+BINARIES = execveat
+DEPS = execveat.symlink execveat.denatured script subdir
+all: $(BINARIES) $(DEPS)
+
+subdir:
+	mkdir -p $@
+script:
+	echo '#!/bin/sh' > $@
+	echo 'exit $$*' >> $@
+	chmod +x $@
+execveat.symlink: execveat
+	ln -s -f $< $@
+execveat.denatured: execveat
+	cp $< $@
+	chmod -x $@
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $^
+
+run_tests: all
+	./execveat
+
+clean:
+	rm -rf $(BINARIES) $(DEPS) subdir.moved execveat.moved xxxxx*
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
new file mode 100644
index 000000000000..33a5c06d95ca
--- /dev/null
+++ b/tools/testing/selftests/exec/execveat.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2014 Google, Inc.
+ *
+ * Licensed under the terms of the GNU GPL License version 2
+ *
+ * Selftests for execveat(2).
+ */
+
+#define _GNU_SOURCE  /* to get O_PATH, AT_EMPTY_PATH */
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static char longpath[2 * PATH_MAX] = "";
+static char *envp[] = { "IN_TEST=yes", NULL, NULL };
+static char *argv[] = { "execveat", "99", NULL };
+
+static int execveat_(int fd, const char *path, char **argv, char **envp,
+		     int flags)
+{
+#ifdef __NR_execveat
+	return syscall(__NR_execveat, fd, path, argv, envp, flags);
+#else
+	errno = -ENOSYS;
+	return -1;
+#endif
+}
+
+#define check_execveat_fail(fd, path, flags, errno)	\
+	_check_execveat_fail(fd, path, flags, errno, #errno)
+static int _check_execveat_fail(int fd, const char *path, int flags,
+				int expected_errno, const char *errno_str)
+{
+	int rc;
+
+	errno = 0;
+	printf("Check failure of execveat(%d, '%s', %d) with %s... ",
+		fd, path?:"(null)", flags, errno_str);
+	rc = execveat_(fd, path, argv, envp, flags);
+
+	if (rc > 0) {
+		printf("[FAIL] (unexpected success from execveat(2))\n");
+		return 1;
+	}
+	if (errno != expected_errno) {
+		printf("[FAIL] (expected errno %d (%s) not %d (%s)\n",
+			expected_errno, strerror(expected_errno),
+			errno, strerror(errno));
+		return 1;
+	}
+	printf("[OK]\n");
+	return 0;
+}
+
+static int check_execveat_invoked_rc(int fd, const char *path, int flags,
+				     int expected_rc)
+{
+	int status;
+	int rc;
+	pid_t child;
+	int pathlen = path ? strlen(path) : 0;
+
+	if (pathlen > 40)
+		printf("Check success of execveat(%d, '%.20s...%s', %d)... ",
+			fd, path, (path + pathlen - 20), flags);
+	else
+		printf("Check success of execveat(%d, '%s', %d)... ",
+			fd, path?:"(null)", flags);
+	child = fork();
+	if (child < 0) {
+		printf("[FAIL] (fork() failed)\n");
+		return 1;
+	}
+	if (child == 0) {
+		/* Child: do execveat(). */
+		rc = execveat_(fd, path, argv, envp, flags);
+		printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n",
+			rc, errno, strerror(errno));
+		exit(1);  /* should not reach here */
+	}
+	/* Parent: wait for & check child's exit status. */
+	rc = waitpid(child, &status, 0);
+	if (rc != child) {
+		printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc);
+		return 1;
+	}
+	if (!WIFEXITED(status)) {
+		printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n",
+			child, status);
+		return 1;
+	}
+	if (WEXITSTATUS(status) != expected_rc) {
+		printf("[FAIL] (child %d exited with %d not %d)\n",
+			child, WEXITSTATUS(status), expected_rc);
+		return 1;
+	}
+	printf("[OK]\n");
+	return 0;
+}
+
+static int check_execveat(int fd, const char *path, int flags)
+{
+	return check_execveat_invoked_rc(fd, path, flags, 99);
+}
+
+static char *concat(const char *left, const char *right)
+{
+	char *result = malloc(strlen(left) + strlen(right) + 1);
+
+	strcpy(result, left);
+	strcat(result, right);
+	return result;
+}
+
+static int open_or_die(const char *filename, int flags)
+{
+	int fd = open(filename, flags);
+
+	if (fd < 0) {
+		printf("Failed to open '%s'; "
+			"check prerequisites are available\n", filename);
+		exit(1);
+	}
+	return fd;
+}
+
+static void exe_cp(const char *src, const char *dest)
+{
+	int in_fd = open_or_die(src, O_RDONLY);
+	int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755);
+	struct stat info;
+
+	fstat(in_fd, &info);
+	sendfile(out_fd, in_fd, NULL, info.st_size);
+	close(in_fd);
+	close(out_fd);
+}
+
+#define XX_DIR_LEN 200
+static int check_execveat_pathmax(int dot_dfd, const char *src, int is_script)
+{
+	int fail = 0;
+	int ii, count, len;
+	char longname[XX_DIR_LEN + 1];
+	int fd;
+
+	if (*longpath == '\0') {
+		/* Create a filename close to PATH_MAX in length */
+		memset(longname, 'x', XX_DIR_LEN - 1);
+		longname[XX_DIR_LEN - 1] = '/';
+		longname[XX_DIR_LEN] = '\0';
+		count = (PATH_MAX - 3) / XX_DIR_LEN;
+		for (ii = 0; ii < count; ii++) {
+			strcat(longpath, longname);
+			mkdir(longpath, 0755);
+		}
+		len = (PATH_MAX - 3) - (count * XX_DIR_LEN);
+		if (len <= 0)
+			len = 1;
+		memset(longname, 'y', len);
+		longname[len] = '\0';
+		strcat(longpath, longname);
+	}
+	exe_cp(src, longpath);
+
+	/*
+	 * Execute as a pre-opened file descriptor, which works whether this is
+	 * a script or not (because the interpreter sees a filename like
+	 * "/dev/fd/20").
+	 */
+	fd = open(longpath, O_RDONLY);
+	if (fd > 0) {
+		printf("Invoke copy of '%s' via filename of length %lu:\n",
+			src, strlen(longpath));
+		fail += check_execveat(fd, "", AT_EMPTY_PATH);
+	} else {
+		printf("Failed to open length %lu filename, errno=%d (%s)\n",
+			strlen(longpath), errno, strerror(errno));
+		fail++;
+	}
+
+	/*
+	 * Execute as a long pathname relative to ".".  If this is a script,
+	 * the interpreter will launch but fail to open the script because its
+	 * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX.
+	 */
+	if (is_script)
+		fail += check_execveat_invoked_rc(dot_dfd, longpath, 0, 127);
+	else
+		fail += check_execveat(dot_dfd, longpath, 0);
+
+	return fail;
+}
+
+static int run_tests(void)
+{
+	int fail = 0;
+	char *fullname = realpath("execveat", NULL);
+	char *fullname_script = realpath("script", NULL);
+	char *fullname_symlink = concat(fullname, ".symlink");
+	int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY);
+	int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral",
+					       O_DIRECTORY|O_RDONLY);
+	int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY);
+	int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH);
+	int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+	int fd = open_or_die("execveat", O_RDONLY);
+	int fd_path = open_or_die("execveat", O_RDONLY|O_PATH);
+	int fd_symlink = open_or_die("execveat.symlink", O_RDONLY);
+	int fd_denatured = open_or_die("execveat.denatured", O_RDONLY);
+	int fd_denatured_path = open_or_die("execveat.denatured",
+					    O_RDONLY|O_PATH);
+	int fd_script = open_or_die("script", O_RDONLY);
+	int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY);
+	int fd_ephemeral_path = open_or_die("execveat.path.ephemeral",
+					    O_RDONLY|O_PATH);
+	int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY);
+	int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC);
+	int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC);
+
+	/* Change file position to confirm it doesn't affect anything */
+	lseek(fd, 10, SEEK_SET);
+
+	/* Normal executable file: */
+	/*   dfd + path */
+	fail += check_execveat(subdir_dfd, "../execveat", 0);
+	fail += check_execveat(dot_dfd, "execveat", 0);
+	fail += check_execveat(dot_dfd_path, "execveat", 0);
+	/*   absolute path */
+	fail += check_execveat(AT_FDCWD, fullname, 0);
+	/*   absolute path with nonsense dfd */
+	fail += check_execveat(99, fullname, 0);
+	/*   fd + no path */
+	fail += check_execveat(fd, "", AT_EMPTY_PATH);
+	/*   O_CLOEXEC fd + no path */
+	fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH);
+	/*   O_PATH fd */
+	fail += check_execveat(fd_path, "", AT_EMPTY_PATH);
+
+	/* Mess with executable file that's already open: */
+	/*   fd + no path to a file that's been renamed */
+	rename("execveat.ephemeral", "execveat.moved");
+	fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
+	/*   fd + no path to a file that's been deleted */
+	unlink("execveat.moved"); /* remove the file now fd open */
+	fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
+
+	/* Mess with executable file that's already open with O_PATH */
+	/*   fd + no path to a file that's been deleted */
+	unlink("execveat.path.ephemeral");
+	fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH);
+
+	/* Invalid argument failures */
+	fail += check_execveat_fail(fd, "", 0, ENOENT);
+	fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT);
+
+	/* Symlink to executable file: */
+	/*   dfd + path */
+	fail += check_execveat(dot_dfd, "execveat.symlink", 0);
+	fail += check_execveat(dot_dfd_path, "execveat.symlink", 0);
+	/*   absolute path */
+	fail += check_execveat(AT_FDCWD, fullname_symlink, 0);
+	/*   fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */
+	fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH);
+	fail += check_execveat(fd_symlink, "",
+			       AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
+
+	/* Symlink fails when AT_SYMLINK_NOFOLLOW set: */
+	/*   dfd + path */
+	fail += check_execveat_fail(dot_dfd, "execveat.symlink",
+				    AT_SYMLINK_NOFOLLOW, ELOOP);
+	fail += check_execveat_fail(dot_dfd_path, "execveat.symlink",
+				    AT_SYMLINK_NOFOLLOW, ELOOP);
+	/*   absolute path */
+	fail += check_execveat_fail(AT_FDCWD, fullname_symlink,
+				    AT_SYMLINK_NOFOLLOW, ELOOP);
+
+	/* Shell script wrapping executable file: */
+	/*   dfd + path */
+	fail += check_execveat(subdir_dfd, "../script", 0);
+	fail += check_execveat(dot_dfd, "script", 0);
+	fail += check_execveat(dot_dfd_path, "script", 0);
+	/*   absolute path */
+	fail += check_execveat(AT_FDCWD, fullname_script, 0);
+	/*   fd + no path */
+	fail += check_execveat(fd_script, "", AT_EMPTY_PATH);
+	fail += check_execveat(fd_script, "",
+			       AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
+	/*   O_CLOEXEC fd fails for a script (as script file inaccessible) */
+	fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH,
+				    ENOENT);
+	fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT);
+
+	/* Mess with script file that's already open: */
+	/*   fd + no path to a file that's been renamed */
+	rename("script.ephemeral", "script.moved");
+	fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
+	/*   fd + no path to a file that's been deleted */
+	unlink("script.moved"); /* remove the file while fd open */
+	fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
+
+	/* Rename a subdirectory in the path: */
+	rename("subdir.ephemeral", "subdir.moved");
+	fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
+	fail += check_execveat(subdir_dfd_ephemeral, "script", 0);
+	/* Remove the subdir and its contents */
+	unlink("subdir.moved/script");
+	unlink("subdir.moved");
+	/* Shell loads via deleted subdir OK because name starts with .. */
+	fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
+	fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT);
+
+	/* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */
+	fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL);
+	/* Invalid path => ENOENT */
+	fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT);
+	fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT);
+	fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT);
+	/* Attempt to execute directory => EACCES */
+	fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES);
+	/* Attempt to execute non-executable => EACCES */
+	fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES);
+	fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES);
+	fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH,
+				    EACCES);
+	/* Attempt to execute nonsense FD => EBADF */
+	fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF);
+	fail += check_execveat_fail(99, "execveat", 0, EBADF);
+	/* Attempt to execute relative to non-directory => ENOTDIR */
+	fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR);
+
+	fail += check_execveat_pathmax(dot_dfd, "execveat", 0);
+	fail += check_execveat_pathmax(dot_dfd, "script", 1);
+	return fail;
+}
+
+static void prerequisites(void)
+{
+	int fd;
+	const char *script = "#!/bin/sh\nexit $*\n";
+
+	/* Create ephemeral copies of files */
+	exe_cp("execveat", "execveat.ephemeral");
+	exe_cp("execveat", "execveat.path.ephemeral");
+	exe_cp("script", "script.ephemeral");
+	mkdir("subdir.ephemeral", 0755);
+
+	fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755);
+	write(fd, script, strlen(script));
+	close(fd);
+}
+
+int main(int argc, char **argv)
+{
+	int ii;
+	int rc;
+	const char *verbose = getenv("VERBOSE");
+
+	if (argc >= 2) {
+		/* If we are invoked with an argument, don't run tests. */
+		const char *in_test = getenv("IN_TEST");
+
+		if (verbose) {
+			printf("  invoked with:");
+			for (ii = 0; ii < argc; ii++)
+				printf(" [%d]='%s'", ii, argv[ii]);
+			printf("\n");
+		}
+
+		/* Check expected environment transferred. */
+		if (!in_test || strcmp(in_test, "yes") != 0) {
+			printf("[FAIL] (no IN_TEST=yes in env)\n");
+			return 1;
+		}
+
+		/* Use the final argument as an exit code. */
+		rc = atoi(argv[argc - 1]);
+		fflush(stdout);
+	} else {
+		prerequisites();
+		if (verbose)
+			envp[1] = "VERBOSE=1";
+		rc = run_tests();
+		if (rc > 0)
+			printf("%d tests failed\n", rc);
+	}
+	return rc;
+}
--
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCHv8 3/4] sparc: Hook up execveat system call.
From: David Drysdale @ 2014-11-14 16:23 UTC (permalink / raw)
  To: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Andrew Morton, David Miller
  Cc: Oleg Nesterov, Michael Kerrisk, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arch-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	sparclinux-u79uwXL29TY76Z2rM5mHXA, David Drysdale
In-Reply-To: <1415982183-20525-1-git-send-email-drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Signed-off-by: David Drysdale <drysdale-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
 arch/sparc/include/uapi/asm/unistd.h | 3 ++-
 arch/sparc/kernel/systbls_32.S       | 1 +
 arch/sparc/kernel/systbls_64.S       | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 46d83842eddc..6f35f4df17f2 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -415,8 +415,9 @@
 #define __NR_getrandom		347
 #define __NR_memfd_create	348
 #define __NR_bpf		349
+#define __NR_execveat		350

-#define NR_syscalls		350
+#define NR_syscalls		351

 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK	0x00000001
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index ad0cdf497b78..e31a9056a303 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -87,3 +87,4 @@ sys_call_table:
 /*335*/	.long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/	.long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 /*345*/	.long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
+/*350*/	.long sys_execveat
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 580cde9370c9..15069cb35dac 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -88,6 +88,7 @@ sys_call_table32:
 	.word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 	.word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
+/*350*/	.word sys_execveat

 #endif /* CONFIG_COMPAT */

@@ -167,3 +168,4 @@ sys_call_table:
 	.word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 	.word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
+/*350*/	.word sys_execveat
--
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox