Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v2 net-next 2/7] bpf: add hashtable type of eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

add new map type BPF_MAP_TYPE_HASH and its implementation

- maps are created/destroyed by userspace. Both userspace and eBPF programs
  can lookup/update/delete elements from the map

- eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism
  for concurrent updates

- key/value are opaque range of bytes (aligned to 8 bytes)

- user space provides 3 configuration attributes via BPF syscall:
  key_size, value_size, max_entries

- map takes care of allocating/freeing key/value pairs

- map_update_elem() must fail to insert new element when max_entries
  limit is reached to make sure that eBPF programs cannot exhaust memory

- map_update_elem() replaces elements in an atomic way

- optimized for speed of lookup() which can be called multiple times from
  eBPF program which itself is triggered by high volume of events
  . in the future JIT compiler may recognize lookup() call and optimize it
    further, since key_size is constant for life of eBPF program

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/hashtab.c     |  362 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/hashtab.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3e9e1b77f29d..03a01fd609aa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@ enum bpf_cmd {
 
 enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
+	BPF_MAP_TYPE_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..2c0ec7f9da78 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..d234a012f046
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,362 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct hlist_head *buckets;
+	spinlock_t lock;
+	u32 count;	/* number of elements in this hashtable */
+	u32 n_buckets;	/* number of hash buckets */
+	u32 elem_size;	/* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	struct hlist_node hash_node;
+	struct rcu_head rcu;
+	u32 hash;
+	char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int err, i;
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	htab->map.key_size = attr->key_size;
+	htab->map.value_size = attr->value_size;
+	htab->map.max_entries = attr->max_entries;
+
+	/* check sanity of attributes.
+	 * value_size == 0 may be allowed in the future to use map as a set
+	 */
+	err = -EINVAL;
+	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+	    htab->map.value_size == 0)
+		goto free_htab;
+
+	/* hash table size must be power of 2 */
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+	err = -E2BIG;
+	if (htab->map.key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+				      GFP_USER | __GFP_NOWARN);
+
+	if (!htab->buckets) {
+		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+		if (!htab->buckets)
+			goto free_htab;
+	}
+
+	for (i = 0; i < htab->n_buckets; i++)
+		INIT_HLIST_HEAD(&htab->buckets[i]);
+
+	spin_lock_init(&htab->lock);
+	htab->count = 0;
+
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8) +
+			  htab->map.value_size;
+	return &htab->map;
+
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+					 void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node)
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 hash, key_size;
+
+	/* Must be called with rcu_read_lock. */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l)
+		return l->key + round_up(map->key_size, 8);
+
+	return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l, *next_l;
+	u32 hash, key_size;
+	int i;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	/* lookup the key */
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (!l) {
+		i = 0;
+		goto find_first_elem;
+	}
+
+	/* key was found, get next key in the same bucket */
+	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+				  struct htab_elem, hash_node);
+
+	if (next_l) {
+		/* if next elem in this hash list is non-zero, just return it */
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		head = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					  struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* itereated over all buckets and all elements */
+	return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	u32 key_size;
+	int ret;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* allocate new element outside of lock */
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+	if (!l_new)
+		return -ENOMEM;
+
+	key_size = map->key_size;
+
+	memcpy(l_new->key, key, key_size);
+	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+	l_new->hash = htab_map_hash(l_new->key, key_size);
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, l_new->hash);
+
+	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+	if (!l_old && unlikely(htab->count >= map->max_entries)) {
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		ret = -E2BIG;
+		goto err;
+	}
+
+	if (l_old && map_flags == BPF_NOEXIST) {
+		/* elem already exists */
+		ret = -EEXIST;
+		goto err;
+	}
+
+	if (!l_old && map_flags == BPF_EXIST) {
+		/* elem doesn't exist, cannot update it */
+		ret = -ENOENT;
+		goto err;
+	}
+
+	/* add new element to the head of the list, so that concurrent
+	 * search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		hlist_del_rcu(&l_old->hash_node);
+		kfree_rcu(l_old, rcu);
+	} else {
+		htab->count++;
+	}
+	spin_unlock_irqrestore(&htab->lock, flags);
+
+	return 0;
+err:
+	spin_unlock_irqrestore(&htab->lock, flags);
+	kfree(l_new);
+	return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	unsigned long flags;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		htab->count--;
+		kfree_rcu(l, rcu);
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&htab->lock, flags);
+	return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+	int i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_del_rcu(&l->hash_node);
+			htab->count--;
+			kfree(l);
+		}
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete
+	 */
+	synchronize_rcu();
+
+	/* some of kfree_rcu() callbacks for elements of this map may not have
+	 * executed. It's ok. Proceed to free residual elements and map itself
+	 */
+	delete_all_elements(htab);
+	kvfree(htab->buckets);
+	kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_map_lookup_elem,
+	.map_update_elem = htab_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &htab_ops,
+	.type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_htab_map);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 1/7] bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

the current meaning of BPF_MAP_UPDATE_ELEM syscall command is:
either update existing map element or create a new one.
Initially the plan was to add a new command to handle the case of
'create new element if it didn't exist', but 'flags' style looks
cleaner and overall diff is much smaller (more code reused), so add 'flags'
attribute to BPF_MAP_UPDATE_ELEM command with the following meaning:
 #define BPF_ANY	0 /* create new element or update existing */
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */

bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST
if element already exists.

bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT
if element doesn't exist.

Userspace will call it as:
int bpf_update_elem(int fd, void *key, void *value, __u64 flags)
{
    union bpf_attr attr = {
        .map_fd = fd,
        .key = ptr_to_u64(key),
        .value = ptr_to_u64(value),
        .flags = flags;
    };

    return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

First two bits of 'flags' are used to encode style of bpf_update_elem() command.
Bits 2-63 are reserved for future use.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

patch 5 of this set includes tests of bpf_update_elem() with these flags

 include/linux/bpf.h      |    2 +-
 include/uapi/linux/bpf.h |    8 +++++++-
 kernel/bpf/syscall.c     |    4 ++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3cf91754a957..51e9242e4803 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,7 +22,7 @@ struct bpf_map_ops {
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
-	int (*map_update_elem)(struct bpf_map *map, void *key, void *value);
+	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
 	int (*map_delete_elem)(struct bpf_map *map, void *key);
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d18316f9e9c4..3e9e1b77f29d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,7 +82,7 @@ enum bpf_cmd {
 
 	/* create or update key/value pair in a given map
 	 * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
-	 * Using attr->map_fd, attr->key, attr->value
+	 * Using attr->map_fd, attr->key, attr->value, attr->flags
 	 * returns zero or negative error
 	 */
 	BPF_MAP_UPDATE_ELEM,
@@ -117,6 +117,11 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 };
 
+/* flags for BPF_MAP_UPDATE_ELEM command */
+#define BPF_ANY		0 /* create new element or update existing */
+#define BPF_NOEXIST	1 /* create new element if it didn't exist */
+#define BPF_EXIST	2 /* update existing element */
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -132,6 +137,7 @@ union bpf_attr {
 			__aligned_u64 value;
 			__aligned_u64 next_key;
 		};
+		__u64		flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_LOAD command */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..c0d03bf317a2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -190,7 +190,7 @@ err_put:
 	return err;
 }
 
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
 
 static int map_update_elem(union bpf_attr *attr)
 {
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
 	 * therefore all map accessors rely on this fact, so do the same here
 	 */
 	rcu_read_lock();
-	err = map->ops->map_update_elem(map, key, value);
+	err = map->ops->map_update_elem(map, key, value, attr->flags);
 	rcu_read_unlock();
 
 free_value:
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 0/7] implementation of eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel

Hi All,

v1->v2:
renamed flags for MAP_UPDATE_ELEM command to be more concise,
clarified commit logs and improved comments in patches 1,3,7
per discussions with Daniel
 
Old v1 cover:

this set of patches adds implementation of HASH and ARRAY types of eBPF maps
which were described in manpage in commit b4fc1a460f30("Merge branch 'bpf-next'")

The difference vs previous version of these patches from August:
- added 'flags' attribute to BPF_MAP_UPDATE_ELEM
- in HASH type implementation removed per-map kmem_cache.
  I was doing kmem_cache_create() for every map to enable selective slub
  debugging to check for overflows and leaks. Now it's not needed, so just
  use normal kmalloc() for map elements.
- added ARRAY type which was mentioned in manpage, but wasn't public yet
- added map testsuite and removed temporary bits from test_stubs

Note, eBPF programs cannot be attached to events yet.
It will come in the next set.

Alexei Starovoitov (7):
  bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command
  bpf: add hashtable type of eBPF maps
  bpf: add array type of eBPF maps
  bpf: fix BPF_MAP_LOOKUP_ELEM command return code
  bpf: add a testsuite for eBPF maps
  bpf: allow eBPF programs to use maps
  bpf: remove test map scaffolding and user proper types

 include/linux/bpf.h         |    7 +-
 include/uapi/linux/bpf.h    |   13 +-
 kernel/bpf/Makefile         |    2 +-
 kernel/bpf/arraymap.c       |  151 ++++++++++++++++++
 kernel/bpf/hashtab.c        |  362 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c        |   89 +++++++++++
 kernel/bpf/syscall.c        |    6 +-
 kernel/bpf/test_stub.c      |   56 ++-----
 samples/bpf/Makefile        |    3 +-
 samples/bpf/libbpf.c        |    3 +-
 samples/bpf/libbpf.h        |    2 +-
 samples/bpf/test_maps.c     |  291 ++++++++++++++++++++++++++++++++++
 samples/bpf/test_verifier.c |   14 +-
 13 files changed, 936 insertions(+), 63 deletions(-)
 create mode 100644 kernel/bpf/arraymap.c
 create mode 100644 kernel/bpf/hashtab.c
 create mode 100644 kernel/bpf/helpers.c
 create mode 100644 samples/bpf/test_maps.c

-- 
1.7.9.5

^ permalink raw reply

* Re: [PATCH v17 0/7] MADV_FREE support
From: Minchan Kim @ 2014-11-13 22:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, Michael Kerrisk,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Hugh Dickins, Johannes Weiner,
	Rik van Riel, KOSAKI Motohiro, Mel Gorman, Jason Evans,
	zhangyanfei-BthXqXjhjHXQFUHtdCDX3A, Kirill A. Shutemov
In-Reply-To: <1413799924-17946-1-git-send-email-minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

Hello Andrew,

It seems I have waited your review for a long time.
What should I do to take your time slot?

On Mon, Oct 20, 2014 at 07:11:57PM +0900, Minchan Kim wrote:
> This patch enable MADV_FREE hint for madvise syscall, which have
> been supported by other OSes. [PATCH 1] includes the details.
> 
> [1] support MADVISE_FREE for !THP page so if VM encounter
> THP page in syscall context, it splits THP page.
> [2-6] is to preparing to call madvise syscall without THP plitting
> [7] enable THP page support for MADV_FREE.
> 
> * from v16
>  * Rebased on mmotm-2014-10-15-16-57
> 
> * from v15
>  * Add more Acked-by - Rik van Riel
>  * Rebased on mmotom-08-29-15-15
> 
> * from v14
>  * Add more Ackedy-by from arch people(sparc, arm64 and arm)
>  * Drop s390 since pmd_dirty/clean was merged
> 
> * from v13
>  * Add more Ackedy-by from arch people(arm, arm64 and ppc)
>  * Rebased on mmotm 2014-08-13-14-29
> 
> * from v12
>  * Fix - skip to mark free pte on try_to_free_swap failed page - Kirill
>  * Add more Acked-by from arch maintainers and Kirill
> 
> * From v11
>  * Fix arm build - Steve
>  * Separate patch for arm and arm64 - Steve
>  * Remove unnecessary check - Kirill
>  * Skip non-vm_normal page - Kirill
>  * Add Acked-by - Zhang
>  * Sparc64 build fix
>  * Pagetable walker THP handling fix
> 
> * From v10
>  * Add Acked-by from arch stuff(x86, s390)
>  * Pagewalker based pagetable working - Kirill
>  * Fix try_to_unmap_one broken with hwpoison - Kirill
>  * Use VM_BUG_ON_PAGE in madvise_free_pmd - Kirill
>  * Fix pgtable-3level.h for arm - Steve
> 
> * From v9
>  * Add Acked-by - Rik
>  * Add THP page support - Kirill
> 
> * From v8
>  * Rebased-on v3.16-rc2-mmotm-2014-06-25-16-44
> 
> * From v7
>  * Rebased-on next-20140613
> 
> * From v6
>  * Remove page from swapcache in syscal time
>  * Move utility functions from memory.c to madvise.c - Johannes
>  * Rename untilify functtions - Johannes
>  * Remove unnecessary checks from vmscan.c - Johannes
>  * Rebased-on v3.15-rc5-mmotm-2014-05-16-16-56
>  * Drop Reviewe-by because there was some changes since then.
> 
> * From v5
>  * Fix PPC problem which don't flush TLB - Rik
>  * Remove unnecessary lazyfree_range stub function - Rik
>  * Rebased on v3.15-rc5
> 
> * From v4
>  * Add Reviewed-by: Zhang Yanfei
>  * Rebase on v3.15-rc1-mmotm-2014-04-15-16-14
> 
> * From v3
>  * Add "how to work part" in description - Zhang
>  * Add page_discardable utility function - Zhang
>  * Clean up
> 
> * From v2
>  * Remove forceful dirty marking of swap-readed page - Johannes
>  * Remove deactivation logic of lazyfreed page
>  * Rebased on 3.14
>  * Remove RFC tag
> 
> * From v1
>  * Use custom page table walker for madvise_free - Johannes
>  * Remove PG_lazypage flag - Johannes
>  * Do madvise_dontneed instead of madvise_freein swapless system
> 
> 
> 
> Minchan Kim (7):
>   mm: support madvise(MADV_FREE)
>   x86: add pmd_[dirty|mkclean] for THP
>   sparc: add pmd_[dirty|mkclean] for THP
>   powerpc: add pmd_[dirty|mkclean] for THP
>   arm: add pmd_mkclean for THP
>   arm64: add pmd_[dirty|mkclean] for THP
>   mm: Don't split THP page when syscall is called
> 
>  arch/arm/include/asm/pgtable-3level.h    |   1 +
>  arch/arm64/include/asm/pgtable.h         |   2 +
>  arch/powerpc/include/asm/pgtable-ppc64.h |   2 +
>  arch/sparc/include/asm/pgtable_64.h      |  16 ++++
>  arch/x86/include/asm/pgtable.h           |  10 ++
>  include/linux/huge_mm.h                  |   4 +
>  include/linux/rmap.h                     |   9 +-
>  include/linux/vm_event_item.h            |   1 +
>  include/uapi/asm-generic/mman-common.h   |   1 +
>  mm/huge_memory.c                         |  35 +++++++
>  mm/madvise.c                             | 159 +++++++++++++++++++++++++++++++
>  mm/rmap.c                                |  46 ++++++++-
>  mm/vmscan.c                              |  64 +++++++++----
>  mm/vmstat.c                              |   1 +
>  14 files changed, 331 insertions(+), 20 deletions(-)
> 
> -- 
> 2.0.0
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo-Bw31MaZKKs0EbZ0PF+XxCw@public.gmane.org  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org"> email-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org </a>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply

* [PATCH 03/56] fs: Support compiling out splice-family syscalls
From: Pieter Smith @ 2014-11-13 21:22 UTC (permalink / raw)
  To: pieter-qeJ+1H9vRZbz+pZb47iToQ
  Cc: Josh Triplett, Alexander Viro, Andrew Morton, Matt Turner,
	Michal Hocko, Fabian Frederick, Paul Gortmaker, Tejun Heo,
	Paul E. McKenney, Luis R. Rodriguez, Peter Foley,
	Eric W. Biederman, Oleg Nesterov, H. Peter Anvin, Andy Lutomirski,
	Vivek Goyal, David Herrmann, Kees Cook, Mailing List, open list,
	open list:ABI/API
In-Reply-To: <1415913813-362-1-git-send-email-pieter-qeJ+1H9vRZbz+pZb47iToQ@public.gmane.org>

Many embedded systems will not need the splice-family syscalls (splice,
vmsplice, tee and sendfile). Omitting them saves space.  This adds a new EXPERT
config option CONFIG_SYSCALL_SPLICE (default y) to support compiling them out.

This patch removes almost all callers of .splice_read() and .splice_write()
in the file_operations struct. This paves the way to eventually compile out the
.splice_read and .splice_write members of the file_operations struct as well as
the remaining splice-related infrastructure.

add/remove: 0/16 grow/shrink: 2/5 up/down: 114/-3693 (-3579)
function                                     old     new   delta
splice_direct_to_actor                       348     416     +68
splice_to_pipe                               371     417     +46
splice_from_pipe_next                        107     106      -1
fdput                                         11       -     -11
signal_pending                                39      26     -13
fdget                                         56      42     -14
user_page_pipe_buf_ops                        20       -     -20
user_page_pipe_buf_steal                      25       -     -25
file_end_write                                58      29     -29
file_start_write                              68      34     -34
pipe_to_user                                  43       -     -43
wakeup_pipe_readers                           54       -     -54
do_splice_to                                  87       -     -87
ipipe_prep.part                               92       -     -92
opipe_prep.part                              119       -    -119
sys_sendfile                                 122       -    -122
sys_sendfile64                               126       -    -126
sys_vmsplice                                 137       -    -137
vmsplice_to_user                             205       -    -205
sys_tee                                      491       -    -491
do_sendfile                                  492       -    -492
vmsplice_to_pipe                             558       -    -558
sys_splice                                  1020       -   -1020

Signed-off-by: Pieter Smith <pieter-qeJ+1H9vRZbz+pZb47iToQ@public.gmane.org>
---
 fs/splice.c     |  2 ++
 init/Kconfig    | 10 ++++++++++
 kernel/sys_ni.c |  8 ++++++++
 3 files changed, 20 insertions(+)

diff --git a/fs/splice.c b/fs/splice.c
index 44b201b..7c4c695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1316,6 +1316,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 	return ret;
 }
 
+#ifdef CONFIG_SYSCALL_SPLICE
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			       struct pipe_inode_info *opipe,
 			       size_t len, unsigned int flags);
@@ -2200,4 +2201,5 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
 #endif
+#endif
 
diff --git a/init/Kconfig b/init/Kconfig
index 782a65b..25ee289 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1547,6 +1547,16 @@ config ADVISE_SYSCALLS
 	  applications use these syscalls, you can disable this option to save
 	  space.
 
+config SYSCALL_SPLICE
+	bool "Enable splice/vmsplice/tee/sendfile syscalls" if EXPERT
+	default y
+	help
+	  This option enables the splice, vmsplice, tee and sendfile syscalls. These
+	  are used by applications to: move data between buffers and arbitrary file
+	  descriptors; "copy" data between buffers; or copy data from userspace into
+	  buffers. If building an embedded system where no applications use these
+	  syscalls, you can disable this option to save space.
+
 config PCI_QUIRKS
 	default y
 	bool "Enable PCI quirk workarounds" if EXPERT
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d4709d4..2913337 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,14 @@ cond_syscall(sys_uselib);
 cond_syscall(sys_fadvise64);
 cond_syscall(sys_fadvise64_64);
 cond_syscall(sys_madvise);
+cond_syscall(sys_vmsplice);
+cond_syscall(sys_splice);
+cond_syscall(sys_tee);
+cond_syscall(sys_sendfile);
+cond_syscall(sys_sendfile64);
+cond_syscall(compat_sys_vmsplice);
+cond_syscall(compat_sys_sendfile);
+cond_syscall(compat_sys_sendfile64);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCHv7 0/3] syscalls,x86: Add execveat() system call
From: Andrew Morton @ 2014-11-13 20:13 UTC (permalink / raw)
  To: David Drysdale
  Cc: Christoph Hellwig, Rich Felker, Eric W. Biederman,
	Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel@vger.kernel.org, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, X86 ML, linux-arch,
	Linux API
In-Reply-To: <CAHse=S_uQGmzz+umKDfJPOaT2+TAdzbtjZeVZaj1i4NBVWy35g@mail.gmail.com>

On Thu, 13 Nov 2014 14:42:58 +0000 David Drysdale <drysdale@google.com> wrote:

> On Wed, Nov 12, 2014 at 9:50 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
> > On Fri,  7 Nov 2014 17:01:01 +0000 David Drysdale <drysdale@google.com> wrote:
> >
> >> This patch set adds execveat(2) for x86, and is derived from Meredydd
> >> Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
> >>
> >> The primary aim of adding an execveat syscall is to allow an
> >> implementation of fexecve(3) that does not rely on the /proc
> >> filesystem, at least for executables (rather than scripts).  The
> >> current glibc version of fexecve(3) is implemented via /proc, which
> >> causes problems in sandboxed or otherwise restricted environments.
> >
> > Have the relevant glibc people seen/reviewed/liked this?
> 
> I think it's been mentioned in passing but not explicitly discussed over there
> (https://sourceware.org/ml/libc-alpha/2014-10/msg00497.html,
> https://sourceware.org/ml/libc-alpha/2014-10/msg00509.html)
> and a couple of the participants in that thread (Christoph Hellwig, Rich Felker)
> were also cc:ed here.
> 
> It sounded like execveat might be useful for another feature (O_EXEC) but
> I'm not sure whether that amounts to the relevant glibc folk liking this...

OK.  Could you please try to hunt down the appropriate people and give
them a poke?  We'd be in a mess if we merged this then glibc didn't use
it, or glibc developers required/suggested any interface modifications.

^ permalink raw reply

* Re: [RFC PATCH 0/1] arm64: Fix /proc/cpuinfo
From: Catalin Marinas @ 2014-11-13 17:48 UTC (permalink / raw)
  To: Will Deacon
  Cc: Mark Rutland, linux-arm-kernel@lists.infradead.org,
	ghackmann@google.com, ijc@hellion.org.uk, Serban Constantinescu,
	cross-distro@lists.linaro.org, linux-api@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20141106170548.GF19702@e104818-lin.cambridge.arm.com>

On Thu, Nov 06, 2014 at 05:05:48PM +0000, Catalin Marinas wrote:
> On Thu, Nov 06, 2014 at 04:54:31PM +0000, Will Deacon wrote:
> > On Thu, Nov 06, 2014 at 04:43:12PM +0000, Catalin Marinas wrote:
> > > On Fri, Oct 24, 2014 at 02:56:39PM +0100, Mark Rutland wrote:
> > > > [d] Print different hwcaps dependent on the personality.
> > > > 
> > > >     This would allow for 32-bit and 64-bit applications to function
> > > >     correctly, but for some 32-bit applications the personality would
> > > >     need to be set explicitly by the user.
> > > 
> > > Which makes this option actually in line with the uname -m behaviour. My
> > > vote goes for [d] with option [b] as a close alternative.
> > > 
> > > > [1] arm, v3.17, Versatile Express A15x2 A7x3 coretile
> > > > Features        : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae evtstrm 
> > > [...]
> > > > [2] arm64, v3.17, Juno platform
> > > > Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32 
> > > 
> > > As an exercise, I'm trying to see what option [b] would look like when
> > > CONFIG_COMPAT is enabled:
> > > 
> > > Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32 half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
> > > 
> > > The duplicate strings would only be listed once (evtstrm, aes, pmull,
> > > sha1, sha2, crc32). New AArch64 features that we may expect to be
> > > optional on AArch32 could be prefixed with "a64". If they are missing
> > > entirely from AArch32, (like asimd), no need for the prefix.
> > > 
> > > The advantage is that we don't need to check the personality but we have
> > > to assume that scripts would not search for substrings (sane people
> > > shouldn't do this anyway as the Features string can always be extended).
> > 
> > And a big disadvantage is that I can imagine AArch64 applications checking
> > for "neon" instead of "asimd", which will break if they're run under kernels
> > without COMPAT support enabled.
[...]
> > So I'm inclined to stick with Mark's patch as it is.
> 
> If we don't hear otherwise, I propose sometime next week we queue Mark's
> patch for -next.

As we haven't heard otherwise:

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply

* Re: [PATCH] ARM: cacheflush: disallow pending signals during cacheflush
From: Peter Maydell @ 2014-11-13 17:39 UTC (permalink / raw)
  To: Will Deacon
  Cc: Chanho Min, Russell King, Jon Medhurst, Taras Kondratiuk,
	Olof Johansson,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Gunho Lee,
	HyoJun Im, Jongsung Kim,
	linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w
In-Reply-To: <20141113112633.GE13350-5wv7dgnIgG8@public.gmane.org>

On 13 November 2014 11:26, Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org> wrote:
> Whilst I don't think this is the correct solution, I agree that there's
> a potential issue here. We could change the restart return value to
> -ERESTARTNOINTR instead, but I can imagine something like a periodic
> SIGALRM which could prevent a large cacheflush from ever completing.
> Do we actually care about making forward progress in such a scenario?
>
> It is interesting to note that this change has been in mainline since
> May last year without any reported issues. That could be down to a number
> of reasons:
>
>   (1) People are using old kernels on ARM
>
>   (2) Code doesn't check the return value from the cacheflush system call,
>       because it historically always returned 0

...and the documentation comment in the source code didn't say
anything about the syscall having a return value; it only
described the input parameters. I would actually be surprised
if any userspace caller of this syscall checked its return value
(the libgcc cacheflush function used by gcc's clear_cache builtin
doesn't, to pick one popularly used example).

>   (3) People are getting lucky with timing, as this is likely difficult
>       to hit

    (4) The resulting misbehaviour ("my JIT crashes occasionally and
        non-reproducibly at some point possibly some while after the
        cacheflush call") will be extremely hard to track back
        to this kernel change

> This leaves me with the following questions:
>
>   - Has this change been shown to break anything in practice?
>   - Can we change the internal return value to -ERESTARTNOINTR?
>   - What do we do about kernels that *do* return -EINTR? (>=3.12?)

My suggestion would be "treat this as a bugfix, put it into
stable kernels in the usual way (and assume distros will pick
it up if appropriate)".

>   - Can we get a manpage put together to describe this mess?

That would be nice :-)

-- PMM

^ permalink raw reply

* Re: [PATCHv7 0/3] syscalls,x86: Add execveat() system call
From: David Drysdale @ 2014-11-13 15:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel@vger.kernel.org, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, Rich Felker,
	Christoph Hellwig, X86 ML, linux-arch, Linux API, Michael Kerrisk
In-Reply-To: <20141112140801.ae8029c7c769d1b606a76f7c@linux-foundation.org>

On Wed, Nov 12, 2014 at 10:08 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Fri,  7 Nov 2014 17:01:01 +0000 David Drysdale <drysdale@google.com> wrote:
>
>> This patch set adds execveat(2) for x86
>
> I grabbed these.  If someone else was planning to do so, feel free to
> shout at me.
>
> I haven't been following the discussion closely so some reviewed-by's
> and tested-by's would be nice.

Yes please -- Andy, Eric, Al?

> Thanks for writing a manpage.  mtk.manpages@gmail.com should have been
> cc'ed.  He is now ;)

Thanks (for some reason I though cc'ing linux-api took care of Michael
seeing it).

> Your syscall number was taken by sys_bpf.  I renumbered it
> appropriately.

Looks like the robots have also found a few things that need fixing:
 1) Sparse warning on "close_on_exec(fd, current->files->fdt)": I guess
    that needs a rcu_dereference_raw() or some such around arg2.
 2) Syscall not implemented warning on sparc build: I could attempt to wire
    up the sparc syscall (although I can only cross-compile it, not run it),
    or is it best to leave it and ask sparclinux / David Miller nicely to take
    care of it?  Or have I just missed some bit of syscall machinery?
 3) Non-canonical comment format just before the close_on_exec() call.
(Well, that last wasn't found by a robot -- thanks Andrew!)

When I generate a new version, would it be easier for you if I also merged up
against v3.18-rc4?  That would at least take care of the syscall renumbering.

^ permalink raw reply

* Re: [PATCHv7 0/3] syscalls,x86: Add execveat() system call
From: David Drysdale @ 2014-11-13 14:42 UTC (permalink / raw)
  To: Andrew Morton, Christoph Hellwig, Rich Felker
  Cc: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel@vger.kernel.org, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Kees Cook, Arnd Bergmann, X86 ML, linux-arch,
	Linux API
In-Reply-To: <20141112135009.5a887d200be262d94ba50495@linux-foundation.org>

On Wed, Nov 12, 2014 at 9:50 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Fri,  7 Nov 2014 17:01:01 +0000 David Drysdale <drysdale@google.com> wrote:
>
>> This patch set adds execveat(2) for x86, and is derived from Meredydd
>> Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
>>
>> The primary aim of adding an execveat syscall is to allow an
>> implementation of fexecve(3) that does not rely on the /proc
>> filesystem, at least for executables (rather than scripts).  The
>> current glibc version of fexecve(3) is implemented via /proc, which
>> causes problems in sandboxed or otherwise restricted environments.
>
> Have the relevant glibc people seen/reviewed/liked this?

I think it's been mentioned in passing but not explicitly discussed over there
(https://sourceware.org/ml/libc-alpha/2014-10/msg00497.html,
https://sourceware.org/ml/libc-alpha/2014-10/msg00509.html)
and a couple of the participants in that thread (Christoph Hellwig, Rich Felker)
were also cc:ed here.

It sounded like execveat might be useful for another feature (O_EXEC) but
I'm not sure whether that amounts to the relevant glibc folk liking this...

^ permalink raw reply

* Re: [PATCH] ARM: cacheflush: disallow pending signals during cacheflush
From: Will Deacon @ 2014-11-13 11:26 UTC (permalink / raw)
  To: Chanho Min
  Cc: Russell King, Jon Medhurst, Taras Kondratiuk, Olof Johansson,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Gunho Lee,
	HyoJun Im, Jongsung Kim,
	peter.maydell-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org,
	linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w
In-Reply-To: <1415863793-6219-1-git-send-email-chanho.min-Hm3cg6mZ9cc@public.gmane.org>

Hello,

[adding linux-api, linux-man]

On Thu, Nov 13, 2014 at 07:29:53AM +0000, Chanho Min wrote:
> Since commit 28256d612726 ("ARM: cacheflush: split user cache-flushing
> into interruptible chunks"), cacheflush can be interrupted by signal.
> 
> But, cacheflush doesn't resume from where we left off if process has
> user-defined signal handlers. It returns -EINTR then cacheflush
> should be re-invoked from the start of address until cache-flushing
> of whole address ranges is completed (restart_syscall isn't available
> in userspace). It may cause regression. So I suggest to disallow
> pending signals during cacheflush.
> 
> This partially reverts commit 28256d612726a28a8b9d3c49f2b74198c4423d6a.

Whilst I don't think this is the correct solution, I agree that there's
a potential issue here. We could change the restart return value to
-ERESTARTNOINTR instead, but I can imagine something like a periodic
SIGALRM which could prevent a large cacheflush from ever completing.
Do we actually care about making forward progress in such a scenario?

It is interesting to note that this change has been in mainline since
May last year without any reported issues. That could be down to a number
of reasons:

  (1) People are using old kernels on ARM

  (2) Code doesn't check the return value from the cacheflush system call,
      because it historically always returned 0

  (3) People are getting lucky with timing, as this is likely difficult
      to hit

Related to (2) is that a `man cacheflush' invocation returns something
about the MIPs system call, that doesn't match what we do for ARM. The
(relatively recent) history of the system call on ARM is:

  < v3.5 [*]

    - Always returns 0
    - Restricts virtual address range to a single VMA
    - Page-aligns the region limits (over flushing for smaller ranges)
    - Terminates on the first fault
    - Flags are ignored but must "ALWAYS be passed as ZERO"

  v3.5 - v3.12
    - Returns -EINVAL if flags is set or if end < start
    - Returns -EINVAL if we couldn't find a vma
    - Terminates on the first fault and returns -EFAULT

  v3.12 - HEAD

    - No longer page-aligns region
    - Removes VMA checking as this had a deadlock bug with mmap_sem
      and we could handle faults by this point anyway
    - Returns -EINVAL if !access_ok for the range
    - Splits the range into PAGE_SIZE chunks, checking for reschedule
      and pending signals to avoid DoSing the system (the hardware can
      only clean by cacheline). This is where the -ERESTART_RESTARTBLOCK
      behaviour came in, potentially returning -EINTR to userspace.

This leaves me with the following questions:

  - Has this change been shown to break anything in practice?
  - Can we change the internal return value to -ERESTARTNOINTR?
  - What do we do about kernels that *do* return -EINTR? (>=3.12?)
  - Can we get a manpage put together to describe this mess?

Cheers,

Will

[*] rmk may have some more ancient history kicking around, if you like!

> diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
> index abd2fc0..275e086 100644
> --- a/arch/arm/kernel/traps.c
> +++ b/arch/arm/kernel/traps.c
> @@ -521,25 +521,6 @@ __do_cache_op(unsigned long start, unsigned long end)
>  	do {
>  		unsigned long chunk = min(PAGE_SIZE, end - start);
>  
> -		if (signal_pending(current)) {
> -			struct thread_info *ti = current_thread_info();
> -
> -			ti->restart_block = (struct restart_block) {
> -				.fn	= do_cache_op_restart,
> -			};
> -
> -			ti->arm_restart_block = (struct arm_restart_block) {
> -				{
> -					.cache = {
> -						.start	= start,
> -						.end	= end,
> -					},
> -				},
> -			};
> -
> -			return -ERESTART_RESTARTBLOCK;
> -		}
> -
>  		ret = flush_cache_user_range(start, start + chunk);
>  		if (ret)
>  			return ret;
> -- 
> 1.7.9.5
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCHv7 0/3] syscalls,x86: Add execveat() system call
From: Andrew Morton @ 2014-11-12 22:08 UTC (permalink / raw)
  To: David Drysdale
  Cc: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Kees Cook, Arnd Bergmann, Rich Felker, Christoph Hellwig, x86,
	linux-arch, linux-api, Michael Kerrisk
In-Reply-To: <1415379664-31555-1-git-send-email-drysdale@google.com>

On Fri,  7 Nov 2014 17:01:01 +0000 David Drysdale <drysdale@google.com> wrote:

> This patch set adds execveat(2) for x86

I grabbed these.  If someone else was planning to do so, feel free to
shout at me.

I haven't been following the discussion closely so some reviewed-by's
and tested-by's would be nice.

Thanks for writing a manpage.  mtk.manpages@gmail.com should have been
cc'ed.  He is now ;)

Your syscall number was taken by sys_bpf.  I renumbered it
appropriately.

^ permalink raw reply

* Re: [PATCHv7 0/3] syscalls,x86: Add execveat() system call
From: Andrew Morton @ 2014-11-12 21:50 UTC (permalink / raw)
  To: David Drysdale
  Cc: Eric W. Biederman, Andy Lutomirski, Alexander Viro, Meredydd Luff,
	linux-kernel, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Kees Cook, Arnd Bergmann, Rich Felker, Christoph Hellwig, x86,
	linux-arch, linux-api
In-Reply-To: <1415379664-31555-1-git-send-email-drysdale@google.com>

On Fri,  7 Nov 2014 17:01:01 +0000 David Drysdale <drysdale@google.com> wrote:

> This patch set adds execveat(2) for x86, and is derived from Meredydd
> Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
> 
> The primary aim of adding an execveat syscall is to allow an
> implementation of fexecve(3) that does not rely on the /proc
> filesystem, at least for executables (rather than scripts).  The
> current glibc version of fexecve(3) is implemented via /proc, which
> causes problems in sandboxed or otherwise restricted environments.

Have the relevant glibc people seen/reviewed/liked this?  

^ permalink raw reply

* Re: [PATCH 7/8] crypto: AF_ALG: add random number generator support
From: Daniel Borkmann @ 2014-11-12 17:51 UTC (permalink / raw)
  To: Stephan Mueller
  Cc: Herbert Xu, ABI/API, linux-crypto-u79uwXL29TY76Z2rM5mHXA, LKML
In-Reply-To: <26219337.yCCTlAE9Ns@tauon>

On 11/12/2014 06:46 PM, Stephan Mueller wrote:
...
> * I unconditionally use the memset after memcpy as you indicated. Once
> the cryptodev tree contains the memzero_explicit call, I will start
> picking up that function.

Herbert merged it actually in this morning, so it's already part of
the cryptodev tree by now.

> Essentially, I throught of the line you suggested.

Ok, thanks.

^ permalink raw reply

* Re: [PATCHv2 7/7] cgroup: mount cgroupns-root when inside non-init cgroupns
From: Aditya Kali @ 2014-11-12 17:48 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Andy Lutomirski, Linux API, Linux Containers, Serge Hallyn,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Ingo Molnar,
	Eric W. Biederman, Tejun Heo, cgroups-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20141104155052.GA7027-7LNsyQBKDXoIagZqoN9o3w@public.gmane.org>

I agree with what Andy and Serge has to say. The ability to mount
cgroupfs inside userns also seems consistent with other kernel
interfaces like sysfs, procfs, etc.

Though it would be great if we can atleast merge the rest of the
patches first while we address the mounting part.

Thanks for your feedback.

On Tue, Nov 4, 2014 at 7:50 AM, Serge E. Hallyn <serge-A9i7LUbDfNHQT0dZR+AlfA@public.gmane.org> wrote:
>
> Quoting Andy Lutomirski (luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org):
> > On Tue, Nov 4, 2014 at 5:46 AM, Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> > > Hello, Aditya.
> > >
> > > On Mon, Nov 03, 2014 at 02:43:47PM -0800, Aditya Kali wrote:
> > >> I agree that this is effectively bind-mounting, but doing this in kernel
> > >> makes it really convenient for the userspace. The process that sets up the
> > >> container doesn't need to care whether it should bind-mount cgroupfs inside
> > >> the container or not. The tasks inside the container can mount cgroupfs on
> > >> as-needed basis. The root container manager can simply unshare cgroupns and
> > >> forget about the internal setup. I think this is useful just for the reason
> > >> that it makes life much simpler for userspace.
> > >
> > > If it's okay to require userland to just do bind mounting, I'd be far
> > > happier with that.  cgroup mount code is already overcomplicated
> > > because of the dynamic matching of supers to mounts when it could just
> > > have told userland to use bind mounting.  Doesn't the host side have
> > > to set up some of the filesystem layouts anyway?  Does it really
> > > matter that we require the host to set up cgroup hierarchy too?
> > >
> >
> > Sort of, but only sort of.
> >
> > You can create a container by unsharing namespaces, mounting
> > everything, and then calling pivot_root.  But this is unpleasant
> > because of the strange way that pid namespaces work -- you generally
> > have to fork first, so this gets tedious.  And it doesn't integrate
> > well with things like fstab or other container-side configuration
> > mechanisms.
> >
> > It's nicer if you can unshare namespaces, mount the bare minimum,
> > pivot_root, and let the contained software do as much setup as
> > possible.
>
> Also, the bind-mount requires the container manager to know where
> the guest distro will want the cgroups mounted.
>
> -serge
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linuxfoundation.org/mailman/listinfo/containers




-- 
Aditya

^ permalink raw reply

* Re: [PATCH 7/8] crypto: AF_ALG: add random number generator support
From: Stephan Mueller @ 2014-11-12 17:46 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Herbert Xu, ABI/API, linux-crypto, LKML
In-Reply-To: <5463978F.7020508@redhat.com>

Am Mittwoch, 12. November 2014, 18:23:27 schrieb Daniel Borkmann:

Hi Daniel,

>On 11/12/2014 05:54 PM, Stephan Mueller wrote:
>> Am Mittwoch, 12. November 2014, 17:15:52 schrieb Daniel Borkmann:
>>> On 11/12/2014 08:05 AM, Stephan Mueller wrote:
>>>> This patch adds the random number generator support for AF_ALG.
>>>> 
>>>> A random number generator's purpose is to generate data without
>>>> requiring the caller to provide any data. Therefore, the AF_ALG
>>>> interface handler for RNGs only implements a callback handler for
>>>> recvmsg.
>>> 
>>> ...
>>> 
>>>> +static int rng_recvmsg(struct kiocb *unused, struct socket *sock,
>>>> +		       struct msghdr *msg, size_t len, int flags)
>>>> +{
>>>> +	struct sock *sk = sock->sk;
>>>> +	struct alg_sock *ask = alg_sk(sk);
>>>> +	struct rng_ctx *ctx = ask->private;
>>>> +	int err = -EFAULT;
>>>> +
>>>> +	if (0 == len)
>>> 
>>> if (len == 0)
>>> 
>>> 	...
>>> 
>>> [And also other places.]
>>> 
>>> We don't use Yoda condition style in the kernel.
>> 
>> Well, there is a very good reason for using the approach I have: we
>> all have done the error of forgetting the second = sign.
>> 
>> In my case, the compiler will complain and we fix the error right
>> away.
>> 
>> In your case, nobody is complaining but we introduced a nasty,
>> potentially hard to debug error. Thus, I very much like to keep my
>> version just to be on the safe side.
>> 
>> Note, there was even a backdoor I have seen where the missing 2nd
>> equal sign introduced a privilege escalation.
>> 
>> Therefore, my standard coding practice is to have a fixed value on
>> the left side and the variable on the right side of any comparison.
>
>I understand, but then please add this proposal first into ...
>
>   Documentation/CodingStyle
>
>The problem is that while the rest of the kernel does not follow
>this coding style, it's also much harder to read and/or program
>this way for people not being used to. So the danger of bugs
>slipping in this way is at least equally high. Besides that, this
>argument would also only account for '==' checks.

Ok, I can change that throughout the code.
>
>>>> +		return 0;
>>>> +	if (MAXSIZE < len)
>>>> +		len = MAXSIZE;
>>>> +
>>>> +	lock_sock(sk);
>>>> +	len = crypto_rng_get_bytes(ctx->drng, ctx->result, len);
>>>> +	if (0 > len)
>>>> +		goto unlock;
>>>> +
>>>> +	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
>>>> +	memset(ctx->result, 0, err);
>>>> +
>>> 
>>> This looks buggy.
>>> 
>>> If copy_to_user() fails from within memcpy_toiovec(), we call
>>> memset()
>>> with a negative return value which is interpreted as size_t and thus
>>> causes a buffer overflow writing beyond ctx->result, no?
>>> 
>>> If it succeeds, we call memset(ctx->result, 0, 0) .....
>> 
>> Right, good catch, I have to add a catch for negative error here.
>
>Hm? Don't you rather mean to say to unconditionally do something like
>...
>
>   memzero_explicit(ctx->result, len);

Sorry, I was not clear:

* I need to catch a failing memcpy, but not return an error.

* I unconditionally use the memset after memcpy as you indicated. Once 
the cryptodev tree contains the memzero_explicit call, I will start 
picking up that function.

Essentially, I throught of the line you suggested.

Ciao
Stephan

^ permalink raw reply

* Re: [PATCH 7/8] crypto: AF_ALG: add random number generator support
From: Daniel Borkmann @ 2014-11-12 17:23 UTC (permalink / raw)
  To: Stephan Mueller; +Cc: Herbert Xu, ABI/API, linux-crypto, LKML
In-Reply-To: <9137675.ZTbqvCU5Bi@tachyon.chronox.de>

On 11/12/2014 05:54 PM, Stephan Mueller wrote:
> Am Mittwoch, 12. November 2014, 17:15:52 schrieb Daniel Borkmann:
>> On 11/12/2014 08:05 AM, Stephan Mueller wrote:
>>> This patch adds the random number generator support for AF_ALG.
>>>
>>> A random number generator's purpose is to generate data without
>>> requiring the caller to provide any data. Therefore, the AF_ALG
>>> interface handler for RNGs only implements a callback handler for
>>> recvmsg.
>>
>> ...
>>
>>> +static int rng_recvmsg(struct kiocb *unused, struct socket *sock,
>>> +		       struct msghdr *msg, size_t len, int flags)
>>> +{
>>> +	struct sock *sk = sock->sk;
>>> +	struct alg_sock *ask = alg_sk(sk);
>>> +	struct rng_ctx *ctx = ask->private;
>>> +	int err = -EFAULT;
>>> +
>>> +	if (0 == len)
>>
>> if (len == 0)
>> 	...
>>
>> [And also other places.]
>>
>> We don't use Yoda condition style in the kernel.
>
> Well, there is a very good reason for using the approach I have: we all have
> done the error of forgetting the second = sign.
>
> In my case, the compiler will complain and we fix the error right away.
>
> In your case, nobody is complaining but we introduced a nasty, potentially
> hard to debug error. Thus, I very much like to keep my version just to be on
> the safe side.
>
> Note, there was even a backdoor I have seen where the missing 2nd equal sign
> introduced a privilege escalation.
>
> Therefore, my standard coding practice is to have a fixed value on the left
> side and the variable on the right side of any comparison.

I understand, but then please add this proposal first into ...

   Documentation/CodingStyle

The problem is that while the rest of the kernel does not follow
this coding style, it's also much harder to read and/or program
this way for people not being used to. So the danger of bugs
slipping in this way is at least equally high. Besides that, this
argument would also only account for '==' checks.

>>> +		return 0;
>>> +	if (MAXSIZE < len)
>>> +		len = MAXSIZE;
>>> +
>>> +	lock_sock(sk);
>>> +	len = crypto_rng_get_bytes(ctx->drng, ctx->result, len);
>>> +	if (0 > len)
>>> +		goto unlock;
>>> +
>>> +	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
>>> +	memset(ctx->result, 0, err);
>>> +
>>
>> This looks buggy.
>>
>> If copy_to_user() fails from within memcpy_toiovec(), we call memset()
>> with a negative return value which is interpreted as size_t and thus
>> causes a buffer overflow writing beyond ctx->result, no?
>>
>> If it succeeds, we call memset(ctx->result, 0, 0) .....
>
> Right, good catch, I have to add a catch for negative error here.

Hm? Don't you rather mean to say to unconditionally do something like ...

   memzero_explicit(ctx->result, len);

...
>>> +	memset(ctx->result, 0, MAXSIZE);
>>
>> memset(ctx->result, 0, sizeof(ctx->result));
>
> Ok, if this is desired, fine with me.

Yes, please.

^ permalink raw reply

* Re: [PATCH 7/8] crypto: AF_ALG: add random number generator support
From: Stephan Mueller @ 2014-11-12 16:54 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Herbert Xu, ABI/API, linux-crypto-u79uwXL29TY76Z2rM5mHXA, LKML
In-Reply-To: <546387B8.9050601-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Am Mittwoch, 12. November 2014, 17:15:52 schrieb Daniel Borkmann:

Hi Daniel,

thanks for the comments.

> On 11/12/2014 08:05 AM, Stephan Mueller wrote:
> > This patch adds the random number generator support for AF_ALG.
> > 
> > A random number generator's purpose is to generate data without
> > requiring the caller to provide any data. Therefore, the AF_ALG
> > interface handler for RNGs only implements a callback handler for
> > recvmsg.
> 
> ...
> 
> > +static int rng_recvmsg(struct kiocb *unused, struct socket *sock,
> > +		       struct msghdr *msg, size_t len, int flags)
> > +{
> > +	struct sock *sk = sock->sk;
> > +	struct alg_sock *ask = alg_sk(sk);
> > +	struct rng_ctx *ctx = ask->private;
> > +	int err = -EFAULT;
> > +
> > +	if (0 == len)
> 
> if (len == 0)
> 	...
> 
> [And also other places.]
> 
> We don't use Yoda condition style in the kernel.

Well, there is a very good reason for using the approach I have: we all have 
done the error of forgetting the second = sign.

In my case, the compiler will complain and we fix the error right away.

In your case, nobody is complaining but we introduced a nasty, potentially 
hard to debug error. Thus, I very much like to keep my version just to be on 
the safe side.

Note, there was even a backdoor I have seen where the missing 2nd equal sign 
introduced a privilege escalation.

Therefore, my standard coding practice is to have a fixed value on the left 
side and the variable on the right side of any comparison.
> 
> > +		return 0;
> > +	if (MAXSIZE < len)
> > +		len = MAXSIZE;
> > +
> > +	lock_sock(sk);
> > +	len = crypto_rng_get_bytes(ctx->drng, ctx->result, len);
> > +	if (0 > len)
> > +		goto unlock;
> > +
> > +	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
> > +	memset(ctx->result, 0, err);
> > +
> 
> This looks buggy.
> 
> If copy_to_user() fails from within memcpy_toiovec(), we call memset()
> with a negative return value which is interpreted as size_t and thus
> causes a buffer overflow writing beyond ctx->result, no?
> 
> If it succeeds, we call memset(ctx->result, 0, 0) .....

Right, good catch, I have to add a catch for negative error here.

> 
> > +unlock:
> > +	release_sock(sk);
> > +
> > +	return err ? err : len;
> > +}
> > +
> > +static struct proto_ops algif_rng_ops = {
> > +	.family		=	PF_ALG,
> > +
> > +	.connect	=	sock_no_connect,
> > +	.socketpair	=	sock_no_socketpair,
> > +	.getname	=	sock_no_getname,
> > +	.ioctl		=	sock_no_ioctl,
> > +	.listen		=	sock_no_listen,
> > +	.shutdown	=	sock_no_shutdown,
> > +	.getsockopt	=	sock_no_getsockopt,
> > +	.mmap		=	sock_no_mmap,
> > +	.bind		=	sock_no_bind,
> > +	.accept		=	sock_no_accept,
> > +	.setsockopt	=	sock_no_setsockopt,
> > +	.poll		=	sock_no_poll,
> > +	.sendmsg	=	sock_no_sendmsg,
> > +	.sendpage	=	sock_no_sendpage,
> > +
> > +	.release	=	af_alg_release,
> > +	.recvmsg	=	rng_recvmsg,
> > +};
> > +
> > +static void *rng_bind(const char *name, u32 type, u32 mask)
> > +{
> > +	return crypto_alloc_rng(name, type, mask);
> > +}
> > +
> > +static void rng_release(void *private)
> > +{
> > +	crypto_free_rng(private);
> > +}
> > +
> > +static void rng_sock_destruct(struct sock *sk)
> > +{
> > +	struct alg_sock *ask = alg_sk(sk);
> > +	struct rng_ctx *ctx = ask->private;
> > +
> > +	memset(ctx->result, 0, MAXSIZE);
> 
> memset(ctx->result, 0, sizeof(ctx->result));

Ok, if this is desired, fine with me.
> 
> > +	sock_kfree_s(sk, ctx, ctx->len);
> > +	af_alg_release_parent(sk);
> > +}
> > +
> > +static int rng_accept_parent(void *private, struct sock *sk)
> > +{
> > +	struct rng_ctx *ctx;
> > +	struct alg_sock *ask = alg_sk(sk);
> > +	unsigned int len = sizeof(*ctx);
> > +	int seedsize = crypto_rng_seedsize(private);
> > +	int ret = -ENOMEM;
> > +
> > +	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
> > +	if (!ctx)
> > +		return -ENOMEM;
> > +	memset(ctx->result, 0, MAXSIZE);
> 
> Ditto...

Will do.

> 
> > +	ctx->len = len;
> > +
> > +	if (seedsize) {
> > +		u8 *buf = kmalloc(seedsize, GFP_KERNEL);
> > +		if (!buf)
> > +			goto err;
> > +		get_random_bytes(buf, seedsize);
> > +		ret = crypto_rng_reset(private, buf, len);
> > +		kzfree(buf);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


-- 
Ciao
Stephan

^ permalink raw reply

* Re: [PATCH 7/8] crypto: AF_ALG: add random number generator support
From: Daniel Borkmann @ 2014-11-12 16:15 UTC (permalink / raw)
  To: Stephan Mueller; +Cc: Herbert Xu, ABI/API, linux-crypto, LKML
In-Reply-To: <1914037.Wy7EiDNG7B@tachyon.chronox.de>

On 11/12/2014 08:05 AM, Stephan Mueller wrote:
> This patch adds the random number generator support for AF_ALG.
>
> A random number generator's purpose is to generate data without
> requiring the caller to provide any data. Therefore, the AF_ALG
> interface handler for RNGs only implements a callback handler for
> recvmsg.
...
> +static int rng_recvmsg(struct kiocb *unused, struct socket *sock,
> +		       struct msghdr *msg, size_t len, int flags)
> +{
> +	struct sock *sk = sock->sk;
> +	struct alg_sock *ask = alg_sk(sk);
> +	struct rng_ctx *ctx = ask->private;
> +	int err = -EFAULT;
> +
> +	if (0 == len)

if (len == 0)
	...

[And also other places.]

We don't use Yoda condition style in the kernel.

> +		return 0;
> +	if (MAXSIZE < len)
> +		len = MAXSIZE;
> +
> +	lock_sock(sk);
> +	len = crypto_rng_get_bytes(ctx->drng, ctx->result, len);
> +	if (0 > len)
> +		goto unlock;
> +
> +	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
> +	memset(ctx->result, 0, err);
> +

This looks buggy.

If copy_to_user() fails from within memcpy_toiovec(), we call memset()
with a negative return value which is interpreted as size_t and thus
causes a buffer overflow writing beyond ctx->result, no?

If it succeeds, we call memset(ctx->result, 0, 0) .....

> +unlock:
> +	release_sock(sk);
> +
> +	return err ? err : len;
> +}
> +
> +static struct proto_ops algif_rng_ops = {
> +	.family		=	PF_ALG,
> +
> +	.connect	=	sock_no_connect,
> +	.socketpair	=	sock_no_socketpair,
> +	.getname	=	sock_no_getname,
> +	.ioctl		=	sock_no_ioctl,
> +	.listen		=	sock_no_listen,
> +	.shutdown	=	sock_no_shutdown,
> +	.getsockopt	=	sock_no_getsockopt,
> +	.mmap		=	sock_no_mmap,
> +	.bind		=	sock_no_bind,
> +	.accept		=	sock_no_accept,
> +	.setsockopt	=	sock_no_setsockopt,
> +	.poll		=	sock_no_poll,
> +	.sendmsg	=	sock_no_sendmsg,
> +	.sendpage	=	sock_no_sendpage,
> +
> +	.release	=	af_alg_release,
> +	.recvmsg	=	rng_recvmsg,
> +};
> +
> +static void *rng_bind(const char *name, u32 type, u32 mask)
> +{
> +	return crypto_alloc_rng(name, type, mask);
> +}
> +
> +static void rng_release(void *private)
> +{
> +	crypto_free_rng(private);
> +}
> +
> +static void rng_sock_destruct(struct sock *sk)
> +{
> +	struct alg_sock *ask = alg_sk(sk);
> +	struct rng_ctx *ctx = ask->private;
> +
> +	memset(ctx->result, 0, MAXSIZE);

memset(ctx->result, 0, sizeof(ctx->result));

> +	sock_kfree_s(sk, ctx, ctx->len);
> +	af_alg_release_parent(sk);
> +}
> +
> +static int rng_accept_parent(void *private, struct sock *sk)
> +{
> +	struct rng_ctx *ctx;
> +	struct alg_sock *ask = alg_sk(sk);
> +	unsigned int len = sizeof(*ctx);
> +	int seedsize = crypto_rng_seedsize(private);
> +	int ret = -ENOMEM;
> +
> +	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
> +	if (!ctx)
> +		return -ENOMEM;
> +	memset(ctx->result, 0, MAXSIZE);

Ditto...

> +	ctx->len = len;
> +
> +	if (seedsize) {
> +		u8 *buf = kmalloc(seedsize, GFP_KERNEL);
> +		if (!buf)
> +			goto err;
> +		get_random_bytes(buf, seedsize);
> +		ret = crypto_rng_reset(private, buf, len);
> +		kzfree(buf);

^ permalink raw reply

* Re: [PATCH v6 2/7] vfs: Define new syscalls preadv2,pwritev2
From: mohanty bhagaban @ 2014-11-12 13:18 UTC (permalink / raw)
  To: russel.david100
  Cc: linux-kernel, Christoph Hellwig, linux-fsdevel, linux-aio,
	Mel Gorman, Volker Lendecke, Tejun Heo, Jeff Moyer,
	Theodore Ts'o, Al Viro, linux-api, Michael Kerrisk,
	linux-arch, linux-mm
In-Reply-To: <0a8539257086c2a3f7615d35ef621c7f81df52cf.1415636409.git.milosz@adfin.com>

[-- Attachment #1: Type: text/plain, Size: 19994 bytes --]

Russel,

Will this new flag ,  affect to any  io_vector. and any buffer cache.

+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *,
vec,
+               unsigned long, vlen, unsigned long, pos_l, unsigned long,
pos_h,
+               int flags)
+{
+       loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+       if (pos == -1)
+               return do_readv(fd, vec, vlen, flags);
+
+       return do_preadv(fd, vec, vlen, pos, flags);
+}
+

Bhagaban





On Mon, Nov 10, 2014 at 10:10 PM, Milosz Tanski <milosz@adfin.com> wrote:

> New syscalls that take an flag argument. This change does not add any
> specific
> flags.
>
> Signed-off-by: Milosz Tanski <milosz@adfin.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/read_write.c                   | 172
> ++++++++++++++++++++++++++++++--------
>  include/linux/compat.h            |   6 ++
>  include/linux/syscalls.h          |   6 ++
>  include/uapi/asm-generic/unistd.h |   6 +-
>  mm/filemap.c                      |   5 +-
>  5 files changed, 156 insertions(+), 39 deletions(-)
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 94b2d34..b1b4bc8 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -866,6 +866,8 @@ ssize_t vfs_readv(struct file *file, const struct
> iovec __user *vec,
>                 return -EBADF;
>         if (!(file->f_mode & FMODE_CAN_READ))
>                 return -EINVAL;
> +       if (flags & ~0)
> +               return -EINVAL;
>
>         return do_readv_writev(READ, file, vec, vlen, pos, flags);
>  }
> @@ -879,21 +881,23 @@ ssize_t vfs_writev(struct file *file, const struct
> iovec __user *vec,
>                 return -EBADF;
>         if (!(file->f_mode & FMODE_CAN_WRITE))
>                 return -EINVAL;
> +       if (flags & ~0)
> +               return -EINVAL;
>
>         return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
>  }
>
>  EXPORT_SYMBOL(vfs_writev);
>
> -SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *,
> vec,
> -               unsigned long, vlen)
> +static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
> +                       unsigned long vlen, int flags)
>  {
>         struct fd f = fdget_pos(fd);
>         ssize_t ret = -EBADF;
>
>         if (f.file) {
>                 loff_t pos = file_pos_read(f.file);
> -               ret = vfs_readv(f.file, vec, vlen, &pos, 0);
> +               ret = vfs_readv(f.file, vec, vlen, &pos, flags);
>                 if (ret >= 0)
>                         file_pos_write(f.file, pos);
>                 fdput_pos(f);
> @@ -905,15 +909,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const
> struct iovec __user *, vec,
>         return ret;
>  }
>
> -SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *,
> vec,
> -               unsigned long, vlen)
> +static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
> +                        unsigned long vlen, int flags)
>  {
>         struct fd f = fdget_pos(fd);
>         ssize_t ret = -EBADF;
>
>         if (f.file) {
>                 loff_t pos = file_pos_read(f.file);
> -               ret = vfs_writev(f.file, vec, vlen, &pos, 0);
> +               ret = vfs_writev(f.file, vec, vlen, &pos, flags);
>                 if (ret >= 0)
>                         file_pos_write(f.file, pos);
>                 fdput_pos(f);
> @@ -931,10 +935,9 @@ static inline loff_t pos_from_hilo(unsigned long
> high, unsigned long low)
>         return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
>  }
>
> -SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *,
> vec,
> -               unsigned long, vlen, unsigned long, pos_l, unsigned long,
> pos_h)
> +static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
> +                        unsigned long vlen, loff_t pos, int flags)
>  {
> -       loff_t pos = pos_from_hilo(pos_h, pos_l);
>         struct fd f;
>         ssize_t ret = -EBADF;
>
> @@ -945,7 +948,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const
> struct iovec __user *, vec,
>         if (f.file) {
>                 ret = -ESPIPE;
>                 if (f.file->f_mode & FMODE_PREAD)
> -                       ret = vfs_readv(f.file, vec, vlen, &pos, 0);
> +                       ret = vfs_readv(f.file, vec, vlen, &pos, flags);
>                 fdput(f);
>         }
>
> @@ -955,10 +958,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const
> struct iovec __user *, vec,
>         return ret;
>  }
>
> -SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *,
> vec,
> -               unsigned long, vlen, unsigned long, pos_l, unsigned long,
> pos_h)
> +static ssize_t do_pwritev(unsigned long fd, const struct iovec __user
> *vec,
> +                         unsigned long vlen, loff_t pos, int flags)
>  {
> -       loff_t pos = pos_from_hilo(pos_h, pos_l);
>         struct fd f;
>         ssize_t ret = -EBADF;
>
> @@ -969,7 +971,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const
> struct iovec __user *, vec,
>         if (f.file) {
>                 ret = -ESPIPE;
>                 if (f.file->f_mode & FMODE_PWRITE)
> -                       ret = vfs_writev(f.file, vec, vlen, &pos, 0);
> +                       ret = vfs_writev(f.file, vec, vlen, &pos, flags);
>                 fdput(f);
>         }
>
> @@ -979,11 +981,63 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const
> struct iovec __user *, vec,
>         return ret;
>  }
>
> +SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *,
> vec,
> +               unsigned long, vlen)
> +{
> +       return do_readv(fd, vec, vlen, 0);
> +}
> +
> +SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *,
> vec,
> +               unsigned long, vlen)
> +{
> +       return do_writev(fd, vec, vlen, 0);
> +}
> +
> +SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *,
> vec,
> +               unsigned long, vlen, unsigned long, pos_l, unsigned long,
> pos_h)
> +{
> +       loff_t pos = pos_from_hilo(pos_h, pos_l);
> +
> +       return do_preadv(fd, vec, vlen, pos, 0);
> +}
> +
> +SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *,
> vec,
> +               unsigned long, vlen, unsigned long, pos_l, unsigned long,
> pos_h,
> +               int, flags)
> +{
> +       loff_t pos = pos_from_hilo(pos_h, pos_l);
> +
> +       if (pos == -1)
> +               return do_readv(fd, vec, vlen, flags);
> +
> +       return do_preadv(fd, vec, vlen, pos, flags);
> +}
> +
> +SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *,
> vec,
> +               unsigned long, vlen, unsigned long, pos_l, unsigned long,
> pos_h)
> +{
> +       loff_t pos = pos_from_hilo(pos_h, pos_l);
> +
> +       return do_pwritev(fd, vec, vlen, pos, 0);
> +}
> +
> +SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *,
> vec,
> +               unsigned long, vlen, unsigned long, pos_l, unsigned long,
> pos_h,
> +               int, flags)
> +{
> +       loff_t pos = pos_from_hilo(pos_h, pos_l);
> +
> +       if (pos == -1)
> +               return do_writev(fd, vec, vlen, flags);
> +
> +       return do_pwritev(fd, vec, vlen, pos, flags);
> +}
> +
>  #ifdef CONFIG_COMPAT
>
>  static ssize_t compat_do_readv_writev(int type, struct file *file,
>                                const struct compat_iovec __user *uvector,
> -                              unsigned long nr_segs, loff_t *pos)
> +                              unsigned long nr_segs, loff_t *pos, int
> flags)
>  {
>         compat_ssize_t tot_len;
>         struct iovec iovstack[UIO_FASTIOV];
> @@ -1017,7 +1071,7 @@ static ssize_t compat_do_readv_writev(int type,
> struct file *file,
>
>         if (iter_fn)
>                 ret = do_iter_readv_writev(file, type, iov, nr_segs,
> tot_len,
> -                                               pos, iter_fn, 0);
> +                                               pos, iter_fn, flags);
>         else if (fnv)
>                 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
>                                                 pos, fnv);
> @@ -1041,7 +1095,7 @@ out:
>
>  static size_t compat_readv(struct file *file,
>                            const struct compat_iovec __user *vec,
> -                          unsigned long vlen, loff_t *pos)
> +                          unsigned long vlen, loff_t *pos, int flags)
>  {
>         ssize_t ret = -EBADF;
>
> @@ -1051,8 +1105,10 @@ static size_t compat_readv(struct file *file,
>         ret = -EINVAL;
>         if (!(file->f_mode & FMODE_CAN_READ))
>                 goto out;
> +       if (flags & ~0)
> +               goto out;
>
> -       ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
> +       ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
>
>  out:
>         if (ret > 0)
> @@ -1061,9 +1117,9 @@ out:
>         return ret;
>  }
>
> -COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
> -               const struct compat_iovec __user *,vec,
> -               compat_ulong_t, vlen)
> +static size_t __compat_sys_readv(compat_ulong_t fd,
> +                                const struct compat_iovec __user *vec,
> +                                compat_ulong_t vlen, int flags)
>  {
>         struct fd f = fdget_pos(fd);
>         ssize_t ret;
> @@ -1072,16 +1128,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
>         if (!f.file)
>                 return -EBADF;
>         pos = f.file->f_pos;
> -       ret = compat_readv(f.file, vec, vlen, &pos);
> +       ret = compat_readv(f.file, vec, vlen, &pos, flags);
>         if (ret >= 0)
>                 f.file->f_pos = pos;
>         fdput_pos(f);
>         return ret;
> +
> +}
> +
> +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
> +               const struct compat_iovec __user *,vec,
> +               compat_ulong_t, vlen)
> +{
> +       return __compat_sys_readv(fd, vec, vlen, 0);
>  }
>
>  static long __compat_sys_preadv64(unsigned long fd,
>                                   const struct compat_iovec __user *vec,
> -                                 unsigned long vlen, loff_t pos)
> +                                 unsigned long vlen, loff_t pos, int
> flags)
>  {
>         struct fd f;
>         ssize_t ret;
> @@ -1093,7 +1157,7 @@ static long __compat_sys_preadv64(unsigned long fd,
>                 return -EBADF;
>         ret = -ESPIPE;
>         if (f.file->f_mode & FMODE_PREAD)
> -               ret = compat_readv(f.file, vec, vlen, &pos);
> +               ret = compat_readv(f.file, vec, vlen, &pos, flags);
>         fdput(f);
>         return ret;
>  }
> @@ -1103,7 +1167,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
>                 const struct compat_iovec __user *,vec,
>                 unsigned long, vlen, loff_t, pos)
>  {
> -       return __compat_sys_preadv64(fd, vec, vlen, pos);
> +       return __compat_sys_preadv64(fd, vec, vlen, pos, 0);
>  }
>  #endif
>
> @@ -1113,12 +1177,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
>  {
>         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
>
> -       return __compat_sys_preadv64(fd, vec, vlen, pos);
> +       return __compat_sys_preadv64(fd, vec, vlen, pos, 0);
> +}
> +
> +COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
> +               const struct compat_iovec __user *,vec,
> +               compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
> +               int, flags)
> +{
> +       loff_t pos = ((loff_t)pos_high << 32) | pos_low;
> +
> +       if (pos == -1)
> +               return __compat_sys_readv(fd, vec, vlen, flags);
> +
> +       return __compat_sys_preadv64(fd, vec, vlen, pos, flags);
>  }
>
>  static size_t compat_writev(struct file *file,
>                             const struct compat_iovec __user *vec,
> -                           unsigned long vlen, loff_t *pos)
> +                           unsigned long vlen, loff_t *pos, int flags)
>  {
>         ssize_t ret = -EBADF;
>
> @@ -1128,8 +1205,10 @@ static size_t compat_writev(struct file *file,
>         ret = -EINVAL;
>         if (!(file->f_mode & FMODE_CAN_WRITE))
>                 goto out;
> +       if (flags & ~0)
> +               goto out;
>
> -       ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
> +       ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags);
>
>  out:
>         if (ret > 0)
> @@ -1138,9 +1217,9 @@ out:
>         return ret;
>  }
>
> -COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
> -               const struct compat_iovec __user *, vec,
> -               compat_ulong_t, vlen)
> +static size_t __compat_sys_writev(compat_ulong_t fd,
> +                                 const struct compat_iovec __user* vec,
> +                                 compat_ulong_t vlen, int flags)
>  {
>         struct fd f = fdget_pos(fd);
>         ssize_t ret;
> @@ -1149,28 +1228,36 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
>         if (!f.file)
>                 return -EBADF;
>         pos = f.file->f_pos;
> -       ret = compat_writev(f.file, vec, vlen, &pos);
> +       ret = compat_writev(f.file, vec, vlen, &pos, flags);
>         if (ret >= 0)
>                 f.file->f_pos = pos;
>         fdput_pos(f);
>         return ret;
>  }
>
> +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
> +               const struct compat_iovec __user *, vec,
> +               compat_ulong_t, vlen)
> +{
> +       return __compat_sys_writev(fd, vec, vlen, 0);
> +}
> +
>  static long __compat_sys_pwritev64(unsigned long fd,
>                                    const struct compat_iovec __user *vec,
> -                                  unsigned long vlen, loff_t pos)
> +                                  unsigned long vlen, loff_t pos, int
> flags)
>  {
>         struct fd f;
>         ssize_t ret;
>
>         if (pos < 0)
>                 return -EINVAL;
> +
>         f = fdget(fd);
>         if (!f.file)
>                 return -EBADF;
>         ret = -ESPIPE;
>         if (f.file->f_mode & FMODE_PWRITE)
> -               ret = compat_writev(f.file, vec, vlen, &pos);
> +               ret = compat_writev(f.file, vec, vlen, &pos, flags);
>         fdput(f);
>         return ret;
>  }
> @@ -1180,7 +1267,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
>                 const struct compat_iovec __user *,vec,
>                 unsigned long, vlen, loff_t, pos)
>  {
> -       return __compat_sys_pwritev64(fd, vec, vlen, pos);
> +       return __compat_sys_pwritev64(fd, vec, vlen, pos, 0);
>  }
>  #endif
>
> @@ -1190,8 +1277,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
>  {
>         loff_t pos = ((loff_t)pos_high << 32) | pos_low;
>
> -       return __compat_sys_pwritev64(fd, vec, vlen, pos);
> +       return __compat_sys_pwritev64(fd, vec, vlen, pos, 0);
> +}
> +
> +COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
> +               const struct compat_iovec __user *,vec,
> +               compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int,
> flags)
> +{
> +       loff_t pos = ((loff_t)pos_high << 32) | pos_low;
> +
> +       if (pos == -1)
> +               return __compat_sys_writev(fd, vec, vlen, flags);
> +
> +       return __compat_sys_pwritev64(fd, vec, vlen, pos, flags);
>  }
> +
>  #endif
>
>  static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index e649426..63a94e2 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -340,6 +340,12 @@ asmlinkage ssize_t compat_sys_preadv(compat_ulong_t
> fd,
>  asmlinkage ssize_t compat_sys_pwritev(compat_ulong_t fd,
>                 const struct compat_iovec __user *vec,
>                 compat_ulong_t vlen, u32 pos_low, u32 pos_high);
> +asmlinkage ssize_t compat_sys_preadv2(compat_ulong_t fd,
> +               const struct compat_iovec __user *vec,
> +               compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
> +asmlinkage ssize_t compat_sys_pwritev2(compat_ulong_t fd,
> +               const struct compat_iovec __user *vec,
> +               compat_ulong_t vlen, u32 pos_low, u32 pos_high, int flags);
>
>  #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
>  asmlinkage long compat_sys_preadv64(unsigned long fd,
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index bda9b81..cedc22e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -571,8 +571,14 @@ asmlinkage long sys_pwrite64(unsigned int fd, const
> char __user *buf,
>                              size_t count, loff_t pos);
>  asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user
> *vec,
>                            unsigned long vlen, unsigned long pos_l,
> unsigned long pos_h);
> +asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user
> *vec,
> +                           unsigned long vlen, unsigned long pos_l,
> unsigned long pos_h,
> +                           int flags);
>  asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user
> *vec,
>                             unsigned long vlen, unsigned long pos_l,
> unsigned long pos_h);
> +asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user
> *vec,
> +                           unsigned long vlen, unsigned long pos_l,
> unsigned long pos_h,
> +                           int flags);
>  asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
>  asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
>  asmlinkage long sys_chdir(const char __user *filename);
> diff --git a/include/uapi/asm-generic/unistd.h
> b/include/uapi/asm-generic/unistd.h
> index 22749c1..9406018 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -213,6 +213,10 @@ __SC_COMP(__NR_pwrite64, sys_pwrite64,
> compat_sys_pwrite64)
>  __SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv)
>  #define __NR_pwritev 70
>  __SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev)
> +#define __NR_preadv2 281
> +__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
> +#define __NR_pwritev2 282
> +__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
>
>  /* fs/sendfile.c */
>  #define __NR3264_sendfile 71
> @@ -709,7 +713,7 @@ __SYSCALL(__NR_memfd_create, sys_memfd_create)
>  __SYSCALL(__NR_bpf, sys_bpf)
>
>  #undef __NR_syscalls
> -#define __NR_syscalls 281
> +#define __NR_syscalls 283
>
>  /*
>   * All syscalls below here should go away really,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 14b4642..530c263 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1457,6 +1457,7 @@ static void shrink_readahead_size_eio(struct file
> *filp,
>   * @ppos:      current file position
>   * @iter:      data destination
>   * @written:   already copied
> + * @flags:     optional flags
>   *
>   * This is a generic file read routine, and uses the
>   * mapping->a_ops->readpage() function for the actual low-level stuff.
> @@ -1465,7 +1466,7 @@ static void shrink_readahead_size_eio(struct file
> *filp,
>   * of the logic when it comes to error handling etc.
>   */
>  static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
> -               struct iov_iter *iter, ssize_t written)
> +               struct iov_iter *iter, ssize_t written, int flags)
>  {
>         struct address_space *mapping = filp->f_mapping;
>         struct inode *inode = mapping->host;
> @@ -1735,7 +1736,7 @@ generic_file_read_iter(struct kiocb *iocb, struct
> iov_iter *iter)
>                 }
>         }
>
> -       retval = do_generic_file_read(file, ppos, iter, retval);
> +       retval = do_generic_file_read(file, ppos, iter, retval,
> iocb->ki_rwflags);
>  out:
>         return retval;
>  }
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

[-- Attachment #2: Type: text/html, Size: 25572 bytes --]

^ permalink raw reply

* [PATCH 8/8] crypto: AF_ALG: enable RNG interface compilation
From: Stephan Mueller @ 2014-11-12  7:06 UTC (permalink / raw)
  To: Herbert Xu; +Cc: ABI/API, linux-crypto, LKML
In-Reply-To: <4738444.A2vZX1nNCo@tachyon.chronox.de>

Enable compilation of the RNG AF_ALG support and provide a Kconfig
option to compile the RNG AF_ALG support.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/Kconfig  | 9 +++++++++
 crypto/Makefile | 1 +
 2 files changed, 10 insertions(+)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 87bbc9c..e127323 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1505,6 +1505,15 @@ config CRYPTO_USER_API_SKCIPHER
 	  This option enables the user-spaces interface for symmetric
 	  key cipher algorithms.
 
+config CRYPTO_USER_API_RNG
+	tristate "User-space interface for random number generator algorithms"
+	depends on NET
+	select CRYPTO_RNG
+	select CRYPTO_USER_API
+	help
+	  This option enables the user-spaces interface for random
+	  number generator algorithms.
+
 config CRYPTO_HASH_INFO
 	bool
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 1445b91..ba19465 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
 obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
+obj-$(CONFIG_CRYPTO_USER_API_RNG) += algif_rng.o
 
 #
 # generic algorithms and the async_tx api
-- 
2.1.0

^ permalink raw reply related

* [PATCH 7/8] crypto: AF_ALG: add random number generator support
From: Stephan Mueller @ 2014-11-12  7:05 UTC (permalink / raw)
  To: Herbert Xu; +Cc: ABI/API, linux-crypto, LKML
In-Reply-To: <4738444.A2vZX1nNCo@tachyon.chronox.de>

This patch adds the random number generator support for AF_ALG.

A random number generator's purpose is to generate data without
requiring the caller to provide any data. Therefore, the AF_ALG
interface handler for RNGs only implements a callback handler for
recvmsg.

The following parameters provided with a recvmsg are processed by the
RNG callback handler:

	* sock - to resolve the RNG context data structure accessing the
	  RNG instance private to the socket

	* len - this parameter allows userspace callers to specify how
	  many random bytes the RNG shall produce and return. As the
	  kernel context for the RNG allocates a buffer of 128 bytes to
	  store random numbers before copying them to userspace, the len
	  parameter is checked that it is not larger than 128. If a
	  caller wants more random numbers, a new request for recvmsg
	  shall be made.

The size of 128 bytes is chose because of the following considerations:

	* to increase the memory footprint of the kernel too much (note,
	  that would be 128 bytes per open socket)

	* 128 is divisible by any typical cryptographic block size an
	  RNG may have

	* A request for random numbers typically only shall supply small
	  amount of data like for keys or IVs that should only require
	  one invocation of the recvmsg function.

Note, during instantiation of the RNG, the code checks whether the RNG
implementation requires seeding. If so, the RNG is seeded with output
from get_random_bytes.

A fully working example using all aspects of the RNG interface is
provided at http://www.chronox.de/libkcapi.html

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_rng.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 crypto/algif_rng.c

diff --git a/crypto/algif_rng.c b/crypto/algif_rng.c
new file mode 100644
index 0000000..d1904d7
--- /dev/null
+++ b/crypto/algif_rng.c
@@ -0,0 +1,186 @@
+/*
+ * algif_rng: User-space interface for random number generators
+ *
+ * This file provides the user-space API for random number generators.
+ *
+ * Copyright (C) 2014, Stephan Mueller <smueller@chronox.de>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL2 are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <crypto/rng.h>
+#include <linux/random.h>
+#include <crypto/if_alg.h>
+#include <linux/net.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>");
+MODULE_DESCRIPTION("User-space interface for random number generators");
+
+struct rng_ctx {
+#define MAXSIZE 128
+	u8 result[MAXSIZE];
+	unsigned int len;
+	struct crypto_rng *drng;
+};
+
+static int rng_recvmsg(struct kiocb *unused, struct socket *sock,
+		       struct msghdr *msg, size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+	struct rng_ctx *ctx = ask->private;
+	int err = -EFAULT;
+
+	if (0 == len)
+		return 0;
+	if (MAXSIZE < len)
+		len = MAXSIZE;
+
+	lock_sock(sk);
+	len = crypto_rng_get_bytes(ctx->drng, ctx->result, len);
+	if (0 > len)
+		goto unlock;
+
+	err = memcpy_toiovec(msg->msg_iov, ctx->result, len);
+	memset(ctx->result, 0, err);
+
+unlock:
+	release_sock(sk);
+
+	return err ? err : len;
+}
+
+static struct proto_ops algif_rng_ops = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+	.poll		=	sock_no_poll,
+	.sendmsg	=	sock_no_sendmsg,
+	.sendpage	=	sock_no_sendpage,
+
+	.release	=	af_alg_release,
+	.recvmsg	=	rng_recvmsg,
+};
+
+static void *rng_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_rng(name, type, mask);
+}
+
+static void rng_release(void *private)
+{
+	crypto_free_rng(private);
+}
+
+static void rng_sock_destruct(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	struct rng_ctx *ctx = ask->private;
+
+	memset(ctx->result, 0, MAXSIZE);
+	sock_kfree_s(sk, ctx, ctx->len);
+	af_alg_release_parent(sk);
+}
+
+static int rng_accept_parent(void *private, struct sock *sk)
+{
+	struct rng_ctx *ctx;
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int len = sizeof(*ctx);
+	int seedsize = crypto_rng_seedsize(private);
+	int ret = -ENOMEM;
+
+	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	memset(ctx->result, 0, MAXSIZE);
+
+	ctx->len = len;
+
+	if (seedsize) {
+		u8 *buf = kmalloc(seedsize, GFP_KERNEL);
+		if (!buf)
+			goto err;
+		get_random_bytes(buf, seedsize);
+		ret = crypto_rng_reset(private, buf, len);
+		kzfree(buf);
+		if (ret)
+			goto err;
+	}
+
+	ctx->drng = private;
+	ask->private = ctx;
+	sk->sk_destruct = rng_sock_destruct;
+
+	return 0;
+
+err:
+	sock_kfree_s(sk, ctx, len);
+	return ret;
+}
+
+static const struct af_alg_type algif_type_rng = {
+	.bind		=	rng_bind,
+	.release	=	rng_release,
+	.accept		=	rng_accept_parent,
+	.ops		=	&algif_rng_ops,
+	.name		=	"rng",
+	.owner		=	THIS_MODULE
+};
+
+static int __init rng_init(void)
+{
+	return af_alg_register_type(&algif_type_rng);
+}
+
+void __exit rng_exit(void)
+{
+	int err = af_alg_unregister_type(&algif_type_rng);
+	BUG_ON(err);
+}
+
+module_init(rng_init);
+module_exit(rng_exit);
-- 
2.1.0

^ permalink raw reply related

* [PATCH 6/8] crypto: AF_ALG: make setkey optional
From: Stephan Mueller @ 2014-11-12  7:05 UTC (permalink / raw)
  To: Herbert Xu; +Cc: ABI/API, linux-crypto, LKML
In-Reply-To: <4738444.A2vZX1nNCo@tachyon.chronox.de>

The current AF_ALG implementation requires that a userspace interface
implementation must provide a callback for setkey. Such a call is not
appliable to random number generators.

To prepare AF_ALG for the addition of a random number generator user
space interface, this function callback invocation is made optional.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/af_alg.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 635140b..47a199c 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -177,6 +177,9 @@ static int alg_setkey(struct sock *sk, char __user *ukey,
 	u8 *key;
 	int err;
 
+	if (!type->setkey)
+		return -EOPNOTSUPP;
+
 	key = sock_kmalloc(sk, keylen, GFP_KERNEL);
 	if (!key)
 		return -ENOMEM;
-- 
2.1.0

^ permalink raw reply related

* [PATCH 5/8] crypto: AF_ALG: add AEAD support
From: Stephan Mueller @ 2014-11-12  7:04 UTC (permalink / raw)
  To: Herbert Xu; +Cc: ABI/API, linux-crypto, LKML
In-Reply-To: <4738444.A2vZX1nNCo@tachyon.chronox.de>

This patch adds the AEAD support for AF_ALG.

The AEAD implementation uses the entire memory handling and
infrastructure of the existing skcipher implementation.

To use AEAD, the user space consumer has to use the salg_type named
"aead". The AEAD extension only uses the bind callback as the key
differentiator. The previously added functions that select whether to
use AEAD or ablkcipher crypto API functions depend on the TFM type
allocated during the bind() call.

The addition of AEAD brings a bit of overhead to calculate the size of
the ciphertext, because the AEAD implementation of the kernel crypto API
makes implied assumption on the location of the authentication tag. When
performing an encryption, the tag will be added to the created
ciphertext (note, the tag is placed adjacent to the ciphertext). For
decryption, the caller must hand in the ciphertext with the tag appended
to the ciphertext. Therefore, the selection of the used memory
needs to add/subtract the tag size from the source/destination buffers
depending on the encryption type. The code is provided with comments
explainint when and how that operation is performed.

Note: The AF_ALG interface does not support zero length plaintext or
zero length ciphertext. Such zero length input data may be used if one
wants to access the hash implementation of an AEAD directly (e.g. the
GHASH of GCM or CMAC for CCM). However, this is a use case that is not
of interest. GHASH or CMAC is directly available via the hash AF_ALG
interface and we therefore do not need to take precautions for this
use case.

A fully working example using all aspects of AEAD is provided at
http://www.chronox.de/libkcapi.html

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_skcipher.c | 153 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 144 insertions(+), 9 deletions(-)

diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index fb8efc8..1e2763d 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -387,6 +387,17 @@ static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock,
 
 		if (con.iv && con.iv->ivlen != ivsize)
 			return -EINVAL;
+
+		/*
+		 * AEAD associated data is limited to a sensible size
+		 * Size limit is set to some arbitrary length to avoid
+		 * user space eating up memory
+		 */
+		if (ctx->aead &&
+		    (con.aead_assoc->aead_assoclen > MAX_AEAD_ASSOCLEN ||
+		     !con.aead_assoc->aead_assoclen ||
+		     !con.aead_assoc || !con.aead_authsize))
+			return -EINVAL;
 	}
 
 	err = -EINVAL;
@@ -399,6 +410,25 @@ static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock,
 		ctx->enc = enc;
 		if (con.iv)
 			memcpy(ctx->iv, con.iv->iv, ivsize);
+		/* AEAD authentication data handling */
+		if (ctx->aead) {
+			if (con.aead_authsize)
+				err = crypto_aead_setauthsize(
+					crypto_aead_reqtfm(&ctx->u.aead_req),
+							   con.aead_authsize);
+			if (err)
+				goto unlock;
+			/* set associated data */
+			memcpy(ctx->aead_assoc,
+			       con.aead_assoc->aead_assoc,
+			       con.aead_assoc->aead_assoclen);
+			sg_init_one(&ctx->sg_aead_assoc,
+				    ctx->aead_assoc,
+				    con.aead_assoc->aead_assoclen);
+			aead_request_set_assoc(&ctx->u.aead_req,
+					       &ctx->sg_aead_assoc,
+					       con.aead_assoc->aead_assoclen);
+		}
 	}
 
 	while (size) {
@@ -547,10 +577,41 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 	int err = -EAGAIN;
 	int used;
 	long copied = 0;
+	unsigned int aead_authsize_enc = 0;
+	unsigned int aead_authsize_dec = 0;
 
 	lock_sock(sk);
+	/*
+	* AEAD memory structure: For encryption, the tag is appended to the
+	* ciphertext which implies that the memory allocated for the ciphertext
+	* must be increased by the tag length. For decryption, the tag
+	* is expected to be concatenated to the ciphertext. The plaintext
+	* therefore has a memory size of the ciphertext minus the tag length.
+	*
+	* Note: this memory calculation only works because we require the
+	* user space caller to:
+	*	* perform encryption by invoking the recv function with a buffer
+	*	  length of ciphertext + tag size -- the send function can be
+	*	  invoked normally with just the plaintext.
+	*	* perform a decryption by invoking the the write function with
+	*	  a buffer holding the ciphertext + tag (and setting the
+	*	  buffer size accordingly) -- the recv function can be invoked
+	*	  normally with just the space needed for the ciphertext.
+	*	  Though, the caller should check for EBADMSG to catch integiry
+	*	  violations.
+	*/
+	if (ctx->aead) {
+		if (ctx->enc)
+			aead_authsize_enc = crypto_aead_authsize(
+					crypto_aead_reqtfm(&ctx->u.aead_req));
+		else
+			aead_authsize_dec = crypto_aead_authsize(
+					crypto_aead_reqtfm(&ctx->u.aead_req));
+	}
+
 	for (iov = msg->msg_iov, iovlen = msg->msg_iovlen; iovlen > 0;
 	     iovlen--, iov++) {
+		/* size of the output data memory */
 		unsigned long seglen = iov->iov_len;
 		char __user *from = iov->iov_base;
 
@@ -562,6 +623,7 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 			while (!sg->length)
 				sg++;
 
+			/* size of the input data memory */
 			used = ctx->used;
 			if (!used) {
 				err = skcipher_wait_for_data(sk, flags);
@@ -569,7 +631,28 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 					goto unlock;
 			}
 
-			used = min_t(unsigned long, used, seglen);
+			used = min_t(unsigned long,
+					     /*
+					      * In case of encryption, add
+					      * the memory needed for the tag
+					      * to the input data length to
+					      * give the cipher the necessary
+					      * space to add the tag.
+					      */
+					     used + aead_authsize_enc,
+					     /*
+					      * In case of decryption, add the
+					      * memory needed for the tag
+					      * calculations to the output
+					      * buffer.
+					      */
+					     seglen + aead_authsize_dec);
+
+			if (used < aead_authsize_enc ||
+			    seglen < aead_authsize_dec) {
+				err = -ENOMEM;
+				goto unlock;
+			}
 
 			used = af_alg_make_sg(&ctx->rsgl, from, used, 1);
 			err = used;
@@ -583,9 +666,16 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 			if (!used)
 				goto free;
 
-			ablkcipher_request_set_crypt(&ctx->req, sg,
-						     ctx->rsgl.sg, used,
-						     ctx->iv);
+			/*
+			 * See API specification of the AEAD API: for
+			 * encryption, we need to tell the encrypt function
+			 * what the size of the plaintext is. But we have
+			 * ensured that we have sufficient memory allocated for
+			 * the operation.
+			 */
+			skcipher_crypto_set_crypt(ctx, sg, ctx->rsgl.sg,
+						  used - aead_authsize_enc,
+						  ctx->iv);
 
 			err = af_alg_wait_for_completion(
 				ctx->enc ?
@@ -599,10 +689,19 @@ free:
 			if (err)
 				goto unlock;
 
-			copied += used;
-			from += used;
-			seglen -= used;
-			skcipher_pull_sgl(sk, used);
+			/*
+			 * Adjust the output buffer counters by only the size
+			 * needed for the plaintext in case of a decryption
+			 */
+			copied += (used - aead_authsize_dec);
+			from += (used - aead_authsize_dec);
+			seglen -= (used - aead_authsize_dec);
+			/*
+			 * Adjust the input buffer by how much we have encrypted
+			 * or decrypted. In case of encryption, we only credit
+			 * the memory of the plaintext.
+			 */
+			skcipher_pull_sgl(sk, used - aead_authsize_enc);
 		}
 	}
 
@@ -724,6 +823,10 @@ static void skcipher_sock_destruct(struct sock *sk)
 	unsigned int ivlen = skcipher_crypto_ivsize_ctx(ctx);
 
 	skcipher_free_sgl(sk);
+	if (ctx->aead) {
+		memset(ctx->aead_assoc, 0, MAX_AEAD_ASSOCLEN);
+		sock_kfree_s(sk, ctx->aead_assoc, MAX_AEAD_ASSOCLEN);
+	}
 	sock_kfree_s(sk, ctx->iv, ivlen);
 	sock_kfree_s(sk, ctx, ctx->len);
 	af_alg_release_parent(sk);
@@ -748,6 +851,17 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 
 	memset(ctx->iv, 0, ivlen);
 
+	if (skcipher_is_aead(private)) {
+		ctx->aead_assoc = sock_kmalloc(sk, MAX_AEAD_ASSOCLEN,
+					       GFP_KERNEL);
+		if (!ctx->aead_assoc) {
+			sock_kfree_s(sk, ctx->iv, ivlen);
+			sock_kfree_s(sk, ctx, len);
+			return -ENOMEM;
+		}
+		memset(ctx->aead_assoc, 0, MAX_AEAD_ASSOCLEN);
+	}
+
 	INIT_LIST_HEAD(&ctx->tsgl);
 	ctx->len = len;
 	ctx->used = 0;
@@ -778,15 +892,36 @@ static const struct af_alg_type algif_type_skcipher = {
 	.owner		=	THIS_MODULE
 };
 
+static void *aead_bind(const char *name, u32 type, u32 mask)
+{
+	return crypto_alloc_aead(name, type, mask);
+}
+
+static const struct af_alg_type algif_type_aead = {
+	.bind		=	aead_bind,
+	.release	=	skcipher_release,
+	.setkey		=	skcipher_setkey,
+	.accept		=	skcipher_accept_parent,
+	.ops		=	&algif_skcipher_ops,
+	.name		=	"aead",
+	.owner		=	THIS_MODULE
+};
+
 static int __init algif_skcipher_init(void)
 {
-	return af_alg_register_type(&algif_type_skcipher);
+	int ret = af_alg_register_type(&algif_type_skcipher);
+
+	if (ret)
+		return ret;
+	return af_alg_register_type(&algif_type_aead);
 }
 
 static void __exit algif_skcipher_exit(void)
 {
 	int err = af_alg_unregister_type(&algif_type_skcipher);
 	BUG_ON(err);
+	err = af_alg_unregister_type(&algif_type_aead);
+	BUG_ON(err);
 }
 
 module_init(algif_skcipher_init);
-- 
2.1.0

^ permalink raw reply related

* [PATCH 4/8] crypto: AF_ALG: crypto API calls to inline functions
From: Stephan Mueller @ 2014-11-12  7:03 UTC (permalink / raw)
  To: Herbert Xu; +Cc: ABI/API, linux-crypto, LKML
In-Reply-To: <4738444.A2vZX1nNCo@tachyon.chronox.de>

To avoid excessive branches and cluttering the code, all kernel crypto
API calls are extracted into separate inline functions. These functions
invoke either the ablkcipher or the aead crypto API function calls, as
necessary.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
---
 crypto/algif_skcipher.c | 141 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 17 deletions(-)

diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c
index 9286cfc..fb8efc8 100644
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -247,14 +247,121 @@ static void skcipher_data_wakeup(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static inline bool skcipher_is_aead(struct crypto_tfm *tfm)
+{
+	return ((crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_MASK) ==
+		CRYPTO_ALG_TYPE_AEAD);
+}
+
+static inline unsigned int skcipher_crypto_ivsize(void *private)
+{
+	if (skcipher_is_aead(private))
+		return crypto_aead_ivsize(private);
+	else
+		return crypto_ablkcipher_ivsize(private);
+}
+
+static inline unsigned int skcipher_crypto_ivsize_ctx(struct skcipher_ctx *ctx)
+{
+	if (ctx->aead)
+		return crypto_aead_ivsize(crypto_aead_reqtfm(&ctx->u.aead_req));
+	else
+		return crypto_ablkcipher_ivsize(
+			crypto_ablkcipher_reqtfm(&ctx->u.ablkcipher_req));
+}
+
+static inline unsigned int skcipher_crypto_blocksize(struct skcipher_ctx *ctx)
+{
+	if (ctx->aead)
+		return crypto_aead_blocksize(
+			crypto_aead_reqtfm(&ctx->u.aead_req));
+	else
+		return crypto_ablkcipher_blocksize(
+			crypto_ablkcipher_reqtfm(&ctx->u.ablkcipher_req));
+}
+
+static inline unsigned int skcipher_crypto_reqsize(void *private)
+{
+	if (skcipher_is_aead(private))
+		return crypto_aead_reqsize(private);
+	else
+		return crypto_ablkcipher_reqsize(private);
+}
+
+static inline unsigned int skcipher_crypto_setkey(void *private, const u8 *key,
+						  unsigned int keylen)
+{
+	if (skcipher_is_aead(private))
+		return crypto_aead_setkey(private, key, keylen);
+	else
+		return crypto_ablkcipher_setkey(private, key, keylen);
+}
+
+static inline void skcipher_crypto_free(void *private)
+{
+	if (skcipher_is_aead(private))
+		crypto_free_aead(private);
+	else
+		crypto_free_ablkcipher(private);
+}
+
+static inline void skcipher_request_set_tfm(struct skcipher_ctx *ctx, void *tfm)
+{
+	if (ctx->aead)
+		aead_request_set_tfm(&ctx->u.aead_req, tfm);
+	else
+		ablkcipher_request_set_tfm(&ctx->u.ablkcipher_req, tfm);
+}
+
+static inline int skcipher_crypto_encrypt(struct skcipher_ctx *ctx)
+{
+	if (ctx->aead)
+		return crypto_aead_encrypt(&ctx->u.aead_req);
+	else
+		return crypto_ablkcipher_encrypt(&ctx->u.ablkcipher_req);
+}
+
+static inline int skcipher_crypto_decrypt(struct skcipher_ctx *ctx)
+{
+	if (ctx->aead)
+		return crypto_aead_decrypt(&ctx->u.aead_req);
+	else
+		return crypto_ablkcipher_decrypt(&ctx->u.ablkcipher_req);
+}
+
+static inline void skcipher_crypto_set_crypt(struct skcipher_ctx *ctx,
+					     struct scatterlist *src,
+					     struct scatterlist *dst,
+					     unsigned int cryptlen, u8 *iv)
+{
+	if (ctx->aead)
+		return aead_request_set_crypt(&ctx->u.aead_req, src, dst,
+					      cryptlen, iv);
+	else
+		return ablkcipher_request_set_crypt(&ctx->u.ablkcipher_req, src,
+						    dst, cryptlen, iv);
+}
+
+static inline void skcipher_request_set_callback(struct skcipher_ctx *ctx,
+						 u32 flags,
+						 crypto_completion_t complete,
+						 void *data)
+{
+	if (ctx->aead)
+		aead_request_set_callback(&ctx->u.aead_req, flags, complete,
+					  data);
+	else
+		ablkcipher_request_set_callback(&ctx->u.ablkcipher_req, flags,
+						complete, data);
+}
+
 static int skcipher_sendmsg(struct kiocb *unused, struct socket *sock,
 			    struct msghdr *msg, size_t size)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct skcipher_ctx *ctx = ask->private;
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req);
-	unsigned ivsize = crypto_ablkcipher_ivsize(tfm);
+	unsigned ivsize = skcipher_crypto_ivsize_ctx(ctx);
 	struct skcipher_sg_list *sgl;
 	struct af_alg_control con = {};
 	long copied = 0;
@@ -432,8 +539,7 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	struct skcipher_ctx *ctx = ask->private;
-	unsigned bs = crypto_ablkcipher_blocksize(crypto_ablkcipher_reqtfm(
-		&ctx->req));
+	unsigned bs = skcipher_crypto_blocksize(ctx);
 	struct skcipher_sg_list *sgl;
 	struct scatterlist *sg;
 	unsigned long iovlen;
@@ -483,8 +589,8 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 
 			err = af_alg_wait_for_completion(
 				ctx->enc ?
-					crypto_ablkcipher_encrypt(&ctx->req) :
-					crypto_ablkcipher_decrypt(&ctx->req),
+					skcipher_crypto_encrypt(ctx) :
+					skcipher_crypto_decrypt(ctx),
 				&ctx->completion);
 
 free:
@@ -603,22 +709,22 @@ static void *skcipher_bind(const char *name, u32 type, u32 mask)
 
 static void skcipher_release(void *private)
 {
-	crypto_free_ablkcipher(private);
+	skcipher_crypto_free(private);
 }
 
 static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-	return crypto_ablkcipher_setkey(private, key, keylen);
+	return skcipher_crypto_setkey(private, key, keylen);
 }
 
 static void skcipher_sock_destruct(struct sock *sk)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	struct skcipher_ctx *ctx = ask->private;
-	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req);
+	unsigned int ivlen = skcipher_crypto_ivsize_ctx(ctx);
 
 	skcipher_free_sgl(sk);
-	sock_kfree_s(sk, ctx->iv, crypto_ablkcipher_ivsize(tfm));
+	sock_kfree_s(sk, ctx->iv, ivlen);
 	sock_kfree_s(sk, ctx, ctx->len);
 	af_alg_release_parent(sk);
 }
@@ -627,20 +733,20 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 {
 	struct skcipher_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
-	unsigned int len = sizeof(*ctx) + crypto_ablkcipher_reqsize(private);
+	unsigned int len = sizeof(*ctx) + skcipher_crypto_reqsize(private);
+	unsigned int ivlen = skcipher_crypto_ivsize(private);
 
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 
-	ctx->iv = sock_kmalloc(sk, crypto_ablkcipher_ivsize(private),
-			       GFP_KERNEL);
+	ctx->iv = sock_kmalloc(sk, ivlen, GFP_KERNEL);
 	if (!ctx->iv) {
 		sock_kfree_s(sk, ctx, len);
 		return -ENOMEM;
 	}
 
-	memset(ctx->iv, 0, crypto_ablkcipher_ivsize(private));
+	memset(ctx->iv, 0, ivlen);
 
 	INIT_LIST_HEAD(&ctx->tsgl);
 	ctx->len = len;
@@ -648,13 +754,14 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 	ctx->more = 0;
 	ctx->merge = 0;
 	ctx->enc = 0;
+	ctx->aead = skcipher_is_aead(private);
 	af_alg_init_completion(&ctx->completion);
 
 	ask->private = ctx;
 
-	ablkcipher_request_set_tfm(&ctx->req, private);
-	ablkcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-					af_alg_complete, &ctx->completion);
+	skcipher_request_set_tfm(ctx, private);
+	skcipher_request_set_callback(ctx, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				      af_alg_complete, &ctx->completion);
 
 	sk->sk_destruct = skcipher_sock_destruct;
 
-- 
2.1.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox