Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 net-next 2/7] bpf: add hashtable type of eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

add new map type BPF_MAP_TYPE_HASH and its implementation

- maps are created/destroyed by userspace. Both userspace and eBPF programs
  can lookup/update/delete elements from the map

- eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism
  for concurrent updates

- key/value are opaque range of bytes (aligned to 8 bytes)

- user space provides 3 configuration attributes via BPF syscall:
  key_size, value_size, max_entries

- map takes care of allocating/freeing key/value pairs

- map_update_elem() must fail to insert new element when max_entries
  limit is reached to make sure that eBPF programs cannot exhaust memory

- map_update_elem() replaces elements in an atomic way

- optimized for speed of lookup() which can be called multiple times from
  eBPF program which itself is triggered by high volume of events
  . in the future JIT compiler may recognize lookup() call and optimize it
    further, since key_size is constant for life of eBPF program

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/hashtab.c     |  362 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/hashtab.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3e9e1b77f29d..03a01fd609aa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -111,6 +111,7 @@ enum bpf_cmd {
 
 enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
+	BPF_MAP_TYPE_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..2c0ec7f9da78 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..d234a012f046
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,362 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct hlist_head *buckets;
+	spinlock_t lock;
+	u32 count;	/* number of elements in this hashtable */
+	u32 n_buckets;	/* number of hash buckets */
+	u32 elem_size;	/* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+	struct hlist_node hash_node;
+	struct rcu_head rcu;
+	u32 hash;
+	char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int err, i;
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	htab->map.key_size = attr->key_size;
+	htab->map.value_size = attr->value_size;
+	htab->map.max_entries = attr->max_entries;
+
+	/* check sanity of attributes.
+	 * value_size == 0 may be allowed in the future to use map as a set
+	 */
+	err = -EINVAL;
+	if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+	    htab->map.value_size == 0)
+		goto free_htab;
+
+	/* hash table size must be power of 2 */
+	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+	err = -E2BIG;
+	if (htab->map.key_size > MAX_BPF_STACK)
+		/* eBPF programs initialize keys on stack, so they cannot be
+		 * larger than max stack size
+		 */
+		goto free_htab;
+
+	err = -ENOMEM;
+	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+				      GFP_USER | __GFP_NOWARN);
+
+	if (!htab->buckets) {
+		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+		if (!htab->buckets)
+			goto free_htab;
+	}
+
+	for (i = 0; i < htab->n_buckets; i++)
+		INIT_HLIST_HEAD(&htab->buckets[i]);
+
+	spin_lock_init(&htab->lock);
+	htab->count = 0;
+
+	htab->elem_size = sizeof(struct htab_elem) +
+			  round_up(htab->map.key_size, 8) +
+			  htab->map.value_size;
+	return &htab->map;
+
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+	return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+					 void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node)
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+
+	return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	u32 hash, key_size;
+
+	/* Must be called with rcu_read_lock. */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l)
+		return l->key + round_up(map->key_size, 8);
+
+	return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l, *next_l;
+	u32 hash, key_size;
+	int i;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	head = select_bucket(htab, hash);
+
+	/* lookup the key */
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (!l) {
+		i = 0;
+		goto find_first_elem;
+	}
+
+	/* key was found, get next key in the same bucket */
+	next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+				  struct htab_elem, hash_node);
+
+	if (next_l) {
+		/* if next elem in this hash list is non-zero, just return it */
+		memcpy(next_key, next_l->key, key_size);
+		return 0;
+	}
+
+	/* no more elements in this hash list, go to the next bucket */
+	i = hash & (htab->n_buckets - 1);
+	i++;
+
+find_first_elem:
+	/* iterate over buckets */
+	for (; i < htab->n_buckets; i++) {
+		head = select_bucket(htab, i);
+
+		/* pick first element in the bucket */
+		next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					  struct htab_elem, hash_node);
+		if (next_l) {
+			/* if it's not empty, just return it */
+			memcpy(next_key, next_l->key, key_size);
+			return 0;
+		}
+	}
+
+	/* itereated over all buckets and all elements */
+	return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	u32 key_size;
+	int ret;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/* allocate new element outside of lock */
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
+	if (!l_new)
+		return -ENOMEM;
+
+	key_size = map->key_size;
+
+	memcpy(l_new->key, key, key_size);
+	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+	l_new->hash = htab_map_hash(l_new->key, key_size);
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, l_new->hash);
+
+	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+	if (!l_old && unlikely(htab->count >= map->max_entries)) {
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		ret = -E2BIG;
+		goto err;
+	}
+
+	if (l_old && map_flags == BPF_NOEXIST) {
+		/* elem already exists */
+		ret = -EEXIST;
+		goto err;
+	}
+
+	if (!l_old && map_flags == BPF_EXIST) {
+		/* elem doesn't exist, cannot update it */
+		ret = -ENOENT;
+		goto err;
+	}
+
+	/* add new element to the head of the list, so that concurrent
+	 * search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		hlist_del_rcu(&l_old->hash_node);
+		kfree_rcu(l_old, rcu);
+	} else {
+		htab->count++;
+	}
+	spin_unlock_irqrestore(&htab->lock, flags);
+
+	return 0;
+err:
+	spin_unlock_irqrestore(&htab->lock, flags);
+	kfree(l_new);
+	return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct htab_elem *l;
+	unsigned long flags;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	spin_lock_irqsave(&htab->lock, flags);
+
+	head = select_bucket(htab, hash);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		htab->count--;
+		kfree_rcu(l, rcu);
+		ret = 0;
+	}
+
+	spin_unlock_irqrestore(&htab->lock, flags);
+	return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+	int i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		struct hlist_head *head = select_bucket(htab, i);
+		struct hlist_node *n;
+		struct htab_elem *l;
+
+		hlist_for_each_entry_safe(l, n, head, hash_node) {
+			hlist_del_rcu(&l->hash_node);
+			htab->count--;
+			kfree(l);
+		}
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete
+	 */
+	synchronize_rcu();
+
+	/* some of kfree_rcu() callbacks for elements of this map may not have
+	 * executed. It's ok. Proceed to free residual elements and map itself
+	 */
+	delete_all_elements(htab);
+	kvfree(htab->buckets);
+	kfree(htab);
+}
+
+static struct bpf_map_ops htab_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_map_lookup_elem,
+	.map_update_elem = htab_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &htab_ops,
+	.type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_htab_map);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 3/7] bpf: add array type of eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

add new map type BPF_MAP_TYPE_ARRAY and its implementation

- optimized for fastest possible lookup()
  . in the future verifier/JIT may recognize lookup() with constant key
    and optimize it into constant pointer. Can optimize non-constant
    key into direct pointer arithmetic as well, since pointers and
    value_size are constant for the life of the eBPF program.
    In other words array_map_lookup_elem() may be 'inlined' by verifier/JIT
    while preserving concurrent access to this map from user space

- two main use cases for array type:
  . 'global' eBPF variables: array of 1 element with key=0 and value is a
    collection of 'global' variables which programs can use to keep the state
    between events
  . aggregation of tracing events into fixed set of buckets

- all array elements pre-allocated and zero initialized at init time

- key as an index in array and can only be 4 byte

- map_delete_elem() returns EINVAL, since elements cannot be deleted

- map_update_elem() replaces elements in an non-atomic way
  (for atomic updates hashtable type should be used instead)

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

Note, from eBPF program and from user space, all map types are accessed
through the same API.

Example of using array type for 'global' variables from eBPF program:
struct globals {
    u64 lat_ave;
    u64 lat_sum;
    u64 missed;
    u64 max_lat;
    int num_samples;
};

struct bpf_map_def SEC("maps") global_map = {
    .type = BPF_MAP_TYPE_ARRAY,
    .key_size = sizeof(int),
    .value_size = sizeof(struct globals),
    .max_entries = 1,
};

int bpf_prog(struct bpf_context *ctx)
{
    ...
    int ind = 0;
    struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
    if (!g)
            return 0;
    if (g->lat_ave == 0) {
            g->num_samples++;
            g->lat_sum += delta;
            if (g->num_samples >= 100) {
                    g->lat_ave = g->lat_sum / g->num_samples;
    ...

The future verifier/JIT optimization will replace bpf_map_lookup_elem()
call inside eBPF program with const pointer to element value of key=0,
so that eBPF program will have no penalty whatsoever to access such
'global' variables.
At the same time user space can access this 'globals' via common map API.

Full example of both kernel and user side follows in later patches.

The array map is like C array of structures. Nothing protects concurrent access.
It's used in the cases where accuracy is not needed or when there is no
concurrent access. To compute a histogram of events in tracing the array
of integers is used. Every integer is a counter. Program increments it
(may be without using xadd) and user space periodically reads it back.
map_update_elem() is called by userspace once to initialize it if zero-init
is not enough. Programs do lookup() and modify the values.
For array type update() method is used rarely, delete() is never used and
get_next() is needed for completeness to browse maps through common map API.

 include/uapi/linux/bpf.h |    1 +
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/arraymap.c    |  151 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/arraymap.c

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 03a01fd609aa..0d662fe75df5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_cmd {
 enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
+	BPF_MAP_TYPE_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 2c0ec7f9da78..72ec98ba2d42 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..f4f6965f86cb
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,151 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	char value[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_array *array;
+	u32 elem_size;
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size == 0)
+		return ERR_PTR(-EINVAL);
+
+	elem_size = round_up(attr->value_size, 8);
+
+	/* allocate all map elements and zero-initialize them */
+	array = kzalloc(sizeof(*array) + attr->max_entries * elem_size,
+			GFP_USER | __GFP_NOWARN);
+	if (!array) {
+		array = vzalloc(array->map.max_entries * array->elem_size);
+		if (!array)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	/* copy mandatory map attributes */
+	array->map.key_size = attr->key_size;
+	array->map.value_size = attr->value_size;
+	array->map.max_entries = attr->max_entries;
+
+	array->elem_size = elem_size;
+
+	return &array->map;
+
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return NULL;
+
+	return array->value + array->elem_size * index;
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+	u32 *next = (u32 *)next_key;
+
+	if (index >= array->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == array->map.max_entries - 1)
+		return -ENOENT;
+
+	*next = index + 1;
+	return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (map_flags > BPF_EXIST)
+		/* unknown flags */
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		/* all elements were pre-allocated, cannot insert a new one */
+		return -E2BIG;
+	
+	if (map_flags == BPF_NOEXIST)
+		/* all elemenets already exist */
+		return -EEXIST;
+
+	memcpy(array->value + array->elem_size * index, value, array->elem_size);
+	return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding programs to complete
+	 * and free the array
+	 */
+	synchronize_rcu();
+
+	kvfree(array);
+}
+
+static struct bpf_map_ops array_ops = {
+	.map_alloc = array_map_alloc,
+	.map_free = array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = array_map_lookup_elem,
+	.map_update_elem = array_map_update_elem,
+	.map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list tl = {
+	.ops = &array_ops,
+	.type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+	bpf_register_map_type(&tl);
+	return 0;
+}
+late_initcall(register_array_map);
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 4/7] bpf: fix BPF_MAP_LOOKUP_ELEM command return code
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

fix errno of BPF_MAP_LOOKUP_ELEM command as bpf manpage
described it in commit b4fc1a460f30("Merge branch 'bpf-next'"):
-----
BPF_MAP_LOOKUP_ELEM
    int bpf_lookup_elem(int fd, void *key, void *value)
    {
        union bpf_attr attr = {
            .map_fd = fd,
            .key = ptr_to_u64(key),
            .value = ptr_to_u64(value),
        };

        return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
    }
    bpf() syscall looks up an element with given key in  a  map  fd.
    If  element  is found it returns zero and stores element's value
    into value.  If element is not found  it  returns  -1  and  sets
    errno to ENOENT.

and further down in manpage:

   ENOENT For BPF_MAP_LOOKUP_ELEM or BPF_MAP_DELETE_ELEM,  indicates  that
          element with given key was not found.
-----

In general all BPF commands return ENOENT when map element is not found
(including BPF_MAP_GET_NEXT_KEY and BPF_MAP_UPDATE_ELEM with
 flags == BPF_MAP_UPDATE_ONLY)

Subsequent patch adds a testsuite to check return values for all of
these combinations.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

I don't think this patch is needed for 'net', since 'net' has syscall shell
only. Actual map types and their implementations are being introduced by
this set of patches.

 kernel/bpf/syscall.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0d03bf317a2..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (copy_from_user(key, ukey, map->key_size) != 0)
 		goto free_key;
 
-	err = -ESRCH;
+	err = -ENOENT;
 	rcu_read_lock();
 	value = map->ops->map_lookup_elem(map, key);
 	if (!value)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 5/7] bpf: add a testsuite for eBPF maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

. check error conditions and sanity of hash and array map APIs
. check large maps (that kernel gracefully switches to vmalloc from kmalloc)
. check multi-process parallel access and stress test

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
Eventually it can be moved tools/testing/selftests/bpf/, but for now keep
it in samples/bpf/, since that's where all subsequent samples are coming to.

 samples/bpf/Makefile    |    3 +-
 samples/bpf/libbpf.c    |    3 +-
 samples/bpf/libbpf.h    |    2 +-
 samples/bpf/test_maps.c |  291 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 296 insertions(+), 3 deletions(-)
 create mode 100644 samples/bpf/test_maps.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 634391797856..0718d9ce4619 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -2,9 +2,10 @@
 obj- := dummy.o
 
 # List of programs to build
-hostprogs-y := test_verifier
+hostprogs-y := test_verifier test_maps
 
 test_verifier-objs := test_verifier.o libbpf.o
+test_maps-objs := test_maps.o libbpf.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c
index ff6504420738..17bb520eb57f 100644
--- a/samples/bpf/libbpf.c
+++ b/samples/bpf/libbpf.c
@@ -27,12 +27,13 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 	return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
 }
 
-int bpf_update_elem(int fd, void *key, void *value)
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
 {
 	union bpf_attr attr = {
 		.map_fd = fd,
 		.key = ptr_to_u64(key),
 		.value = ptr_to_u64(value),
+		.flags = flags,
 	};
 
 	return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 8a31babeca5d..f8678e5f48bf 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -6,7 +6,7 @@ struct bpf_insn;
 
 int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
 		   int max_entries);
-int bpf_update_elem(int fd, void *key, void *value);
+int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
 int bpf_lookup_elem(int fd, void *key, void *value);
 int bpf_delete_elem(int fd, void *key);
 int bpf_get_next_key(int fd, void *key, void *next_key);
diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
new file mode 100644
index 000000000000..e286b42307f3
--- /dev/null
+++ b/samples/bpf/test_maps.c
@@ -0,0 +1,291 @@
+/*
+ * Testsuite for eBPF maps
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include "libbpf.h"
+
+/* sanity tests for map API */
+static void test_hashmap_sanity(int i, void *data)
+{
+	long long key, next_key, value;
+	int map_fd;
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2);
+	if (map_fd < 0) {
+		printf("failed to create hashmap '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	key = 1;
+	value = 1234;
+	/* insert key=1 element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+
+	value = 0;
+	/* BPF_NOEXIST means: add new element if it doesn't exist */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       /* key=1 already exists */
+	       errno == EEXIST);
+
+	assert(bpf_update_elem(map_fd, &key, &value, -1) == -1 && errno == EINVAL);
+
+	/* check that key=1 can be found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+
+	key = 2;
+	/* check that key=2 is not found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	/* BPF_EXIST means: update existing element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+	       /* key=2 is not there */
+	       errno == ENOENT);
+
+	/* insert key=2 element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+
+	/* key=1 and key=2 were inserted, check that key=0 cannot be inserted
+	 * due to max_entries limit
+	 */
+	key = 0;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* check that key = 0 doesn't exist */
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	/* iterate over two elements */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
+	       next_key == 2);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 &&
+	       next_key == 1);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 &&
+	       errno == ENOENT);
+
+	/* delete both elements */
+	key = 1;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	key = 2;
+	assert(bpf_delete_elem(map_fd, &key) == 0);
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == ENOENT);
+
+	key = 0;
+	/* check that map is empty */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == -1 &&
+	       errno == ENOENT);
+	close(map_fd);
+}
+
+static void test_arraymap_sanity(int i, void *data)
+{
+	int key, next_key, map_fd;
+	long long value;
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 2);
+	if (map_fd < 0) {
+		printf("failed to create arraymap '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	key = 1;
+	value = 1234;
+	/* insert key=1 element */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_ANY) == 0);
+
+	value = 0;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == EEXIST);
+
+	/* check that key=1 can be found */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 1234);
+
+	key = 0;
+	/* check that key=0 is also found and zero initialized */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+
+
+	/* key=0 and key=1 were inserted, check that key=2 cannot be inserted
+	 * due to max_entries limit
+	 */
+	key = 2;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_EXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* check that key = 2 doesn't exist */
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	/* iterate over two elements */
+	assert(bpf_get_next_key(map_fd, &key, &next_key) == 0 &&
+	       next_key == 0);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == 0 &&
+	       next_key == 1);
+	assert(bpf_get_next_key(map_fd, &next_key, &next_key) == -1 &&
+	       errno == ENOENT);
+
+	/* delete shouldn't succeed */
+	key = 1;
+	assert(bpf_delete_elem(map_fd, &key) == -1 && errno == EINVAL);
+
+	close(map_fd);
+}
+
+#define MAP_SIZE (32 * 1024)
+static void test_map_large(void)
+{
+	struct bigkey {
+		int a;
+		char b[116];
+		long long c;
+	} key;
+	int map_fd, i, value;
+
+	/* allocate 4Mbyte of memory */
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+				MAP_SIZE);
+	if (map_fd < 0) {
+		printf("failed to create large map '%s'\n", strerror(errno));
+		exit(1);
+	}
+
+	for (i = 0; i < MAP_SIZE; i++) {
+		key = (struct bigkey) {.c = i};
+		value = i;
+		assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+	}
+	key.c = -1;
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == E2BIG);
+
+	/* iterate through all elements */
+	for (i = 0; i < MAP_SIZE; i++)
+		assert(bpf_get_next_key(map_fd, &key, &key) == 0);
+	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+
+	key.c = 0;
+	assert(bpf_lookup_elem(map_fd, &key, &value) == 0 && value == 0);
+	key.a = 1;
+	assert(bpf_lookup_elem(map_fd, &key, &value) == -1 && errno == ENOENT);
+
+	close(map_fd);
+}
+
+/* fork N children and wait for them to complete */
+static void run_parallel(int tasks, void (*fn)(int i, void *data), void *data)
+{
+	pid_t pid[tasks];
+	int i;
+
+	for (i = 0; i < tasks; i++) {
+		pid[i] = fork();
+		if (pid[i] == 0) {
+			fn(i, data);
+			exit(0);
+		} else if (pid[i] == -1) {
+			printf("couldn't spawn #%d process\n", i);
+			exit(1);
+		}
+	}
+	for (i = 0; i < tasks; i++) {
+		int status;
+
+		assert(waitpid(pid[i], &status, 0) == pid[i]);
+		assert(status == 0);
+	}
+}
+
+static void test_map_stress(void)
+{
+	run_parallel(100, test_hashmap_sanity, NULL);
+	run_parallel(100, test_arraymap_sanity, NULL);
+}
+
+#define TASKS 1024
+#define DO_UPDATE 1
+#define DO_DELETE 0
+static void do_work(int fn, void *data)
+{
+	int map_fd = ((int *)data)[0];
+	int do_update = ((int *)data)[1];
+	int i;
+	int key, value;
+
+	for (i = fn; i < MAP_SIZE; i += TASKS) {
+		key = value = i;
+		if (do_update)
+			assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == 0);
+		else
+			assert(bpf_delete_elem(map_fd, &key) == 0);
+	}
+}
+
+static void test_map_parallel(void)
+{
+	int i, map_fd, key = 0, value = 0;
+	int data[2];
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+				MAP_SIZE);
+	if (map_fd < 0) {
+		printf("failed to create map for parallel test '%s'\n",
+		       strerror(errno));
+		exit(1);
+	}
+
+	data[0] = map_fd;
+	data[1] = DO_UPDATE;
+	/* use the same map_fd in children to add elements to this map
+	 * child_0 adds key=0, key=1024, key=2048, ...
+	 * child_1 adds key=1, key=1025, key=2049, ...
+	 * child_1023 adds key=1023, ...
+	 */
+	run_parallel(TASKS, do_work, data);
+
+	/* check that key=0 is already there */
+	assert(bpf_update_elem(map_fd, &key, &value, BPF_NOEXIST) == -1 &&
+	       errno == EEXIST);
+
+	/* check that all elements were inserted */
+	key = -1;
+	for (i = 0; i < MAP_SIZE; i++)
+		assert(bpf_get_next_key(map_fd, &key, &key) == 0);
+	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+
+	/* another check for all elements */
+	for (i = 0; i < MAP_SIZE; i++) {
+		key = MAP_SIZE - i - 1;
+		assert(bpf_lookup_elem(map_fd, &key, &value) == 0 &&
+		       value == key);
+	}
+
+	/* now let's delete all elemenets in parallel */
+	data[1] = DO_DELETE;
+	run_parallel(TASKS, do_work, data);
+
+	/* nothing should be left */
+	key = -1;
+	assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
+}
+
+int main(void)
+{
+	test_hashmap_sanity(0, NULL);
+	test_arraymap_sanity(0, NULL);
+	test_map_large();
+	test_map_parallel();
+	test_map_stress();
+	printf("test_maps: OK\n");
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 6/7] bpf: allow eBPF programs to use maps
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

expose bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem()
map accessors to eBPF programs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
Note, these helpers are exposed as '.gpl_only = false', so non-GPL eBPF programs
can use them. That was requested by AndyL and DavidL before.

 include/linux/bpf.h      |    5 +++
 include/uapi/linux/bpf.h |    3 ++
 kernel/bpf/Makefile      |    2 +-
 kernel/bpf/helpers.c     |   89 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/helpers.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 51e9242e4803..75e94eaa228b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -133,4 +133,9 @@ struct bpf_prog *bpf_prog_get(u32 ufd);
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog *fp, union bpf_attr *attr);
 
+/* verifier prototypes for helper functions called from eBPF programs */
+extern struct bpf_func_proto bpf_map_lookup_elem_proto;
+extern struct bpf_func_proto bpf_map_update_elem_proto;
+extern struct bpf_func_proto bpf_map_delete_elem_proto;
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d662fe75df5..4a3d0f84f178 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -158,6 +158,9 @@ union bpf_attr {
  */
 enum bpf_func_id {
 	BPF_FUNC_unspec,
+	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
+	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
+	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 72ec98ba2d42..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
 obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
 ifdef CONFIG_TEST_BPF
 obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* verifier checked that R1 contains a valid pointer to bpf_map
+	 * and R2 points to a program stack and map->key_size bytes were
+	 * initialized
+	 */
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	value = map->ops->map_lookup_elem(map, key);
+
+	/* lookup() returns either pointer to element value or NULL
+	 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+	 */
+	return (unsigned long) value;
+}
+
+struct bpf_func_proto bpf_map_lookup_elem_proto = {
+	.func = bpf_map_lookup_elem,
+	.gpl_only = false,
+	.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+	void *value = (void *) (unsigned long) r3;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_update_elem(map, key, value, r4);
+}
+
+struct bpf_func_proto bpf_map_update_elem_proto = {
+	.func = bpf_map_update_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+	.arg3_type = ARG_PTR_TO_MAP_VALUE,
+	.arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	void *key = (void *) (unsigned long) r2;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	return map->ops->map_delete_elem(map, key);
+}
+
+struct bpf_func_proto bpf_map_delete_elem_proto = {
+	.func = bpf_map_delete_elem,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_CONST_MAP_PTR,
+	.arg2_type = ARG_PTR_TO_MAP_KEY,
+};
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 net-next 7/7] bpf: remove test map scaffolding and user proper types
From: Alexei Starovoitov @ 2014-11-14  1:36 UTC (permalink / raw)
  To: David S. Miller
  Cc: Ingo Molnar, Andy Lutomirski, Daniel Borkmann,
	Hannes Frederic Sowa, Eric Dumazet, linux-api, netdev,
	linux-kernel
In-Reply-To: <1415929010-9361-1-git-send-email-ast@plumgrid.com>

proper types and function helpers are ready. Use them in verifier testsuite.
Remove temporary stubs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 kernel/bpf/test_stub.c      |   56 +++++++------------------------------------
 samples/bpf/test_verifier.c |   14 +++++------
 2 files changed, 16 insertions(+), 54 deletions(-)

diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
 	u64 arg2;
 };
 
-static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	return 0;
-}
-
-static struct bpf_func_proto test_funcs[] = {
-	[BPF_FUNC_unspec] = {
-		.func = test_func,
-		.gpl_only = true,
-		.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
-		.arg1_type = ARG_CONST_MAP_PTR,
-		.arg2_type = ARG_PTR_TO_MAP_KEY,
-	},
-};
-
 static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
 {
-	if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
 		return NULL;
-	return &test_funcs[func_id];
+	}
 }
 
 static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
 	.type = BPF_PROG_TYPE_UNSPEC,
 };
 
-static struct bpf_map *test_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_map *map;
-
-	map = kzalloc(sizeof(*map), GFP_USER);
-	if (!map)
-		return ERR_PTR(-ENOMEM);
-
-	map->key_size = attr->key_size;
-	map->value_size = attr->value_size;
-	map->max_entries = attr->max_entries;
-	return map;
-}
-
-static void test_map_free(struct bpf_map *map)
-{
-	kfree(map);
-}
-
-static struct bpf_map_ops test_map_ops = {
-	.map_alloc = test_map_alloc,
-	.map_free = test_map_free,
-};
-
-static struct bpf_map_type_list tl_map = {
-	.ops = &test_map_ops,
-	.type = BPF_MAP_TYPE_UNSPEC,
-};
-
 static int __init register_test_ops(void)
 {
-	bpf_register_map_type(&tl_map);
 	bpf_register_prog_type(&tl_prog);
 	return 0;
 }
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 63402742345e..b96175e90363 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -261,7 +261,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_EXIT_INSN(),
 		},
 		.fixup = {2},
@@ -417,7 +417,7 @@ static struct bpf_test tests[] = {
 			BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "fd 0 is not pointing to valid bpf_map",
@@ -430,7 +430,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
 			BPF_EXIT_INSN(),
 		},
@@ -445,7 +445,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 			BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
 			BPF_EXIT_INSN(),
@@ -461,7 +461,7 @@ static struct bpf_test tests[] = {
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 			BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
 			BPF_EXIT_INSN(),
@@ -548,7 +548,7 @@ static struct bpf_test tests[] = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -56),
 			BPF_LD_MAP_FD(BPF_REG_1, 0),
-			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
 			BPF_EXIT_INSN(),
 		},
 		.fixup = {24},
@@ -659,7 +659,7 @@ static int create_map(void)
 	long long key, value = 0;
 	int map_fd;
 
-	map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024);
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
 	if (map_fd < 0) {
 		printf("failed to create map '%s'\n", strerror(errno));
 	}
-- 
1.7.9.5

^ permalink raw reply related

* Re: [GIT PULL nf] Second Round of IPVS Fixes for v3.18
From: Simon Horman @ 2014-11-14  1:59 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov
In-Reply-To: <20141113113818.GA7570@salvia>

On Thu, Nov 13, 2014 at 12:38:18PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Nov 12, 2014 at 11:21:59AM +0900, Simon Horman wrote:
> > Hi Pablo,
> > 
> > please consider this fix for v3.18.
> > 
> > It fixes handling of skb->sk which may cause incorrect handling
> > of connections from a local process.
> > 
> > This problem was introduced in its current form by 8052ba292559f907e
> > ("ipvs: support ipv4 in ipv6 and ipv6 in ipv4 tunnel forwarding") in
> > v3.18-rc1.
> 
> Pulled, thanks Simon.
> 
> > I believe it also exists in a different form in older kernels.
> > No fix for that is available at this time.
> 
> AFAIK -stable also accepts backports if there's a clear relation
> between this original patch in mainstream and the backported version.

Thanks. I will see about making one.

^ permalink raw reply

* net-next panic in ovs call to arch_fast_hash2 since e5a2c899
From: Jay Vosburgh @ 2014-11-14  2:15 UTC (permalink / raw)
  To: netdev; +Cc: discuss, Pravin Shelar, Or Gerlitz


	I'm having an issue with recent net-next, wherein a call is now
using alternative_call, and this is apparently being mis-compiled for
the "don't have feature" case.

	I'm using gcc (Ubuntu 4.8.2-19ubuntu1) 4.8.2 on an Ubuntu 14.04
system.

	The call is in net/openvswitch/flow_table.c:flow_hash(), which
as of commit

commit e5a2c899957659cd1a9f789bc462f9c0b35f5150
Author: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date:   Wed Nov 5 00:23:04 2014 +0100

    fast_hash: avoid indirect function calls

	uses arch_fast_hash2, which is an alternative_call function,
selecting between __jhash2 and __intel_crc4_2_hash based on the
X86_FEATURE_XMM4_2:

static inline u32 arch_fast_hash2(const u32 *data, u32 len, u32 seed)
{
        u32 hash;

        alternative_call(__jhash2, __intel_crc4_2_hash2, X86_FEATURE_XMM4_2,
#ifdef CONFIG_X86_64
                         "=a" (hash), "D" (data), "S" (len), "d" (seed));
#else
                         "=a" (hash), "a" (data), "d" (len), "c" (seed));
#endif
        return hash;
}

	This is panicing on a system without X86_FEATURE_XMM4_2.

	Reverting just the above commit does make the problem go away.

	It appears that the alternative_call itself is not calling
__jhash2 correctly:

0xffffffffa01a55dd <ovs_flow_tbl_insert+0xcd>:	sub    %ecx,%esi
0xffffffffa01a55df <ovs_flow_tbl_insert+0xcf>:	lea    0x38(%r8,%rax,1),%rdi
0xffffffffa01a55e4 <ovs_flow_tbl_insert+0xd4>:	sar    $0x2,%esi
0xffffffffa01a55e7 <ovs_flow_tbl_insert+0xd7>:	callq  0xffffffff813a75c0 <__jhash2>
0xffffffffa01a55ec <ovs_flow_tbl_insert+0xdc>:	mov    %eax,0x30(%r8)
0xffffffffa01a55f0 <ovs_flow_tbl_insert+0xe0>:	mov    (%rbx),%r13
0xffffffffa01a55f3 <ovs_flow_tbl_insert+0xe3>:	mov    %r8,%rsi
0xffffffffa01a55f6 <ovs_flow_tbl_insert+0xe6>:	mov    %r13,%rdi
0xffffffffa01a55f9 <ovs_flow_tbl_insert+0xe9>:	callq  0xffffffffa01a4ba0 <table_instance_insert>

	but __jhash2 clobbers %r8 (which is not saved), resulting in a
panic on the next instruction at ovs_flow_tbl_insert+0xdc:

[   17.762419] BUG: unable to handle kernel paging request at 00000000f6cc13e5
[   17.765456] IP: [<ffffffffa01a6bec>] ovs_flow_tbl_insert+0xdc/0x1f0 [openvswi
tch]
[   17.765456] PGD b18da067 PUD 0 
[   17.765456] Oops: 0002 [#1] SMP 
[   17.765456] Modules linked in: openvswitch libcrc32c i915 video drm_kms_helpe
r coretemp kvm_intel drm kvm gpio_ich ppdev parport_pc lpc_ich i2c_algo_bit lp s
erio_raw parport mac_hid hid_generic usbhid hid psmouse r8169 mii sky2
[   17.765456] CPU: 0 PID: 901 Comm: ovs-vswitchd Not tainted 3.18.0-rc2-nn-4d3c
9d37+ #19
[   17.765456] Hardware name: LENOVO 0829F3U/To be filled by O.E.M., BIOS 90KT15
AUS 07/21/2010
[   17.765456] task: ffff8800b07c9900 ti: ffff8800b1a04000 task.ti: ffff8800b1a0
4000
[   17.765456] RIP: 0010:[<ffffffffa01a6bec>]  [<ffffffffa01a6bec>] ovs_flow_tbl
_insert+0xdc/0x1f0 [openvswitch]
[   17.765456] RSP: 0018:ffff8800b1a07798  EFLAGS: 00010293
[   17.765456] RAX: 00000000e81d0094 RBX: ffff8800b27a0b20 RCX: 000000007aa02ddf
[   17.765456] RDX: 000000005e013969 RSI: 00000000290f109c RDI: ffff880138d501a4
[   17.765456] RBP: ffff8800b1a077e8 R08: 00000000f6cc13b5 R09: 00000000748df07f
[   17.765456] R10: ffffffffa01a6c96 R11: 0000000000000004 R12: ffff8800b27a0b28
[   17.765456] R13: ffff8800b1a07850 R14: ffff8800b27a0b28 R15: ffff8800a5a99c00
[   17.765456] FS:  00007fcd60b8d980(0000) GS:ffff88013fc00000(0000) knlGS:0000000000000000
[   17.765456] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   17.765456] CR2: 00000000f6cc13e5 CR3: 0000000031846000 CR4: 00000000000407f0
[   17.765456] Stack:
[   17.765456]  ffff880138d50000 ffff8800b1a07a70 ffff880138d50000 0000000000000000
[   17.765456]  ffff880138d501c0 ffff8800b1a07a70 ffff880138d50000 0000000000000000
[   17.765456]  0000000000000000 ffff8800b27a0b20 ffff8800b1a07a38 ffffffffa019e1fe
[   17.765456] Call Trace:
[   17.765456]  [<ffffffffa019e1fe>] ovs_flow_cmd_new+0x23e/0x3c0 [openvswitch]
[   17.765456]  [<ffffffff8165f3e5>] genl_family_rcv_msg+0x1a5/0x3c0

	The "have feature" function, __intel_crc4_2_hash2, does not
clobber %r8, and so the call does not panic on a system with
X86_FEATURE_XMM4_2, although I'm not sure if that's a deliberate
compiler action or just happenstance because __intel_crc4_2_hash2 uses
fewer registers than __jhash2.

	As I said above, reverting the commit in question does resolve
the problem, but it does appear that there is a problem in the compiler
or alternative_call system that is the real root cause.

	I've discussed this with Jesse Gross <jesse@nicira.com> and
Pravin Shelar <pshelar@nicira.com>, who don't see the problem, but I
suspect that's because they have newer cpus with X86_FEATURE_XMM4_2.
Jesse, Pravin, can you confirm whether or not your test systems have
this cpu feature (it's "sse4_2" in /proc/cpuinfo's flags)?

	-J

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply

* Re: [PATCH net-next] icmp: Remove some spurious dropped packet profile hits from the ICMP path
From: Eric Dumazet @ 2014-11-14  2:17 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev, davem
In-Reply-To: <20141113225457.A3E502900805@tardy>

On Thu, 2014-11-13 at 14:54 -0800, Rick Jones wrote:
> From: Rick Jones <rick.jones2@hp.com>
> 
> If icmp_rcv() has successfully processed the incoming ICMP datagram, we
> should use consume_skb() rather than kfree_skb() because a hit on the likes
> of perf -e skb:kfree_skb is not called-for.
> 
> Signed-off-by: Rick Jones <rick.jones2@hp.com>
> 
> ---
> 
> A test system hit with a flood ping hits on perf top -e ksb:kfre_skb before
> the change and none after for the normal/success path.  The IPv6 path would
> be somewhat more ugly.  For the time being, just deal with the overlap on
> ping_rcv() between the two to avoid a possible double free of an skb.
> 
> diff --git a/include/net/ping.h b/include/net/ping.h
> index 026479b..f074060 100644
> --- a/include/net/ping.h
> +++ b/include/net/ping.h
> @@ -82,7 +82,7 @@ int  ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
>  int  ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
>  		     size_t len);
>  int  ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
> -void ping_rcv(struct sk_buff *skb);
> +bool ping_rcv(struct sk_buff *skb);
>  
>  #ifdef CONFIG_PROC_FS
>  struct ping_seq_afinfo {
> diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
> index 36b7bfa..b9f3653 100644
> --- a/net/ipv4/icmp.c
> +++ b/net/ipv4/icmp.c
> @@ -190,7 +190,7 @@ EXPORT_SYMBOL(icmp_err_convert);
>   */
>  
>  struct icmp_control {
> -	void (*handler)(struct sk_buff *skb);
> +	bool (*handler)(struct sk_buff *skb);
>  	short   error;		/* This ICMP is classed as an error message */
>  };
>  
> @@ -746,7 +746,7 @@ static bool icmp_tag_validation(int proto)
>   *	ICMP_PARAMETERPROB.
>   */
>  
> -static void icmp_unreach(struct sk_buff *skb)
> +static bool icmp_unreach(struct sk_buff *skb)
>  {
>  	const struct iphdr *iph;
>  	struct icmphdr *icmph;
> @@ -839,10 +839,11 @@ static void icmp_unreach(struct sk_buff *skb)
>  	icmp_socket_deliver(skb, info);
>  
>  out:
> -	return;
> +	return true;
>  out_err:
>  	ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
> -	goto out;
> +	kfree_skb(skb);
> +	return false;
>  }
>  
> 
> @@ -850,17 +851,22 @@ out_err:
>   *	Handle ICMP_REDIRECT.
>   */
>  
> -static void icmp_redirect(struct sk_buff *skb)
> +static bool icmp_redirect(struct sk_buff *skb)
>  {
>  	if (skb->len < sizeof(struct iphdr)) {
>  		ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
> -		return;
> +		kfree_skb(skb);
> +		return false;
>  	}
>  
> -	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
> -		return;
> +	if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
> +		/* there aught to be a stat */
> +		kfree_skb(skb);
> +		return false;
> +	}
>  
>  	icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
> +	return true;
>  }
>  
>  /*
> @@ -875,7 +881,7 @@ static void icmp_redirect(struct sk_buff *skb)
>   *	See also WRT handling of options once they are done and working.
>   */
>  
> -static void icmp_echo(struct sk_buff *skb)
> +static bool icmp_echo(struct sk_buff *skb)
>  {
>  	struct net *net;
>  
> @@ -891,6 +897,8 @@ static void icmp_echo(struct sk_buff *skb)
>  		icmp_param.head_len	   = sizeof(struct icmphdr);
>  		icmp_reply(&icmp_param, skb);
>  	}
> +	/* should there be an ICMP stat for ignored echos? */
> +	return true;
>  }
>  
>  /*
> @@ -900,7 +908,7 @@ static void icmp_echo(struct sk_buff *skb)
>   *		  MUST be accurate to a few minutes.
>   *		  MUST be updated at least at 15Hz.
>   */
> -static void icmp_timestamp(struct sk_buff *skb)
> +static bool icmp_timestamp(struct sk_buff *skb)
>  {
>  	struct timespec tv;
>  	struct icmp_bxm icmp_param;
> @@ -927,15 +935,18 @@ static void icmp_timestamp(struct sk_buff *skb)
>  	icmp_param.data_len	   = 0;
>  	icmp_param.head_len	   = sizeof(struct icmphdr) + 12;
>  	icmp_reply(&icmp_param, skb);
> -out:
> -	return;
> +	return true;
> +
>  out_err:
>  	ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
> -	goto out;
> +	kfree_skb(skb);
> +	return false;
>  }
>  
> -static void icmp_discard(struct sk_buff *skb)
> +static bool icmp_discard(struct sk_buff *skb)
>  {
> +	/* pretend it was a success */
> +	return true;
>  }
>  
>  /*
> @@ -946,6 +957,7 @@ int icmp_rcv(struct sk_buff *skb)
>  	struct icmphdr *icmph;
>  	struct rtable *rt = skb_rtable(skb);
>  	struct net *net = dev_net(rt->dst.dev);
> +	bool success;
>  
>  	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
>  		struct sec_path *sp = skb_sec_path(skb);
> @@ -1012,7 +1024,12 @@ int icmp_rcv(struct sk_buff *skb)
>  		}
>  	}
>  
> -	icmp_pointers[icmph->type].handler(skb);
> +	success = icmp_pointers[icmph->type].handler(skb);
> +
> +	if (success) 
> +		consume_skb(skb);
> +
> +	return 0;


This looks quite complicated to me.

Why are you adding kfree_skb() everywhere instead of :

	bool to_consume = icmp_pointers[icmph->type].handler(skb);
	if (ro_consume)
		consume_skb(skb);
	else
		kfree_skb(skb);

>  
>  drop:
>  	kfree_skb(skb);

^ permalink raw reply

* Re: net-next panic in ovs call to arch_fast_hash2 since e5a2c899
From: David Miller @ 2014-11-14  2:45 UTC (permalink / raw)
  To: jay.vosburgh; +Cc: netdev, discuss, pshelar, ogerlitz
In-Reply-To: <12086.1415931332@famine>

From: Jay Vosburgh <jay.vosburgh@canonical.com>
Date: Thu, 13 Nov 2014 18:15:32 -0800

> 	The "have feature" function, __intel_crc4_2_hash2, does not
> clobber %r8, and so the call does not panic on a system with
> X86_FEATURE_XMM4_2, although I'm not sure if that's a deliberate
> compiler action or just happenstance because __intel_crc4_2_hash2 uses
> fewer registers than __jhash2.

Perhaps alternative calls can only be used with assembler routines
that use specific calling conventions, and they therefore generally
don't work with C functions?

^ permalink raw reply

* Re: net-next panic in ovs call to arch_fast_hash2 since e5a2c899
From: Jay Vosburgh @ 2014-11-14  5:04 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, discuss, pshelar, ogerlitz, Hannes Frederic Sowa
In-Reply-To: <20141113.214549.1520205472319716774.davem@davemloft.net>


	[ adding Hannes to Cc, which I should've done initially ]

David Miller <davem@davemloft.net> wrote:

>From: Jay Vosburgh <jay.vosburgh@canonical.com>
>Date: Thu, 13 Nov 2014 18:15:32 -0800
>
>> 	The "have feature" function, __intel_crc4_2_hash2, does not
>> clobber %r8, and so the call does not panic on a system with
>> X86_FEATURE_XMM4_2, although I'm not sure if that's a deliberate
>> compiler action or just happenstance because __intel_crc4_2_hash2 uses
>> fewer registers than __jhash2.
>
>Perhaps alternative calls can only be used with assembler routines
>that use specific calling conventions, and they therefore generally
>don't work with C functions?

	I don't know the answer to that, but a quick search suggests
that arch_fast_hash and arch_fast_hash2 (both added by commit e5a2c899)
may be the only cases of alternative calls that aren't supplying either
single instructions or assembly language functions.

	From looking at how the alternative calls are implemented (code
patching at boot or module load time from a table stored in a special
section of the object file), I'm skeptical that the compiler could know
what's the right thing to do.

	Hannes, can you shed any light on this?

	-J

---
	-Jay Vosburgh, jay.vosburgh@canonical.com

^ permalink raw reply

* RE: [PATCHv2 net 4/4] qlcnic: Implement ndo_gso_check()
From: Shahed Shaikh @ 2014-11-14  5:08 UTC (permalink / raw)
  To: Joe Stringer, netdev
  Cc: sathya.perla@emulex.com, amirv@mellanox.com,
	Dept-GE Linux NIC Dev, Tom Herbert (Partner - google),
	gerlitz.or@gmail.com, alexander.duyck@gmail.com, linux-kernel
In-Reply-To: <1415925495-59312-5-git-send-email-joestringer@nicira.com>

> -----Original Message-----
> From: Joe Stringer [mailto:joestringer@nicira.com]
> Sent: Friday, November 14, 2014 6:08 AM
> To: netdev
> Cc: sathya.perla@emulex.com; Shahed Shaikh; amirv@mellanox.com; Dept-
> GE Linux NIC Dev; Tom Herbert (Partner - google); gerlitz.or@gmail.com;
> alexander.duyck@gmail.com; linux-kernel
> Subject: [PATCHv2 net 4/4] qlcnic: Implement ndo_gso_check()
> 
> Use vxlan_gso_check() to advertise offload support for this NIC.
> 
> Signed-off-by: Joe Stringer <joestringer@nicira.com>
> ---
> v2: Refactor out vxlan helper.
> ---
>  drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c |    6 ++++++
>  1 file changed, 6 insertions(+)

Acked-by: Shahed Shaikh <shahed.shaikh@qlogic.com>

Thanks Joe.

-Shahed
> 
> diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
> b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
> index f5e29f7..a913b3a 100644
> --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
> +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
> @@ -503,6 +503,11 @@ static void qlcnic_del_vxlan_port(struct net_device
> *netdev,
> 
>  	adapter->flags |= QLCNIC_DEL_VXLAN_PORT;  }
> +
> +static bool qlcnic_gso_check(struct sk_buff *skb, struct net_device
> +*dev) {
> +	return vxlan_gso_check(skb);
> +}
>  #endif
> 
>  static const struct net_device_ops qlcnic_netdev_ops = { @@ -526,6 +531,7
> @@ static const struct net_device_ops qlcnic_netdev_ops = {  #ifdef
> CONFIG_QLCNIC_VXLAN
>  	.ndo_add_vxlan_port	= qlcnic_add_vxlan_port,
>  	.ndo_del_vxlan_port	= qlcnic_del_vxlan_port,
> +	.ndo_gso_check		= qlcnic_gso_check,
>  #endif
>  #ifdef CONFIG_NET_POLL_CONTROLLER
>  	.ndo_poll_controller = qlcnic_poll_controller,
> --
> 1.7.10.4

^ permalink raw reply

* RE: [PATCH net-next 2/2] r8152: adjust rtl_start_rx
From: Hayes Wang @ 2014-11-14  5:14 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, nic_swsd, linux-kernel@vger.kernel.org,
	linux-usb@vger.kernel.org
In-Reply-To: <20141113.162240.1823683928052355016.davem@davemloft.net>

David Miller [mailto:davem@davemloft.net] 
> Sent: Friday, November 14, 2014 5:23 AM
[...]
> What if even the first r8152_submit_rx() fails?  What ever will cause
> any of these retries to trigger at all?

According to the patch #1 "adjust r8152_submit_rx", the
r8152_submit_rx() would add the rx to the list and schedule
the tasklet, when the error occurs. Each time the tasklet is
called, the rx_bottom() would deal with all the rx in the
list. If the actual_length isn't vaild, the rx buffer would be
submitted directly. By this way, the retries would be done.
That is, the retries would be triggered when the tasklet
is called. Therefore, any tx, rx, and tasklet scheduling
would result in the retries.

> Second, why does your patch increment 'i' with 'i++;' in the error
> break path?  You should mark the first failed entry as unallocated
> with actual_length == 0 and place it on the rx_done queue.

Because the r8152_submit_rx() would add the failed rx to
the list, I only have to deal with the remaining ones. That
is why I increase the "i", otherwise the failed one would
be added twice.

I remember the usb_submit_urb() would set actual_length
to 0, so I skip the step. I would check it again.

Best Regards,
Hayes

^ permalink raw reply

* [PATCH] carl9170: Convert byte_rev_table uses to bitrev8
From: Wang, Yalin @ 2014-11-14  5:16 UTC (permalink / raw)
  To: 'chunkeey@googlemail.com',
	'linville@tuxdriver.com',
	'linux-wireless@vger.kernel.org',
	'netdev@vger.kernel.org',
	'linux-kernel@vger.kernel.org', 'joe@perches.com'

Use the inline function instead of directly indexing the array.

This allows some architectures with hardware instructions for bit
reversals to eliminate the array.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
---
 drivers/net/wireless/ath/carl9170/phy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/carl9170/phy.c b/drivers/net/wireless/ath/carl9170/phy.c
index b80b213..dca6df1 100644
--- a/drivers/net/wireless/ath/carl9170/phy.c
+++ b/drivers/net/wireless/ath/carl9170/phy.c
@@ -994,7 +994,7 @@ static int carl9170_init_rf_bank4_pwr(struct ar9170 *ar, bool band5ghz,
 			refsel0 = 0;
 			refsel1 = 1;
 		}
-		chansel = byte_rev_table[chansel];
+		chansel = bitrev8(chansel);
 	} else {
 		if (freq == 2484) {
 			chansel = 10 + (freq - 2274) / 5;
@@ -1002,7 +1002,7 @@ static int carl9170_init_rf_bank4_pwr(struct ar9170 *ar, bool band5ghz,
 		} else
 			chansel = 16 + (freq - 2272) / 5;
 		chansel *= 4;
-		chansel = byte_rev_table[chansel];
+		chansel = bitrev8(chansel);
 	}
 
 	d1 =	chansel;
-- 
2.1.1

^ permalink raw reply related

* Re: [PATCH] carl9170: Convert byte_rev_table uses to bitrev8
From: Joe Perches @ 2014-11-14  5:32 UTC (permalink / raw)
  To: Wang, Yalin
  Cc: 'chunkeey@googlemail.com',
	'linville@tuxdriver.com',
	'linux-wireless@vger.kernel.org',
	'netdev@vger.kernel.org',
	'linux-kernel@vger.kernel.org'
In-Reply-To: <35FD53F367049845BC99AC72306C23D103E010D1829D@CNBJMBX05.corpusers.net>

On Fri, 2014-11-14 at 13:16 +0800, Wang, Yalin wrote:
> Use the inline function instead of directly indexing the array.
> 
> This allows some architectures with hardware instructions for bit
> reversals to eliminate the array.

This one is already in -next

commit 7a1283d8f5298437a454ec477384dcd9f9f88bac
Author: Joe Perches <joe@perches.com>
Date:   Tue Oct 28 14:18:58 2014 -0700

    carl9170: Convert byte_rev_table uses to bitrev8
    
    Use the inline function instead of directly indexing the array.
    
    This allows some architectures with hardware instructions
    for bit reversals to eliminate the array.
    
    Signed-off-by: Joe Perches <joe@perches.com>
    Signed-off-by: John W. Linville <linville@tuxdriver.com>

^ permalink raw reply

* RE: [PATCH] carl9170: Convert byte_rev_table uses to bitrev8
From: Wang, Yalin @ 2014-11-14  5:37 UTC (permalink / raw)
  To: 'Joe Perches'
  Cc: 'chunkeey@googlemail.com',
	'linville@tuxdriver.com',
	'linux-wireless@vger.kernel.org',
	'netdev@vger.kernel.org',
	'linux-kernel@vger.kernel.org'
In-Reply-To: <1415943164.5912.4.camel@perches.com>

> From: Joe Perches [mailto:joe@perches.com]
> Sent: Friday, November 14, 2014 1:33 PM
> To: Wang, Yalin
> Cc: 'chunkeey@googlemail.com'; 'linville@tuxdriver.com'; 'linux-
> wireless@vger.kernel.org'; 'netdev@vger.kernel.org'; 'linux-
> kernel@vger.kernel.org'
> Subject: Re: [PATCH] carl9170: Convert byte_rev_table uses to bitrev8
> 
> On Fri, 2014-11-14 at 13:16 +0800, Wang, Yalin wrote:
> > Use the inline function instead of directly indexing the array.
> >
> > This allows some architectures with hardware instructions for bit
> > reversals to eliminate the array.
> 
> This one is already in -next
> 
> commit 7a1283d8f5298437a454ec477384dcd9f9f88bac
> Author: Joe Perches <joe@perches.com>
> Date:   Tue Oct 28 14:18:58 2014 -0700
> 
>     carl9170: Convert byte_rev_table uses to bitrev8
> 
>     Use the inline function instead of directly indexing the array.
> 
>     This allows some architectures with hardware instructions
>     for bit reversals to eliminate the array.
> 
>     Signed-off-by: Joe Perches <joe@perches.com>
>     Signed-off-by: John W. Linville <linville@tuxdriver.com>
> 
Got it ,
So I need wait for your another patch to be accepted.

Thanks!

^ permalink raw reply

* Re: [PATCH net-next 1/1] ipvlan: Initial check-in of the IPVLAN driver.
From: Mahesh Bandewar @ 2014-11-14  5:47 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: netdev, Eric Dumazet, Maciej Zenczykowski, Laurent Chavey,
	Tim Hockin, David Miller, Brandon Philips, Pavel Emelianov
In-Reply-To: <CAADnVQJn4+0pCOeSu3oT0fvaJ=CtvD3wmczm4Jn6KjWgZoFrLQ@mail.gmail.com>

On Thu, Nov 13, 2014 at 3:25 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Tue, Nov 11, 2014 at 2:29 PM, Mahesh Bandewar <maheshb@google.com> wrote:
>> The device operates in two different modes and the difference
>> in these two modes in primarily in the TX side.
>>
>> (a) L2 mode : In this mode, the device behaves as a L2 device.
>> TX processing upto L2 happens on the stack of the virtual device
>> associated with (namespace). Packets are switched after that
>> into the main device (default-ns) and queued for xmit.
>>
>> RX processing is simple and all multicast, broadcast (if
>> applicable), and unicast belonging to the address(es) are
>> delivered to the virtual devices.
>>
>> (b) L3 mode : In this mode, the device behaves like a L3 device.
>> TX processing upto L3 happens on the stack of the virtual device
>> associated with (namespace). Packets are switched to the
>> main-device (default-ns) for the L2 processing. Hence the routing
>> table of the default-ns will be used in this mode.
>>
>> RX processins is somewhat similar to the L2 mode except that in
>> this mode only Unicast packets are delivered to the virtual device
>> while main-dev will handle all other packets.
>
> great stuff. would be interesting to see a 'typical use'
> scenario of l2 vs l3 mode. Why users would pick one
> or another?
> I can only think of different default ip in different ns
> would force l2. Anything else?
>
The primary difference is the ability to TX/RX multicast/broadcast as
well as control of routing in L2 mode while in L3 mode that belongs to
the default-ns and that means it can not be controlled from the client
namespace. L3 mode would be more restrictive of the two modes because
of that. Your use case would mostly define the mode to choose.

> Few comments:
>
>> +++ b/drivers/net/ipvlan/ipvlan.h
> ...
>> +#include <linux/kernel.h>
>> +#include <linux/types.h>
>> +#include <linux/module.h>
>> +#include <linux/init.h>
>> +#include <linux/errno.h>
>> +#include <linux/slab.h>
>> +#include <linux/string.h>
>> +#include <linux/rculist.h>
>> +#include <linux/notifier.h>
>> +#include <linux/netdevice.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/ethtool.h>
>> +#include <linux/if_arp.h>
>> +#include <linux/if_link.h>
>> +#include <linux/atomic.h>
>> +#include <linux/if_vlan.h>
>> +#include <linux/inet.h>
>> +#include <linux/hash.h>
>> +#include <linux/ip.h>
>> +#include <linux/inetdevice.h>
>> +#include <net/rtnetlink.h>
>> +#include <net/gre.h>
>> +#include <net/route.h>
>> +#include <net/addrconf.h>
>
> I don't think it's a good style to put all headers that all
> .c need into common .h
> Rather put them into individual .c
>
I don't know why it's wrong (also this a driver-private include and
not expecting anyone else to include) but I can definitely see few
advantages in this - (a) by including in the header file it's
available to all the .c files and does not have to specified
separately. (b) This means probably I can avoid some duplication
meaning less lines of include (c) even if I include some extra
definitions, there is no runtime cost that has to be paid since this
is sorted out during compile.

>> +static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type)
>> +{
>> +       void *lyr3h = NULL;
>> +
>> +       switch (skb->protocol) {
>> +       case htons(ETH_P_ARP): {
>> +               struct arphdr *arph;
>> +
>> +               if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr))))
>> +                       return NULL;
>> +
>> +               arph = arp_hdr(skb);
>> +               *type = IPVL_ARP;
>> +               lyr3h = arph;
>> +               break;
>> +       }
> ...
>
>> +static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port,
>> +                                           void *lyr3h, int addr_type,
>> +                                           bool use_dest)
>> +{
>> +       struct ipvl_addr *addr = NULL;
>> +
>> +       if (addr_type == IPVL_IPV6) {
>> +               struct ipv6hdr *ip6h = NULL;
>> +               struct in6_addr *i6addr;
>> +
>> +               ip6h = (struct ipv6hdr *)lyr3h;
>> +               i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr;
>> +               addr = ipvlan_ht_addr_lookup(port, i6addr, true);
>
> imo it looks very artificial to split logically single
> lookup function into two: get() that returns 'type'/
> 'void * lyr3h' and lookup() that uses them.
> It feels error prone.
> Also everywhere lookup() follows get() immediately.
> I think single lookup() would be much cleaner.

I feel it's clean in the current form. One function is looking into
the packet / frame while the other one is dealing with the hash-table
and making one do both could be error prone. I guess it's the
perspective and probably no one is wrong!

^ permalink raw reply

* [PATCH net-next] openvswitch: Fix build failure.
From: Pravin B Shelar @ 2014-11-14  6:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, Pravin B Shelar

Add dependency on INET to fix following build error. I have also
fixed MPLS dependency.

ERROR: "ip_route_output_flow" [net/openvswitch/openvswitch.ko]
undefined!
make[1]: *** [__modpost] Error 1

Reported-by: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/Kconfig |    6 ++----
 1 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 454ce12..b7d818c 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -4,7 +4,9 @@
 
 config OPENVSWITCH
 	tristate "Open vSwitch"
+	depends on INET
 	select LIBCRC32C
+	select NET_MPLS_GSO
 	---help---
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
@@ -30,8 +32,6 @@ config OPENVSWITCH
 
 config OPENVSWITCH_GRE
 	tristate "Open vSwitch GRE tunneling support"
-	select NET_MPLS_GSO
-	depends on INET
 	depends on OPENVSWITCH
 	depends on NET_IPGRE_DEMUX
 	default OPENVSWITCH
@@ -45,7 +45,6 @@ config OPENVSWITCH_GRE
 
 config OPENVSWITCH_VXLAN
 	tristate "Open vSwitch VXLAN tunneling support"
-	depends on INET
 	depends on OPENVSWITCH
 	depends on VXLAN
 	default OPENVSWITCH
@@ -58,7 +57,6 @@ config OPENVSWITCH_VXLAN
 
 config OPENVSWITCH_GENEVE
 	tristate "Open vSwitch Geneve tunneling support"
-	depends on INET
 	depends on OPENVSWITCH
 	depends on GENEVE
 	default OPENVSWITCH
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH net-next] openvswitch: Fix build failure.
From: David Miller @ 2014-11-14  6:24 UTC (permalink / raw)
  To: pshelar; +Cc: netdev
In-Reply-To: <1415946090-1504-1-git-send-email-pshelar@nicira.com>

From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 13 Nov 2014 22:21:30 -0800

> Add dependency on INET to fix following build error. I have also
> fixed MPLS dependency.
> 
> ERROR: "ip_route_output_flow" [net/openvswitch/openvswitch.ko]
> undefined!
> make[1]: *** [__modpost] Error 1
> 
> Reported-by: Jim Davis <jim.epost@gmail.com>
> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

Applied, thanks Pravin.

^ permalink raw reply

* [PATCH net-next v1] ipvlan: Initial check-in of the IPVLAN driver.
From: Mahesh Bandewar @ 2014-11-14  6:29 UTC (permalink / raw)
  To: netdev
  Cc: Eric Dumazet, Maciej Zenczykowski, Laurent Chavey, Tim Hockin,
	David Miller, Brandon Philips, Pavel Emelianov, Mahesh Bandewar

This driver is very similar to the macvlan driver except that it
uses L3 on the frame to determine the logical interface while
functioning as packet dispatcher. It inherits L2 of the master
device hence the packets on wire will have the same L2 for all
the packets originating from all virtual devices off of the same
master device.

This driver was developed keeping the namespace use-case in
mind. Hence most of the examples given here take that as the
base setup where main-device belongs to the default-ns and
virtual devices are assigned to the additional namespaces.

The device operates in two different modes and the difference
in these two modes in primarily in the TX side.

(a) L2 mode : In this mode, the device behaves as a L2 device.
TX processing upto L2 happens on the stack of the virtual device
associated with (namespace). Packets are switched after that
into the main device (default-ns) and queued for xmit.

RX processing is simple and all multicast, broadcast (if
applicable), and unicast belonging to the address(es) are
delivered to the virtual devices.

(b) L3 mode : In this mode, the device behaves like a L3 device.
TX processing upto L3 happens on the stack of the virtual device
associated with (namespace). Packets are switched to the
main-device (default-ns) for the L2 processing. Hence the routing
table of the default-ns will be used in this mode.

RX processins is somewhat similar to the L2 mode except that in
this mode only Unicast packets are delivered to the virtual device
while main-dev will handle all other packets.

The devices can be added using the "ip" command from the iproute2
package -

	ip link add link <master> <virtual> type ipvlan mode [ l2 | l3 ]

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Laurent Chavey <chavey@google.com>
Cc: Tim Hockin <thockin@google.com>
Cc: Brandon Philips <brandon.philips@coreos.com>
Cc: Pavel Emelianov <xemul@parallels.com>
---
 Documentation/networking/ipvlan.txt | 114 +++++
 drivers/net/Kconfig                 |  18 +
 drivers/net/Makefile                |   1 +
 drivers/net/ipvlan/Makefile         |   7 +
 drivers/net/ipvlan/ipvlan.h         | 157 +++++++
 drivers/net/ipvlan/ipvlan_core.c    | 629 +++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_main.c    | 827 ++++++++++++++++++++++++++++++++++++
 drivers/net/ipvlan/ipvlan_sysfs.c   | 119 ++++++
 include/linux/netdevice.h           |   4 +
 include/uapi/linux/if_link.h        |  15 +
 10 files changed, 1891 insertions(+)
 create mode 100644 Documentation/networking/ipvlan.txt
 create mode 100644 drivers/net/ipvlan/Makefile
 create mode 100644 drivers/net/ipvlan/ipvlan.h
 create mode 100644 drivers/net/ipvlan/ipvlan_core.c
 create mode 100644 drivers/net/ipvlan/ipvlan_main.c
 create mode 100644 drivers/net/ipvlan/ipvlan_sysfs.c

diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt
new file mode 100644
index 000000000000..1f5d34cb4841
--- /dev/null
+++ b/Documentation/networking/ipvlan.txt
@@ -0,0 +1,114 @@
+
+                            IPVLAN Driver HOWTO
+
+Initial Release:
+	Mahesh Bandewar <maheshb AT google.com>
+
+1. Introduction:
+	This is conceptually very similar to the macvlan driver with one major
+exception of using L3 for mux-ing /demux-ing among slaves. This property makes
+the master device share the L2 with it's slave devices. I have developed this
+driver in conjuntion with network namespaces and not sure if there is use case
+outside of it.
+
+
+2. Building and Installation:
+	In order to build the driver, please select the config item CONFIG_IPVLAN.
+The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module
+(CONFIG_IPVLAN=m).
+
+
+3. Configuration:
+	There are no module parameters for this driver and it can be configured
+using IProute2/ip utility.
+
+	ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 }
+
+	e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
+
+
+4. Operating modes:
+	IPvlan has two modes of operation - L2 and L3. For a given master device,
+you can select one of these two modes and all slaves on that master will
+operate in the same (selected) mode. The RX mode is almost identical except
+that in L3 mode the slaves wont receive any multicast / broadcast traffic.
+L3 mode is more restrictive since routing is controlled from the other (mostly)
+default namespace.
+
+4.1 L2 mode:
+	In this mode TX processing happens on the stack instance attached to the
+slave device and packets are switched and queued to the master device to send
+out. In this mode the slaves will RX/TX multicast and broadcast (if applicable)
+as well.
+
+4.2 L3 mode:
+	In this mode TX processing upto L3 happens on the stack instance attached
+to the slave device and packets are switched to the stack instance of the
+master device for the L2 processing and routing from that instance will be
+used before packets are queued on the outbound device. In this mode the slaves
+will not receive nor can send multicast / broadcast traffic.
+
+
+5. Sysfs interface:
+	Currently the mode of operation is available at -
+		 /sys/class/net/<master>/ipvlan/mode
+The value can be 0 or 1; where 0 :=> L2, 1 := L3 mode
+
+
+6. What to choose (macvlan vs. ipvlan)?
+	These two devices are very similar in many regards and the specific use
+case could very well define which device to choose. if one of the following
+situations defines your use case then you can choose to use ipvlan -
+	(a) The Linux host that is connected to the external switch / router has
+policy configured that allows only one mac per port.
+	(b) No of virtual devices created on a master exceed the mac capacity and
+puts the NIC in promiscous mode and degraded performance is a concern.
+	(c) If the slave device is to be put into the hostile / untrusted network
+namespace where L2 on the slave could be changed / misused.
+
+
+7. Example configuration:
+
+  +=============================================================+
+  |  Host: host1                                                |
+  |                                                             |
+  |   +----------------------+      +----------------------+    |
+  |   |   NS:ns0             |      |  NS:ns1              |    |
+  |   |                      |      |                      |    |
+  |   |                      |      |                      |    |
+  |   |        ipvl0         |      |         ipvl1        |    |
+  |   +----------#-----------+      +-----------#----------+    |
+  |              #                              #               |
+  |              ################################               |
+  |                              # eth0                         |
+  +==============================#==============================+
+
+
+	(a) Create two network namespaces - ns0, ns1
+		ip netns add ns0
+		ip netns add ns1
+
+	(b) Create two ipvlan slaves on eth0 (master device)
+		ip link add link eth0 ipvl0 type ipvlan mode l2
+		ip link add link eth0 ipvl1 type ipvlan mode l2
+
+	(c) Assign slaves to the respective network namespaces
+		ip link set dev ipvl0 netns ns0
+		ip link set dev ipvl1 netns ns1
+
+	(d) Now switch to the namespace (ns0 or ns1) to configure the slave devices
+		- For ns0
+			(1) ip netns exec ns0 bash
+			(2) ip link set dev ipvl0 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl0
+			(6) ip -4 route add default via $ROUTER dev ipvl0
+		- For ns1
+			(1) ip netns exec ns1 bash
+			(2) ip link set dev ipvl1 up
+			(3) ip link set dev lo up
+			(4) ip -4 addr add 127.0.0.1 dev lo
+			(5) ip -4 addr add $IPADDR dev ipvl1
+			(6) ip -4 route add default via $ROUTER dev ipvl1
+
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index f9009be3f307..b6d64f546574 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -145,6 +145,24 @@ config MACVTAP
 	  To compile this driver as a module, choose M here: the module
 	  will be called macvtap.
 
+
+config IPVLAN
+    tristate "IP-VLAN support"
+    ---help---
+      This allows one to create virtual devices off of a main interface
+      and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
+      on packets. All interfaces (including the main interface) share L2
+      making it transparent to the connected L2 switch.
+
+      Ipvlan devices can be added using the "ip" command from the
+      iproute2 package starting with the iproute2-X.Y.ZZ release:
+
+      "ip link add link <main-dev> [ NAME ] type ipvlan"
+
+      To compile this driver as a module, choose M here: the module
+      will be called ipvlan.
+
+
 config VXLAN
        tristate "Virtual eXtensible Local Area Network (VXLAN)"
        depends on INET
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 61aefdd1e173..e25fdd7d905e 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -6,6 +6,7 @@
 # Networking Core Drivers
 #
 obj-$(CONFIG_BONDING) += bonding/
+obj-$(CONFIG_IPVLAN) += ipvlan/
 obj-$(CONFIG_DUMMY) += dummy.o
 obj-$(CONFIG_EQUALIZER) += eql.o
 obj-$(CONFIG_IFB) += ifb.o
diff --git a/drivers/net/ipvlan/Makefile b/drivers/net/ipvlan/Makefile
new file mode 100644
index 000000000000..2efff4e9bb40
--- /dev/null
+++ b/drivers/net/ipvlan/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Ethernet Ipvlan driver
+#
+
+obj-$(CONFIG_IPVLAN) += ipvlan.o
+
+ipvlan-objs := ipvlan_core.o ipvlan_main.o ipvlan_sysfs.o
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
new file mode 100644
index 000000000000..78bb1ee9bc68
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+#ifndef __IPVLAN_H
+#define __IPVLAN_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rculist.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_arp.h>
+#include <linux/if_link.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/inetdevice.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+#include <net/route.h>
+#include <net/addrconf.h>
+
+#define IPVLAN_DRV	"ipvlan"
+#define IPV_DRV_VER	"0.1"
+
+#define IPVLAN_HASH_SIZE	(1 << BITS_PER_BYTE)
+#define IPVLAN_HASH_MASK	(IPVLAN_HASH_SIZE - 1)
+
+#define IPVLAN_MAC_FILTER_BITS	8
+#define IPVLAN_MAC_FILTER_SIZE	(1 << IPVLAN_MAC_FILTER_BITS)
+#define IPVLAN_MAC_FILTER_MASK	(IPVLAN_MAC_FILTER_SIZE - 1)
+
+/* Define IPVL_DEBUG and set the appropriate dbg_level for debugging. */
+#ifdef	IPVL_DEBUG
+/*
+ * 1 : non-datapath debugging
+ * 2 : Custom
+ * 3 : function enters and exists.
+ * 4 : printk in data path (be careful!)
+ */
+#define IPVL_DBG_LEVEL 1
+#define ipvlan_dbg(level, msg...)	do { \
+						if (level <= IPVL_DBG_LEVEL) \
+						printk(KERN_DEBUG msg); \
+					} while (0)
+#else
+#define ipvlan_dbg(level, msg...) do { ; } while (0)
+#endif
+
+typedef enum {
+	IPVL_IPV6 = 0,
+	IPVL_ICMPV6,
+	IPVL_IPV4,
+	IPVL_ARP,
+} ipvl_hdr_type;
+
+struct ipvl_pcpu_stats {
+	u64			rx_pkts;
+	u64			rx_bytes;
+	u64			rx_mcast;
+	u64			tx_pkts;
+	u64			tx_bytes;
+	struct u64_stats_sync	syncp;
+	u32			rx_errs;
+	u32			tx_drps;
+};
+
+/* Forward declaration */
+struct ipvl_port;
+
+struct ipvl_dev {
+	struct net_device	*dev;
+	struct list_head	pnode;
+	struct ipvl_port	*port;
+	struct net_device	*phy_dev;
+	struct list_head	addrs;
+	int			ipv4cnt;
+	int			ipv6cnt;
+	struct ipvl_pcpu_stats	*pcpu_stats;
+	DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
+	netdev_features_t	sfeatures;
+	u16			mtu_adj;
+};
+
+struct ipvl_addr {
+	struct ipvl_dev		*master; /* Back pointer to master */
+	union {
+		struct in6_addr	ip6;	 /* IPv6 address on logical interface */
+		struct in_addr	ip4;	 /* IPv4 address on logical interface */
+	} ipu;
+#define ip6addr	ipu.ip6
+#define ip4addr ipu.ip4
+	struct hlist_node	hlnode;  /* Hash-table linkage */
+	struct list_head	anode;   /* logical-interface linkage */
+	struct rcu_head		rcu;
+	ipvl_hdr_type		atype;
+};
+
+struct ipvl_port {
+	struct net_device	*dev;
+	struct hlist_head	hlhead[IPVLAN_HASH_SIZE];
+	struct list_head	ipvlans;
+	struct rcu_head		rcu;
+	int			count;
+	struct kobject		kobj;
+	u16			mode;
+};
+
+static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d)
+{
+	return rcu_dereference(d->rx_handler_data);
+}
+
+static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
+{
+	return rtnl_dereference(d->rx_handler_data);
+}
+
+static inline bool ipvlan_dev_master(struct net_device *d)
+{
+	return d->priv_flags & IFF_IPVLAN_MASTER;
+}
+
+static inline bool ipvlan_dev_slave(struct net_device *d)
+{
+	return d->priv_flags & IFF_IPVLAN_SLAVE;
+}
+
+/* ---- Prototype declarations ---- */
+/* ---- ipvlan_main.c ---- */
+void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev);
+void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval);
+
+/* ---- ipvlan_sysfs.c ---- */
+int ipvlan_add_per_master_sysfs_mode(struct ipvl_port *port,
+				     struct net_device *dev);
+void ipvlan_del_per_master_sysfs_mode(struct ipvl_port *port);
+
+/* ---- ipvlan_core.c ---- */
+void ipvlan_init_secret(void);
+unsigned int ipvlan_mac_hash(const unsigned char *addr);
+rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
+int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
+void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
+bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6);
+struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
+					const void *iaddr, bool is_v6);
+void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync);
+#endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
new file mode 100644
index 000000000000..24d5ccc96eae
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -0,0 +1,629 @@
+/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+
+#include "ipvlan.h"
+
+static u32 ipvlan_jhash_secret;
+
+void ipvlan_init_secret(void)
+{
+	net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret));
+}
+
+static void ipvlan_count_rx(const struct ipvl_dev *ipvlan,
+			    unsigned int len, bool success, bool mcast)
+{
+	if (!ipvlan)
+		return;
+
+	if (likely(success)) {
+		struct ipvl_pcpu_stats *pcptr;
+
+		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
+		u64_stats_update_begin(&pcptr->syncp);
+		pcptr->rx_pkts++;
+		pcptr->rx_bytes += len;
+		if (mcast)
+			pcptr->rx_mcast++;
+		u64_stats_update_end(&pcptr->syncp);
+	} else {
+		this_cpu_inc(ipvlan->pcpu_stats->rx_errs);
+	}
+}
+
+static u8 ipvlan_get_v6_hash(const void *iaddr)
+{
+	const struct in6_addr *ip6_addr = iaddr;
+
+	return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret)
+	       & IPVLAN_HASH_MASK;
+}
+
+static u8 ipvlan_get_v4_hash(const void *iaddr)
+{
+	const struct in_addr *ip4_addr = iaddr;
+	return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret)
+	       & IPVLAN_HASH_MASK;
+}
+
+struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
+					const void *iaddr, bool is_v6)
+{
+	struct ipvl_addr *addr;
+	u8 hash = is_v6 ? ipvlan_get_v6_hash(iaddr) :
+			    ipvlan_get_v4_hash(iaddr);
+
+	hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) {
+		if (is_v6 && addr->atype == IPVL_IPV6 &&
+			ipv6_addr_equal(&addr->ip6addr, iaddr))
+			return addr;
+		else if (!is_v6 && addr->atype == IPVL_IPV4 &&
+			 addr->ip4addr.s_addr ==
+				((struct in_addr *)iaddr)->s_addr)
+			return addr;
+	}
+	return NULL;
+}
+
+void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr)
+{
+	struct ipvl_port *port = ipvlan->port;
+	u8 hash = (addr->atype == IPVL_IPV6) ?
+		ipvlan_get_v6_hash(&addr->ip6addr) :
+		ipvlan_get_v4_hash(&addr->ip4addr);
+
+	hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]);
+}
+
+void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync)
+{
+	hlist_del_rcu(&addr->hlnode);
+	if (sync)
+		synchronize_rcu();
+}
+
+bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
+{
+	struct ipvl_port *port = ipvlan->port;
+	struct ipvl_addr *addr;
+
+	list_for_each_entry(addr, &ipvlan->addrs, anode) {
+		if ((is_v6 && addr->atype == IPVL_IPV6 &&
+		     ipv6_addr_equal(&addr->ip6addr, iaddr))
+		   || (!is_v6 && addr->atype == IPVL_IPV4 &&
+		      addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr)
+		  )
+			return true;
+	}
+
+	if (ipvlan_ht_addr_lookup(port, iaddr, is_v6))
+		return true;
+
+	return false;
+}
+
+static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type)
+{
+	void *lyr3h = NULL;
+
+	switch (skb->protocol) {
+	case htons(ETH_P_ARP): {
+		struct arphdr *arph;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(*arph))))
+			return NULL;
+
+		arph = arp_hdr(skb);
+		*type = IPVL_ARP;
+		lyr3h = arph;
+		break;
+	}
+
+	case htons(ETH_P_IP): {
+		u32 pktlen;
+		struct iphdr *ip4h;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h))))
+			return NULL;
+
+		ip4h = ip_hdr(skb);
+		pktlen = ntohs(ip4h->tot_len);
+		if (ip4h->ihl < 5 || ip4h->version != 4)
+			return NULL;
+		if (skb->len < pktlen || pktlen < (ip4h->ihl * 4))
+			return NULL;
+
+		*type = IPVL_IPV4;
+		lyr3h = ip4h;
+		break;
+	}
+	case htons(ETH_P_IPV6): {
+		struct ipv6hdr *ip6h;
+
+		if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h))))
+			return NULL;
+
+		ip6h = ipv6_hdr(skb);
+		if (ip6h->version != 6)
+			return NULL;
+
+		*type = IPVL_IPV6;
+		lyr3h = ip6h;
+		/* Only Neighbour Solicitation pkts need different treatment */
+		if (ipv6_addr_any(&ip6h->saddr) &&
+		    ip6h->nexthdr == NEXTHDR_ICMP) {
+			/* Get to the ICMPv6 header */
+			*type = IPVL_ICMPV6;
+			lyr3h = ip6h + 1;
+		}
+		break;
+	}
+	default:
+		return NULL;
+	}
+
+	return lyr3h;
+}
+
+unsigned int ipvlan_mac_hash(const unsigned char *addr)
+{
+	u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2),
+			       ipvlan_jhash_secret);
+	return hash & IPVLAN_MAC_FILTER_MASK;
+}
+
+static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb,
+				   const struct ipvl_dev *in_dev, bool local)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_dev *ipvlan = NULL;
+	struct sk_buff *nskb;
+	unsigned int len;
+	unsigned int mac_hash;
+	int ret;
+
+	/* If it's a PAUSE frame discard it! */
+	if (skb->protocol == htons(ETH_P_PAUSE))
+		return;
+
+	list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+		if (local && (ipvlan == in_dev))
+			continue;
+
+		mac_hash = ipvlan_mac_hash(eth->h_dest);
+		if (!test_bit(mac_hash, ipvlan->mac_filters))
+			continue;
+
+		ret = NET_RX_DROP;
+		len = skb->len + ETH_HLEN;
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (!nskb)
+			goto mcast_acct;
+
+		if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast))
+			nskb->pkt_type = PACKET_BROADCAST;
+		else
+			nskb->pkt_type = PACKET_MULTICAST;
+
+		nskb->dev = ipvlan->dev;
+		if (local)
+			ret = dev_forward_skb(ipvlan->dev, nskb);
+		else
+			ret = netif_rx(nskb);
+mcast_acct:
+		ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
+	}
+
+	/* Locally generated? ...Forward a copy to the main-device as
+	 * well. On the RX side we'll ignore it (wont give it to any
+	 * of the virtual devices.
+	 */
+	if (local) {
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (nskb) {
+			if (ether_addr_equal(eth->h_dest, port->dev->broadcast))
+				nskb->pkt_type = PACKET_BROADCAST;
+			else
+				nskb->pkt_type = PACKET_MULTICAST;
+
+			dev_forward_skb(port->dev, nskb);
+		}
+	}
+}
+
+static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb,
+			    bool local)
+{
+	struct ipvl_dev *ipvlan = addr->master;
+	struct net_device *dev = ipvlan->dev;
+	unsigned int len;
+	rx_handler_result_t ret = RX_HANDLER_CONSUMED;
+	bool success = false;
+
+	len = skb->len + ETH_HLEN;
+	if (unlikely(!(dev->flags & IFF_UP))) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (!skb)
+		goto out;
+
+	skb->dev = dev;
+	skb->pkt_type = PACKET_HOST;
+
+	if (local) {
+		if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
+			success = true;
+	} else {
+		ret = RX_HANDLER_ANOTHER;
+		success = true;
+	}
+
+out:
+	ipvlan_count_rx(ipvlan, len, success, false);
+	return ret;
+}
+
+static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port,
+					    void *lyr3h, int addr_type,
+					    bool use_dest)
+{
+	struct ipvl_addr *addr = NULL;
+
+	if (addr_type == IPVL_IPV6) {
+		struct ipv6hdr *ip6h = NULL;
+		struct in6_addr *i6addr;
+
+		ip6h = (struct ipv6hdr *)lyr3h;
+		i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr;
+		addr = ipvlan_ht_addr_lookup(port, i6addr, true);
+	} else if (addr_type == IPVL_ICMPV6) {
+		struct nd_msg *ndmh;
+		struct in6_addr *i6addr;
+		ndmh = (struct nd_msg *)lyr3h;
+
+		/* Make sure that the NeighborSolicitation ICMPv6 packets
+		 * are handled to avoid DAD issue.
+		 */
+		if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) {
+			/* Reach the target address */
+			i6addr = &ndmh->target;
+			addr = ipvlan_ht_addr_lookup(port, i6addr, true);
+		}
+	} else if (addr_type == IPVL_IPV4) {
+		struct iphdr *ip4h = NULL;
+		__be32 *i4addr;
+
+		ip4h = (struct iphdr *)lyr3h;
+		i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr;
+		addr = ipvlan_ht_addr_lookup(port, i4addr, false);
+	} else if (addr_type == IPVL_ARP) {
+		struct arphdr *arph = NULL;
+		unsigned char *arp_ptr;
+		__be32 dip;
+
+		arph = (struct arphdr *)lyr3h;
+		arp_ptr = (unsigned char *)(arph + 1);
+		if (use_dest)
+			/* Skip 2 L2 headers + 1 src L3 (IPv4) header */
+			arp_ptr += (2 * port->dev->addr_len) + 4;
+		else
+			/* Skip L2 header to get to src L3 (IPv4) */
+			arp_ptr += port->dev->addr_len;
+
+		memcpy(&dip, arp_ptr, 4); /* Get the dst IPv4 */
+		addr = ipvlan_ht_addr_lookup(port, &dip, false);
+	}
+
+	return addr;
+}
+
+static int ipvlan_process_v4_outbound(struct sk_buff *skb)
+{
+	const struct iphdr *ip4h = ip_hdr(skb);
+	struct net_device *dev = skb->dev;
+	struct rtable *rt;
+	int err, ret = NET_XMIT_DROP;
+	struct flowi4 fl4 = {
+		.flowi4_oif = dev->iflink,
+		.flowi4_tos = RT_TOS(ip4h->tos),
+		.flowi4_flags = FLOWI_FLAG_ANYSRC,
+		.daddr = ip4h->daddr,
+		.saddr = ip4h->saddr,
+	};
+
+	rt = ip_route_output_flow(dev_net(dev), &fl4, NULL);
+	if (IS_ERR(rt))
+		goto err;
+
+	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+		ip_rt_put(rt);
+		goto err;
+	}
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+	err = ip_local_out(skb);
+	if (unlikely(net_xmit_eval(err)))
+		dev->stats.tx_errors++;
+	else
+		ret = NET_XMIT_SUCCESS;
+	goto out;
+err:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+out:
+	return ret;
+}
+
+static int ipvlan_process_v6_outbound(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	struct net_device *dev = skb->dev;
+	struct dst_entry *dst;
+	int err, ret = NET_XMIT_DROP;
+	struct flowi6 fl6 = {
+		.flowi6_iif = skb->dev->ifindex,
+		.daddr = ip6h->daddr,
+		.saddr = ip6h->saddr,
+		.flowi6_flags = FLOWI_FLAG_ANYSRC,
+		.flowlabel = ip6_flowinfo(ip6h),
+		.flowi6_mark = skb->mark,
+		.flowi6_proto = ip6h->nexthdr,
+	};
+
+	dst = ip6_route_output(dev_net(dev), NULL, &fl6);
+	if (IS_ERR(dst))
+		goto err;
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, dst);
+	err = ip6_local_out(skb);
+	if (unlikely(net_xmit_eval(err)))
+		dev->stats.tx_errors++;
+	else
+		ret = NET_XMIT_SUCCESS;
+	goto out;
+err:
+	dev->stats.tx_errors++;
+	kfree_skb(skb);
+out:
+	return ret;
+}
+
+static int ipvlan_process_outbound(struct sk_buff *skb,
+				   const struct ipvl_dev *ipvlan)
+{
+	struct ethhdr *ethh = eth_hdr(skb);
+	int ret = NET_XMIT_DROP;
+
+	/* In this mode we dont care about multicast and broadcast traffic */
+	if (is_multicast_ether_addr(ethh->h_dest)) {
+		pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
+				    ntohs(skb->protocol));
+		kfree_skb(skb);
+		goto out;
+	}
+
+	/* The ipvlan is a pseudo-L2 device, so the packets that we receive
+	 * will have L2; which need to discarded and processed further
+	 * in the net-ns of the main-device.
+	 */
+	if (skb_mac_header_was_set(skb)) {
+		skb_pull(skb, sizeof(*ethh));
+		skb->mac_header = (typeof(skb->mac_header))~0U;
+		skb_reset_network_header(skb);
+	}
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		ret = ipvlan_process_v6_outbound(skb);
+	else if (skb->protocol == htons(ETH_P_IP))
+		ret = ipvlan_process_v4_outbound(skb);
+	else {
+		pr_warn_ratelimited("Dropped outbound packet type=%x\n",
+				    ntohs(skb->protocol));
+		kfree_skb(skb);
+	}
+out:
+	return ret;
+}
+
+static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	void *lyr3h = NULL;
+	struct ipvl_addr *addr = NULL;
+	int addr_type;
+
+	ipvlan_dbg(4, "L3:Xmit on dev %s,PROT=%x\n", dev->name,
+		   ntohs(skb->protocol));
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+	if (addr)
+		return ipvlan_rcv_frame(addr, skb, true);
+
+out:
+	/* Send it out */
+	skb->dev = ipvlan->phy_dev;
+	return ipvlan_process_outbound(skb, ipvlan);
+}
+
+static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_addr *addr = NULL;
+	void *lyr3h = NULL;
+	int addr_type;
+
+	ipvlan_dbg(4, "L2:Xmit on dev %s,PROT=%x\n", dev->name,
+		   ntohs(skb->protocol));
+	if (ether_addr_equal(eth->h_dest, eth->h_source)) {
+		ipvlan_dbg(4, "Comm betn 2 virt devs PROT=%x\n",
+			   ntohs(skb->protocol));
+		if ((lyr3h = ipvlan_get_L3_hdr(skb, &addr_type)) == NULL)
+			goto to_default;
+
+		addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+		if (addr)
+			return ipvlan_rcv_frame(addr, skb, true);
+
+		/* No matching ipvlan dev! Must be on the Physical device */
+to_default:
+		skb = skb_share_check(skb, GFP_ATOMIC);
+		if (!skb)
+			return RX_HANDLER_CONSUMED;
+
+		/* Packet definitely does not belong to any of the
+		 * virtual devices, but the dest is local. So forward
+		 * the skb for the main-dev. At the RX side we just return
+		 * RX_PASS for it to be processed further on the stack.
+		 */
+		return dev_forward_skb(ipvlan->phy_dev, skb);
+
+	} else if (is_multicast_ether_addr(eth->h_dest)) {
+		u8 ip_summed = skb->ip_summed;
+		/* Packet needs to be multicast-ed. */
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		ipvlan_dbg(4, "%s[%d] Mcast Xmit on [%s], PROT=[%x]\n",
+			   __func__, __LINE__, dev->name,
+			   ntohs(skb->protocol));
+		ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
+		skb->ip_summed = ip_summed;
+	}
+
+	/* Send it out */
+	skb->dev = ipvlan->phy_dev;
+	return dev_queue_xmit(skb);
+}
+
+int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev);
+
+	if (!port)
+		goto out;
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
+		goto out;
+
+	switch(port->mode) {
+	case IPVLAN_MODE_L2:
+		return ipvlan_xmit_mode_l2(skb, dev);
+	case IPVLAN_MODE_L3:
+		return ipvlan_xmit_mode_l3(skb, dev);
+	}
+
+	/* Should not reach here */
+	BUG();
+out:
+	return RX_HANDLER_ANOTHER;
+}
+
+static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port)
+{
+	struct ethhdr *eth = eth_hdr(skb);
+	struct ipvl_addr *addr = NULL;
+	void *lyr3h;
+	int addr_type;
+
+	if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) {
+		if ((lyr3h = ipvlan_get_L3_hdr(skb, &addr_type)) == NULL)
+			return true;
+
+		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false);
+		if (addr)
+			return false;
+	}
+
+	return true;
+}
+
+static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
+						 struct ipvl_port *port)
+{
+	void *lyr3h;
+	int addr_type;
+	struct ipvl_addr *addr = NULL;
+	struct sk_buff *skb = *pskb;
+	rx_handler_result_t ret = RX_HANDLER_PASS;
+
+	lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
+	if (!lyr3h)
+		goto out;
+
+	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+	if (addr) {
+		ipvlan_dbg(4, "%s[%d]L3:Ucast Recv for [%s], PROT=[%x]\n",
+			   __func__, __LINE__, addr->master->dev->name,
+			   ntohs(skb->protocol));
+		ret = ipvlan_rcv_frame(addr, skb, false);
+	}
+out:
+	return ret;
+}
+
+static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
+						 struct ipvl_port *port)
+{
+	struct sk_buff *skb = *pskb;
+	struct ethhdr *eth = eth_hdr(skb);
+	rx_handler_result_t ret = RX_HANDLER_PASS;
+	void *lyr3h;
+	int addr_type;
+
+	/* First Handle multi-cast frames */
+	if (is_multicast_ether_addr(eth->h_dest)) {
+		/* Pass to virtual devs only if they haven't seen the frame. */
+		if (ipvlan_external_frame(skb, port)) {
+			ipvlan_dbg(4, "%s[%d]L2:Mcast Recv:[%s], PROT=[%x]\n",
+				   __func__, __LINE__, port->dev->name,
+				   ntohs(skb->protocol));
+			ipvlan_multicast_frame(port, skb, NULL, false);
+		}
+	} else if ((lyr3h = ipvlan_get_L3_hdr(skb, &addr_type)) != NULL) {
+		struct ipvl_addr *addr;
+
+		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
+		if (addr) {
+			ipvlan_dbg(4, "%s[%d]L2:Ucast Recv:[%s], PROT=[%x]\n",
+				   __func__, __LINE__, addr->master->dev->name,
+				   ntohs(skb->protocol));
+			ret = ipvlan_rcv_frame(addr, skb, false);
+		}
+	}
+
+	return ret;
+}
+
+rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);
+
+	if (!port)
+		goto out;
+
+	switch (port->mode) {
+	case IPVLAN_MODE_L2:
+		return ipvlan_handle_mode_l2(pskb, port);
+	case IPVLAN_MODE_L3:
+		return ipvlan_handle_mode_l3(pskb, port);
+	}
+
+	/* Should not reach here */
+	BUG();
+out:
+	return RX_HANDLER_PASS;
+}
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
new file mode 100644
index 000000000000..b4fd6e786316
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -0,0 +1,827 @@
+/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+
+#include "ipvlan.h"
+
+void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
+{
+	ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
+}
+
+void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval)
+{
+	struct ipvl_dev *ipvlan;
+
+	if (port->mode != nval) {
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+			if (nval == IPVLAN_MODE_L3)
+				ipvlan->dev->flags |= IFF_NOARP;
+			else
+				ipvlan->dev->flags &= ~IFF_NOARP;
+		}
+		port->mode = nval;
+	}
+}
+
+static int ipvlan_port_create(struct net_device *dev)
+{
+	struct ipvl_port *port;
+	int err, idx;
+
+	ipvlan_dbg(3, "%s[%d]: Entering...\n", __func__, __LINE__);
+	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) {
+		pr_warn("%s[%d]: Returning -EINVAL...\n",
+			__func__, __LINE__);
+		return -EINVAL;
+	}
+	if ((port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL)) == NULL) {
+		pr_warn("%s[%d]: Returning -ENOMEM...\n",
+			__func__, __LINE__);
+		return -ENOMEM;
+	}
+	port->dev = dev;
+	port->mode = IPVLAN_MODE_L3;
+	INIT_LIST_HEAD(&port->ipvlans);
+	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
+		INIT_HLIST_HEAD(&port->hlhead[idx]);
+
+	err = ipvlan_add_per_master_sysfs_mode(port, dev);
+	if (err)
+		goto err;
+
+	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
+	if (err)
+		goto err;
+
+	dev->priv_flags |= IFF_IPVLAN_MASTER;
+	ipvlan_dbg(3, "%s[%d]: Returning (%d)...\n", __func__, __LINE__, err);
+	return 0;
+
+err:
+	kfree_rcu(port, rcu);
+	return err;
+}
+
+static void ipvlan_port_destroy(struct net_device *dev)
+{
+	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
+
+	dev->priv_flags &= ~IFF_IPVLAN_MASTER;
+	ipvlan_del_per_master_sysfs_mode(port);
+	netdev_rx_handler_unregister(dev);
+	kfree_rcu(port, rcu);
+}
+
+/* ipvlan network devices have devices nesting below it and are a special
+ * "super class" of normal network devices; split their locks off into a
+ * separate class since they always nest.
+ */
+static struct lock_class_key ipvlan_netdev_xmit_lock_key;
+static struct lock_class_key ipvlan_netdev_addr_lock_key;
+
+#define IPVLAN_FEATURES \
+	(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
+	 NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \
+	 NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
+	 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
+
+#define IPVLAN_STATE_MASK \
+	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
+
+static void ipvlan_set_lockdep_class_one(struct net_device *dev,
+					 struct netdev_queue *txq,
+					 void *_unused)
+{
+	lockdep_set_class(&txq->_xmit_lock, &ipvlan_netdev_xmit_lock_key);
+}
+
+static void ipvlan_set_lockdep_class(struct net_device *dev)
+{
+	lockdep_set_class(&dev->addr_list_lock, &ipvlan_netdev_addr_lock_key);
+	netdev_for_each_tx_queue(dev, ipvlan_set_lockdep_class_one, NULL);
+}
+
+/* ---- IPVLAN Netdev Ops ---- */
+static int ipvlan_init(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	const struct net_device *phy_dev = ipvlan->phy_dev;
+
+	dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
+		     (phy_dev->state & IPVLAN_STATE_MASK);
+	dev->features = phy_dev->features & IPVLAN_FEATURES;
+	dev->features |= NETIF_F_LLTX;
+	dev->gso_max_size = phy_dev->gso_max_size;
+	dev->iflink = phy_dev->ifindex;
+	dev->hard_header_len = phy_dev->hard_header_len;
+
+	ipvlan_set_lockdep_class(dev);
+
+	ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats);
+	if (!ipvlan->pcpu_stats)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ipvlan_uninit(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan->port;
+
+	if (ipvlan->pcpu_stats)
+		free_percpu(ipvlan->pcpu_stats);
+
+	port->count -= 1;
+	if (!port->count)
+		ipvlan_port_destroy(port->dev);
+}
+
+static int ipvlan_open(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+	struct ipvl_addr *addr;
+
+	if (ipvlan->port->mode == IPVLAN_MODE_L3)
+		dev->flags |= IFF_NOARP;
+	else
+		dev->flags &= ~IFF_NOARP;
+
+	if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
+		list_for_each_entry(addr, &ipvlan->addrs, anode) {
+			ipvlan_ht_addr_add(ipvlan, addr);
+		}
+	}
+	return dev_uc_add(phy_dev, phy_dev->dev_addr);
+}
+
+static int ipvlan_stop(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+	struct ipvl_addr *addr;
+
+	dev_uc_unsync(phy_dev, dev);
+	dev_mc_unsync(phy_dev, dev);
+
+	dev_uc_del(phy_dev, phy_dev->dev_addr);
+
+	if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
+		list_for_each_entry(addr, &ipvlan->addrs, anode) {
+			ipvlan_ht_addr_del(addr, !dev->dismantle);
+		}
+	}
+	return 0;
+}
+
+netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	int skblen = skb->len;
+	int ret;
+
+	ret = ipvlan_queue_xmit(skb, dev);
+	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
+		struct ipvl_pcpu_stats *pcptr;
+
+		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
+
+		u64_stats_update_begin(&pcptr->syncp);
+		pcptr->tx_pkts++;
+		pcptr->tx_bytes += skblen;
+		u64_stats_update_end(&pcptr->syncp);
+	} else {
+		this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
+	}
+	return ret;
+}
+
+static netdev_features_t ipvlan_fix_features(struct net_device *dev,
+					     netdev_features_t features)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES);
+}
+
+static void ipvlan_change_rx_flags(struct net_device *dev, int change)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	if (change & IFF_ALLMULTI)
+		dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1);
+}
+
+static void ipvlan_set_broadcast_mac_filter(struct ipvl_dev *ipvlan, bool set)
+{
+	struct net_device *dev = ipvlan->dev;
+	unsigned int hashbit = ipvlan_mac_hash(dev->broadcast);
+
+	if (set && !test_bit(hashbit, ipvlan->mac_filters)) {
+		/* Set broadcast hash-bit (for IPv4) */
+		__set_bit(hashbit, ipvlan->mac_filters);
+	} else if (!set && test_bit(hashbit, ipvlan->mac_filters)) {
+		/* Reset broadcast hash-bit */
+		__clear_bit(hashbit, ipvlan->mac_filters);
+	}
+}
+
+static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
+		bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE);
+	} else {
+		struct netdev_hw_addr *ha;
+		DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE);
+
+		bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE);
+		netdev_for_each_mc_addr(ha, dev) {
+			__set_bit(ipvlan_mac_hash(ha->addr), mc_filters);
+		}
+		bitmap_copy(ipvlan->mac_filters, mc_filters,
+			    IPVLAN_MAC_FILTER_SIZE);
+	}
+	dev_uc_sync(ipvlan->phy_dev, dev);
+	dev_mc_sync(ipvlan->phy_dev, dev);
+}
+
+static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev,
+						struct rtnl_link_stats64 *stats)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	if (ipvlan->pcpu_stats) {
+		struct ipvl_pcpu_stats *pcptr;
+		u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes;
+		u32 rx_errs = 0, tx_drps = 0;
+		u32 strt;
+		int idx;
+
+		for_each_possible_cpu(idx) {
+			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
+			do {
+				strt= u64_stats_fetch_begin_irq(&pcptr->syncp);
+				rx_pkts = pcptr->rx_pkts;
+				rx_bytes = pcptr->rx_bytes;
+				rx_mcast = pcptr->rx_mcast;
+				tx_pkts = pcptr->tx_pkts;
+				tx_bytes = pcptr->tx_bytes;
+			} while(u64_stats_fetch_retry_irq(&pcptr->syncp, strt));
+
+			stats->rx_packets += rx_pkts;
+			stats->rx_bytes += rx_bytes;
+			stats->multicast += rx_mcast;
+			stats->tx_packets += tx_pkts;
+			stats->tx_bytes += tx_bytes;
+
+			/* u32 values are updated without syncp protection. */
+			rx_errs += pcptr->rx_errs;
+			tx_drps += pcptr->tx_drps;
+		}
+		stats->rx_errors = rx_errs;
+		stats->rx_dropped = rx_errs;
+		stats->tx_dropped = tx_drps;
+	}
+	return stats;
+}
+
+static int ipvlan_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	return vlan_vid_add(phy_dev, proto, vid);
+}
+
+static int ipvlan_vlan_rx_kill_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	vlan_vid_del(phy_dev, proto, vid);
+	return 0;
+}
+
+static const struct net_device_ops ipvlan_netdev_ops = {
+	.ndo_init		= ipvlan_init,
+	.ndo_uninit		= ipvlan_uninit,
+	.ndo_open		= ipvlan_open,
+	.ndo_stop		= ipvlan_stop,
+	.ndo_start_xmit		= ipvlan_start_xmit,
+	.ndo_fix_features	= ipvlan_fix_features,
+	.ndo_change_rx_flags	= ipvlan_change_rx_flags,
+	.ndo_set_rx_mode	= ipvlan_set_multicast_mac_filter,
+	.ndo_get_stats64	= ipvlan_get_stats64,
+	.ndo_vlan_rx_add_vid	= ipvlan_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= ipvlan_vlan_rx_kill_vid,
+};
+
+/* ---- Ethernet Header Ops ---- */
+static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
+			      unsigned short type, const void *daddr,
+			      const void *saddr, unsigned len)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct net_device *phy_dev = ipvlan->phy_dev;
+
+	/* TODO Probably use a different field than dev_addr so that the
+	 * mac-address on the virtual device is portable and can be carried
+	 * while the packets use the mac-addr on the physical device.
+	 */
+	return dev_hard_header(skb, phy_dev, type, daddr,
+			       saddr ? : dev->dev_addr, len);
+}
+
+static const struct header_ops ipvlan_header_ops = {
+	.create  	= ipvlan_hard_header,
+	.rebuild	= eth_rebuild_header,
+	.parse		= eth_header_parse,
+	.cache		= eth_header_cache,
+	.cache_update	= eth_header_cache_update,
+};
+
+/* ---- Ethtool ops ---- */
+static int ipvlan_ethtool_get_settings(struct net_device *dev,
+				       struct ethtool_cmd *cmd)
+{
+	const struct ipvl_dev *ipvlan = netdev_priv(dev);
+	return __ethtool_get_settings(ipvlan->phy_dev, cmd);
+}
+
+static void ipvlan_ethtool_get_drvinfo(struct net_device *dev,
+				       struct ethtool_drvinfo *drvinfo)
+{
+	strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version));
+}
+
+static const struct ethtool_ops ipvlan_ethtool_ops = {
+	.get_link	= ethtool_op_get_link,
+	.get_settings	= ipvlan_ethtool_get_settings,
+	.get_drvinfo	= ipvlan_ethtool_get_drvinfo,
+};
+
+/* ---- Link-ops ---- */
+static int ipvlan_nl_changelink(struct net_device *dev,
+				struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+
+	if (data && data[IFLA_IPVLAN_MODE]) {
+		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+		ipvlan_set_port_mode(port, nmode);
+	}
+
+	return 0;
+}
+
+static size_t ipvlan_nl_getsize(const struct net_device *dev)
+{
+	return (0
+		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
+		);
+}
+
+static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+	if (data && data[IFLA_IPVLAN_MODE]) {
+		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+
+		if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ipvlan_nl_fillinfo(struct sk_buff *skb,
+			      const struct net_device *dev)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
+	int ret = -EINVAL;
+
+	if (!port)
+		goto err;
+
+	ret = -EMSGSIZE;
+	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
+		goto err;
+
+	return 0;
+
+err:
+	return ret;
+}
+
+static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
+			   struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_port *port;
+	struct net_device *phy_dev;
+	int err;
+
+	ipvlan_dbg(3, "%s[%d]: Entering...\n", __func__, __LINE__);
+	if (!tb[IFLA_LINK]) {
+		ipvlan_dbg(3, "%s[%d]: Returning -EINVAL...\n",
+			   __func__, __LINE__);
+		return -EINVAL;
+	}
+
+	phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (phy_dev == NULL) {
+		ipvlan_dbg(3, "%s[%d]: Returning -ENODEV...\n",
+			   __func__, __LINE__);
+		return -ENODEV;
+	}
+
+	/* TODO will someone try creating ipvlan-dev on an ipvlan-virtual dev?*/
+	if (!ipvlan_dev_master(phy_dev)) {
+		err = ipvlan_port_create(phy_dev);
+		if (err < 0) {
+			ipvlan_dbg(3, "%s[%d]: Returning error (%d)...\n",
+				   __func__, __LINE__, err);
+			return err;
+		}
+	}
+
+	port = ipvlan_port_get_rtnl(phy_dev);
+	/* Get the mode if specified. */
+	if (data && data[IFLA_IPVLAN_MODE])
+		port->mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
+
+	ipvlan->phy_dev = phy_dev;
+	ipvlan->dev = dev;
+	ipvlan->port = port;
+	ipvlan->sfeatures = IPVLAN_FEATURES;
+	INIT_LIST_HEAD(&ipvlan->addrs);
+	ipvlan->ipv4cnt = 0;
+	ipvlan->ipv6cnt = 0;
+
+	/* Probably put a random address here to be presented to the
+	 * world but keep using the physical-dev address for the outgoing
+	 * packets.
+	 */
+	memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN);
+
+	/* Mark this as a IPVLAN secondary device. */
+	dev->priv_flags |= IFF_IPVLAN_SLAVE;
+
+	port->count += 1;
+	err = register_netdevice(dev);
+	if (err < 0) {
+		ipvlan_dbg(3, "%s[%d]: Returning error...\n",
+			   __func__, __LINE__);
+		goto ipvlan_destroy_port;
+	}
+	err = netdev_upper_dev_link(phy_dev, dev);
+	if (err) {
+		ipvlan_dbg(3, "%s[%d]: Returning error (%d)\n",
+			   __func__, __LINE__, err);
+		goto ipvlan_destroy_port;
+	}
+
+	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
+	netif_stacked_transfer_operstate(phy_dev, dev);
+	ipvlan_dbg(3, "%s[%d]: Returning success...\n", __func__, __LINE__);
+	return 0;
+
+ipvlan_destroy_port:
+	port->count -= 1;
+	if (!port->count)
+		ipvlan_port_destroy(phy_dev);
+
+	ipvlan_dbg(3, "%s[%d]: Return (after Destroying Port)",
+		   __func__, __LINE__);
+	return err;
+}
+
+static void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
+{
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct ipvl_addr *addr, *next;
+
+	if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) {
+		list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
+			ipvlan_ht_addr_del(addr, !dev->dismantle);
+			list_del_rcu(&addr->anode);
+		}
+	}
+	list_del_rcu(&ipvlan->pnode);
+	unregister_netdevice_queue(dev, head);
+	netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
+}
+
+static void ipvlan_link_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+	dev->priv_flags |= IFF_UNICAST_FLT;
+	dev->netdev_ops = &ipvlan_netdev_ops;
+	dev->destructor = free_netdev;
+	dev->header_ops = &ipvlan_header_ops;
+	dev->ethtool_ops = &ipvlan_ethtool_ops;
+	dev->tx_queue_len = 0;
+}
+
+static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
+{
+	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
+};
+
+static struct rtnl_link_ops ipvlan_link_ops = {
+	.kind		= "ipvlan",
+	.priv_size	= sizeof(struct ipvl_dev),
+
+	.get_size	= ipvlan_nl_getsize,
+	.policy		= ipvlan_nl_policy,
+	.validate	= ipvlan_nl_validate,
+	.fill_info	= ipvlan_nl_fillinfo,
+	.changelink	= ipvlan_nl_changelink,
+	.maxtype	= IFLA_IPVLAN_MAX,
+
+	.setup		= ipvlan_link_setup,
+	.newlink	= ipvlan_link_new,
+	.dellink	= ipvlan_link_delete,
+};
+
+int ipvlan_link_register(struct rtnl_link_ops *ops)
+{
+	return rtnl_link_register(ops);
+}
+
+/* ---- IPVLAN event handling ---- */
+static int ipvlan_device_event(struct notifier_block *unused,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct ipvl_dev *ipvlan, *next;
+	struct ipvl_port *port;
+	LIST_HEAD(lst_kill);
+
+	if (!ipvlan_dev_master(dev))
+		return NOTIFY_DONE;
+
+	port = ipvlan_port_get_rtnl(dev);
+
+	switch (event) {
+	case NETDEV_CHANGE:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
+			netif_stacked_transfer_operstate(ipvlan->phy_dev,
+							 ipvlan->dev);
+		break;
+
+	case NETDEV_UNREGISTER:
+		if (dev->reg_state != NETREG_UNREGISTERING)
+			break;
+
+		list_for_each_entry_safe(ipvlan, next, &port->ipvlans,
+					 pnode)
+			ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev,
+							    &lst_kill);
+		unregister_netdevice_many(&lst_kill);
+		break;
+
+	case NETDEV_FEAT_CHANGE:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+			ipvlan->dev->features = dev->features & IPVLAN_FEATURES;
+			ipvlan->dev->gso_max_size = dev->gso_max_size;
+			netdev_features_change(ipvlan->dev);
+		}
+		break;
+
+	case NETDEV_CHANGEMTU:
+		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+			ipvlan_adjust_mtu(ipvlan, dev);
+		}
+		break;
+
+	case NETDEV_PRE_TYPE_CHANGE:
+		/* Forbid underlying device to change its type. */
+		return NOTIFY_BAD;
+	}
+	return NOTIFY_DONE;
+}
+
+static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+{
+	struct ipvl_addr *addr = NULL;
+
+	if (ipvlan_addr_busy(ipvlan, ip6_addr, true)) {
+		pr_warn("%s[%d]: Failed IPv6=%x:%x:%x:%x address for %s intf\n",
+			__func__, __LINE__, ip6_addr->s6_addr32[0],
+			ip6_addr->s6_addr32[1], ip6_addr->s6_addr32[2],
+			ip6_addr->s6_addr32[3], ipvlan->dev->name);
+		return -EINVAL;
+	}
+	if ((addr = kzalloc(sizeof(struct ipvl_addr), GFP_KERNEL)) == NULL)
+		return -ENOMEM;
+
+	ipvlan_dbg(1, "%s[%d]: Adding IPv6=%x:%x:%x:%x address for %s intf\n",
+		   __func__, __LINE__, ip6_addr->s6_addr32[0],
+		   ip6_addr->s6_addr32[1], ip6_addr->s6_addr32[2],
+		   ip6_addr->s6_addr32[3], ipvlan->dev->name);
+	addr->master = ipvlan;
+	memcpy(&addr->ip6addr, ip6_addr, sizeof(struct in6_addr));
+	addr->atype = IPVL_IPV6;
+	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
+	ipvlan->ipv6cnt++;
+	ipvlan_ht_addr_add(ipvlan, addr);
+
+	return 0;
+}
+
+static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
+{
+	struct ipvl_addr *addr = NULL;
+
+	if ((addr = ipvlan_ht_addr_lookup(ipvlan->port, ip6_addr, true)) ==NULL)
+		return;
+
+	ipvlan_dbg(1,
+		   "%s[%d]: Deleting IPv6=%x:%x:%x:%x address for %s intf.\n",
+		   __func__, __LINE__, ip6_addr->s6_addr32[0],
+		   ip6_addr->s6_addr32[1], ip6_addr->s6_addr32[2],
+		   ip6_addr->s6_addr32[3], ipvlan->dev->name);
+	/* Delete from the hash-table */
+	ipvlan_ht_addr_del(addr, true);
+	/* Delete from the logical's addr list */
+	list_del_rcu(&addr->anode);
+	ipvlan->ipv6cnt--;
+	WARN_ON(ipvlan->ipv6cnt < 0);
+	kfree_rcu(addr, rcu);
+
+	return;
+}
+
+static int ipvlan_addr6_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr;
+	struct net_device *dev = (struct net_device *)if6->idev->dev;
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+
+	ipvlan_dbg(3, "%s[%d]: Entering...\n", __func__, __LINE__);
+	if (!ipvlan_dev_slave(dev))
+		return NOTIFY_DONE;
+
+	if (!ipvlan || !ipvlan->port)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		if (ipvlan_add_addr6(ipvlan, &if6->addr))
+			return NOTIFY_BAD;
+		break;
+
+	case NETDEV_DOWN:
+		ipvlan_del_addr6(ipvlan, &if6->addr);
+		break;
+	}
+
+	ipvlan_dbg(3, "%s[%d]: Leaving...\n", __func__, __LINE__);
+	return NOTIFY_OK;
+}
+
+static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
+{
+	struct ipvl_addr *addr = NULL;
+
+	if (ipvlan_addr_busy(ipvlan, ip4_addr, false)) {
+		pr_warn("%s[%d]: Failed to add IPv4=%x on %s intf.\n",
+			__func__, __LINE__, ntohl(ip4_addr->s_addr),
+			   ipvlan->dev->name);
+		return -EINVAL;
+	}
+	if ((addr = kzalloc(sizeof(struct ipvl_addr), GFP_KERNEL)) == NULL)
+		return -ENOMEM;
+
+	ipvlan_dbg(1, "%s[%d]: Adding IPv4=%x address for %s intf.\n",
+		   __func__, __LINE__, ip4_addr->s_addr, ipvlan->dev->name);
+	addr->master = ipvlan;
+	memcpy(&addr->ip4addr, ip4_addr, sizeof(struct in_addr));
+	addr->atype = IPVL_IPV4;
+	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
+	ipvlan->ipv4cnt++;
+	ipvlan_ht_addr_add(ipvlan, addr);
+	ipvlan_set_broadcast_mac_filter(ipvlan, true);
+
+	return 0;
+}
+
+static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
+{
+	struct ipvl_addr *addr = NULL;
+
+	if ((addr= ipvlan_ht_addr_lookup(ipvlan->port, ip4_addr, false)) ==NULL)
+		return;
+
+	ipvlan_dbg(1, "%s[%d]: Deleting IPv4=%x address for %s intf.\n",
+		   __func__, __LINE__, ip4_addr->s_addr, ipvlan->dev->name);
+	/* Delete from the hash-table */
+	ipvlan_ht_addr_del(addr, true);
+	/* Delete from the logical's addr list */
+	list_del_rcu(&addr->anode);
+	ipvlan->ipv4cnt--;
+	WARN_ON(ipvlan->ipv4cnt < 0);
+	if (!ipvlan->ipv4cnt)
+	    ipvlan_set_broadcast_mac_filter(ipvlan, false);
+	kfree_rcu(addr, rcu);
+
+	return;
+}
+
+static int ipvlan_addr4_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
+{
+	struct in_ifaddr *if4 = (struct in_ifaddr *)ptr;
+	struct net_device *dev = (struct net_device *)if4->ifa_dev->dev;
+	struct ipvl_dev *ipvlan = netdev_priv(dev);
+	struct in_addr ip4_addr;
+
+	ipvlan_dbg(3, "%s[%d]: Entering...\n", __func__, __LINE__);
+	if (!ipvlan_dev_slave(dev))
+		return NOTIFY_DONE;
+
+	if (!ipvlan || !ipvlan->port)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		ip4_addr.s_addr = if4->ifa_address;
+		if (ipvlan_add_addr4(ipvlan, &ip4_addr))
+			return NOTIFY_BAD;
+		break;
+
+	case NETDEV_DOWN:
+		ip4_addr.s_addr = if4->ifa_address;
+		ipvlan_del_addr4(ipvlan, &ip4_addr);
+		break;
+	}
+
+	ipvlan_dbg(3, "%s[%d]: Leaving...\n", __func__, __LINE__);
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = {
+	.notifier_call = ipvlan_addr4_event,
+};
+
+static struct notifier_block ipvlan_notifier_block __read_mostly = {
+	.notifier_call = ipvlan_device_event,
+};
+
+static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = {
+	.notifier_call = ipvlan_addr6_event,
+};
+
+static int __init ipvlan_init_module(void)
+{
+	int err;
+
+	ipvlan_init_secret();
+	register_netdevice_notifier(&ipvlan_notifier_block);
+	register_inet6addr_notifier(&ipvlan_addr6_notifier_block);
+	register_inetaddr_notifier(&ipvlan_addr4_notifier_block);
+
+	err = ipvlan_link_register(&ipvlan_link_ops);
+	if (err < 0)
+		goto error;
+
+	return 0;
+error:
+	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
+	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
+	unregister_netdevice_notifier(&ipvlan_notifier_block);
+	return err;
+}
+
+static void __exit ipvlan_cleanup_module(void)
+{
+	rtnl_link_unregister(&ipvlan_link_ops);
+	unregister_netdevice_notifier(&ipvlan_notifier_block);
+	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
+	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
+}
+
+module_init(ipvlan_init_module);
+module_exit(ipvlan_cleanup_module);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
+MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
+MODULE_ALIAS_RTNL_LINK("ipvlan");
diff --git a/drivers/net/ipvlan/ipvlan_sysfs.c b/drivers/net/ipvlan/ipvlan_sysfs.c
new file mode 100644
index 000000000000..ce0a6378d435
--- /dev/null
+++ b/drivers/net/ipvlan/ipvlan_sysfs.c
@@ -0,0 +1,119 @@
+/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+
+#include "ipvlan.h"
+
+/* ---- SysFS entries ---- */
+#define port_of(ko)		container_of(ko, struct ipvl_port, kobj)
+#define ipvl_mode_attr_of(_a)	container_of(_a, struct ipvl_mode_attr, attr)
+
+
+/* -- For Master mode -- */
+struct ipvl_mode_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ipvl_port *port, char *buf);
+	ssize_t (*store)(struct ipvl_port *port, const char *buf, size_t len);
+};
+
+static ssize_t ipvlan_show_mode(struct ipvl_port *port, char *buf)
+{
+	return sprintf(buf, "%hu\n", port->mode);
+}
+
+static ssize_t ipvlan_store_mode(struct ipvl_port *port,
+				 const char *buf, size_t count)
+{
+	int ret = count;
+	u16 nval;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	if (!port) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (sscanf(buf, "%hu", &nval) != 1) {
+		pr_warn("%s: no mode specified.\n", port->dev->name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (nval != 0 && nval != 1) {
+		pr_warn("%s: mode value can only be 0 or 1.\n",
+			   port->dev->name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ipvlan_set_port_mode(port, nval);
+
+out:
+	rtnl_unlock();
+	return ret;
+}
+
+static struct ipvl_mode_attr mode_attr =
+	__ATTR(mode, S_IRUGO | S_IWUSR, ipvlan_show_mode, ipvlan_store_mode);
+
+static struct attribute *ipvl_mode_attrs[] = {
+	&mode_attr.attr,
+	NULL
+};
+
+static ssize_t ipvlan_sysfs_show_mode(struct kobject *kobj,
+				      struct attribute *attr, char *buf)
+{
+	struct ipvl_mode_attr *attribute = ipvl_mode_attr_of(attr);
+	struct ipvl_port *port = port_of(kobj);
+
+	if (!attribute->show)
+		return -EIO;
+
+	return attribute->show(port, buf);
+}
+
+static ssize_t ipvlan_sysfs_store_mode(struct kobject *kobj,
+				       struct attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct ipvl_mode_attr *attribute = ipvl_mode_attr_of(attr);
+	struct ipvl_port *port = port_of(kobj);
+
+	if (!attribute->store)
+		return -EIO;
+
+	return attribute->store(port, buf, count);
+}
+
+static struct sysfs_ops ipvl_mode_sysfs_ops = {
+	.show  = ipvlan_sysfs_show_mode,
+	.store = ipvlan_sysfs_store_mode,
+};
+
+static struct kobj_type ipvl_master_ktype = {
+#ifdef CONFIG_SYSFS
+	.sysfs_ops = &ipvl_mode_sysfs_ops,
+#endif
+	.default_attrs = ipvl_mode_attrs,
+};
+
+int ipvlan_add_per_master_sysfs_mode(struct ipvl_port *port,
+				     struct net_device *dev)
+{
+	return kobject_init_and_add(&port->kobj, &ipvl_master_ktype,
+			&(dev->dev.kobj), "ipvlan");
+}
+
+void ipvlan_del_per_master_sysfs_mode(struct ipvl_port *port)
+{
+		kobject_put(&port->kobj);
+}
+/* ---- END SysFS entries ---- */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 888d5513fa4a..0b290c04a469 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1228,6 +1228,8 @@ enum netdev_priv_flags {
 	IFF_LIVE_ADDR_CHANGE		= 1<<20,
 	IFF_MACVLAN			= 1<<21,
 	IFF_XMIT_DST_RELEASE_PERM	= 1<<22,
+	IFF_IPVLAN_MASTER		= 1<<23,
+	IFF_IPVLAN_SLAVE		= 1<<24,
 };
 
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
@@ -1253,6 +1255,8 @@ enum netdev_priv_flags {
 #define IFF_LIVE_ADDR_CHANGE		IFF_LIVE_ADDR_CHANGE
 #define IFF_MACVLAN			IFF_MACVLAN
 #define IFF_XMIT_DST_RELEASE_PERM	IFF_XMIT_DST_RELEASE_PERM
+#define IFF_IPVLAN_MASTER		IFF_IPVLAN_MASTER
+#define IFF_IPVLAN_SLAVE		IFF_IPVLAN_SLAVE
 
 /**
  *	struct net_device - The DEVICE structure.
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7072d8325016..36bddc233633 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -330,6 +330,21 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC	1
 
+/* IPVLAN section */
+enum {
+	IFLA_IPVLAN_UNSPEC,
+	IFLA_IPVLAN_MODE,
+	__IFLA_IPVLAN_MAX
+};
+
+#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1)
+
+enum ipvlan_mode {
+	IPVLAN_MODE_L2 = 0,
+	IPVLAN_MODE_L3,
+	IPVLAN_MODE_MAX
+};
+
 /* VXLAN section */
 enum {
 	IFLA_VXLAN_UNSPEC,
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* [PATCH Iproute2 next] ip link: Add ipvlan support to the iproute2/ip util
From: Mahesh Bandewar @ 2014-11-14  6:56 UTC (permalink / raw)
  To: netdev, Stephen Hemminger
  Cc: Eric Dumazet, Maciej Zenczykowski, Laurent Chavey, Tim Hockin,
	David Miller, Brandon Philips, Pavel Emelianov, Mahesh Bandewar

Adding basic support to create virtual devices using 'ip'
utility. Following is the syntax -

	ip link add link <master> <virtual> mode [ l2 | l3 ]
	e.g. ip link add link eth0 ipvl0 mode l3

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Laurent Chavey <chavey@google.com>
Cc: Tim Hockin <thockin@google.com>
Cc: Brandon Philips <brandon.philips@coreos.com>
Cc: Pavel Emelianov <xemul@parallels.com>
---
 include/linux/if_link.h | 14 ++++++++
 ip/Makefile             |  2 +-
 ip/iplink.c             |  2 +-
 ip/iplink_ipvlan.c      | 85 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 ip/iplink_ipvlan.c

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 47320636361c..ef1e9f73fb15 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -325,6 +325,20 @@ enum macvlan_macaddr_mode {
 	MACVLAN_MACADDR_SET,
 };
 
+/* IPVLAN section */
+enum {
+	IFLA_IPVLAN_UNSPEC,
+	IFLA_IPVLAN_MODE,
+	__IFLA_IPVLAN_MAX,
+};
+
+#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1)
+
+enum ipvlan_mode {
+	IPVLAN_MODE_L2 = 0, /* Process packets all the way upto L2 */
+	IPVLAN_MODE_L3 = 1, /* Process Packets all the way upto L3 */
+};
+
 #define MACVLAN_FLAG_NOPROMISC	1
 
 /* VXLAN section */
diff --git a/ip/Makefile b/ip/Makefile
index fdc82f7286a0..01901bc8571c 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -6,7 +6,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \
     iplink_macvlan.o iplink_macvtap.o ipl2tp.o link_vti.o link_vti6.o \
     iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
     link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
-    iplink_bridge.o iplink_bridge_slave.o ipfou.o
+    iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/iplink.c b/ip/iplink.c
index 43b26f4cea08..2fc7fa2b47c8 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -90,7 +90,7 @@ void iplink_usage(void)
 		fprintf(stderr, "TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | macvtap |\n");
 		fprintf(stderr, "          bridge | bond | ipoib | ip6tnl | ipip | sit | vxlan |\n");
 		fprintf(stderr, "          gre | gretap | ip6gre | ip6gretap | vti | nlmon |\n");
-		fprintf(stderr, "          bond_slave }\n");
+		fprintf(stderr, "          bond_slave | ipvlan }\n");
 	}
 	exit(-1);
 }
diff --git a/ip/iplink_ipvlan.c b/ip/iplink_ipvlan.c
new file mode 100644
index 000000000000..6712fdb92fd4
--- /dev/null
+++ b/ip/iplink_ipvlan.c
@@ -0,0 +1,85 @@
+/* iplink_ipvlan.c	IPVLAN device support
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Mahesh Bandewar <maheshb@google.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <linux/if_link.h>
+
+#include "rt_names.h"
+#include "utils.h"
+#include "ip_common.h"
+
+static void explain(void)
+{
+	fprintf(stderr, "Usage: ... ipvlan [ mode { l2 | l3 } ]\n");
+}
+
+static int mode_arg(void)
+{
+	fprintf(stderr, "Error: argument of \"mode\" must be either \"l2\", "
+		"or \"l3\"\n");
+        return -1;
+}
+
+static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv,
+			  struct nlmsghdr *n)
+{
+	while (argc > 0) {
+	    if (matches(*argv, "mode") == 0) {
+			__u16 mode = 0;
+			NEXT_ARG();
+
+			if (strcmp(*argv, "l2") == 0)
+				mode = IPVLAN_MODE_L2;
+			else if (strcmp(*argv, "l3") == 0)
+				mode = IPVLAN_MODE_L3;
+			else
+				mode_arg();
+
+			addattr16(n, 1024, IFLA_IPVLAN_MODE, mode);
+		} else if (matches(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "ipvlan: unknown option \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--, argv++;
+	}
+
+	return 0;
+}
+
+static void ipvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
+{
+
+	if (!tb)
+		return;
+
+	if (tb[IFLA_IPVLAN_MODE]) {
+		if (RTA_PAYLOAD(tb[IFLA_IPVLAN_MODE]) == sizeof(__u16)) {
+			__u16 mode = rta_getattr_u16(tb[IFLA_IPVLAN_MODE]);
+
+			fprintf(f, " mode %s ",
+				mode == IPVLAN_MODE_L2 ? "l2"
+				: mode == IPVLAN_MODE_L3 ? "l3" : "unknown");
+		}
+	}
+}
+
+struct link_util ipvlan_link_util = {
+	.id		= "ipvlan",
+	.maxattr	= IFLA_IPVLAN_MAX,
+	.parse_opt	= ipvlan_parse_opt,
+	.print_opt	= ipvlan_print_opt,
+};
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* Re: [PATCH net-next v1] ipvlan: Initial check-in of the IPVLAN driver.
From: Eric Dumazet @ 2014-11-14  7:16 UTC (permalink / raw)
  To: Mahesh Bandewar
  Cc: netdev, Eric Dumazet, Maciej Zenczykowski, Laurent Chavey,
	Tim Hockin, David Miller, Brandon Philips, Pavel Emelianov
In-Reply-To: <1415946547-25811-1-git-send-email-maheshb@google.com>

On Thu, 2014-11-13 at 22:29 -0800, Mahesh Bandewar wrote:

> +int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct ipvl_dev *ipvlan = netdev_priv(dev);
> +	struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev);
> +
> +	if (!port)
> +		goto out;
> +
> +	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
> +		goto out;
> +
> +	switch(port->mode) {
> +	case IPVLAN_MODE_L2:
> +		return ipvlan_xmit_mode_l2(skb, dev);
> +	case IPVLAN_MODE_L3:
> +		return ipvlan_xmit_mode_l3(skb, dev);
> +	}
> +
> +	/* Should not reach here */
> +	BUG();
> +out:
> +	return RX_HANDLER_ANOTHER;
> +}

RX_HANDLER_ANOTHER makes no sense here...

Also you are leaking skb.

kfree_skb(skb);
return NET_XMIT_DROP;

^ permalink raw reply

* Re: [PATCH net-next v1] ipvlan: Initial check-in of the IPVLAN driver.
From: Eric Dumazet @ 2014-11-14  7:19 UTC (permalink / raw)
  To: Mahesh Bandewar
  Cc: netdev, Eric Dumazet, Maciej Zenczykowski, Laurent Chavey,
	Tim Hockin, David Miller, Brandon Philips, Pavel Emelianov
In-Reply-To: <1415946547-25811-1-git-send-email-maheshb@google.com>

On Thu, 2014-11-13 at 22:29 -0800, Mahesh Bandewar wrote:

> +static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
> +{
> +	const struct ipvl_dev *ipvlan = netdev_priv(dev);
> +	struct ethhdr *eth = eth_hdr(skb);
> +	struct ipvl_addr *addr = NULL;
> +	void *lyr3h = NULL;
> +	int addr_type;
> +
> +	ipvlan_dbg(4, "L2:Xmit on dev %s,PROT=%x\n", dev->name,
> +		   ntohs(skb->protocol));
> +	if (ether_addr_equal(eth->h_dest, eth->h_source)) {
> +		ipvlan_dbg(4, "Comm betn 2 virt devs PROT=%x\n",
> +			   ntohs(skb->protocol));
> +		if ((lyr3h = ipvlan_get_L3_hdr(skb, &addr_type)) == NULL)
> +			goto to_default;
> +
> +		addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
> +		if (addr)
> +			return ipvlan_rcv_frame(addr, skb, true);
> +
> +		/* No matching ipvlan dev! Must be on the Physical device */
> +to_default:
> +		skb = skb_share_check(skb, GFP_ATOMIC);
> +		if (!skb)
> +			return RX_HANDLER_CONSUMED;

Same problem here. RX_HANDLER_CONSUMED makes no sense.

> +
> +		/* Packet definitely does not belong to any of the
> +		 * virtual devices, but the dest is local. So forward
> +		 * the skb for the main-dev. At the RX side we just return
> +		 * RX_PASS for it to be processed further on the stack.
> +		 */
> +		return dev_forward_skb(ipvlan->phy_dev, skb);
> +
> +	} else if (is_multicast_ether_addr(eth->h_dest)) {
> +		u8 ip_summed = skb->ip_summed;
> +		/* Packet needs to be multicast-ed. */
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> +		ipvlan_dbg(4, "%s[%d] Mcast Xmit on [%s], PROT=[%x]\n",
> +			   __func__, __LINE__, dev->name,
> +			   ntohs(skb->protocol));
> +		ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true);
> +		skb->ip_summed = ip_summed;
> +	}
> +
> +	/* Send it out */
> +	skb->dev = ipvlan->phy_dev;
> +	return dev_queue_xmit(skb);
> +}
> +

^ permalink raw reply

* Re: Device Tree Binding for Marvell DSA Switch on imx28 board over Mdio Interface
From: Oliver Graute @ 2014-11-14  7:39 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev
In-Reply-To: <54650E9C.7080708@gmail.com>

On Thu, Nov 13, 2014 at 9:03 PM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> On 11/13/2014 07:15 AM, Oliver Graute wrote:
>> Hello Florian,
>>
>> On Wed, Nov 12, 2014 at 8:19 PM, Florian Fainelli <f.fainelli@gmail.com> wrote:
>>> On 11/12/2014 05:07 AM, Oliver Graute wrote:
>>>> Hello,
>>>>
>>>> how do I specify the DSA node and the MDIO node in the Device Tree
>>>> Binding to integrate a Marvell 88e6071 switch with a imx28 board?
>>>>
>>>> On my board the Marvell switch 88e6071 is connected via phy1 (on a
>>>> imx28 PCB) to phy5 on the Marvell switch (on a Switch PCB). All phys
>>>> are connected via the same MDIO Bus.
>>>>
>>>> I enabled the Marvell DSA Support Driver, Gianfar Ethernet Driver and
>>>> Freescale PQ MDIO Driver in the Kernel (I' am not sure if this is the
>>>> right choice for imx28 fec ethernet controller is it?)
>>>>
>>
>> I changed my DeviceTree according to your proposal. Now I got a ENODEV 19
>> in dsa_of_probe. Because  of_find_device_by_node(ethernet) is returning 0.
>> Is my ethernet setting still wrong?
>
> Is your ethernet driver also modular? If so, you will need it to be
> loaded *before* dsa. of_find_device_by_node() also needs the ethernet
> driver to be a platform_driver.

No my Freescale FEC PHY driver is not a module. FEC is a imx28/arm
platform driver or not?

I loaded the DSA as a Kernel module to make sure that the DSA probing
is happening when the switch is really on. I enable the SWITCH ON Pin
on bootup with a systemd started script. Then I write some registers
on the switch with a userspace mii tool. This manually writing of some
switch registers works fine via the MII Bus using ioctl(SIOCGMIIPHY).

But i would like to integrate the switch with a full dsa driver.
currently its failing with dsa_of_probe returns=-19

> NB: I have a patch that looks up a net_device based on the struct
> device_node that might be better to use, since it makes no assumption
> about whether that is a platform_device/pci_device etc...

can you give me a link to this patch?

Best Regards,

Oliver

^ permalink raw reply

* Re: [PATCH net-next] net: introduce SO_INCOMING_CPU
From: Michael Kerrisk @ 2014-11-14  8:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, netdev, Ying Cai, Willem de Bruijn, Neal Cardwell,
	Linux API
In-Reply-To: <1415393472.13896.119.camel-XN9IlZ5yJG9HTL0Zs8A6p/gx64E7kk8eUsxypvmhUTTZJqsBc5GL+g@public.gmane.org>

Hi Eric,

Since this is an API change ( Documentation/SubmitChecklist),
linux-api@ should be CCed.

Thanks,

Michael



On Fri, Nov 7, 2014 at 9:51 PM, Eric Dumazet <eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> From: Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
>
> Alternative to RPS/RFS is to use hardware support for multi queue.
>
> Then split a set of million of sockets into worker threads, each
> one using epoll() to manage events on its own socket pool.
>
> Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
> know after accept() or connect() on which queue/cpu a socket is managed.
>
> We normally use one cpu per RX queue (IRQ smp_affinity being properly
> set), so remembering on socket structure which cpu delivered last packet
> is enough to solve the problem.
>
> After accept(), connect(), or even file descriptor passing around
> processes, applications can use :
>
>  int cpu;
>  socklen_t len = sizeof(cpu);
>
>  getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);
>
> And use this information to put the socket into the right silo
> for optimal performance, as all networking stack should run
> on the appropriate cpu, without need to send IPI (RPS/RFS).
>
> Signed-off-by: Eric Dumazet <edumazet-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
>  arch/alpha/include/uapi/asm/socket.h   |    2 ++
>  arch/avr32/include/uapi/asm/socket.h   |    2 ++
>  arch/cris/include/uapi/asm/socket.h    |    2 ++
>  arch/frv/include/uapi/asm/socket.h     |    2 ++
>  arch/ia64/include/uapi/asm/socket.h    |    2 ++
>  arch/m32r/include/uapi/asm/socket.h    |    2 ++
>  arch/mips/include/uapi/asm/socket.h    |    2 ++
>  arch/mn10300/include/uapi/asm/socket.h |    2 ++
>  arch/parisc/include/uapi/asm/socket.h  |    2 ++
>  arch/powerpc/include/uapi/asm/socket.h |    2 ++
>  arch/s390/include/uapi/asm/socket.h    |    2 ++
>  arch/sparc/include/uapi/asm/socket.h   |    2 ++
>  arch/xtensa/include/uapi/asm/socket.h  |    2 ++
>  include/net/sock.h                     |   12 ++++++++++++
>  include/uapi/asm-generic/socket.h      |    2 ++
>  net/core/sock.c                        |    5 +++++
>  net/ipv4/tcp_ipv4.c                    |    1 +
>  net/ipv4/udp.c                         |    1 +
>  net/ipv6/tcp_ipv6.c                    |    1 +
>  net/ipv6/udp.c                         |    1 +
>  net/sctp/ulpqueue.c                    |    5 +++--
>  21 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index 3de1394bcab821984674e89a3ee022cc6dd5f0f2..e2fe0700b3b442bffc1f606b1b8b0bb7759aa157 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h
> index 6e6cd159924b1855aa5f1811ad4e4c60b403c431..92121b0f5b989a61c008e0be24030725bab88e36 100644
> --- a/arch/avr32/include/uapi/asm/socket.h
> +++ b/arch/avr32/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI__ASM_AVR32_SOCKET_H */
> diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h
> index ed94e5ed0a238c2750e677ccb806a6bc0a94041a..60f60f5b9b35bd219d7a9834fe5394e8ac5fdbab 100644
> --- a/arch/cris/include/uapi/asm/socket.h
> +++ b/arch/cris/include/uapi/asm/socket.h
> @@ -82,6 +82,8 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
>
> diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h
> index ca2c6e6f31c6817780d31a246652adcc9847e373..2c6890209ea60c149bf097c2a1b369519cb8c301 100644
> --- a/arch/frv/include/uapi/asm/socket.h
> +++ b/arch/frv/include/uapi/asm/socket.h
> @@ -80,5 +80,7 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
>
> diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h
> index a1b49bac7951929127ed08db549218c2c16ccf89..09a93fb566f6c6c6fe29c10c95b931881843d1cd 100644
> --- a/arch/ia64/include/uapi/asm/socket.h
> +++ b/arch/ia64/include/uapi/asm/socket.h
> @@ -89,4 +89,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_IA64_SOCKET_H */
> diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h
> index 6c9a24b3aefa3a4f3048c17a7fa06d97b585ec14..e8589819c2743c6e112b15a245fc3ebd146e6313 100644
> --- a/arch/m32r/include/uapi/asm/socket.h
> +++ b/arch/m32r/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_M32R_SOCKET_H */
> diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
> index a14baa218c76f14de988ef106bdac5dadc48aceb..2e9ee8c55a103a0337d9f80f71fe9ef28be1154b 100644
> --- a/arch/mips/include/uapi/asm/socket.h
> +++ b/arch/mips/include/uapi/asm/socket.h
> @@ -98,4 +98,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h
> index 6aa3ce1854aa9523d46bc28851eddabd59edeb37..f3492e8c9f7009c33e07168df916f7337bef3929 100644
> --- a/arch/mn10300/include/uapi/asm/socket.h
> +++ b/arch/mn10300/include/uapi/asm/socket.h
> @@ -80,4 +80,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
> index fe35ceacf0e72cad69a43d9b1ce7b8f5ec3da98a..7984a1cab3da980f1f810827967b4b67616eb89b 100644
> --- a/arch/parisc/include/uapi/asm/socket.h
> +++ b/arch/parisc/include/uapi/asm/socket.h
> @@ -79,4 +79,6 @@
>
>  #define SO_BPF_EXTENSIONS      0x4029
>
> +#define SO_INCOMING_CPU                0x402A
> +
>  #endif /* _UAPI_ASM_SOCKET_H */
> diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h
> index a9c3e2e18c054a1e952fe33599401de57c6a6544..3474e4ef166df4a573773916b325d0fa9f3b45d0 100644
> --- a/arch/powerpc/include/uapi/asm/socket.h
> +++ b/arch/powerpc/include/uapi/asm/socket.h
> @@ -87,4 +87,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_POWERPC_SOCKET_H */
> diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h
> index e031332096d7c7b23b5953680289e8f3bcc3b378..8457636c33e1b67a9b7804daa05627839035a8fb 100644
> --- a/arch/s390/include/uapi/asm/socket.h
> +++ b/arch/s390/include/uapi/asm/socket.h
> @@ -86,4 +86,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _ASM_SOCKET_H */
> diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
> index 54d9608681b6947ae25dab008f808841d96125c0..4a8003a9416348006cfa85d5bcdf7553c8d23958 100644
> --- a/arch/sparc/include/uapi/asm/socket.h
> +++ b/arch/sparc/include/uapi/asm/socket.h
> @@ -76,6 +76,8 @@
>
>  #define SO_BPF_EXTENSIONS      0x0032
>
> +#define SO_INCOMING_CPU                0x0033
> +
>  /* Security levels - as per NRL IPv6 - don't actually do anything */
>  #define SO_SECURITY_AUTHENTICATION             0x5001
>  #define SO_SECURITY_ENCRYPTION_TRANSPORT       0x5002
> diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h
> index 39acec0cf0b1d500c1c40f9b523ef3a9a142c2f1..c46f6a696849c6f7f8a34b2cc522b48e04b17380 100644
> --- a/arch/xtensa/include/uapi/asm/socket.h
> +++ b/arch/xtensa/include/uapi/asm/socket.h
> @@ -91,4 +91,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* _XTENSA_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 6767d75ecb17693eb59a99b8218da4319854ccc0..7789b59c0c400eb99f65d1f0e03cd9773664cf93 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -273,6 +273,7 @@ struct cg_proto;
>    *    @sk_rcvtimeo: %SO_RCVTIMEO setting
>    *    @sk_sndtimeo: %SO_SNDTIMEO setting
>    *    @sk_rxhash: flow hash received from netif layer
> +  *    @sk_incoming_cpu: record cpu processing incoming packets
>    *    @sk_txhash: computed flow hash for use on transmit
>    *    @sk_filter: socket filtering instructions
>    *    @sk_protinfo: private area, net family specific, when not using slab
> @@ -350,6 +351,12 @@ struct sock {
>  #ifdef CONFIG_RPS
>         __u32                   sk_rxhash;
>  #endif
> +       u16                     sk_incoming_cpu;
> +       /* 16bit hole
> +        * Warned : sk_incoming_cpu can be set from softirq,
> +        * Do not use this hole without fully understanding possible issues.
> +        */
> +
>         __u32                   sk_txhash;
>  #ifdef CONFIG_NET_RX_BUSY_POLL
>         unsigned int            sk_napi_id;
> @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>         return sk->sk_backlog_rcv(sk, skb);
>  }
>
> +static inline void sk_incoming_cpu_update(struct sock *sk)
> +{
> +       sk->sk_incoming_cpu = raw_smp_processor_id();
> +}
> +
>  static inline void sock_rps_record_flow_hash(__u32 hash)
>  {
>  #ifdef CONFIG_RPS
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index ea0796bdcf88404ef0f127eb6e64ba00c16ea856..f541ccefd4acbeb4ad757be9dbf4b67f204bf21d 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -82,4 +82,6 @@
>
>  #define SO_BPF_EXTENSIONS      48
>
> +#define SO_INCOMING_CPU                49
> +
>  #endif /* __ASM_GENERIC_SOCKET_H */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index ac56dd06c306f3712e57ce8e4724c79565589499..0725cf0cb685787b2122606437da53299fb24621 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sk->sk_max_pacing_rate;
>                 break;
>
> +       case SO_INCOMING_CPU:
> +               v.val = sk->sk_incoming_cpu;
> +               break;
> +
>         default:
>                 return -ENOPROTOOPT;
>         }
> @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
>
>                 newsk->sk_err      = 0;
>                 newsk->sk_priority = 0;
> +               newsk->sk_incoming_cpu = raw_smp_processor_id();
>                 /*
>                  * Before updating sk_refcnt, we must commit prior changes to memory
>                  * (Documentation/RCU/rculist_nulls.txt for details)
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 9c7d7621466b1241f404a5ca11de809dcff2d02a..3893f51972f28271a6d27a763c05495c5c2554f7 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1662,6 +1662,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index df19027f44f3d6fbe13dec78d3b085968dbf2329..f52b6081158e87caa5df32e8e5d27dbf314a01b1 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (inet_sk(sk)->inet_daddr) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index ace29b60813cf8a1d7182ad2262cbcbd21810fa7..ac40d23204b5e55da5172c80dafd1d4854b370d5 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1455,6 +1455,7 @@ process:
>                 goto discard_and_relse;
>
>         sk_mark_napi_id(sk, skb);
> +       sk_incoming_cpu_update(sk);
>         skb->dev = NULL;
>
>         bh_lock_sock_nested(sk);
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 9b6809232b178c16d699ce3d152196b8c4cb096b..0125ca3daf47a4a3333e7462a11550d3e2f96875 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>         if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
>                 sock_rps_save_rxhash(sk, skb);
>                 sk_mark_napi_id(sk, skb);
> +               sk_incoming_cpu_update(sk);
>         }
>
>         rc = sock_queue_rcv_skb(sk, skb);
> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
> index d49dc2ed30adb97a809eb37902b9956c366a2862..ce469d648ffbe166f9ae1c5650f481256f31a7f8 100644
> --- a/net/sctp/ulpqueue.c
> +++ b/net/sctp/ulpqueue.c
> @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
>         if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
>                 goto out_free;
>
> -       if (!sctp_ulpevent_is_notification(event))
> +       if (!sctp_ulpevent_is_notification(event)) {
>                 sk_mark_napi_id(sk, skb);
> -
> +               sk_incoming_cpu_update(sk);
> +       }
>         /* Check if the user wishes to receive this event.  */
>         if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
>                 goto out_free;
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Michael Kerrisk Linux man-pages maintainer;
http://www.kernel.org/doc/man-pages/
Author of "The Linux Programming Interface", http://blog.man7.org/

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox