* [PATCH bpf-next v4 2/4] bpf: sockmap, add hash map support
From: John Fastabend @ 2018-05-03 18:28 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev, John Fastabend
In-Reply-To: <1525372108-8690-1-git-send-email-john.fastabend@gmail.com>
Sockmap is currently backed by an array and enforces keys to be
four bytes. This works well for many use cases and was originally
modeled after devmap which also uses four bytes keys. However,
this has become limiting in larger use cases where a hash would
be more appropriate. For example users may want to use the 5-tuple
of the socket as the lookup key.
To support this add hash support.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
include/linux/bpf.h | 8 +
include/linux/bpf_types.h | 1 +
include/uapi/linux/bpf.h | 53 ++++-
kernel/bpf/core.c | 1 +
kernel/bpf/sockmap.c | 494 ++++++++++++++++++++++++++++++++++++++++++++--
kernel/bpf/verifier.c | 14 +-
net/core/filter.c | 58 ++++++
7 files changed, 611 insertions(+), 18 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 38ebbc6..add768a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -661,6 +661,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
+struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
#else
static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
@@ -668,6 +669,12 @@ static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
return NULL;
}
+static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map,
+ void *key)
+{
+ return NULL;
+}
+
static inline int sock_map_prog(struct bpf_map *map,
struct bpf_prog *prog,
u32 type)
@@ -693,6 +700,7 @@ static inline int sock_map_prog(struct bpf_map *map,
extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
+extern const struct bpf_func_proto bpf_sock_hash_update_proto;
/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b28fcf..3101118 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -47,6 +47,7 @@
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
#endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da77a93..c2613c5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
+ BPF_MAP_TYPE_SOCKHASH,
};
enum bpf_prog_type {
@@ -1767,6 +1768,53 @@ struct bpf_stack_build_id {
* **CONFIG_XFRM** configuration option.
* Return
* 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * Add an entry to, or update a sockhash *map* referencing sockets.
+ * The *skops* is used as a new value for the entry associated to
+ * *key*. *flags* is one of:
+ *
+ * **BPF_NOEXIST**
+ * The entry for *key* must not exist in the map.
+ * **BPF_EXIST**
+ * The entry for *key* must already exist in the map.
+ * **BPF_ANY**
+ * No condition on the existence of the entry for *key*.
+ *
+ * If the *map* has eBPF programs (parser and verdict), those will
+ * be inherited by the socket being added. If the socket is
+ * already attached to eBPF programs, this results in an error.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * This helper is used in programs implementing policies at the
+ * socket level. If the message *msg* is allowed to pass (i.e. if
+ * the verdict eBPF program returns **SK_PASS**), redirect it to
+ * the socket referenced by *map* (of type
+ * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * egress interfaces can be used for redirection. The
+ * **BPF_F_INGRESS** value in *flags* is used to make the
+ * distinction (ingress path is selected if the flag is present,
+ * egress path otherwise). This is the only flag supported for now.
+ * Return
+ * **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * This helper is used in programs implementing policies at the
+ * skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ * if the verdeict eBPF program returns **SK_PASS**), redirect it
+ * to the socket referenced by *map* (of type
+ * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * egress interfaces can be used for redirection. The
+ * **BPF_F_INGRESS** value in *flags* is used to make the
+ * distinction (ingress path is selected if the flag is present,
+ * egress otherwise). This is the only flag supported for now.
+ * Return
+ * **SK_PASS** on success, or **SK_DROP** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -1835,7 +1883,10 @@ struct bpf_stack_build_id {
FN(msg_pull_data), \
FN(bind), \
FN(xdp_adjust_tail), \
- FN(skb_get_xfrm_state),
+ FN(skb_get_xfrm_state), \
+ FN(sock_hash_update), \
+ FN(msg_redirect_hash), \
+ FN(sk_redirect_hash),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba03ec3..5917cc1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1782,6 +1782,7 @@ void bpf_user_rnd_init_once(void)
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 4eef5b1..306eb5d 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -60,6 +60,28 @@ struct bpf_stab {
struct bpf_sock_progs progs;
};
+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct bucket *buckets;
+ atomic_t count;
+ u32 n_buckets;
+ u32 elem_size;
+ struct bpf_sock_progs progs;
+};
+
+struct htab_elem {
+ struct rcu_head rcu;
+ struct hlist_node hash_node;
+ u32 hash;
+ struct sock *sk;
+ char key[0];
+};
+
enum smap_psock_state {
SMAP_TX_RUNNING,
};
@@ -67,6 +89,8 @@ enum smap_psock_state {
struct smap_psock_map_entry {
struct list_head list;
struct sock **entry;
+ struct htab_elem *hash_link;
+ struct bpf_htab *htab;
};
struct smap_psock {
@@ -195,6 +219,12 @@ static void bpf_tcp_release(struct sock *sk)
rcu_read_unlock();
}
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(l, rcu);
+}
+
static void bpf_tcp_close(struct sock *sk, long timeout)
{
void (*close_fun)(struct sock *sk, long timeout);
@@ -231,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
}
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- osk = cmpxchg(e->entry, sk, NULL);
- if (osk == sk) {
- list_del(&e->list);
- smap_release_sock(psock, sk);
+ if (e->entry) {
+ osk = cmpxchg(e->entry, sk, NULL);
+ if (osk == sk) {
+ list_del(&e->list);
+ smap_release_sock(psock, sk);
+ }
+ } else {
+ hlist_del_rcu(&e->hash_link->hash_node);
+ smap_release_sock(psock, e->hash_link->sk);
+ free_htab_elem(e->htab, e->hash_link);
}
}
write_unlock_bh(&sk->sk_callback_lock);
@@ -1526,12 +1562,14 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(err);
}
-static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
+static void smap_list_remove(struct smap_psock *psock,
+ struct sock **entry,
+ struct htab_elem *hash_link)
{
struct smap_psock_map_entry *e, *tmp;
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
- if (e->entry == entry) {
+ if (e->entry == entry || e->hash_link == hash_link) {
list_del(&e->list);
break;
}
@@ -1569,7 +1607,7 @@ static void sock_map_free(struct bpf_map *map)
* to be null and queued for garbage collection.
*/
if (likely(psock)) {
- smap_list_remove(psock, &stab->sock_map[i]);
+ smap_list_remove(psock, &stab->sock_map[i], NULL);
smap_release_sock(psock, sock);
}
write_unlock_bh(&sock->sk_callback_lock);
@@ -1628,7 +1666,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key)
if (psock->bpf_parse)
smap_stop_sock(psock, sock);
- smap_list_remove(psock, &stab->sock_map[k]);
+ smap_list_remove(psock, &stab->sock_map[k], NULL);
smap_release_sock(psock, sock);
out:
write_unlock_bh(&sock->sk_callback_lock);
@@ -1745,10 +1783,12 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
new = true;
}
- e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
- if (!e) {
- err = -ENOMEM;
- goto out_progs;
+ if (map_link) {
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e) {
+ err = -ENOMEM;
+ goto out_progs;
+ }
}
/* 3. At this point we have a reference to a valid psock that is
@@ -1782,6 +1822,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map,
write_unlock_bh(&sock->sk_callback_lock);
return err;
out_free:
+ kfree(e);
smap_release_sock(psock, sock);
out_progs:
if (verdict)
@@ -1828,7 +1869,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
struct smap_psock *opsock = smap_psock_sk(osock);
write_lock_bh(&osock->sk_callback_lock);
- smap_list_remove(opsock, &stab->sock_map[i]);
+ smap_list_remove(opsock, &stab->sock_map[i], NULL);
smap_release_sock(opsock, osock);
write_unlock_bh(&osock->sk_callback_lock);
}
@@ -1845,6 +1886,10 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
progs = &stab->progs;
+ } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ progs = &htab->progs;
} else {
return -EINVAL;
}
@@ -1905,11 +1950,19 @@ static int sock_map_update_elem(struct bpf_map *map,
static void sock_map_release(struct bpf_map *map)
{
- struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
struct bpf_sock_progs *progs;
struct bpf_prog *orig;
- progs = &stab->progs;
+ if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ progs = &stab->progs;
+ } else {
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ progs = &htab->progs;
+ }
+
orig = xchg(&progs->bpf_parse, NULL);
if (orig)
bpf_prog_put(orig);
@@ -1922,6 +1975,390 @@ static void sock_map_release(struct bpf_map *map)
bpf_prog_put(orig);
}
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int i, err;
+ u64 cost;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->value_size != 4 ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ err = bpf_tcp_ulp_register();
+ if (err && err != -EEXIST)
+ return ERR_PTR(err);
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8);
+
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
+ goto free_htab;
+
+ cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_htab;
+
+ htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ err = bpf_map_precharge_memlock(htab->map.pages);
+ if (err)
+ goto free_htab;
+
+ err = -ENOMEM;
+ htab->buckets = bpf_map_area_alloc(
+ htab->n_buckets * sizeof(struct bucket),
+ htab->map.numa_node);
+ if (!htab->buckets)
+ goto free_htab;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* At this point no update, lookup or delete operations can happen.
+ * However, be aware we can still get a socket state event updates,
+ * and data ready callabacks that reference the psock from sk_user_data
+ * Also psock worker threads are still in-flight. So smap_release_sock
+ * will only free the psock after cancel_sync on the worker threads
+ * and a grace period expire to ensure psock is really safe to remove.
+ */
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ struct sock *sock = l->sk;
+ struct smap_psock *psock;
+
+ hlist_del_rcu(&l->hash_node);
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ /* This check handles a racing sock event that can get
+ * the sk_callback_lock before this case but after xchg
+ * causing the refcnt to hit zero and sock user data
+ * (psock) to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, NULL, l);
+ smap_release_sock(psock, sock);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ kfree(l);
+ }
+ }
+ rcu_read_unlock();
+ bpf_map_area_free(htab->buckets);
+ kfree(htab);
+}
+
+static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
+ void *key, u32 key_size, u32 hash,
+ struct sock *sk,
+ struct htab_elem *old_elem)
+{
+ struct htab_elem *l_new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old_elem) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(l_new->key, key, key_size);
+ l_new->sk = sk;
+ l_new->hash = hash;
+ return l_new;
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+ u32 hash, void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node) {
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+ }
+
+ return NULL;
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map,
+ void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l, *next_l;
+ struct hlist_head *h;
+ u32 hash, key_size;
+ int i = 0;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+ if (!key)
+ goto find_first_elem;
+ hash = htab_map_hash(key, key_size);
+ h = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(h, hash, key, key_size);
+ if (!l)
+ goto find_first_elem;
+ next_l = hlist_entry_safe(
+ rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ h = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(h)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* iterated over all buckets and all elements */
+ return -ENOENT;
+}
+
+static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
+ struct bpf_map *map,
+ void *key, u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct bpf_sock_progs *progs = &htab->progs;
+ struct htab_elem *l_new = NULL, *l_old;
+ struct smap_psock_map_entry *e = NULL;
+ struct hlist_head *head;
+ struct smap_psock *psock;
+ u32 key_size, hash;
+ struct sock *sock;
+ struct bucket *b;
+ int err;
+
+ sock = skops->sk;
+
+ if (sock->sk_type != SOCK_STREAM ||
+ sock->sk_protocol != IPPROTO_TCP)
+ return -EOPNOTSUPP;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ return -EINVAL;
+
+ e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
+ if (!e)
+ return -ENOMEM;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key);
+ if (err)
+ goto err;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_bh(&b->lock);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+ if (l_old && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto bucket_err;
+ }
+ if (!l_old && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto bucket_err;
+ }
+
+ l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
+ if (IS_ERR(l_new)) {
+ err = PTR_ERR(l_new);
+ goto bucket_err;
+ }
+
+ psock = smap_psock_sk(sock);
+ if (unlikely(!psock)) {
+ err = -EINVAL;
+ goto bucket_err;
+ }
+
+ e->hash_link = l_new;
+ e->htab = container_of(map, struct bpf_htab, map);
+ list_add_tail(&e->list, &psock->maps);
+
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ psock = smap_psock_sk(l_old->sk);
+
+ hlist_del_rcu(&l_old->hash_node);
+ smap_list_remove(psock, NULL, l_old);
+ smap_release_sock(psock, l_old->sk);
+ free_htab_elem(htab, l_old);
+ }
+ raw_spin_unlock_bh(&b->lock);
+ return 0;
+bucket_err:
+ raw_spin_unlock_bh(&b->lock);
+err:
+ kfree(e);
+ psock = smap_psock_sk(sock);
+ if (psock)
+ smap_release_sock(psock, sock);
+ return err;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map,
+ void *key, void *value, u64 flags)
+{
+ struct bpf_sock_ops_kern skops;
+ u32 fd = *(u32 *)value;
+ struct socket *socket;
+ int err;
+
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ skops.sk = socket->sk;
+ if (!skops.sk) {
+ fput(socket->file);
+ return -EINVAL;
+ }
+
+ err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+ fput(socket->file);
+ return err;
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (l) {
+ struct sock *sock = l->sk;
+ struct smap_psock *psock;
+
+ hlist_del_rcu(&l->hash_node);
+ write_lock_bh(&sock->sk_callback_lock);
+ psock = smap_psock_sk(sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, NULL, l);
+ smap_release_sock(psock, sock);
+ }
+ write_unlock_bh(&sock->sk_callback_lock);
+ free_htab_elem(htab, l);
+ ret = 0;
+ }
+ raw_spin_unlock_bh(&b->lock);
+ return ret;
+}
+
+struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 key_size, hash;
+ struct bucket *b;
+ struct sock *sk;
+
+ key_size = map->key_size;
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_bh(&b->lock);
+ l = lookup_elem_raw(head, hash, key, key_size);
+ sk = l ? l->sk : NULL;
+ raw_spin_unlock_bh(&b->lock);
+ return sk;
+}
+
const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
@@ -1932,6 +2369,15 @@ static void sock_map_release(struct bpf_map *map)
.map_release_uref = sock_map_release,
};
+const struct bpf_map_ops sock_hash_ops = {
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_lookup_elem = sock_map_lookup,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_hash_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
+};
+
BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
struct bpf_map *, map, void *, key, u64, flags)
{
@@ -1949,3 +2395,21 @@ static void sock_map_release(struct bpf_map *map)
.arg3_type = ARG_PTR_TO_MAP_KEY,
.arg4_type = ARG_ANYTHING,
};
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596..cd3966d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2078,6 +2078,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_msg_redirect_map)
goto error;
break;
+ case BPF_MAP_TYPE_SOCKHASH:
+ if (func_id != BPF_FUNC_sk_redirect_hash &&
+ func_id != BPF_FUNC_sock_hash_update &&
+ func_id != BPF_FUNC_map_delete_elem &&
+ func_id != BPF_FUNC_msg_redirect_hash)
+ goto error;
+ break;
default:
break;
}
@@ -2114,11 +2121,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_FUNC_sk_redirect_map:
case BPF_FUNC_msg_redirect_map:
+ case BPF_FUNC_sock_map_update:
if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
goto error;
break;
- case BPF_FUNC_sock_map_update:
- if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+ case BPF_FUNC_sk_redirect_hash:
+ case BPF_FUNC_msg_redirect_hash:
+ case BPF_FUNC_sock_hash_update:
+ if (map->map_type != BPF_MAP_TYPE_SOCKHASH)
goto error;
break;
default:
diff --git a/net/core/filter.c b/net/core/filter.c
index 5623dc8..4cde871 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1851,6 +1851,33 @@ int skb_do_redirect(struct sk_buff *skb)
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!tcb->bpf.sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
@@ -1885,6 +1912,31 @@ struct sock *do_sk_redirect_map(struct sk_buff *skb)
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = __sock_hash_lookup_elem(map, key);
+ if (!msg->sk_redir)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
struct bpf_map *, map, u32, key, u64, flags)
{
@@ -3987,6 +4039,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
return &bpf_sock_map_update_proto;
+ case BPF_FUNC_sock_hash_update:
+ return &bpf_sock_hash_update_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -3998,6 +4052,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
switch (func_id) {
case BPF_FUNC_msg_redirect_map:
return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_redirect_hash:
+ return &bpf_msg_redirect_hash_proto;
case BPF_FUNC_msg_apply_bytes:
return &bpf_msg_apply_bytes_proto;
case BPF_FUNC_msg_cork_bytes:
@@ -4029,6 +4085,8 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
return &bpf_get_socket_uid_proto;
case BPF_FUNC_sk_redirect_map:
return &bpf_sk_redirect_map_proto;
+ case BPF_FUNC_sk_redirect_hash:
+ return &bpf_sk_redirect_hash_proto;
default:
return bpf_base_func_proto(func_id);
}
--
1.9.1
^ permalink raw reply related
* [PATCH bpf-next v4 3/4] bpf: selftest additions for SOCKHASH
From: John Fastabend @ 2018-05-03 18:28 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev, John Fastabend
In-Reply-To: <1525372108-8690-1-git-send-email-john.fastabend@gmail.com>
This runs existing SOCKMAP tests with SOCKHASH map type. To do this
we push programs into include file and build two BPF programs. One
for SOCKHASH and one for SOCKMAP.
We then run the entire test suite with each type.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
tools/include/uapi/linux/bpf.h | 6 ++++-
tools/testing/selftests/bpf/Makefile | 3 ++-
tools/testing/selftests/bpf/test_sockhash_kern.c | 4 ++++
tools/testing/selftests/bpf/test_sockmap.c | 27 ++++++++++++++++------
.../{test_sockmap_kern.c => test_sockmap_kern.h} | 10 ++++----
5 files changed, 36 insertions(+), 14 deletions(-)
create mode 100644 tools/testing/selftests/bpf/test_sockhash_kern.c
rename tools/testing/selftests/bpf/{test_sockmap_kern.c => test_sockmap_kern.h} (97%)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index da77a93..5cb983d 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -116,6 +116,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_DEVMAP,
BPF_MAP_TYPE_SOCKMAP,
BPF_MAP_TYPE_CPUMAP,
+ BPF_MAP_TYPE_SOCKHASH,
};
enum bpf_prog_type {
@@ -1835,7 +1836,10 @@ struct bpf_stack_build_id {
FN(msg_pull_data), \
FN(bind), \
FN(xdp_adjust_tail), \
- FN(skb_get_xfrm_state),
+ FN(skb_get_xfrm_state), \
+ FN(sock_hash_update), \
+ FN(msg_redirect_hash), \
+ FN(sk_redirect_hash),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index b64a7a3..03f9bf3 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
- test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o
+ test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o test_tunnel_kern.o \
+ test_sockmap_kern.o test_sockhash_kern.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/test_sockhash_kern.c b/tools/testing/selftests/bpf/test_sockhash_kern.c
new file mode 100644
index 0000000..3bf4ad4
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockhash_kern.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKHASH
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 29c022d..df7afc7 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -47,7 +47,8 @@
#define S1_PORT 10000
#define S2_PORT 10001
-#define BPF_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKHASH_FILENAME "test_sockmap_kern.o"
#define CG_PATH "/sockmap"
/* global sockets */
@@ -1260,9 +1261,8 @@ static int test_start_end(int cgrp)
BPF_PROG_TYPE_SK_MSG,
};
-static int populate_progs(void)
+static int populate_progs(char *bpf_file)
{
- char *bpf_file = BPF_FILENAME;
struct bpf_program *prog;
struct bpf_object *obj;
int i = 0;
@@ -1306,11 +1306,11 @@ static int populate_progs(void)
return 0;
}
-static int test_suite(void)
+static int __test_suite(char *bpf_file)
{
int cg_fd, err;
- err = populate_progs();
+ err = populate_progs(bpf_file);
if (err < 0) {
fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
return err;
@@ -1347,17 +1347,30 @@ static int test_suite(void)
out:
printf("Summary: %i PASSED %i FAILED\n", passed, failed);
+ cleanup_cgroup_environment();
close(cg_fd);
return err;
}
+static int test_suite(void)
+{
+ int err;
+
+ err = __test_suite(BPF_SOCKMAP_FILENAME);
+ if (err)
+ goto out;
+ err = __test_suite(BPF_SOCKHASH_FILENAME);
+out:
+ return err;
+}
+
int main(int argc, char **argv)
{
struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
int iov_count = 1, length = 1024, rate = 1;
struct sockmap_options options = {0};
int opt, longindex, err, cg_fd = 0;
- char *bpf_file = BPF_FILENAME;
+ char *bpf_file = BPF_SOCKMAP_FILENAME;
int test = PING_PONG;
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
@@ -1438,7 +1451,7 @@ int main(int argc, char **argv)
return -1;
}
- err = populate_progs();
+ err = populate_progs(bpf_file);
if (err) {
fprintf(stderr, "populate program: (%s) %s\n",
bpf_file, strerror(errno));
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.c b/tools/testing/selftests/bpf/test_sockmap_kern.h
similarity index 97%
rename from tools/testing/selftests/bpf/test_sockmap_kern.c
rename to tools/testing/selftests/bpf/test_sockmap_kern.h
index 33de97e..2cc57a1 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.c
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -1,5 +1,5 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
#include <stddef.h>
#include <string.h>
#include <linux/bpf.h>
@@ -36,21 +36,21 @@
})
struct bpf_map_def SEC("maps") sock_map = {
- .type = BPF_MAP_TYPE_SOCKMAP,
+ .type = TEST_MAP_TYPE,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 20,
};
struct bpf_map_def SEC("maps") sock_map_txmsg = {
- .type = BPF_MAP_TYPE_SOCKMAP,
+ .type = TEST_MAP_TYPE,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 20,
};
struct bpf_map_def SEC("maps") sock_map_redir = {
- .type = BPF_MAP_TYPE_SOCKMAP,
+ .type = TEST_MAP_TYPE,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 20,
--
1.9.1
^ permalink raw reply related
* [PATCH bpf-next v4 4/4] bpf: bpftool, support for sockhash
From: John Fastabend @ 2018-05-03 18:28 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev, John Fastabend
In-Reply-To: <1525372108-8690-1-git-send-email-john.fastabend@gmail.com>
This adds the SOCKHASH map type to bpftools so that we get correct
pretty printing.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
tools/bpf/bpftool/map.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index a6cdb64..4420b1a 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -67,6 +67,7 @@
[BPF_MAP_TYPE_DEVMAP] = "devmap",
[BPF_MAP_TYPE_SOCKMAP] = "sockmap",
[BPF_MAP_TYPE_CPUMAP] = "cpumap",
+ [BPF_MAP_TYPE_SOCKHASH] = "sockhash",
};
static unsigned int get_possible_cpus(void)
--
1.9.1
^ permalink raw reply related
* Re: [PATCH bpf-next v4 0/4] Hash support for sock
From: David Miller @ 2018-05-03 18:31 UTC (permalink / raw)
To: john.fastabend; +Cc: borkmann, ast, netdev
In-Reply-To: <1525372108-8690-1-git-send-email-john.fastabend@gmail.com>
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 3 May 2018 11:28:24 -0700
> In the original sockmap implementation we got away with using an
> array similar to devmap. However, unlike devmap where an ifindex
> has a nice 1:1 function into the map we have found some use cases
> with sockets that need to be referenced using longer keys.
>
> This series adds support for a sockhash map reusing as much of
> the sockmap code as possible. I made the decision to add sockhash
> specific helpers vs trying to generalize the existing helpers
> because (a) they have sockmap in the name and (b) the keys are
> different types. I prefer to be explicit here rather than play
> type games or do something else tricky.
>
> To test this we duplicate all the sockmap testing except swap out
> the sockmap with a sockhash.
>
> v2: fix file stats and add v2 tag
> v3: move tool updates into test patch, move bpftool updates into
> its own patch, and fixup the test patch stats to catch the
> renamed file and provide only diffs +/- on that.
> v4: Add documentation to UAPI bpf.h
Acked-by: David S. Miller <davem@davemloft.net>
^ permalink raw reply
* Re: [PATCH net-next] net: stmmac: Add support for U32 TC filter using Flexible RX Parser
From: David Miller @ 2018-05-03 18:35 UTC (permalink / raw)
To: Jose.Abreu
Cc: netdev, Joao.Pinto, Vitor.Soares, peppe.cavallaro,
alexandre.torgue
In-Reply-To: <9f57f98ef35360619001882fdcf0d7ef0558863f.1525351447.git.joabreu@synopsys.com>
From: Jose Abreu <Jose.Abreu@synopsys.com>
Date: Thu, 3 May 2018 13:45:30 +0100
> +static int dwmac5_rxp_update_single_entry(void __iomem *ioaddr,
> + struct stmmac_tc_entry *entry, int pos)
Please follow the Linux networking coding style for function arguments
in function declarations and definitions.
Each second and subsequent line of the declaration/definition shall start
precisely at the column after the openning parenthesis of the first line.
Use the appropriate number of TAB then SPACE characters necessary to
achieve this.
Otherwise, this patch looks great.
Thanks.
^ permalink raw reply
* [PATCH rdma-next] MAINTAINERS: Remove bouncing @mellanox.com addresses
From: Leon Romanovsky @ 2018-05-03 18:37 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe; +Cc: Leon Romanovsky, RDMA mailing list, netdev
From: Leon Romanovsky <leonro@mellanox.com>
Delete non-existent @mellanox.com addresses from MAINTAINERS file.
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
MAINTAINERS | 6 ------
1 file changed, 6 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..1b52c7e7fc7d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5391,7 +5391,6 @@ S: Maintained
F: drivers/iommu/exynos-iommu.c
EZchip NPS platform support
-M: Elad Kanfi <eladkan@mellanox.com>
M: Vineet Gupta <vgupta@synopsys.com>
S: Supported
F: arch/arc/plat-eznps
@@ -9012,7 +9011,6 @@ Q: http://patchwork.ozlabs.org/project/netdev/list/
F: drivers/net/ethernet/mellanox/mlx5/core/en_*
MELLANOX ETHERNET INNOVA DRIVER
-M: Ilan Tayari <ilant@mellanox.com>
R: Boris Pismenny <borisp@mellanox.com>
L: netdev@vger.kernel.org
S: Supported
@@ -9022,7 +9020,6 @@ F: drivers/net/ethernet/mellanox/mlx5/core/fpga/*
F: include/linux/mlx5/mlx5_ifc_fpga.h
MELLANOX ETHERNET INNOVA IPSEC DRIVER
-M: Ilan Tayari <ilant@mellanox.com>
R: Boris Pismenny <borisp@mellanox.com>
L: netdev@vger.kernel.org
S: Supported
@@ -9078,7 +9075,6 @@ F: include/uapi/rdma/mlx4-abi.h
MELLANOX MLX5 core VPI driver
M: Saeed Mahameed <saeedm@mellanox.com>
-M: Matan Barak <matanb@mellanox.com>
M: Leon Romanovsky <leonro@mellanox.com>
L: netdev@vger.kernel.org
L: linux-rdma@vger.kernel.org
@@ -9089,7 +9085,6 @@ F: drivers/net/ethernet/mellanox/mlx5/core/
F: include/linux/mlx5/
MELLANOX MLX5 IB driver
-M: Matan Barak <matanb@mellanox.com>
M: Leon Romanovsky <leonro@mellanox.com>
L: linux-rdma@vger.kernel.org
W: http://www.mellanox.com
@@ -9821,7 +9816,6 @@ F: net/netfilter/xt_CONNSECMARK.c
F: net/netfilter/xt_SECMARK.c
NETWORKING [TLS]
-M: Ilya Lesokhin <ilyal@mellanox.com>
M: Aviad Yehezkel <aviadye@mellanox.com>
M: Dave Watson <davejwatson@fb.com>
L: netdev@vger.kernel.org
^ permalink raw reply related
* [PATCH] atm: zatm: Fix potential Spectre v1
From: Gustavo A. R. Silva @ 2018-05-03 18:17 UTC (permalink / raw)
To: Chas Williams, David S. Miller
Cc: linux-atm-general, netdev, linux-kernel, Gustavo A. R. Silva
pool can be indirectly controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.
This issue was detected with the help of Smatch:
drivers/atm/zatm.c:1462 zatm_ioctl() warn: potential spectre issue
'zatm_dev->pool_info' (local cap)
Fix this by sanitizing pool before using it to index
zatm_dev->pool_info
Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].
[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2
Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
drivers/atm/zatm.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index 1ef67db..9c9a229 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -28,6 +28,7 @@
#include <asm/io.h>
#include <linux/atomic.h>
#include <linux/uaccess.h>
+#include <linux/nospec.h>
#include "uPD98401.h"
#include "uPD98402.h"
@@ -1458,6 +1459,8 @@ static int zatm_ioctl(struct atm_dev *dev,unsigned int cmd,void __user *arg)
return -EFAULT;
if (pool < 0 || pool > ZATM_LAST_POOL)
return -EINVAL;
+ pool = array_index_nospec(pool,
+ ZATM_LAST_POOL + 1);
spin_lock_irqsave(&zatm_dev->lock, flags);
info = zatm_dev->pool_info[pool];
if (cmd == ZATM_GETPOOLZ) {
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next v2 00/15] ARM: sun8i: r40: Add Ethernet support
From: David Miller @ 2018-05-03 18:40 UTC (permalink / raw)
To: maxime.ripard
Cc: wens, mturquette, sboyd, peppe.cavallaro, robh+dt, mark.rutland,
broonie, linux-arm-kernel, linux-clk, devicetree, netdev,
clabbe.montjoie, icenowy
In-Reply-To: <20180503131257.rlqxetafejikmnji@flea>
From: Maxime Ripard <maxime.ripard@bootlin.com>
Date: Thu, 3 May 2018 15:12:57 +0200
> Hi Dave,
>
> On Wed, May 02, 2018 at 11:06:17AM -0400, David Miller wrote:
>> From: Chen-Yu Tsai <wens@csie.org>
>> Date: Wed, 2 May 2018 00:33:45 +0800
>>
>> > I should've mentioned that patches 3 ~ 10, and only these, should go
>> > through net-next. sunxi will handle the remaining clk, device tree, and
>> > soc driver patches.
>>
>> Ok, I just noticed this.
>>
>> Why don't you just post those patches separately as a series on their
>> own then, in order to avoid confusion?
>>
>> Then you can adjust the patch series header posting to explain the
>> non-net-next changes, where they got merged, and what they provide
>> in order to faciliate the net-next changes.
>
> I now that we usually have some feedback from non-net maintainers that
> they actually prefer seeing the full picture (and I also tend to
> prefer that as well) and having all the patches relevant to enable a
> particular feature, even if it means getting multiple maintainers
> involved.
>
> Just to make sure we understood you fully, do you want Chen-Yu to
> resend his serie following your comments, or was that just a general
> remark for next time?
Yeah, good questions.
I think it can be argued either way. For review having the complete
context is important.
But from a maintainer's standpoint, when there is any ambiguity
whatsoever about what patches go into this tree or that, it is really
frowned upon and is quite error prone.
Also, that header posting is _SO_ important. It explains the series.
But for these 'partial apply' situations the header posting refers
to patches not in the series.
This looks terrible in the logs, when, as I do, the header posting
text is added to a marge commit for the series. People will read it
and say "where are all of these other changes mentioned in the text?
was this series misapplied?"
That's why, maybe after the review is successful, I want the actual
patch series standalone with appropriately updated header posting
text.
^ permalink raw reply
* KMSAN: uninit-value in strcmp
From: syzbot @ 2018-05-03 18:44 UTC (permalink / raw)
To: davem, jon.maloy, linux-kernel, netdev, syzkaller-bugs,
tipc-discussion, ying.xue
Hello,
syzbot found the following crash on:
HEAD commit: d2d741e5d189 kmsan: add initialization for shmem pages
git tree: https://github.com/google/kmsan.git/master
console output: https://syzkaller.appspot.com/x/log.txt?x=10051497800000
kernel config: https://syzkaller.appspot.com/x/.config?x=48f9de3384bcd0f
dashboard link: https://syzkaller.appspot.com/bug?extid=df0257c92ffd4fcc58cd
compiler: clang version 7.0.0 (trunk 329391)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=11275657800000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=17c3d5e7800000
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+df0257c92ffd4fcc58cd@syzkaller.appspotmail.com
==================================================================
BUG: KMSAN: uninit-value in strcmp+0xf7/0x160 lib/string.c:329
CPU: 1 PID: 4527 Comm: syz-executor655 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:53
kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
__msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
strcmp+0xf7/0x160 lib/string.c:329
tipc_nl_node_get_link+0x220/0x6f0 net/tipc/node.c:1881
genl_family_rcv_msg net/netlink/genetlink.c:599 [inline]
genl_rcv_msg+0x1686/0x1810 net/netlink/genetlink.c:624
netlink_rcv_skb+0x378/0x600 net/netlink/af_netlink.c:2447
genl_rcv+0x63/0x80 net/netlink/genetlink.c:635
netlink_unicast_kernel net/netlink/af_netlink.c:1311 [inline]
netlink_unicast+0x166b/0x1740 net/netlink/af_netlink.c:1337
netlink_sendmsg+0x1048/0x1310 net/netlink/af_netlink.c:1900
sock_sendmsg_nosec net/socket.c:630 [inline]
sock_sendmsg net/socket.c:640 [inline]
___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
__sys_sendmsg net/socket.c:2080 [inline]
SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
SyS_sendmsg+0x54/0x80 net/socket.c:2087
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x445589
RSP: 002b:00007fb7ee66cdb8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000006dac24 RCX: 0000000000445589
RDX: 0000000000000000 RSI: 0000000020023000 RDI: 0000000000000003
RBP: 00000000006dac20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fffa2bf3f3f R14: 00007fb7ee66d9c0 R15: 0000000000000001
Uninit was created at:
kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
slab_post_alloc_hook mm/slab.h:445 [inline]
slab_alloc_node mm/slub.c:2737 [inline]
__kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
__kmalloc_reserve net/core/skbuff.c:138 [inline]
__alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
alloc_skb include/linux/skbuff.h:984 [inline]
netlink_alloc_large_skb net/netlink/af_netlink.c:1183 [inline]
netlink_sendmsg+0x9a6/0x1310 net/netlink/af_netlink.c:1875
sock_sendmsg_nosec net/socket.c:630 [inline]
sock_sendmsg net/socket.c:640 [inline]
___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
__sys_sendmsg net/socket.c:2080 [inline]
SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091
SyS_sendmsg+0x54/0x80 net/socket.c:2087
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
==================================================================
---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug
report.
Note: all commands must start from beginning of the line in the email body.
^ permalink raw reply
* Re: [PATCH] net/mlx5: fix spelling mistake: "modfiy" -> "modify"
From: David Miller @ 2018-05-03 18:44 UTC (permalink / raw)
To: colin.king
Cc: saeedm, matanb, leon, netdev, linux-rdma, kernel-janitors,
linux-kernel
In-Reply-To: <20180503133503.14508-1-colin.king@canonical.com>
From: Colin King <colin.king@canonical.com>
Date: Thu, 3 May 2018 14:35:03 +0100
> From: Colin Ian King <colin.king@canonical.com>
>
> Trivial fix to spelling mistake in netdev_warn warning message
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
Saeed, please send this to me in your next pull request.
Thanks.
^ permalink raw reply
* [PATCH] net: atm: Fix potential Spectre v1
From: Gustavo A. R. Silva @ 2018-05-03 18:45 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, linux-kernel, Gustavo A. R. Silva
ioc_data.dev_num can be controlled by user-space, hence leading to
a potential exploitation of the Spectre variant 1 vulnerability.
This issue was detected with the help of Smatch:
net/atm/lec.c:702 lec_vcc_attach() warn: potential spectre issue
'dev_lec'
Fix this by sanitizing ioc_data.dev_num before using it to index
dev_lec. Also, notice that there is another instance in which array
dev_lec is being indexed using ioc_data.dev_num at line 705:
lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]),
Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].
[1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2
Cc: stable@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
net/atm/lec.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 01d5d20..3138a86 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -41,6 +41,9 @@ static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 };
#include <linux/module.h>
#include <linux/init.h>
+/* Hardening for Spectre-v1 */
+#include <linux/nospec.h>
+
#include "lec.h"
#include "lec_arpc.h"
#include "resources.h"
@@ -687,8 +690,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc));
if (bytes_left != 0)
pr_info("copy from user failed for %d bytes\n", bytes_left);
- if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF ||
- !dev_lec[ioc_data.dev_num])
+ if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF)
+ return -EINVAL;
+ ioc_data.dev_num = array_index_nospec(ioc_data.dev_num, MAX_LEC_ITF);
+ if (!dev_lec[ioc_data.dev_num])
return -EINVAL;
vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL);
if (!vpriv)
--
2.7.4
^ permalink raw reply related
* Re: [PATCH] qed: fix spelling mistake: "offloded" -> "offloaded"
From: David Miller @ 2018-05-03 18:46 UTC (permalink / raw)
To: colin.king
Cc: Ariel.Elior, everest-linux-l2, netdev, kernel-janitors,
linux-kernel
In-Reply-To: <20180503151932.22458-1-colin.king@canonical.com>
From: Colin King <colin.king@canonical.com>
Date: Thu, 3 May 2018 16:19:32 +0100
> From: Colin Ian King <colin.king@canonical.com>
>
> Trivial fix to spelling mistake in DP_NOTICE message
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
Applied.
^ permalink raw reply
* Re: [PATCH net 0/3] net/smc: fixes 2018/05/03
From: David Miller @ 2018-05-03 18:47 UTC (permalink / raw)
To: ubraun; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl
In-Reply-To: <20180503155739.55616-1-ubraun@linux.ibm.com>
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Thu, 3 May 2018 17:57:36 +0200
> From: Ursula Braun <ursula.braun@de.ibm.com>
>
> Dave,
>
> here are smc fixes for 2 problems:
> * receive buffers in SMC must be registered. If registration fails
> these buffers must not be kept within the link group for reuse.
> Patch 1 is a preparational patch; patch 2 contains the fix.
> * sendpage: do not hold the sock lock when calling kernel_sendpage()
> or sock_no_sendpage()
Series applied, thanks.
^ permalink raw reply
* Re: [PATCH] atm: zatm: Fix potential Spectre v1
From: Randy Dunlap @ 2018-05-03 19:09 UTC (permalink / raw)
To: Gustavo A. R. Silva, Chas Williams, David S. Miller
Cc: linux-atm-general, netdev, linux-kernel
In-Reply-To: <20180503181712.GA7443@embeddedor.com>
On 05/03/2018 11:17 AM, Gustavo A. R. Silva wrote:
> pool can be indirectly controlled by user-space, hence leading to
> a potential exploitation of the Spectre variant 1 vulnerability.
>
> This issue was detected with the help of Smatch:
>
> drivers/atm/zatm.c:1462 zatm_ioctl() warn: potential spectre issue
> 'zatm_dev->pool_info' (local cap)
>
> Fix this by sanitizing pool before using it to index
> zatm_dev->pool_info
>
> Notice that given that speculation windows are large, the policy is
> to kill the speculation on the first load and not worry if it can be
> completed with a dependent load/store [1].
>
> [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2
Hi,
Just for (my) info: all of these types of patches are to prevent
what is loaded in cache when the index is out of range, right?
Not some random pool_info[random], but pool_info[valid, i.e., 0].
Since the value of pool is already sanity checked and -EINVAL is
returned when it's out of range.
Thanks.
> Cc: stable@vger.kernel.org
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> ---
> drivers/atm/zatm.c | 3 +++
> 1 file changed, 3 insertions(+)
>
> diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
> index 1ef67db..9c9a229 100644
> --- a/drivers/atm/zatm.c
> +++ b/drivers/atm/zatm.c
> @@ -28,6 +28,7 @@
> #include <asm/io.h>
> #include <linux/atomic.h>
> #include <linux/uaccess.h>
> +#include <linux/nospec.h>
>
> #include "uPD98401.h"
> #include "uPD98402.h"
> @@ -1458,6 +1459,8 @@ static int zatm_ioctl(struct atm_dev *dev,unsigned int cmd,void __user *arg)
> return -EFAULT;
> if (pool < 0 || pool > ZATM_LAST_POOL)
> return -EINVAL;
> + pool = array_index_nospec(pool,
> + ZATM_LAST_POOL + 1);
> spin_lock_irqsave(&zatm_dev->lock, flags);
> info = zatm_dev->pool_info[pool];
> if (cmd == ZATM_GETPOOLZ) {
>
--
~Randy
^ permalink raw reply
* Re: [PATCH net] dccp: fix tasklet usage
From: David Miller @ 2018-05-03 19:15 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet, gerrit, dccp
In-Reply-To: <20180503163920.102035-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 3 May 2018 09:39:20 -0700
> syzbot reported a crash in tasklet_action_common() caused by dccp.
>
> dccp needs to make sure socket wont disappear before tasklet handler
> has completed.
>
> This patch takes a reference on the socket when arming the tasklet,
> and moves the sock_put() from dccp_write_xmit_timer() to dccp_write_xmitlet()
...
> Fixes: dc841e30eaea ("dccp: Extend CCID packet dequeueing interface")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>
Applied and queued up for -stable, thanks Eric.
^ permalink raw reply
* Re: KMSAN: uninit-value in strcmp
From: David Miller @ 2018-05-03 19:22 UTC (permalink / raw)
To: syzbot+df0257c92ffd4fcc58cd
Cc: jon.maloy, linux-kernel, netdev, syzkaller-bugs, tipc-discussion,
ying.xue
In-Reply-To: <00000000000059f907056b519603@google.com>
From: syzbot <syzbot+df0257c92ffd4fcc58cd@syzkaller.appspotmail.com>
Date: Thu, 03 May 2018 11:44:02 -0700
> Call Trace:
> __dump_stack lib/dump_stack.c:17 [inline]
> dump_stack+0x185/0x1d0 lib/dump_stack.c:53
> kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
> __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
> strcmp+0xf7/0x160 lib/string.c:329
> tipc_nl_node_get_link+0x220/0x6f0 net/tipc/node.c:1881
> genl_family_rcv_msg net/netlink/genetlink.c:599 [inline]
Hmmm, TIPC_NL_LINK_GET uses tipc_nl_policy, which has a proper nesting
entry for TIPC_NLA_LINK. I wonder how the code goes about validating
TIPC_NLA_LINK_NAME in such a case? Does it?
This may be the problem.
^ permalink raw reply
* Re: [PATCH] atm: zatm: Fix potential Spectre v1
From: David Miller @ 2018-05-03 19:25 UTC (permalink / raw)
To: rdunlap; +Cc: gustavo, 3chas3, linux-atm-general, netdev, linux-kernel
In-Reply-To: <98d11d2c-c7c9-e0ae-8a96-320ab9bca765@infradead.org>
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 3 May 2018 12:09:40 -0700
> Just for (my) info: all of these types of patches are to prevent
> what is loaded in cache when the index is out of range, right?
> Not some random pool_info[random], but pool_info[valid, i.e., 0].
>
> Since the value of pool is already sanity checked and -EINVAL is
> returned when it's out of range.
Well, the whole point is that the cpu can speculate execution past the
range check and execute the indexed read anyways. So even if the
value is "sanity checked" the cpu can execute ahead and load things
into the cache, just cancelling the register state updates later when
the range check is fully resolved.
^ permalink raw reply
* [PATCH v6 0/5] PCI: Improve PCIe link status reporting
From: Bjorn Helgaas @ 2018-05-03 20:00 UTC (permalink / raw)
To: Jeff Kirsher, Ganesh Goudar, Michael Chan, Ariel Elior
Cc: linux-pci, everest-linux-l2, intel-wired-lan, netdev,
linux-kernel, Tal Gilboa, Tariq Toukan, Jacob Keller,
Jakub Kicinski
This is based on Tal's recent work to unify the approach for reporting PCIe
link speed/width and whether the device is being limited by a slower
upstream link.
The new pcie_print_link_status() interface appeared in v4.17-rc1; see
9e506a7b5147 ("PCI: Add pcie_print_link_status() to log link speed and
whether it's limited").
That's a good way to replace use of pcie_get_minimum_link(), which gives
misleading results when a path contains both a fast, narrow link and a
slow, wide link: it reports the equivalent of a slow, narrow link.
This series removes the remaining uses of pcie_get_minimum_link() and then
removes the interface itself. I'd like to merge them all through the PCI
tree to make the removal easy.
This does change the dmesg reporting of link speeds, and in the ixgbe case,
it changes the reporting from KERN_WARN level to KERN_INFO. If that's an
issue, let's talk about it. I'm hoping the reduce code size, improved
functionality, and consistency across drivers is enough to make this
worthwhile.
---
Bjorn Helgaas (5):
bnx2x: Report PCIe link properties with pcie_print_link_status()
bnxt_en: Report PCIe link properties with pcie_print_link_status()
cxgb4: Report PCIe link properties with pcie_print_link_status()
ixgbe: Report PCIe link properties with pcie_print_link_status()
PCI: Remove unused pcie_get_minimum_link()
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 23 ++-----
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 ------
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 75 ----------------------
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 47 --------------
drivers/pci/pci.c | 43 -------------
include/linux/pci.h | 2 -
6 files changed, 9 insertions(+), 200 deletions(-)
^ permalink raw reply
* [PATCH v6 1/5] bnx2x: Report PCIe link properties with pcie_print_link_status()
From: Bjorn Helgaas @ 2018-05-03 20:00 UTC (permalink / raw)
To: Jeff Kirsher, Ganesh Goudar, Michael Chan, Ariel Elior
Cc: linux-pci, everest-linux-l2, intel-wired-lan, netdev,
linux-kernel, Tal Gilboa, Tariq Toukan, Jacob Keller,
Jakub Kicinski
In-Reply-To: <152537719056.62474.2571390812509425478.stgit@bhelgaas-glaptop.roam.corp.google.com>
From: Bjorn Helgaas <bhelgaas@google.com>
Previously the driver used pcie_get_minimum_link() to warn when the NIC
is in a slot that can't supply as much bandwidth as the NIC could use.
pcie_get_minimum_link() can be misleading because it finds the slowest link
and the narrowest link (which may be different links) without considering
the total bandwidth of each link. For a path with a 16 GT/s x1 link and a
2.5 GT/s x16 link, it returns 2.5 GT/s x1, which corresponds to 250 MB/s of
bandwidth, not the true available bandwidth of about 1969 MB/s for a
16 GT/s x1 link.
Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself. This finds
the slowest link in the path to the device by computing the total bandwidth
of each link and compares that with the capabilities of the device.
The dmesg change is:
- %s (%c%d) PCI-E x%d %s found at mem %lx, IRQ %d, node addr %pM
+ %s (%c%d) PCI-E found at mem %lx, IRQ %d, node addr %pM
+ %u.%03u Gb/s available PCIe bandwidth (%s x%d link)
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 23 ++++++----------------
1 file changed, 6 insertions(+), 17 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c766ae23bc74..5b1ed240bf18 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13922,8 +13922,6 @@ static int bnx2x_init_one(struct pci_dev *pdev,
{
struct net_device *dev = NULL;
struct bnx2x *bp;
- enum pcie_link_width pcie_width;
- enum pci_bus_speed pcie_speed;
int rc, max_non_def_sbs;
int rx_count, tx_count, rss_count, doorbell_size;
int max_cos_est;
@@ -14091,21 +14089,12 @@ static int bnx2x_init_one(struct pci_dev *pdev,
dev_addr_add(bp->dev, bp->fip_mac, NETDEV_HW_ADDR_T_SAN);
rtnl_unlock();
}
- if (pcie_get_minimum_link(bp->pdev, &pcie_speed, &pcie_width) ||
- pcie_speed == PCI_SPEED_UNKNOWN ||
- pcie_width == PCIE_LNK_WIDTH_UNKNOWN)
- BNX2X_DEV_INFO("Failed to determine PCI Express Bandwidth\n");
- else
- BNX2X_DEV_INFO(
- "%s (%c%d) PCI-E x%d %s found at mem %lx, IRQ %d, node addr %pM\n",
- board_info[ent->driver_data].name,
- (CHIP_REV(bp) >> 12) + 'A', (CHIP_METAL(bp) >> 4),
- pcie_width,
- pcie_speed == PCIE_SPEED_2_5GT ? "2.5GHz" :
- pcie_speed == PCIE_SPEED_5_0GT ? "5.0GHz" :
- pcie_speed == PCIE_SPEED_8_0GT ? "8.0GHz" :
- "Unknown",
- dev->base_addr, bp->pdev->irq, dev->dev_addr);
+ BNX2X_DEV_INFO(
+ "%s (%c%d) PCI-E found at mem %lx, IRQ %d, node addr %pM\n",
+ board_info[ent->driver_data].name,
+ (CHIP_REV(bp) >> 12) + 'A', (CHIP_METAL(bp) >> 4),
+ dev->base_addr, bp->pdev->irq, dev->dev_addr);
+ pcie_print_link_status(bp->pdev);
bnx2x_register_phc(bp);
^ permalink raw reply related
* [PATCH v6 2/5] bnxt_en: Report PCIe link properties with pcie_print_link_status()
From: Bjorn Helgaas @ 2018-05-03 20:00 UTC (permalink / raw)
To: Jeff Kirsher, Ganesh Goudar, Michael Chan, Ariel Elior
Cc: linux-pci, everest-linux-l2, intel-wired-lan, netdev,
linux-kernel, Tal Gilboa, Tariq Toukan, Jacob Keller,
Jakub Kicinski
In-Reply-To: <152537719056.62474.2571390812509425478.stgit@bhelgaas-glaptop.roam.corp.google.com>
From: Bjorn Helgaas <bhelgaas@google.com>
Previously the driver used pcie_get_minimum_link() to warn when the NIC
is in a slot that can't supply as much bandwidth as the NIC could use.
pcie_get_minimum_link() can be misleading because it finds the slowest link
and the narrowest link (which may be different links) without considering
the total bandwidth of each link. For a path with a 16 GT/s x1 link and a
2.5 GT/s x16 link, it returns 2.5 GT/s x1, which corresponds to 250 MB/s of
bandwidth, not the true available bandwidth of about 1969 MB/s for a
16 GT/s x1 link.
Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself. This finds
the slowest link in the path to the device by computing the total bandwidth
of each link and compares that with the capabilities of the device.
The dmesg change is:
- PCIe: Speed %s Width x%d
+ %u.%03u Gb/s available PCIe bandwidth (%s x%d link)
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 19 +------------------
1 file changed, 1 insertion(+), 18 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index f83769d8047b..34fddb48fecc 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -8621,22 +8621,6 @@ static int bnxt_init_mac_addr(struct bnxt *bp)
return rc;
}
-static void bnxt_parse_log_pcie_link(struct bnxt *bp)
-{
- enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
- enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
-
- if (pcie_get_minimum_link(pci_physfn(bp->pdev), &speed, &width) ||
- speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN)
- netdev_info(bp->dev, "Failed to determine PCIe Link Info\n");
- else
- netdev_info(bp->dev, "PCIe: Speed %s Width x%d\n",
- speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
- speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
- speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
- "Unknown", width);
-}
-
static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
static int version_printed;
@@ -8851,8 +8835,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
board_info[ent->driver_data].name,
(long)pci_resource_start(pdev, 0), dev->dev_addr);
-
- bnxt_parse_log_pcie_link(bp);
+ pcie_print_link_status(pdev);
return 0;
^ permalink raw reply related
* [PATCH v6 3/5] cxgb4: Report PCIe link properties with pcie_print_link_status()
From: Bjorn Helgaas @ 2018-05-03 20:00 UTC (permalink / raw)
To: Jeff Kirsher, Ganesh Goudar, Michael Chan, Ariel Elior
Cc: linux-pci, everest-linux-l2, intel-wired-lan, netdev,
linux-kernel, Tal Gilboa, Tariq Toukan, Jacob Keller,
Jakub Kicinski
In-Reply-To: <152537719056.62474.2571390812509425478.stgit@bhelgaas-glaptop.roam.corp.google.com>
From: Bjorn Helgaas <bhelgaas@google.com>
Previously the driver used pcie_get_minimum_link() to warn when the NIC
is in a slot that can't supply as much bandwidth as the NIC could use.
pcie_get_minimum_link() can be misleading because it finds the slowest link
and the narrowest link (which may be different links) without considering
the total bandwidth of each link. For a path with a 16 GT/s x1 link and a
2.5 GT/s x16 link, it returns 2.5 GT/s x1, which corresponds to 250 MB/s of
bandwidth, not the true available bandwidth of about 1969 MB/s for a
16 GT/s x1 link.
Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself. This finds
the slowest link in the path to the device by computing the total bandwidth
of each link and compares that with the capabilities of the device.
The dmesg change is:
- PCIe link speed is %s, device supports %s
- PCIe link width is x%d, device supports x%d
+ %u.%03u Gb/s available PCIe bandwidth (%s x%d link)
or, if the device is capable of better performance than is available in the
current slot:
- A slot with more lanes and/or higher speed is suggested for optimal performance.
+ %u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 75 -----------------------
1 file changed, 1 insertion(+), 74 deletions(-)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 24d2865b8806..7328f24ba1dd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5042,79 +5042,6 @@ static int init_rss(struct adapter *adap)
return 0;
}
-static int cxgb4_get_pcie_dev_link_caps(struct adapter *adap,
- enum pci_bus_speed *speed,
- enum pcie_link_width *width)
-{
- u32 lnkcap1, lnkcap2;
- int err1, err2;
-
-#define PCIE_MLW_CAP_SHIFT 4 /* start of MLW mask in link capabilities */
-
- *speed = PCI_SPEED_UNKNOWN;
- *width = PCIE_LNK_WIDTH_UNKNOWN;
-
- err1 = pcie_capability_read_dword(adap->pdev, PCI_EXP_LNKCAP,
- &lnkcap1);
- err2 = pcie_capability_read_dword(adap->pdev, PCI_EXP_LNKCAP2,
- &lnkcap2);
- if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
- if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
- *speed = PCIE_SPEED_8_0GT;
- else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
- *speed = PCIE_SPEED_5_0GT;
- else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
- *speed = PCIE_SPEED_2_5GT;
- }
- if (!err1) {
- *width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT;
- if (!lnkcap2) { /* pre-r3.0 */
- if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB)
- *speed = PCIE_SPEED_5_0GT;
- else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB)
- *speed = PCIE_SPEED_2_5GT;
- }
- }
-
- if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN)
- return err1 ? err1 : err2 ? err2 : -EINVAL;
- return 0;
-}
-
-static void cxgb4_check_pcie_caps(struct adapter *adap)
-{
- enum pcie_link_width width, width_cap;
- enum pci_bus_speed speed, speed_cap;
-
-#define PCIE_SPEED_STR(speed) \
- (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \
- speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \
- speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \
- "Unknown")
-
- if (cxgb4_get_pcie_dev_link_caps(adap, &speed_cap, &width_cap)) {
- dev_warn(adap->pdev_dev,
- "Unable to determine PCIe device BW capabilities\n");
- return;
- }
-
- if (pcie_get_minimum_link(adap->pdev, &speed, &width) ||
- speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
- dev_warn(adap->pdev_dev,
- "Unable to determine PCI Express bandwidth.\n");
- return;
- }
-
- dev_info(adap->pdev_dev, "PCIe link speed is %s, device supports %s\n",
- PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap));
- dev_info(adap->pdev_dev, "PCIe link width is x%d, device supports x%d\n",
- width, width_cap);
- if (speed < speed_cap || width < width_cap)
- dev_info(adap->pdev_dev,
- "A slot with more lanes and/or higher speed is "
- "suggested for optimal performance.\n");
-}
-
/* Dump basic information about the adapter */
static void print_adapter_info(struct adapter *adapter)
{
@@ -5750,7 +5677,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
}
/* check for PCI Express bandwidth capabiltites */
- cxgb4_check_pcie_caps(adapter);
+ pcie_print_link_status(pdev);
err = init_rss(adapter);
if (err)
^ permalink raw reply related
* [PATCH v6 4/5] ixgbe: Report PCIe link properties with pcie_print_link_status()
From: Bjorn Helgaas @ 2018-05-03 20:00 UTC (permalink / raw)
To: Jeff Kirsher, Ganesh Goudar, Michael Chan, Ariel Elior
Cc: linux-pci, everest-linux-l2, intel-wired-lan, netdev,
linux-kernel, Tal Gilboa, Tariq Toukan, Jacob Keller,
Jakub Kicinski
In-Reply-To: <152537719056.62474.2571390812509425478.stgit@bhelgaas-glaptop.roam.corp.google.com>
From: Bjorn Helgaas <bhelgaas@google.com>
Previously the driver used pcie_get_minimum_link() to warn when the NIC
is in a slot that can't supply as much bandwidth as the NIC could use.
pcie_get_minimum_link() can be misleading because it finds the slowest link
and the narrowest link (which may be different links) without considering
the total bandwidth of each link. For a path with a 16 GT/s x1 link and a
2.5 GT/s x16 link, it returns 2.5 GT/s x1, which corresponds to 250 MB/s of
bandwidth, not the true available bandwidth of about 1969 MB/s for a
16 GT/s x1 link.
Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself. This finds
the slowest link in the path to the device by computing the total bandwidth
of each link and compares that with the capabilities of the device.
The dmesg change is:
- PCI Express bandwidth of %dGT/s available
- (Speed:%s, Width: x%d, Encoding Loss:%s)
+ %u.%03u Gb/s available PCIe bandwidth (%s x%d link)
or, if the device is capable of better performance than is available in the
current slot:
- This is not sufficient for optimal performance of this card.
- For optimal performance, at least %dGT/s of bandwidth is required.
- A slot with more lanes and/or higher speed is suggested.
+ %u.%03u Gb/s available PCIe bandwidth, limited by %s x%d link at %s (capable of %u.%03u Gb/s with %s x%d link)
Note that the driver previously used dev_warn() to suggest using a
different slot, but pcie_print_link_status() uses dev_info() because if the
platform has no faster slot available, the user can't do anything about the
warning and may not want to be bothered with it.
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 47 +------------------------
1 file changed, 1 insertion(+), 46 deletions(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index afadba99f7b8..8990285f6e12 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -270,9 +270,6 @@ static void ixgbe_check_minimum_link(struct ixgbe_adapter *adapter,
int expected_gts)
{
struct ixgbe_hw *hw = &adapter->hw;
- int max_gts = 0;
- enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
- enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
struct pci_dev *pdev;
/* Some devices are not connected over PCIe and thus do not negotiate
@@ -288,49 +285,7 @@ static void ixgbe_check_minimum_link(struct ixgbe_adapter *adapter,
else
pdev = adapter->pdev;
- if (pcie_get_minimum_link(pdev, &speed, &width) ||
- speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
- e_dev_warn("Unable to determine PCI Express bandwidth.\n");
- return;
- }
-
- switch (speed) {
- case PCIE_SPEED_2_5GT:
- /* 8b/10b encoding reduces max throughput by 20% */
- max_gts = 2 * width;
- break;
- case PCIE_SPEED_5_0GT:
- /* 8b/10b encoding reduces max throughput by 20% */
- max_gts = 4 * width;
- break;
- case PCIE_SPEED_8_0GT:
- /* 128b/130b encoding reduces throughput by less than 2% */
- max_gts = 8 * width;
- break;
- default:
- e_dev_warn("Unable to determine PCI Express bandwidth.\n");
- return;
- }
-
- e_dev_info("PCI Express bandwidth of %dGT/s available\n",
- max_gts);
- e_dev_info("(Speed:%s, Width: x%d, Encoding Loss:%s)\n",
- (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
- speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
- speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
- "Unknown"),
- width,
- (speed == PCIE_SPEED_2_5GT ? "20%" :
- speed == PCIE_SPEED_5_0GT ? "20%" :
- speed == PCIE_SPEED_8_0GT ? "<2%" :
- "Unknown"));
-
- if (max_gts < expected_gts) {
- e_dev_warn("This is not sufficient for optimal performance of this card.\n");
- e_dev_warn("For optimal performance, at least %dGT/s of bandwidth is required.\n",
- expected_gts);
- e_dev_warn("A slot with more lanes and/or higher speed is suggested.\n");
- }
+ pcie_print_link_status(pdev);
}
static void ixgbe_service_event_schedule(struct ixgbe_adapter *adapter)
^ permalink raw reply related
* [PATCH v6 5/5] PCI: Remove unused pcie_get_minimum_link()
From: Bjorn Helgaas @ 2018-05-03 20:00 UTC (permalink / raw)
To: Jeff Kirsher, Ganesh Goudar, Michael Chan, Ariel Elior
Cc: linux-pci, everest-linux-l2, intel-wired-lan, netdev,
linux-kernel, Tal Gilboa, Tariq Toukan, Jacob Keller,
Jakub Kicinski
In-Reply-To: <152537719056.62474.2571390812509425478.stgit@bhelgaas-glaptop.roam.corp.google.com>
From: Bjorn Helgaas <bhelgaas@google.com>
In some cases pcie_get_minimum_link() returned misleading information
because it found the slowest link and the narrowest link without
considering the total bandwidth of the link.
For example, consider a path with these two links:
- 16.0 GT/s x1 link (16.0 * 10^9 * 128 / 130) * 1 / 8 = 1969 MB/s
- 2.5 GT/s x16 link ( 2.5 * 10^9 * 8 / 10) * 16 / 8 = 4000 MB/s
The available bandwidth of the path is limited by the 16 GT/s link to about
1969 MB/s, but pcie_get_minimum_link() returned 2.5 GT/s x1, which
corresponds to only 250 MB/s.
Callers should use pcie_print_link_status() instead, or
pcie_bandwidth_available() if they need more detailed information.
Remove pcie_get_minimum_link() since there are no callers left.
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
drivers/pci/pci.c | 43 -------------------------------------------
include/linux/pci.h | 2 --
2 files changed, 45 deletions(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655a5643..4bafa817c40a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5069,49 +5069,6 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
}
EXPORT_SYMBOL(pcie_set_mps);
-/**
- * pcie_get_minimum_link - determine minimum link settings of a PCI device
- * @dev: PCI device to query
- * @speed: storage for minimum speed
- * @width: storage for minimum width
- *
- * This function will walk up the PCI device chain and determine the minimum
- * link width and speed of the device.
- */
-int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
- enum pcie_link_width *width)
-{
- int ret;
-
- *speed = PCI_SPEED_UNKNOWN;
- *width = PCIE_LNK_WIDTH_UNKNOWN;
-
- while (dev) {
- u16 lnksta;
- enum pci_bus_speed next_speed;
- enum pcie_link_width next_width;
-
- ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
- if (ret)
- return ret;
-
- next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
- next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
- PCI_EXP_LNKSTA_NLW_SHIFT;
-
- if (next_speed < *speed)
- *speed = next_speed;
-
- if (next_width < *width)
- *width = next_width;
-
- dev = dev->bus->self;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(pcie_get_minimum_link);
-
/**
* pcie_bandwidth_available - determine minimum link settings of a PCIe
* device and its bandwidth limitation
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..230615620a4a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1079,8 +1079,6 @@ int pcie_get_readrq(struct pci_dev *dev);
int pcie_set_readrq(struct pci_dev *dev, int rq);
int pcie_get_mps(struct pci_dev *dev);
int pcie_set_mps(struct pci_dev *dev, int mps);
-int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
- enum pcie_link_width *width);
u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
enum pci_bus_speed *speed,
enum pcie_link_width *width);
^ permalink raw reply related
* Re: [PATCH] net: ethernet: sun: niu set correct packet size in skb
From: David Miller @ 2018-05-03 20:04 UTC (permalink / raw)
To: rob; +Cc: netdev
In-Reply-To: <1525359964.11695.1@server175.web-hosting.com>
From: Rob Taglang <rob@taglang.io>
Date: Thu, 03 May 2018 11:06:04 -0400
> Currently, skb->len and skb->data_len are set to the page size, not
> the packet size. This causes the frame check sequence to not be
> located at the "end" of the packet resulting in ethernet frame check
> errors. The driver does work currently, but stricter kernel facing
> networking solutions like OpenVSwitch will drop these packets as
> invalid.
>
> These changes set the packet size correctly so that these errors no
> longer occur. The length does not include the frame check sequence, so
> that subtraction was removed.
>
> Tested on Oracle/SUN Multithreaded 10-Gigabit Ethernet Network
> Controller [108e:abcd].
>
> This is a resubmission after subscribing to the list; I think it got
> caught in a spam filter since I can't see my message in the archive,
> but if not and this is just pissing off a maintainer I'm really sorry.
>
> Signed-off-by: Rob Taglang <rob@taglang.io>
> ---
> drivers/net/ethernet/sun/niu.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/sun/niu.c
> b/drivers/net/ethernet/sun/niu.c
> index f081de4..88c1247 100644
> --- a/drivers/net/ethernet/sun/niu.c
> +++ b/drivers/net/ethernet/sun/niu.c
> @@ -3443,7 +3443,7 @@ static int niu_process_rx_pkt(struct napi_struct
> *napi, struct niu *np,
>
> len = (val & RCR_ENTRY_L2_LEN) >>
> RCR_ENTRY_L2_LEN_SHIFT;
> - len -= ETH_FCS_LEN;
> + append_size = len + ETH_HLEN + ETH_FCS_LEN;
This patch is severely corrupted by your email client.
Please fix this, send the patch to yourself as a test, and only repost
the patch here on the list once you can successfully apply the patch
contained in the test email.
Thanks.
^ permalink raw reply
* [GIT] Networking
From: David Miller @ 2018-05-03 20:21 UTC (permalink / raw)
To: torvalds; +Cc: akpm, netdev, linux-kernel
1) Various sockmap fixes from John Fastabend (pinned map handling, blocking
in recvmsg, double page put, error handling during redirect failures, etc.)
2) Fix dead code handling in x86-64 JIT, from Gianluca Borello.
3) Missing device put in RDS IB code, from Dag Moxnes.
4) Don't process fast open during repair mode in TCP< from Yuchung
Cheng.
5) Move address/port comparison fixes in SCTP, from Xin Long.
6) Handle add a bond slave's master into a bridge properly, from
Hangbin Liu.
7) IPv6 multipath code can operate on unitialized memory due to an
assumption that the icmp header is in the linear SKB area. Fix
from Eric Dumazet.
8) Don't invoke do_tcp_sendpages() recursively via TLS, from Dave
Watson.
9) Fix memory leaks in x86-64 JIT, from Daniel Borkmann.
10) RDS leaks kernel memory to userspace, from Eric Dumazet.
11) DCCP can invoke a tasklet on a freed socket, take a refcount.
Also from Eric Dumazet.
Please pull, thanks a lot!
The following changes since commit 3be4aaf4e2d3eb95cce7835e8df797ae65ae5ac1:
Merge branch 'userns-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace (2018-04-24 17:58:51 -0700)
are available in the Git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
for you to fetch changes up to a8d7aa17bbc970971ccdf71988ea19230ab368b1:
dccp: fix tasklet usage (2018-05-03 15:14:57 -0400)
----------------------------------------------------------------
Alexandre Belloni (1):
net: phy: allow scanning busses with missing phys
Alexei Starovoitov (2):
Merge branch 'x86-bpf-jit-fixes'
Merge branch 'bpf-sockmap-fixes'
Anders Roxell (1):
selftests: net: add in_netns.sh TEST_GEN_PROGS_EXTENDED
Arend Van Spriel (1):
brcmfmac: fix firmware request processing if nvram load fails
Bjørn Mork (1):
qmi_wwan: do not steal interfaces from class drivers
Chris Mi (1):
net/mlx5: Properly deal with flow counters when deleting rules
Colin Ian King (6):
net: systemport: fix spelling mistake: "asymetric" -> "asymmetric"
qed: fix spelling mistake: "checksumed" -> "checksummed"
net: ethernet: ucc: fix spelling mistake: "tx-late-collsion" -> "tx-late-collision"
net/mlx4: fix spelling mistake: "failedi" -> "failed"
net/mlx5e: fix spelling mistake: "loobpack" -> "loopback"
qed: fix spelling mistake: "offloded" -> "offloaded"
Dag Moxnes (1):
rds: ib: Fix missing call to rds_ib_dev_put in rds_ib_setup_qp
Daniel Borkmann (3):
Merge branch 'bpf-sockmap-fixes'
bpf, x64: fix memleak when not converging after image
bpf, x64: fix memleak when not converging on calls
Dave Watson (1):
net/tls: Don't recursively call push_record during tls_write_space callbacks
David S. Miller (7):
Merge git://git.kernel.org/.../bpf/bpf
Merge branch 'mvpp2-fixes'
Merge tag 'wireless-drivers-for-davem-2018-04-26' of git://git.kernel.org/.../kvalo/wireless-drivers
Merge tag 'mlx5-fixes-2018-04-25' of git://git.kernel.org/.../saeed/linux
Merge branch 'sfc-more-ARFS-fixes'
Merge git://git.kernel.org/.../bpf/bpf
Merge branch 'smc-fixes'
Edward Cree (2):
sfc: Use filter index rather than ID for rps_flow_id table
sfc: fix ARFS expiry check on EF10
Eric Dumazet (6):
ipv6: fix uninit-value in ip6_multipath_l3_keys()
tcp: fix TCP_REPAIR_QUEUE bound checking
net_sched: fq: take care of throttled flows before reuse
rds: do not leak kernel memory to user land
tcp: restore autocorking
dccp: fix tasklet usage
Florian Fainelli (1):
net: systemport: Correclty disambiguate driver instances
Gianluca Borello (1):
bpf, x64: fix JIT emission for dead code
Grygorii Strashko (1):
net: ethernet: ti: cpsw: fix packet leaking in dual_mac mode
Haim Dreyfuss (1):
iwlwifi: mvm: query regdb for wmm rule if needed
Hangbin Liu (1):
bridge: check iface upper dev when setting master via ioctl
Huy Nguyen (1):
net/mlx5e: DCBNL fix min inline header size for dscp
Ido Schimmel (2):
mlxsw: spectrum_switchdev: Do not remove mrouter port from MDB's ports list
ipv6: Revert "ipv6: Allow non-gateway ECMP for IPv6"
Ingo Molnar (1):
8139too: Use disable_irq_nosync() in rtl8139_poll_controller()
Israel Rukshin (1):
net/mlx5: Fix mlx5_get_vector_affinity function
Jakub Kicinski (1):
nfp: don't depend on eth_tbl being available
Jianbo Liu (1):
net/mlx5e: Allow offloading ipv4 header re-write for icmp
John Fastabend (10):
bpf: Document sockmap '-target bpf' requirement for PROG_TYPE_SK_MSG
bpf: sockmap sample use clang flag, -target bpf
bpf: sockmap, map_release does not hold refcnt for pinned maps
bpf: sockmap, sk_wait_event needed to handle blocking cases
bpf: sockmap, fix double page_put on ENOMEM error in redirect path
bpf: fix for lex/yacc build error with gcc-5
bpf: fix uninitialized variable in bpf tools
bpf: sockmap, fix scatterlist update on error path in send with apply
bpf: sockmap, zero sg_size on error when buffer is released
bpf: sockmap, fix error handling in redirect failures
John Hurley (1):
nfp: flower: set tunnel ttl value to net default
Jon Maloy (1):
tipc: fix bug in function tipc_nl_node_dump_monitor
Julian Anastasov (1):
ipv4: fix fnhe usage by non-cached routes
Karsten Graul (2):
net/smc: call consolidation
net/smc: handle unregistered buffers
Lance Richardson (1):
net: support compat 64-bit time in {s,g}etsockopt
Luca Coelho (1):
iwlwifi: mvm: fix old scan version sizes
Marcelo Ricardo Leitner (1):
MAINTAINERS: add myself as SCTP co-maintainer
Maxime Chevallier (2):
net: mvpp2: Fix clk error path in mvpp2_probe
net: mvpp2: Fix clock resource by adding missing mg_core_clk
Michael S. Tsirkin (2):
vhost: make msg padding explicit
Revert "vhost: make msg padding explicit"
Neal Cardwell (1):
tcp_bbr: fix to zero idle_restart only upon S/ACKed data
Ping-Ke Shih (1):
rtlwifi: cleanup 8723be ant_sel definition
Roman Gushchin (1):
bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY
SZ Lin (林上智) (1):
NET: usb: qmi_wwan: add support for ublox R410M PID 0x90b2
Shahar Klein (1):
net/mlx5e: Fix traffic between VF and representor
Song Liu (1):
bpf: minor fix to selftest test_stacktrace_build_id()
Stefan Raspl (1):
smc: fix sendpage() call
Talat Batheesh (1):
net/mlx5: Avoid cleaning flow steering table twice during error flow
Tariq Toukan (1):
net/mlx5e: TX, Use correct counter in dma_map error flow
Thomas Winter (1):
ipv6: Allow non-gateway ECMP for IPv6
Ursula Braun (2):
net/smc: keep clcsock reference in smc_tcp_listen_work()
net/smc: restrict non-blocking connect finish
Vivien Didelot (1):
MAINTAINERS: add davem in NETWORKING DRIVERS
Wenwen Wang (1):
ethtool: fix a potential missing-check bug
William Tu (1):
bpf: clear the ip_tunnel_info.
Xin Long (5):
sctp: handle two v4 addrs comparison in sctp_inet6_cmp_addr
sctp: clear the new asoc's stream outcnt in sctp_stream_update
sctp: init active key for the new asoc in dupcook_a and dupcook_b
sctp: use the old asoc when making the cookie-ack chunk in dupcook_d
sctp: fix the issue that the cookie-ack with auth can't get processed
Yuchung Cheng (1):
tcp: ignore Fast Open on repair mode
Documentation/bpf/bpf_devel_QA.txt | 10 +++++++-
MAINTAINERS | 2 ++
arch/x86/net/bpf_jit_comp.c | 18 +++++++++++---
drivers/infiniband/hw/mlx5/main.c | 2 +-
drivers/net/ethernet/broadcom/bcmsysport.c | 18 ++++++++++----
drivers/net/ethernet/freescale/ucc_geth_ethtool.c | 2 +-
drivers/net/ethernet/marvell/mvpp2.c | 30 ++++++++++++++++------
drivers/net/ethernet/mellanox/mlx4/main.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 8 +++---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 5 ++--
drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++-
drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 20 +++++++--------
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 26 +++++++++++--------
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 12 ++++-----
drivers/net/ethernet/netronome/nfp/flower/action.c | 10 ++++++--
drivers/net/ethernet/netronome/nfp/flower/cmsg.h | 5 +++-
drivers/net/ethernet/netronome/nfp/flower/main.c | 2 +-
drivers/net/ethernet/netronome/nfp/nfp_app_nic.c | 2 +-
drivers/net/ethernet/netronome/nfp/nfp_main.h | 4 ++-
drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 31 +++++++++++++----------
drivers/net/ethernet/qlogic/qed/qed_ll2.c | 2 +-
drivers/net/ethernet/qlogic/qed/qed_roce.c | 2 +-
drivers/net/ethernet/realtek/8139too.c | 2 +-
drivers/net/ethernet/sfc/ef10.c | 5 ++--
drivers/net/ethernet/sfc/rx.c | 2 ++
drivers/net/ethernet/ti/cpsw.c | 2 ++
drivers/net/phy/phy_device.c | 11 ++++++++-
drivers/net/usb/qmi_wwan.c | 13 ++++++++++
drivers/net/wireless/broadcom/brcm80211/brcmfmac/firmware.c | 36 +++++++++++++++------------
drivers/net/wireless/intel/iwlwifi/fw/api/scan.h | 13 ++++------
drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.h | 6 +++--
drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 3 ++-
drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c | 15 -----------
drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c | 11 ++++++---
drivers/net/wireless/realtek/rtlwifi/wifi.h | 5 ++++
include/linux/bpf.h | 4 ++-
include/linux/mlx5/driver.h | 12 +++------
include/net/tls.h | 1 +
kernel/bpf/arraymap.c | 3 ++-
kernel/bpf/sockmap.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
kernel/bpf/syscall.c | 4 +--
net/bridge/br_if.c | 4 +--
net/compat.c | 6 +++--
net/core/ethtool.c | 5 ++++
net/core/filter.c | 1 +
net/dccp/ccids/ccid2.c | 14 +++++++++--
net/dccp/timer.c | 2 +-
net/ipv4/route.c | 118 +++++++++++++++++++++++++++++++++++++++------------------------------------------------
net/ipv4/tcp.c | 7 +++---
net/ipv4/tcp_bbr.c | 4 ++-
net/ipv6/route.c | 7 +++++-
net/rds/ib_cm.c | 3 ++-
net/rds/recv.c | 1 +
net/sched/sch_fq.c | 37 ++++++++++++++++++---------
net/sctp/inqueue.c | 2 +-
net/sctp/ipv6.c | 3 +++
net/sctp/sm_statefuns.c | 8 +++++-
net/sctp/stream.c | 2 ++
net/smc/af_smc.c | 61 ++++++++++++++++++++++-----------------------
net/smc/smc_core.c | 22 ++++++++++++++---
net/smc/smc_core.h | 3 ++-
net/tipc/node.c | 2 +-
net/tls/tls_main.c | 7 ++++++
samples/sockmap/Makefile | 7 ++++--
tools/bpf/Makefile | 2 ++
tools/bpf/bpf_dbg.c | 7 ++++--
tools/testing/selftests/bpf/test_progs.c | 4 +--
tools/testing/selftests/net/Makefile | 3 ++-
70 files changed, 602 insertions(+), 316 deletions(-)
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox