* [PATCH net-next v7 1/6] neighbour: Add hlist_node to struct neighbour
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
@ 2024-10-22 13:43 ` Gilad Naaman
2024-10-22 14:40 ` Eric Dumazet
2024-10-22 13:43 ` [PATCH net-next v7 2/6] neighbour: Define neigh_for_each_in_bucket Gilad Naaman
` (5 subsequent siblings)
6 siblings, 1 reply; 13+ messages in thread
From: Gilad Naaman @ 2024-10-22 13:43 UTC (permalink / raw)
To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Gilad Naaman, Kuniyuki Iwashima
Add a doubly-linked node to neighbours, so that they
can be deleted without iterating the entire bucket they're in.
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
include/net/neighbour.h | 2 ++
net/core/neighbour.c | 40 ++++++++++++++++++++++++++++++++++++++--
2 files changed, 40 insertions(+), 2 deletions(-)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 3887ed9e5026..0402447854c7 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -136,6 +136,7 @@ struct neigh_statistics {
struct neighbour {
struct neighbour __rcu *next;
+ struct hlist_node hash;
struct neigh_table *tbl;
struct neigh_parms *parms;
unsigned long confirmed;
@@ -191,6 +192,7 @@ struct pneigh_entry {
struct neigh_hash_table {
struct neighbour __rcu **hash_buckets;
+ struct hlist_head *hash_heads;
unsigned int hash_shift;
__u32 hash_rnd[NEIGH_NUM_HASH_RND];
struct rcu_head rcu;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 395ae1626eef..7df4cfc0ac9a 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -217,6 +217,7 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
neigh = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock));
rcu_assign_pointer(*np, neigh);
+ hlist_del_rcu(&n->hash);
neigh_mark_dead(n);
retval = true;
}
@@ -403,6 +404,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
write_lock(&n->lock);
neigh_del_timer(n);
neigh_mark_dead(n);
@@ -530,27 +532,47 @@ static void neigh_get_hash_rnd(u32 *x)
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
+ size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head);
size_t size = (1 << shift) * sizeof(struct neighbour *);
- struct neigh_hash_table *ret;
struct neighbour __rcu **buckets;
+ struct hlist_head *hash_heads;
+ struct neigh_hash_table *ret;
int i;
+ hash_heads = NULL;
+
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
if (size <= PAGE_SIZE) {
buckets = kzalloc(size, GFP_ATOMIC);
+
+ if (buckets) {
+ hash_heads = kzalloc(hash_heads_size, GFP_ATOMIC);
+ if (!hash_heads)
+ kfree(buckets);
+ }
} else {
buckets = (struct neighbour __rcu **)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
+
+ if (buckets) {
+ hash_heads = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+ get_order(hash_heads_size));
+ kmemleak_alloc(hash_heads, hash_heads_size, 1, GFP_ATOMIC);
+ if (!hash_heads)
+ free_pages((unsigned long)buckets, get_order(size));
+ }
}
- if (!buckets) {
+ if (!buckets || !hash_heads) {
kfree(ret);
return NULL;
}
ret->hash_buckets = buckets;
+ ret->hash_heads = hash_heads;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
@@ -562,8 +584,10 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
+ size_t hash_heads_size = (1 << nht->hash_shift) * sizeof(struct hlist_head);
size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
struct neighbour __rcu **buckets = nht->hash_buckets;
+ struct hlist_head *hash_heads = nht->hash_heads;
if (size <= PAGE_SIZE) {
kfree(buckets);
@@ -571,6 +595,13 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
kmemleak_free(buckets);
free_pages((unsigned long)buckets, get_order(size));
}
+
+ if (hash_heads_size < PAGE_SIZE) {
+ kfree(hash_heads);
+ } else {
+ kmemleak_free(hash_heads);
+ free_pages((unsigned long)hash_heads, get_order(hash_heads_size));
+ }
kfree(nht);
}
@@ -607,6 +638,8 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
new_nht->hash_buckets[hash],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+ hlist_del_rcu(&n->hash);
+ hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
}
@@ -717,6 +750,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock)));
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
+ hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
@@ -1002,6 +1036,7 @@ static void neigh_periodic_work(struct work_struct *work)
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
@@ -3131,6 +3166,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)));
+ hlist_del_rcu(&n->hash);
neigh_mark_dead(n);
} else
np = &n->next;
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH net-next v7 1/6] neighbour: Add hlist_node to struct neighbour
2024-10-22 13:43 ` [PATCH net-next v7 1/6] neighbour: Add hlist_node to struct neighbour Gilad Naaman
@ 2024-10-22 14:40 ` Eric Dumazet
2024-10-23 5:16 ` Gilad Naaman
0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2024-10-22 14:40 UTC (permalink / raw)
To: Gilad Naaman
Cc: netdev, David S. Miller, Jakub Kicinski, Paolo Abeni,
Kuniyuki Iwashima
On Tue, Oct 22, 2024 at 3:44 PM Gilad Naaman <gnaaman@drivenets.com> wrote:
>
> Add a doubly-linked node to neighbours, so that they
> can be deleted without iterating the entire bucket they're in.
>
> Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
> ---
> include/net/neighbour.h | 2 ++
> net/core/neighbour.c | 40 ++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 40 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/neighbour.h b/include/net/neighbour.h
> index 3887ed9e5026..0402447854c7 100644
> --- a/include/net/neighbour.h
> +++ b/include/net/neighbour.h
> @@ -136,6 +136,7 @@ struct neigh_statistics {
>
> struct neighbour {
> struct neighbour __rcu *next;
> + struct hlist_node hash;
> struct neigh_table *tbl;
> struct neigh_parms *parms;
> unsigned long confirmed;
> @@ -191,6 +192,7 @@ struct pneigh_entry {
>
> struct neigh_hash_table {
> struct neighbour __rcu **hash_buckets;
> + struct hlist_head *hash_heads;
> unsigned int hash_shift;
> __u32 hash_rnd[NEIGH_NUM_HASH_RND];
> struct rcu_head rcu;
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 395ae1626eef..7df4cfc0ac9a 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -217,6 +217,7 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
> neigh = rcu_dereference_protected(n->next,
> lockdep_is_held(&tbl->lock));
> rcu_assign_pointer(*np, neigh);
> + hlist_del_rcu(&n->hash);
> neigh_mark_dead(n);
> retval = true;
> }
> @@ -403,6 +404,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
> rcu_assign_pointer(*np,
> rcu_dereference_protected(n->next,
> lockdep_is_held(&tbl->lock)));
> + hlist_del_rcu(&n->hash);
> write_lock(&n->lock);
> neigh_del_timer(n);
> neigh_mark_dead(n);
> @@ -530,27 +532,47 @@ static void neigh_get_hash_rnd(u32 *x)
>
> static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
> {
> + size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head);
> size_t size = (1 << shift) * sizeof(struct neighbour *);
> - struct neigh_hash_table *ret;
> struct neighbour __rcu **buckets;
> + struct hlist_head *hash_heads;
> + struct neigh_hash_table *ret;
> int i;
>
> + hash_heads = NULL;
> +
> ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
> if (!ret)
> return NULL;
> if (size <= PAGE_SIZE) {
> buckets = kzalloc(size, GFP_ATOMIC);
> +
> + if (buckets) {
> + hash_heads = kzalloc(hash_heads_size, GFP_ATOMIC);
> + if (!hash_heads)
> + kfree(buckets);
> + }
Oh well, I strongly suggest we first switch to kvzalloc() and
kvfree(), instead of copy/pasting old work arounds...
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 395ae1626eef2f22f5b81051671371ed67eb5943..a44511218a600ff55513a7255e90641cd7c2e983
100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -538,14 +538,7 @@ static struct neigh_hash_table
*neigh_hash_alloc(unsigned int shift)
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
- if (size <= PAGE_SIZE) {
- buckets = kzalloc(size, GFP_ATOMIC);
- } else {
- buckets = (struct neighbour __rcu **)
- __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
- get_order(size));
- kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
- }
+ buckets = kvzalloc(size, GFP_ATOMIC);
if (!buckets) {
kfree(ret);
return NULL;
@@ -562,15 +555,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
- size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
- struct neighbour __rcu **buckets = nht->hash_buckets;
- if (size <= PAGE_SIZE) {
- kfree(buckets);
- } else {
- kmemleak_free(buckets);
- free_pages((unsigned long)buckets, get_order(size));
- }
+ kvfree(nht->hash_buckets);
kfree(nht);
}
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH net-next v7 1/6] neighbour: Add hlist_node to struct neighbour
2024-10-22 14:40 ` Eric Dumazet
@ 2024-10-23 5:16 ` Gilad Naaman
0 siblings, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-23 5:16 UTC (permalink / raw)
To: edumazet; +Cc: davem, gnaaman, kuba, kuniyu, netdev, pabeni
> Oh well, I strongly suggest we first switch to kvzalloc() and
> kvfree(), instead of copy/pasting old work arounds...
Hey, thank you for going over this.
I see you've posted a patch that changes this, I'll rebase and absorb it.
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH net-next v7 2/6] neighbour: Define neigh_for_each_in_bucket
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 1/6] neighbour: Add hlist_node to struct neighbour Gilad Naaman
@ 2024-10-22 13:43 ` Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 3/6] neighbour: Convert seq_file functions to use hlist Gilad Naaman
` (4 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-22 13:43 UTC (permalink / raw)
To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Gilad Naaman, Kuniyuki Iwashima
Introduce neigh_for_each_in_bucket in neighbour.h, to help iterate over
the neighbour table more succinctly.
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
include/net/neighbour.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 0402447854c7..69aaacd1419f 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -277,6 +277,10 @@ static inline void *neighbour_priv(const struct neighbour *n)
extern const struct nla_policy nda_policy[];
+#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash)
+#define neigh_for_each_in_bucket_safe(pos, tmp, head) \
+ hlist_for_each_entry_safe(pos, tmp, head, hash)
+
static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey)
{
return *(const u32 *)n->primary_key == *(const u32 *)pkey;
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH net-next v7 3/6] neighbour: Convert seq_file functions to use hlist
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 1/6] neighbour: Add hlist_node to struct neighbour Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 2/6] neighbour: Define neigh_for_each_in_bucket Gilad Naaman
@ 2024-10-22 13:43 ` Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 4/6] neighbour: Convert iteration to use hlist+macro Gilad Naaman
` (3 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-22 13:43 UTC (permalink / raw)
To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Gilad Naaman, Kuniyuki Iwashima
Convert seq_file-related neighbour functionality to use neighbour::hash
and the related for_each macro.
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
net/core/neighbour.c | 104 ++++++++++++++++++++-----------------------
1 file changed, 48 insertions(+), 56 deletions(-)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7df4cfc0ac9a..80bb1eef7edf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3226,43 +3226,53 @@ EXPORT_SYMBOL(neigh_xmit);
#ifdef CONFIG_PROC_FS
-static struct neighbour *neigh_get_first(struct seq_file *seq)
+static struct neighbour *neigh_get_valid(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
+
+ if (!net_eq(dev_net(n->dev), net))
+ return NULL;
+
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
+ if (!v)
+ return NULL;
+ if (pos)
+ return v;
+ }
+
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ return n;
+
+ if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
+ return n;
+
+ return NULL;
+}
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
struct neigh_hash_table *nht = state->nht;
- struct neighbour *n = NULL;
- int bucket;
+ struct neighbour *n, *tmp;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
- n = rcu_dereference(nht->hash_buckets[bucket]);
-
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- loff_t fakep = 0;
- void *v;
- v = state->neigh_sub_iter(state, n, &fakep);
- if (!v)
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference(n->next);
+ while (++state->bucket < (1 << nht->hash_shift)) {
+ neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
+ tmp = neigh_get_valid(seq, n, NULL);
+ if (tmp)
+ return tmp;
}
-
- if (n)
- break;
}
- state->bucket = bucket;
- return n;
+ return NULL;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
@@ -3270,46 +3280,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
- struct net *net = seq_file_net(seq);
- struct neigh_hash_table *nht = state->nht;
+ struct neighbour *tmp;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
+
if (v)
return n;
}
- n = rcu_dereference(n->next);
-
- while (1) {
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- void *v = state->neigh_sub_iter(state, n, pos);
- if (v)
- return n;
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference(n->next);
+ hlist_for_each_entry_continue(n, hash) {
+ tmp = neigh_get_valid(seq, n, pos);
+ if (tmp) {
+ n = tmp;
+ goto out;
}
-
- if (n)
- break;
-
- if (++state->bucket >= (1 << nht->hash_shift))
- break;
-
- n = rcu_dereference(nht->hash_buckets[state->bucket]);
}
+ n = neigh_get_first(seq);
+out:
if (n && pos)
--(*pos);
+
return n;
}
@@ -3412,7 +3404,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
- state->bucket = 0;
+ state->bucket = -1;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
rcu_read_lock();
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH net-next v7 4/6] neighbour: Convert iteration to use hlist+macro
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
` (2 preceding siblings ...)
2024-10-22 13:43 ` [PATCH net-next v7 3/6] neighbour: Convert seq_file functions to use hlist Gilad Naaman
@ 2024-10-22 13:43 ` Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 5/6] neighbour: Remove bare neighbour::next pointer Gilad Naaman
` (2 subsequent siblings)
6 siblings, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-22 13:43 UTC (permalink / raw)
To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Gilad Naaman, Kuniyuki Iwashima
Remove all usage of the bare neighbour::next pointer,
replacing them with neighbour::hash and its for_each macro.
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
include/net/neighbour.h | 5 +----
net/core/neighbour.c | 47 ++++++++++++++++-------------------------
2 files changed, 19 insertions(+), 33 deletions(-)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 69aaacd1419f..68b1970d9045 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -309,12 +309,9 @@ static inline struct neighbour *___neigh_lookup_noref(
u32 hash_val;
hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
- for (n = rcu_dereference(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference(n->next)) {
+ neigh_for_each_in_bucket(n, &nht->hash_heads[hash_val])
if (n->dev == dev && key_eq(n, pkey))
return n;
- }
return NULL;
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 80bb1eef7edf..e2f7699693f0 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -388,11 +388,11 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct neighbour *n;
struct neighbour __rcu **np = &nht->hash_buckets[i];
+ struct hlist_node *tmp;
+ struct neighbour *n;
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
if (dev && n->dev != dev) {
np = &n->next;
continue;
@@ -620,18 +620,14 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
- struct neighbour *n, *next;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
- lockdep_is_held(&tbl->lock));
- n != NULL;
- n = next) {
+ neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
- next = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
rcu_assign_pointer(n->next,
rcu_dereference_protected(
@@ -726,11 +722,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_tbl_unlock;
}
- for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock));
- n1 != NULL;
- n1 = rcu_dereference_protected(n1->next,
- lockdep_is_held(&tbl->lock))) {
+ neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
@@ -982,10 +974,11 @@ static void neigh_connect(struct neighbour *neigh)
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
- struct neighbour *n;
+ struct neigh_hash_table *nht;
struct neighbour __rcu **np;
+ struct hlist_node *tmp;
+ struct neighbour *n;
unsigned int i;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
@@ -1012,8 +1005,7 @@ static void neigh_periodic_work(struct work_struct *work)
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
np = &nht->hash_buckets[i];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
write_lock(&n->lock);
@@ -2763,9 +2755,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
- for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0;
- n != NULL;
- n = rcu_dereference(n->next)) {
+ idx = 0;
+ neigh_for_each_in_bucket(n, &nht->hash_heads[h]) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
@@ -3132,9 +3123,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
- for (n = rcu_dereference(nht->hash_buckets[chain]);
- n != NULL;
- n = rcu_dereference(n->next))
+ neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
read_unlock_bh(&tbl->lock);
@@ -3146,18 +3135,18 @@ EXPORT_SYMBOL(neigh_for_each);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
- int chain;
struct neigh_hash_table *nht;
+ int chain;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
- struct neighbour *n;
struct neighbour __rcu **np;
+ struct hlist_node *tmp;
+ struct neighbour *n;
np = &nht->hash_buckets[chain];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH net-next v7 5/6] neighbour: Remove bare neighbour::next pointer
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
` (3 preceding siblings ...)
2024-10-22 13:43 ` [PATCH net-next v7 4/6] neighbour: Convert iteration to use hlist+macro Gilad Naaman
@ 2024-10-22 13:43 ` Gilad Naaman
2024-10-22 13:43 ` [PATCH net-next v7 6/6] neighbour: Create netdev->neighbour association Gilad Naaman
2024-10-22 23:48 ` [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Stanislav Fomichev
6 siblings, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-22 13:43 UTC (permalink / raw)
To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Gilad Naaman, Kuniyuki Iwashima
Remove the now-unused neighbour::next pointer, leaving struct neighbour
solely with the hlist_node implementation.
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
include/net/neighbour.h | 4 +-
net/core/neighbour.c | 120 ++++++----------------------------------
net/ipv4/arp.c | 2 +-
3 files changed, 18 insertions(+), 108 deletions(-)
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 68b1970d9045..0244fbd22a1f 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -135,7 +135,6 @@ struct neigh_statistics {
#define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field)
struct neighbour {
- struct neighbour __rcu *next;
struct hlist_node hash;
struct neigh_table *tbl;
struct neigh_parms *parms;
@@ -191,7 +190,6 @@ struct pneigh_entry {
#define NEIGH_NUM_HASH_RND 4
struct neigh_hash_table {
- struct neighbour __rcu **hash_buckets;
struct hlist_head *hash_heads;
unsigned int hash_shift;
__u32 hash_rnd[NEIGH_NUM_HASH_RND];
@@ -352,7 +350,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags,
u32 nlmsg_pid);
void __neigh_set_probe_once(struct neighbour *neigh);
-bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl);
+bool neigh_remove_one(struct neighbour *ndel);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e2f7699693f0..02bc1feab611 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -205,18 +205,12 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
}
}
-static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
- struct neigh_table *tbl)
+bool neigh_remove_one(struct neighbour *n)
{
bool retval = false;
write_lock(&n->lock);
if (refcount_read(&n->refcnt) == 1) {
- struct neighbour *neigh;
-
- neigh = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(*np, neigh);
hlist_del_rcu(&n->hash);
neigh_mark_dead(n);
retval = true;
@@ -227,29 +221,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np,
return retval;
}
-bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
-{
- struct neigh_hash_table *nht;
- void *pkey = ndel->primary_key;
- u32 hash_val;
- struct neighbour *n;
- struct neighbour __rcu **np;
-
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
- hash_val = hash_val >> (32 - nht->hash_shift);
-
- np = &nht->hash_buckets[hash_val];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock)))) {
- if (n == ndel)
- return neigh_del(n, np, tbl);
- np = &n->next;
- }
- return false;
-}
-
static int neigh_forced_gc(struct neigh_table *tbl)
{
int max_clean = atomic_read(&tbl->gc_entries) -
@@ -277,7 +248,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
remove = true;
write_unlock(&n->lock);
- if (remove && neigh_remove_one(n, tbl))
+ if (remove && neigh_remove_one(n))
shrunk++;
if (shrunk >= max_clean)
break;
@@ -388,22 +359,15 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct neighbour __rcu **np = &nht->hash_buckets[i];
struct hlist_node *tmp;
struct neighbour *n;
neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
- if (dev && n->dev != dev) {
- np = &n->next;
+ if (dev && n->dev != dev)
continue;
- }
- if (skip_perm && n->nud_state & NUD_PERMANENT) {
- np = &n->next;
+ if (skip_perm && n->nud_state & NUD_PERMANENT)
continue;
- }
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
+
hlist_del_rcu(&n->hash);
write_lock(&n->lock);
neigh_del_timer(n);
@@ -532,46 +496,26 @@ static void neigh_get_hash_rnd(u32 *x)
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
- size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head);
- size_t size = (1 << shift) * sizeof(struct neighbour *);
- struct neighbour __rcu **buckets;
+ size_t size = (1 << shift) * sizeof(struct hlist_head);
struct hlist_head *hash_heads;
struct neigh_hash_table *ret;
int i;
- hash_heads = NULL;
-
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
if (size <= PAGE_SIZE) {
- buckets = kzalloc(size, GFP_ATOMIC);
-
- if (buckets) {
- hash_heads = kzalloc(hash_heads_size, GFP_ATOMIC);
- if (!hash_heads)
- kfree(buckets);
- }
+ hash_heads = kzalloc(size, GFP_ATOMIC);
} else {
- buckets = (struct neighbour __rcu **)
+ hash_heads = (struct hlist_head *)
__get_free_pages(GFP_ATOMIC | __GFP_ZERO,
get_order(size));
- kmemleak_alloc(buckets, size, 1, GFP_ATOMIC);
-
- if (buckets) {
- hash_heads = (struct hlist_head *)
- __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
- get_order(hash_heads_size));
- kmemleak_alloc(hash_heads, hash_heads_size, 1, GFP_ATOMIC);
- if (!hash_heads)
- free_pages((unsigned long)buckets, get_order(size));
- }
+ kmemleak_alloc(hash_heads, size, 1, GFP_ATOMIC);
}
- if (!buckets || !hash_heads) {
+ if (!hash_heads) {
kfree(ret);
return NULL;
}
- ret->hash_buckets = buckets;
ret->hash_heads = hash_heads;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
@@ -584,23 +528,14 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
- size_t hash_heads_size = (1 << nht->hash_shift) * sizeof(struct hlist_head);
- size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
- struct neighbour __rcu **buckets = nht->hash_buckets;
+ size_t size = (1 << nht->hash_shift) * sizeof(struct hlist_head);
struct hlist_head *hash_heads = nht->hash_heads;
- if (size <= PAGE_SIZE) {
- kfree(buckets);
- } else {
- kmemleak_free(buckets);
- free_pages((unsigned long)buckets, get_order(size));
- }
-
- if (hash_heads_size < PAGE_SIZE) {
+ if (size < PAGE_SIZE) {
kfree(hash_heads);
} else {
kmemleak_free(hash_heads);
- free_pages((unsigned long)hash_heads, get_order(hash_heads_size));
+ free_pages((unsigned long)hash_heads, get_order(size));
}
kfree(nht);
}
@@ -629,11 +564,6 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
hash >>= (32 - new_nht->hash_shift);
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(
- new_nht->hash_buckets[hash],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(new_nht->hash_buckets[hash], n);
hlist_del_rcu(&n->hash);
hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
@@ -738,10 +668,6 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(nht->hash_buckets[hash_val], n);
hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
@@ -975,7 +901,6 @@ static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
struct neigh_hash_table *nht;
- struct neighbour __rcu **np;
struct hlist_node *tmp;
struct neighbour *n;
unsigned int i;
@@ -1003,7 +928,6 @@ static void neigh_periodic_work(struct work_struct *work)
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
- np = &nht->hash_buckets[i];
neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
@@ -1014,7 +938,7 @@ static void neigh_periodic_work(struct work_struct *work)
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
(n->flags & NTF_EXT_LEARNED)) {
write_unlock(&n->lock);
- goto next_elt;
+ continue;
}
if (time_before(n->used, n->confirmed) &&
@@ -1025,9 +949,6 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED ||
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
hlist_del_rcu(&n->hash);
neigh_mark_dead(n);
write_unlock(&n->lock);
@@ -1035,9 +956,6 @@ static void neigh_periodic_work(struct work_struct *work)
continue;
}
write_unlock(&n->lock);
-
-next_elt:
- np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
@@ -1984,7 +1902,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).portid, extack);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
- neigh_remove_one(neigh, tbl);
+ neigh_remove_one(neigh);
write_unlock_bh(&tbl->lock);
out:
@@ -3141,24 +3059,18 @@ void __neigh_for_each_release(struct neigh_table *tbl,
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
- struct neighbour __rcu **np;
struct hlist_node *tmp;
struct neighbour *n;
- np = &nht->hash_buckets[chain];
neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
hlist_del_rcu(&n->hash);
neigh_mark_dead(n);
- } else
- np = &n->next;
+ }
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 11c1519b3699..cb9a7ed8abd3 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1215,7 +1215,7 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
NEIGH_UPDATE_F_ADMIN, 0);
write_lock_bh(&tbl->lock);
neigh_release(neigh);
- neigh_remove_one(neigh, tbl);
+ neigh_remove_one(neigh);
write_unlock_bh(&tbl->lock);
}
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* [PATCH net-next v7 6/6] neighbour: Create netdev->neighbour association
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
` (4 preceding siblings ...)
2024-10-22 13:43 ` [PATCH net-next v7 5/6] neighbour: Remove bare neighbour::next pointer Gilad Naaman
@ 2024-10-22 13:43 ` Gilad Naaman
2024-10-22 23:48 ` [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Stanislav Fomichev
6 siblings, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-22 13:43 UTC (permalink / raw)
To: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Gilad Naaman, Kuniyuki Iwashima
Create a mapping between a netdev and its neighoburs,
allowing for much cheaper flushes.
Signed-off-by: Gilad Naaman <gnaaman@drivenets.com>
---
.../networking/net_cachelines/net_device.rst | 1 +
include/linux/netdevice.h | 7 ++
include/net/neighbour.h | 9 +-
include/net/neighbour_tables.h | 12 +++
net/core/neighbour.c | 96 +++++++++++--------
5 files changed, 80 insertions(+), 45 deletions(-)
create mode 100644 include/net/neighbour_tables.h
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index db6192b2bb50..2edb6ac1cab4 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -189,4 +189,5 @@ u64 max_pacing_offload_horizon
struct_napi_config* napi_config
unsigned_long gro_flush_timeout
u32 napi_defer_hard_irqs
+struct hlist_head neighbours[2]
=================================== =========================== =================== =================== ===================================================================================
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8feaca12655e..80bde95cc302 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,7 @@
#include <net/net_trackers.h>
#include <net/net_debug.h>
#include <net/dropreason-core.h>
+#include <net/neighbour_tables.h>
struct netpoll_info;
struct device;
@@ -2034,6 +2035,9 @@ enum netdev_reg_state {
* @napi_defer_hard_irqs: If not zero, provides a counter that would
* allow to avoid NIC hard IRQ, on busy queues.
*
+ * @neighbours: List heads pointing to this device's neighbours'
+ * dev_list, one per address-family.
+ *
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@@ -2443,6 +2447,9 @@ struct net_device {
*/
struct net_shaper_hierarchy *net_shaper_hierarchy;
#endif
+
+ struct hlist_head neighbours[NEIGH_NR_TABLES];
+
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
} ____cacheline_aligned;
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 0244fbd22a1f..bb345ce8bbf8 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -29,6 +29,7 @@
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/rtnetlink.h>
+#include <net/neighbour_tables.h>
/*
* NUD stands for "neighbor unreachability detection"
@@ -136,6 +137,7 @@ struct neigh_statistics {
struct neighbour {
struct hlist_node hash;
+ struct hlist_node dev_list;
struct neigh_table *tbl;
struct neigh_parms *parms;
unsigned long confirmed;
@@ -236,13 +238,6 @@ struct neigh_table {
struct pneigh_entry **phash_buckets;
};
-enum {
- NEIGH_ARP_TABLE = 0,
- NEIGH_ND_TABLE = 1,
- NEIGH_NR_TABLES,
- NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
-};
-
static inline int neigh_parms_family(struct neigh_parms *p)
{
return p->tbl->family;
diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h
new file mode 100644
index 000000000000..bcffbe8f7601
--- /dev/null
+++ b/include/net/neighbour_tables.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_NEIGHBOUR_TABLES_H
+#define _NET_NEIGHBOUR_TABLES_H
+
+enum {
+ NEIGH_ARP_TABLE = 0,
+ NEIGH_ND_TABLE = 1,
+ NEIGH_NR_TABLES,
+ NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */
+};
+
+#endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 02bc1feab611..f4a772c71f2f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -61,6 +61,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
static const struct seq_operations neigh_stat_seq_ops;
#endif
+static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
+{
+ int i;
+
+ switch (family) {
+ default:
+ DEBUG_NET_WARN_ON_ONCE(1);
+ fallthrough; /* to avoid panic by null-ptr-deref */
+ case AF_INET:
+ i = NEIGH_ARP_TABLE;
+ break;
+ case AF_INET6:
+ i = NEIGH_ND_TABLE;
+ break;
+ }
+
+ return &dev->neighbours[i];
+}
+
/*
Neighbour hash table buckets are protected with rwlock tbl->lock.
@@ -212,6 +231,7 @@ bool neigh_remove_one(struct neighbour *n)
write_lock(&n->lock);
if (refcount_read(&n->refcnt) == 1) {
hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
retval = true;
}
@@ -352,48 +372,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm)
{
- int i;
- struct neigh_hash_table *nht;
-
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
+ struct hlist_head *dev_head;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct hlist_node *tmp;
- struct neighbour *n;
+ dev_head = neigh_get_dev_table(dev, tbl->family);
- neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
- if (dev && n->dev != dev)
- continue;
- if (skip_perm && n->nud_state & NUD_PERMANENT)
- continue;
+ hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
+ if (skip_perm && n->nud_state & NUD_PERMANENT)
+ continue;
- hlist_del_rcu(&n->hash);
- write_lock(&n->lock);
- neigh_del_timer(n);
- neigh_mark_dead(n);
- if (refcount_read(&n->refcnt) != 1) {
- /* The most unpleasant situation.
- We must destroy neighbour entry,
- but someone still uses it.
-
- The destroy will be delayed until
- the last user releases us, but
- we must kill timers etc. and move
- it to safe state.
- */
- __skb_queue_purge(&n->arp_queue);
- n->arp_queue_len_bytes = 0;
- WRITE_ONCE(n->output, neigh_blackhole);
- if (n->nud_state & NUD_VALID)
- n->nud_state = NUD_NOARP;
- else
- n->nud_state = NUD_NONE;
- neigh_dbg(2, "neigh %p is stray\n", n);
- }
- write_unlock(&n->lock);
- neigh_cleanup_and_release(n);
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ write_lock(&n->lock);
+ neigh_del_timer(n);
+ neigh_mark_dead(n);
+ if (refcount_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ * We must destroy neighbour entry,
+ * but someone still uses it.
+ *
+ * The destroy will be delayed until
+ * the last user releases us, but
+ * we must kill timers etc. and move
+ * it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ WRITE_ONCE(n->output, neigh_blackhole);
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+ neigh_dbg(2, "neigh %p is stray\n", n);
}
+ write_unlock(&n->lock);
+ neigh_cleanup_and_release(n);
}
}
@@ -669,6 +683,10 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey,
if (want_ref)
neigh_hold(n);
hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
+
+ hlist_add_head_rcu(&n->dev_list,
+ neigh_get_dev_table(dev, tbl->family));
+
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
@@ -950,6 +968,7 @@ static void neigh_periodic_work(struct work_struct *work)
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
@@ -3069,6 +3088,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
release = cb(n);
if (release) {
hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
}
write_unlock(&n->lock);
--
2.46.0
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance
2024-10-22 13:43 [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Gilad Naaman
` (5 preceding siblings ...)
2024-10-22 13:43 ` [PATCH net-next v7 6/6] neighbour: Create netdev->neighbour association Gilad Naaman
@ 2024-10-22 23:48 ` Stanislav Fomichev
2024-10-23 5:01 ` Gilad Naaman
6 siblings, 1 reply; 13+ messages in thread
From: Stanislav Fomichev @ 2024-10-22 23:48 UTC (permalink / raw)
To: Gilad Naaman
Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Kuniyuki Iwashima
On 10/22, Gilad Naaman wrote:
> This patchsets improves the performance of neigh_flush_dev.
>
> Currently, the only way to implement it requires traversing
> all neighbours known to the kernel, across all network-namespaces.
>
> This means that some flows are slowed down as a function of neighbour-scale,
> even if the specific link they're handling has little to no neighbours.
>
> In order to solve this, this patchset adds a netdev->neighbours list,
> as well as making the original linked-list doubly-, so that it is
> possible to unlink neighbours without traversing the hash-bucket to
> obtain the previous neighbour.
>
> The original use-case we encountered was mass-deletion of links (12K
> VLANs) while there are 50K ARPs and 50K NDPs in the system; though the
> slowdowns would also appear when the links are set down.
>
> Changes in v7:
>
> - Fix crash due to use of poisoned hlist_node
> - Apply samx-tree formatting
>
> Gilad Naaman (6):
> neighbour: Add hlist_node to struct neighbour
> neighbour: Define neigh_for_each_in_bucket
> neighbour: Convert seq_file functions to use hlist
> neighbour: Convert iteration to use hlist+macro
> neighbour: Remove bare neighbour::next pointer
> neighbour: Create netdev->neighbour association
>
> .../networking/net_cachelines/net_device.rst | 1 +
> include/linux/netdevice.h | 7 +
> include/net/neighbour.h | 24 +-
> include/net/neighbour_tables.h | 12 +
> net/core/neighbour.c | 337 ++++++++----------
> net/ipv4/arp.c | 2 +-
> 6 files changed, 174 insertions(+), 209 deletions(-)
> create mode 100644 include/net/neighbour_tables.h
Looks like the test is still unhappy. Can you try to run it on your side
before reposting? Or does it look good?
[ 110.442590][ C2] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x5191
[ 110.443219][ C2] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[ 110.443498][ C2] flags: 0x80000000000040(head|node=0|zone=1)
[ 110.443752][ C2] page_type: f5(slab)
[ 110.443897][ C2] raw: 0080000000000003 ffffea0000146401 ffffffffffffffff 0000000000000000
[ 110.444236][ C2] raw: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
[ 110.444546][ C2] head: 0080000000000040 ffff8880010433c0 ffffea0000256410 ffff8880010410e8
[ 110.444862][ C2] head: 0000000000000000 0000000000020002 00000001f5000000 0000000000000000
[ 110.445175][ C2] head: 0080000000000003 ffffea0000146401 ffffffffffffffff 0000000000000000
[ 110.445890][ C2] head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000
[ 110.446197][ C2] page dumped because: VM_BUG_ON_PAGE(page_ref_count(page) == 0)
[ 110.446558][ C2] ------------[ cut here ]------------
[ 110.446754][ C2] kernel BUG at include/linux/mm.h:1140!
[ 110.446972][ C2] Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
[ 110.447210][ C2] CPU: 2 UID: 0 PID: 29 Comm: ksoftirqd/2 Not tainted 6.12.0-rc3-virtme #1
[ 110.447528][ C2] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[ 110.447928][ C2] RIP: 0010:__free_pages+0x1e4/0x220
[ 110.448128][ C2] Code: 0f 94 c3 e9 ba fe ff ff 48 c7 c6 a0 18 58 92 4c 89 e7 e8 df bf f4 ff 90 0f 0b 48 c7 c6 40 29 58 92 4c 89 e7 e8 cd bf f4 ff 90 <0f> 0b 48 89 ef e8 72 03 09 00 e9 c5 fe ff ff e8 98 03 09 00 e9 35
[ 110.448803][ C2] RSP: 0018:ffffc90000217cb0 EFLAGS: 00010246
[ 110.449040][ C2] RAX: 000000000000003e RBX: 0000000000000000 RCX: 1ffffffff263b43c
[ 110.449304][ C2] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000001
[ 110.449565][ C2] RBP: ffffea0000146474 R08: 0000000000000000 R09: fffffbfff263b43c
[ 110.449840][ C2] R10: 0000000000000003 R11: 205d324320202020 R12: ffffea0000146440
[ 110.450101][ C2] R13: ffffc90000217d78 R14: 0000000000000000 R15: 0000000000000008
[ 110.450399][ C2] FS: 0000000000000000(0000) GS:ffff888036100000(0000) knlGS:0000000000000000
[ 110.450787][ C2] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 110.451008][ C2] CR2: 00007f9289e72270 CR3: 000000000a73a005 CR4: 0000000000772ef0
[ 110.451274][ C2] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 110.451564][ C2] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 110.451860][ C2] PKRU: 55555554
[ 110.452015][ C2] Call Trace:
[ 110.452167][ C2] <TASK>
[ 110.452282][ C2] ? die+0x37/0x90
[ 110.452440][ C2] ? do_trap+0x1a3/0x260
[ 110.452589][ C2] ? __free_pages+0x1e4/0x220
[ 110.452786][ C2] ? do_error_trap+0xbe/0x180
[ 110.452970][ C2] ? __free_pages+0x1e4/0x220
[ 110.453152][ C2] ? __free_pages+0x1e4/0x220
[ 110.453342][ C2] ? handle_invalid_op+0x2c/0x40
[ 110.453527][ C2] ? __free_pages+0x1e4/0x220
[ 110.453709][ C2] ? exc_invalid_op+0x30/0x50
[ 110.453934][ C2] ? asm_exc_invalid_op+0x1a/0x20
[ 110.454126][ C2] ? __free_pages+0x1e4/0x220
[ 110.454321][ C2] ? rcu_do_batch+0x34d/0xf20
[ 110.454528][ C2] neigh_hash_free_rcu+0xb7/0xe0
[ 110.454728][ C2] rcu_do_batch+0x34f/0xf20
[ 110.454913][ C2] ? __pfx___lock_release+0x10/0x10
[ 110.455108][ C2] ? __pfx_rcu_do_batch+0x10/0x10
[ 110.455350][ C2] ? lockdep_hardirqs_on_prepare+0x12b/0x410
[ 110.455604][ C2] rcu_core+0x2bd/0x4f0
[ 110.455773][ C2] handle_softirqs+0x1f6/0x5c0
[ 110.455965][ C2] ? __pfx_run_ksoftirqd+0x10/0x10
[ 110.456152][ C2] run_ksoftirqd+0x33/0x60
[ 110.456342][ C2] smpboot_thread_fn+0x306/0x850
[ 110.456533][ C2] ? __pfx_smpboot_thread_fn+0x10/0x10
[ 110.456719][ C2] ? __pfx_smpboot_thread_fn+0x10/0x10
[ 110.456903][ C2] kthread+0x28a/0x350
[ 110.457041][ C2] ? __pfx_kthread+0x10/0x10
[ 110.457369][ C2] ret_from_fork+0x31/0x70
[ 110.457566][ C2] ? __pfx_kthread+0x10/0x10
[ 110.457748][ C2] ret_from_fork_asm+0x1a/0x30
[ 110.457944][ C2] </TASK>
---
pw-bot: cr
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance
2024-10-22 23:48 ` [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance Stanislav Fomichev
@ 2024-10-23 5:01 ` Gilad Naaman
2024-10-23 5:18 ` Kuniyuki Iwashima
2024-10-23 5:38 ` Gilad Naaman
0 siblings, 2 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-23 5:01 UTC (permalink / raw)
To: stfomichev; +Cc: davem, edumazet, gnaaman, kuba, kuniyu, netdev, pabeni
> Looks like the test is still unhappy. Can you try to run it on your side
> before reposting? Or does it look good?
Hey,
Apologies if I missed anything.
I ran this before posting, after applying the entire series, and found no crashes:
sudo make -C tools/testing/selftests run_tests TARGETS=net
Is there more info about this run?
Was this ran on an intermediate patch in the series or all of it?
^ permalink raw reply [flat|nested] 13+ messages in thread* Re: [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance
2024-10-23 5:01 ` Gilad Naaman
@ 2024-10-23 5:18 ` Kuniyuki Iwashima
2024-10-23 5:38 ` Gilad Naaman
1 sibling, 0 replies; 13+ messages in thread
From: Kuniyuki Iwashima @ 2024-10-23 5:18 UTC (permalink / raw)
To: gnaaman; +Cc: davem, edumazet, kuba, kuniyu, netdev, pabeni, stfomichev
From: Gilad Naaman <gnaaman@drivenets.com>
Date: Wed, 23 Oct 2024 05:01:10 +0000
> > Looks like the test is still unhappy. Can you try to run it on your side
> > before reposting? Or does it look good?
>
> Hey,
>
> Apologies if I missed anything.
>
> I ran this before posting, after applying the entire series, and found no crashes:
>
> sudo make -C tools/testing/selftests run_tests TARGETS=net
>
> Is there more info about this run?
> Was this ran on an intermediate patch in the series or all of it?
It seems the warning requires CONFIG_DEBUG_VM.
[ 110.446754][ C2] kernel BUG at include/linux/mm.h:1140!
But I guess the issue will disappear if you rebase the series on top of
Eric's patch and avoid calling free_pages() directly ?
https://lore.kernel.org/netdev/20241022150059.1345406-1-edumazet@google.com/
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH net-next v7 0/6] neighbour: Improve neigh_flush_dev performance
2024-10-23 5:01 ` Gilad Naaman
2024-10-23 5:18 ` Kuniyuki Iwashima
@ 2024-10-23 5:38 ` Gilad Naaman
1 sibling, 0 replies; 13+ messages in thread
From: Gilad Naaman @ 2024-10-23 5:38 UTC (permalink / raw)
To: gnaaman; +Cc: davem, edumazet, kuba, kuniyu, netdev, pabeni, stfomichev
> It seems the warning requires CONFIG_DEBUG_VM.
Ah, so that's what I missed. Thank you.
> But I guess the issue will disappear if you rebase the series on top of
> Eric's patch and avoid calling free_pages() directly ?
I hope that's going to be the case, although this warning looks a bit like
I introduced a double-free somewhere,
which I guess is also possible if Eric's changes go through the same changes in my patch.
^ permalink raw reply [flat|nested] 13+ messages in thread