* [PATCH 0/6] Lockless neighbour table
@ 2006-08-28 23:07 Stephen Hemminger
2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
` (5 more replies)
0 siblings, 6 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
This is an updated version of the changes to make the neighbour (ARP)
table lockless for normal accesses. It uses RCU and seqlock in
several places. I separated the hlist conversions from RCU
changes (in case we need to bisect for some reason).
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 1/6] net neighbor: convert top level list to RCU
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
2006-08-28 23:07 ` [PATCH 2/6] neighbour: convert neighbour hash table to hlist Stephen Hemminger
` (4 subsequent siblings)
5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: neigh-top-lock.patch --]
[-- Type: text/plain, Size: 6810 bytes --]
Change the top level list of neighbor tables to use RCU.
Minor change to BUG() if a neighbor table is registered twice.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/net/neighbour.h | 2 -
net/core/neighbour.c | 89 ++++++++++++++++++++++--------------------------
2 files changed, 42 insertions(+), 49 deletions(-)
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -61,7 +61,7 @@ static void neigh_app_notify(struct neig
static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
-static struct neigh_table *neigh_tables;
+static LIST_HEAD(neigh_tables);
#ifdef CONFIG_PROC_FS
static struct file_operations neigh_stat_seq_fops;
#endif
@@ -93,11 +93,11 @@ static struct file_operations neigh_stat
It is supposed, that dev->hard_header is simplistic and does
not make callbacks to neighbour tables.
- The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
- list of neighbour tables. This list is used only in process context,
+ The last lock is neigh_tbl_lock. It controls update to the
+ set of neighbor tables. Typically used during initialization
*/
-static DEFINE_RWLOCK(neigh_tbl_lock);
+static DEFINE_SPINLOCK(neigh_tbl_lock);
static int neigh_blackhole(struct sk_buff *skb)
{
@@ -1387,26 +1387,18 @@ void neigh_table_init(struct neigh_table
struct neigh_table *tmp;
neigh_table_init_no_netlink(tbl);
- write_lock(&neigh_tbl_lock);
- for (tmp = neigh_tables; tmp; tmp = tmp->next) {
- if (tmp->family == tbl->family)
- break;
- }
- tbl->next = neigh_tables;
- neigh_tables = tbl;
- write_unlock(&neigh_tbl_lock);
-
- if (unlikely(tmp)) {
- printk(KERN_ERR "NEIGH: Registering multiple tables for "
- "family %d\n", tbl->family);
- dump_stack();
+
+ spin_lock(&neigh_tbl_lock);
+ list_for_each_entry(tmp, &neigh_tables, list) {
+ BUG_ON(tmp->family == tbl->family);
}
+
+ list_add_rcu(&tbl->list, &neigh_tables);
+ spin_unlock(&neigh_tbl_lock);
}
int neigh_table_clear(struct neigh_table *tbl)
{
- struct neigh_table **tp;
-
/* It is not clean... Fix it to unload IPv6 module safely */
del_timer_sync(&tbl->gc_timer);
del_timer_sync(&tbl->proxy_timer);
@@ -1414,14 +1406,12 @@ int neigh_table_clear(struct neigh_table
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
printk(KERN_CRIT "neighbour leakage\n");
- write_lock(&neigh_tbl_lock);
- for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
- if (*tp == tbl) {
- *tp = tbl->next;
- break;
- }
- }
- write_unlock(&neigh_tbl_lock);
+
+ spin_lock(&neigh_tbl_lock);
+ list_del_rcu(&tbl->list);
+ spin_unlock(&neigh_tbl_lock);
+
+ synchronize_rcu();
neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
tbl->hash_buckets = NULL;
@@ -1456,13 +1446,12 @@ int neigh_delete(struct sk_buff *skb, st
}
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(tbl, &neigh_tables, list) {
struct neighbour *neigh;
if (tbl->family != ndm->ndm_family)
continue;
- read_unlock(&neigh_tbl_lock);
if (nla_len(dst_attr) < tbl->key_len)
goto out_dev_put;
@@ -1487,12 +1476,12 @@ int neigh_delete(struct sk_buff *skb, st
neigh_release(neigh);
goto out_dev_put;
}
- read_unlock(&neigh_tbl_lock);
err = -EAFNOSUPPORT;
out_dev_put:
if (dev)
dev_put(dev);
+ rcu_read_unlock();
out:
return err;
}
@@ -1525,15 +1514,14 @@ int neigh_add(struct sk_buff *skb, struc
goto out_dev_put;
}
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(tbl, &neigh_tables, list) {
int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
struct neighbour *neigh;
void *dst, *lladdr;
if (tbl->family != ndm->ndm_family)
continue;
- read_unlock(&neigh_tbl_lock);
if (nla_len(tb[NDA_DST]) < tbl->key_len)
goto out_dev_put;
@@ -1578,12 +1566,12 @@ int neigh_add(struct sk_buff *skb, struc
goto out_dev_put;
}
- read_unlock(&neigh_tbl_lock);
err = -EAFNOSUPPORT;
out_dev_put:
if (dev)
dev_put(dev);
+ rcu_read_unlock();
out:
return err;
}
@@ -1788,8 +1776,8 @@ int neightbl_set(struct sk_buff *skb, st
}
ndtmsg = nlmsg_data(nlh);
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(tbl, &neigh_tables, list) {
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
@@ -1889,26 +1877,26 @@ int neightbl_set(struct sk_buff *skb, st
errout_tbl_lock:
write_unlock_bh(&tbl->lock);
errout_locked:
- read_unlock(&neigh_tbl_lock);
+ rcu_read_unlock();
errout:
return err;
}
int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
- int family, tidx, nidx = 0;
+ int family, tidx = 0, nidx = 0;
int tbl_skip = cb->args[0];
int neigh_skip = cb->args[1];
struct neigh_table *tbl;
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
- read_lock(&neigh_tbl_lock);
- for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(tbl, &neigh_tables, list) {
struct neigh_parms *p;
if (tidx < tbl_skip || (family && tbl->family != family))
- continue;
+ goto next_tbl;
if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
@@ -1928,9 +1916,11 @@ int neightbl_dump_info(struct sk_buff *s
}
neigh_skip = 0;
+next_tbl:
+ ++tidx;
}
out:
- read_unlock(&neigh_tbl_lock);
+ rcu_read_unlock();
cb->args[0] = tidx;
cb->args[1] = nidx;
@@ -2020,22 +2010,25 @@ out:
int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
struct neigh_table *tbl;
- int t, family, s_t;
+ int t = 0, family, s_t;
+
- read_lock(&neigh_tbl_lock);
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
s_t = cb->args[0];
- for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(tbl, &neigh_tables, list) {
if (t < s_t || (family && tbl->family != family))
- continue;
+ goto next;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
if (neigh_dump_table(tbl, skb, cb) < 0)
break;
+next:
+ ++t;
}
- read_unlock(&neigh_tbl_lock);
+ rcu_read_unlock();
cb->args[0] = t;
return skb->len;
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -136,7 +136,7 @@ struct pneigh_entry
struct neigh_table
{
- struct neigh_table *next;
+ struct list_head list;
int family;
int entry_size;
int key_len;
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 2/6] neighbour: convert neighbour hash table to hlist
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
2006-08-28 23:07 ` [PATCH 3/6] neighbour: convert pneigh " Stephen Hemminger
` (3 subsequent siblings)
5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: hlist1.patch --]
[-- Type: text/plain, Size: 11860 bytes --]
Change the neighbour table hash list to hlist from list.h
to allow for easier later conversion to RCU.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/net/neighbour.h | 6 -
net/core/neighbour.c | 160 +++++++++++++++++++++++++-----------------------
2 files changed, 88 insertions(+), 78 deletions(-)
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -88,10 +88,10 @@ struct neigh_statistics
struct neighbour
{
- struct neighbour *next;
+ struct hlist_node hlist;
struct neigh_table *tbl;
struct neigh_parms *parms;
- struct net_device *dev;
+ struct net_device *dev;
unsigned long used;
unsigned long confirmed;
unsigned long updated;
@@ -161,7 +161,7 @@ struct neigh_table
unsigned long last_rand;
kmem_cache_t *kmem_cachep;
struct neigh_statistics *stats;
- struct neighbour **hash_buckets;
+ struct hlist_head *hash_buckets;
unsigned int hash_mask;
__u32 hash_rnd;
unsigned int hash_chain_gc;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -126,10 +126,11 @@ static int neigh_forced_gc(struct neigh_
write_lock_bh(&tbl->lock);
for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np;
+ struct neighbour *n;
+ struct hlist_node *node, *tmp;
- np = &tbl->hash_buckets[i];
- while ((n = *np) != NULL) {
+ hlist_for_each_entry_safe(n, node, tmp,
+ &tbl->hash_buckets[i], hlist) {
/* Neighbour record may be discarded if:
* - nobody refers to it.
* - it is not permanent
@@ -137,7 +138,7 @@ static int neigh_forced_gc(struct neigh_
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
- *np = n->next;
+ hlist_del(&n->hlist);
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
@@ -145,7 +146,6 @@ static int neigh_forced_gc(struct neigh_
continue;
}
write_unlock(&n->lock);
- np = &n->next;
}
}
@@ -181,14 +181,15 @@ static void neigh_flush_dev(struct neigh
int i;
for (i = 0; i <= tbl->hash_mask; i++) {
- struct neighbour *n, **np = &tbl->hash_buckets[i];
+ struct hlist_node *node, *tmp;
+ struct neighbour *n;
- while ((n = *np) != NULL) {
- if (dev && n->dev != dev) {
- np = &n->next;
+ hlist_for_each_entry_safe(n, node, tmp,
+ &tbl->hash_buckets[i], hlist) {
+ if (dev && n->dev != dev)
continue;
- }
- *np = n->next;
+
+ hlist_del(&n->hlist);
write_lock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
@@ -279,23 +280,20 @@ out_entries:
goto out;
}
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static struct hlist_head *neigh_hash_alloc(unsigned int entries)
{
- unsigned long size = entries * sizeof(struct neighbour *);
- struct neighbour **ret;
+ unsigned long size = entries * sizeof(struct hlist_head);
- if (size <= PAGE_SIZE) {
- ret = kzalloc(size, GFP_ATOMIC);
- } else {
- ret = (struct neighbour **)
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, GFP_ATOMIC);
+ else
+ return (struct hlist_head *)
__get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
- }
- return ret;
}
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free(struct hlist_head *hash, unsigned int entries)
{
- unsigned long size = entries * sizeof(struct neighbour *);
+ unsigned long size = entries * sizeof(struct hlist_head);
if (size <= PAGE_SIZE)
kfree(hash);
@@ -305,7 +303,7 @@ static void neigh_hash_free(struct neigh
static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
{
- struct neighbour **new_hash, **old_hash;
+ struct hlist_head *new_hash, *old_hash;
unsigned int i, new_hash_mask, old_entries;
NEIGH_CACHE_STAT_INC(tbl, hash_grows);
@@ -321,16 +319,15 @@ static void neigh_hash_grow(struct neigh
get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
for (i = 0; i < old_entries; i++) {
- struct neighbour *n, *next;
+ struct neighbour *n;
+ struct hlist_node *node, *tmp;
- for (n = old_hash[i]; n; n = next) {
+ hlist_for_each_entry_safe(n, node, tmp, &old_hash[i], hlist) {
unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
hash_val &= new_hash_mask;
- next = n->next;
-
- n->next = new_hash[hash_val];
- new_hash[hash_val] = n;
+ hlist_del(&n->hlist);
+ hlist_add_head(&n->hlist, &new_hash[hash_val]);
}
}
tbl->hash_buckets = new_hash;
@@ -343,19 +340,22 @@ struct neighbour *neigh_lookup(struct ne
struct net_device *dev)
{
struct neighbour *n;
+ struct hlist_node *tmp;
int key_len = tbl->key_len;
u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
NEIGH_CACHE_STAT_INC(tbl, lookups);
read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
+ hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
+ goto found;
}
}
+ n = NULL;
+found:
read_unlock_bh(&tbl->lock);
return n;
}
@@ -363,19 +363,22 @@ struct neighbour *neigh_lookup(struct ne
struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
{
struct neighbour *n;
+ struct hlist_node *tmp;
int key_len = tbl->key_len;
u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask;
NEIGH_CACHE_STAT_INC(tbl, lookups);
read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
+ hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (!memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
+ goto found;
}
}
+ n = NULL;
+found:
read_unlock_bh(&tbl->lock);
return n;
}
@@ -387,6 +390,7 @@ struct neighbour *neigh_create(struct ne
int key_len = tbl->key_len;
int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct hlist_node *tmp;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
@@ -424,7 +428,7 @@ struct neighbour *neigh_create(struct ne
goto out_tbl_unlock;
}
- for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+ hlist_for_each_entry(n1, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
neigh_hold(n1);
rc = n1;
@@ -432,8 +436,7 @@ struct neighbour *neigh_create(struct ne
}
}
- n->next = tbl->hash_buckets[hash_val];
- tbl->hash_buckets[hash_val] = n;
+ hlist_add_head(&n->hlist, &tbl->hash_buckets[hash_val]);
n->dead = 0;
neigh_hold(n);
write_unlock_bh(&tbl->lock);
@@ -635,7 +638,9 @@ static void neigh_connect(struct neighbo
static void neigh_periodic_timer(unsigned long arg)
{
struct neigh_table *tbl = (struct neigh_table *)arg;
- struct neighbour *n, **np;
+ struct neighbour *n;
+ struct hlist_head *head;
+ struct hlist_node *node, *tmp;
unsigned long expire, now = jiffies;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
@@ -654,19 +659,17 @@ static void neigh_periodic_timer(unsigne
neigh_rand_reach_time(p->base_reachable_time);
}
- np = &tbl->hash_buckets[tbl->hash_chain_gc];
+ head = &tbl->hash_buckets[tbl->hash_chain_gc];
tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
- while ((n = *np) != NULL) {
+ hlist_for_each_entry_safe(n, node, tmp, head, hlist) {
unsigned int state;
write_lock(&n->lock);
state = n->nud_state;
- if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
- write_unlock(&n->lock);
+ if (state & (NUD_PERMANENT | NUD_IN_TIMER))
goto next_elt;
- }
if (time_before(n->used, n->confirmed))
n->used = n->confirmed;
@@ -674,16 +677,14 @@ static void neigh_periodic_timer(unsigne
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(now, n->used + n->parms->gc_staletime))) {
- *np = n->next;
+ hlist_del(&n->hlist);
n->dead = 1;
write_unlock(&n->lock);
neigh_release(n);
continue;
}
+ next_elt:
write_unlock(&n->lock);
-
-next_elt:
- np = &n->next;
}
/* Cycle through all hash buckets every base_reachable_time/2 ticks.
@@ -1976,27 +1977,29 @@ nla_put_failure:
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct neighbour *n;
int rc, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
for (h = 0; h <= tbl->hash_mask; h++) {
+ struct neighbour *n;
+ struct hlist_node *tmp;
+
if (h < s_h)
continue;
if (h > s_h)
s_idx = 0;
read_lock_bh(&tbl->lock);
- for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) {
- if (idx < s_idx)
- continue;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+ idx = 0;
+ hlist_for_each_entry(n, tmp, &tbl->hash_buckets[h], hlist) {
+ if (idx >= s_idx &&
+ neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI) <= 0) {
+ RTM_NEWNEIGH, NLM_F_MULTI) <= 0) {
read_unlock_bh(&tbl->lock);
rc = -1;
goto out;
}
+ ++idx;
}
read_unlock_bh(&tbl->lock);
}
@@ -2040,10 +2043,10 @@ void neigh_for_each(struct neigh_table *
read_lock_bh(&tbl->lock);
for (chain = 0; chain <= tbl->hash_mask; chain++) {
- struct neighbour *n;
+ struct hlist_node *p;
- for (n = tbl->hash_buckets[chain]; n; n = n->next)
- cb(n, cookie);
+ hlist_for_each(p, &tbl->hash_buckets[chain])
+ cb(hlist_entry(p, struct neighbour, hlist), cookie);
}
read_unlock_bh(&tbl->lock);
}
@@ -2056,19 +2059,19 @@ void __neigh_for_each_release(struct nei
int chain;
for (chain = 0; chain <= tbl->hash_mask; chain++) {
- struct neighbour *n, **np;
+ struct neighbour *n;
+ struct hlist_node *tmp, *next;
- np = &tbl->hash_buckets[chain];
- while ((n = *np) != NULL) {
+ hlist_for_each_entry_safe(n, tmp, next,
+ &tbl->hash_buckets[chain], hlist) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- *np = n->next;
+ hlist_del(&n->hlist);
n->dead = 1;
- } else
- np = &n->next;
+ }
write_unlock(&n->lock);
if (release)
neigh_release(n);
@@ -2088,33 +2091,39 @@ static struct neighbour *neigh_get_first
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
- n = tbl->hash_buckets[bucket];
+ struct hlist_node *tmp;
- while (n) {
+ hlist_for_each_entry(n, tmp, &tbl->hash_buckets[bucket], hlist) {
if (state->neigh_sub_iter) {
loff_t fakep = 0;
void *v;
v = state->neigh_sub_iter(state, n, &fakep);
if (!v)
- goto next;
+ continue;
}
+
if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
+ goto found;
if (n->nud_state & ~NUD_NOARP)
- break;
- next:
- n = n->next;
+ goto found;
}
-
- if (n)
- break;
+ n = NULL;
}
+found:
state->bucket = bucket;
return n;
}
+static struct neighbour *next_neigh(struct hlist_node *node)
+{
+ if (node)
+ return hlist_entry(node, struct neighbour, hlist);
+ else
+ return NULL;
+}
+
static struct neighbour *neigh_get_next(struct seq_file *seq,
struct neighbour *n,
loff_t *pos)
@@ -2127,7 +2136,8 @@ static struct neighbour *neigh_get_next(
if (v)
return n;
}
- n = n->next;
+
+ n = next_neigh(n->hlist.next);
while (1) {
while (n) {
@@ -2143,7 +2153,7 @@ static struct neighbour *neigh_get_next(
if (n->nud_state & ~NUD_NOARP)
break;
next:
- n = n->next;
+ n = next_neigh(n->hlist.next);
}
if (n)
@@ -2152,7 +2162,7 @@ static struct neighbour *neigh_get_next(
if (++state->bucket > tbl->hash_mask)
break;
- n = tbl->hash_buckets[state->bucket];
+ n = next_neigh(tbl->hash_buckets[state->bucket].first);
}
if (n && pos)
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 3/6] neighbour: convert pneigh hash table to hlist
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
2006-08-28 23:07 ` [PATCH 2/6] neighbour: convert neighbour hash table to hlist Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
` (2 subsequent siblings)
5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: hlist2.patch --]
[-- Type: text/plain, Size: 5376 bytes --]
Change the pneigh_entry table to hlist from list.h
to allow for easier later conversion to RCU.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/net/neighbour.h | 6 ++--
net/core/neighbour.c | 58 ++++++++++++++++++++++++------------------------
2 files changed, 33 insertions(+), 31 deletions(-)
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -124,8 +124,8 @@ struct neigh_ops
struct pneigh_entry
{
- struct pneigh_entry *next;
- struct net_device *dev;
+ struct hlist_node hlist;
+ struct net_device *dev;
u8 key[0];
};
@@ -165,7 +165,7 @@ struct neigh_table
unsigned int hash_mask;
__u32 hash_rnd;
unsigned int hash_chain_gc;
- struct pneigh_entry **phash_buckets;
+ struct hlist_head *phash_buckets;
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *pde;
#endif
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -455,6 +455,7 @@ struct pneigh_entry * pneigh_lookup(stru
struct net_device *dev, int creat)
{
struct pneigh_entry *n;
+ struct hlist_node *tmp;
int key_len = tbl->key_len;
u32 hash_val = *(u32 *)(pkey + key_len - 4);
@@ -465,7 +466,7 @@ struct pneigh_entry * pneigh_lookup(stru
read_lock_bh(&tbl->lock);
- for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
+ hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
if (!memcmp(n->key, pkey, key_len) &&
(n->dev == dev || !n->dev)) {
read_unlock_bh(&tbl->lock);
@@ -495,8 +496,7 @@ struct pneigh_entry * pneigh_lookup(stru
}
write_lock_bh(&tbl->lock);
- n->next = tbl->phash_buckets[hash_val];
- tbl->phash_buckets[hash_val] = n;
+ hlist_add_head(&n->hlist, &tbl->phash_buckets[hash_val]);
write_unlock_bh(&tbl->lock);
out:
return n;
@@ -506,7 +506,8 @@ out:
int pneigh_delete(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
{
- struct pneigh_entry *n, **np;
+ struct pneigh_entry *n;
+ struct hlist_node *tmp;
int key_len = tbl->key_len;
u32 hash_val = *(u32 *)(pkey + key_len - 4);
@@ -516,10 +517,9 @@ int pneigh_delete(struct neigh_table *tb
hash_val &= PNEIGH_HASHMASK;
write_lock_bh(&tbl->lock);
- for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
- np = &n->next) {
+ hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
- *np = n->next;
+ hlist_del(&n->hlist);
write_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
@@ -535,22 +535,21 @@ int pneigh_delete(struct neigh_table *tb
static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
- struct pneigh_entry *n, **np;
u32 h;
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
- np = &tbl->phash_buckets[h];
- while ((n = *np) != NULL) {
+ struct pneigh_entry *n;
+ struct hlist_node *tmp, *nxt;
+
+ hlist_for_each_entry_safe(n, tmp, nxt, &tbl->phash_buckets[h], hlist) {
if (!dev || n->dev == dev) {
- *np = n->next;
+ hlist_del(&n->hlist);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
kfree(n);
- continue;
}
- np = &n->next;
}
}
return -ENOENT;
@@ -1332,7 +1331,6 @@ void neigh_parms_destroy(struct neigh_pa
void neigh_table_init_no_netlink(struct neigh_table *tbl)
{
unsigned long now = jiffies;
- unsigned long phsize;
atomic_set(&tbl->parms.refcnt, 1);
INIT_RCU_HEAD(&tbl->parms.rcu_head);
@@ -1359,8 +1357,8 @@ void neigh_table_init_no_netlink(struct
tbl->hash_mask = 1;
tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
- phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
- tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
+ tbl->phash_buckets = kcalloc(PNEIGH_HASHMASK + 1, sizeof(struct hlist_head),
+ GFP_KERNEL);
if (!tbl->hash_buckets || !tbl->phash_buckets)
panic("cannot allocate neighbour cache hashes");
@@ -2188,18 +2186,18 @@ static struct pneigh_entry *pneigh_get_f
{
struct neigh_seq_state *state = seq->private;
struct neigh_table *tbl = state->tbl;
- struct pneigh_entry *pn = NULL;
+ struct hlist_node *pn = NULL;
int bucket = state->bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
- pn = tbl->phash_buckets[bucket];
+ pn = tbl->phash_buckets[bucket].first;
if (pn)
break;
}
state->bucket = bucket;
- return pn;
+ return pn ? hlist_entry(pn, struct pneigh_entry, hlist) : NULL;
}
static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
@@ -2208,20 +2206,24 @@ static struct pneigh_entry *pneigh_get_n
{
struct neigh_seq_state *state = seq->private;
struct neigh_table *tbl = state->tbl;
+ struct hlist_node *tmp = &pn->hlist;
- pn = pn->next;
- while (!pn) {
- if (++state->bucket > PNEIGH_HASHMASK)
- break;
- pn = tbl->phash_buckets[state->bucket];
- if (pn)
- break;
+ tmp = tmp->next;
+ if (tmp)
+ goto found;
+
+ while (++state->bucket < PNEIGH_HASHMASK) {
+ tmp = tbl->phash_buckets[state->bucket].first;
+ if (tmp)
+ goto found;
}
+ return NULL;
- if (pn && pos)
+found:
+ if (pos)
--(*pos);
- return pn;
+ return hlist_entry(tmp, struct pneigh_entry, hlist);
}
static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 4/6] net neighbour: convert to RCU
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
` (2 preceding siblings ...)
2006-08-28 23:07 ` [PATCH 3/6] neighbour: convert pneigh " Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
2006-08-29 15:28 ` Alexey Kuznetsov
2006-08-28 23:07 ` [PATCH 5/6] neighbour: convert lookup to sequence lock Stephen Hemminger
2006-08-28 23:07 ` [PATCH 6/6] neighbour: convert hard header cache to sequence number Stephen Hemminger
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: rcu-neigh.patch --]
[-- Type: text/plain, Size: 15275 bytes --]
Use RCU to allow for lock less access to the neighbour table.
This should speedup the send path because no atomic operations
will be needed to lookup ARP entries, etc.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/net/neighbour.h | 4 -
net/core/neighbour.c | 158 +++++++++++++++++++++++++-----------------------
2 files changed, 87 insertions(+), 75 deletions(-)
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -108,6 +108,7 @@ struct neighbour
struct sk_buff_head arp_queue;
struct timer_list timer;
struct neigh_ops *ops;
+ struct rcu_head rcu;
u8 primary_key[0];
};
@@ -126,6 +127,7 @@ struct pneigh_entry
{
struct hlist_node hlist;
struct net_device *dev;
+ struct rcu_head rcu;
u8 key[0];
};
@@ -157,7 +159,7 @@ struct neigh_table
struct timer_list proxy_timer;
struct sk_buff_head proxy_queue;
atomic_t entries;
- rwlock_t lock;
+ spinlock_t lock;
unsigned long last_rand;
kmem_cache_t *kmem_cachep;
struct neigh_statistics *stats;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -67,9 +67,10 @@ static struct file_operations neigh_stat
#endif
/*
- Neighbour hash table buckets are protected with rwlock tbl->lock.
+ Neighbour hash table buckets are protected with lock tbl->lock.
- - All the scans/updates to hash buckets MUST be made under this lock.
+ - All the scans of hash buckes must be made with RCU read lock (nopreempt)
+ - updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
to protocol backends, no attempts to send something to network.
It will result in deadlocks, if backend/driver wants to use neighbour
@@ -117,6 +118,13 @@ unsigned long neigh_rand_reach_time(unsi
}
+static void neigh_rcu_release(struct rcu_head *head)
+{
+ struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+ neigh_release(neigh);
+}
+
static int neigh_forced_gc(struct neigh_table *tbl)
{
int shrunk = 0;
@@ -124,7 +132,7 @@ static int neigh_forced_gc(struct neigh_
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
for (i = 0; i <= tbl->hash_mask; i++) {
struct neighbour *n;
struct hlist_node *node, *tmp;
@@ -138,11 +146,11 @@ static int neigh_forced_gc(struct neigh_
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
- neigh_release(n);
+ call_rcu(&n->rcu, neigh_rcu_release);
continue;
}
write_unlock(&n->lock);
@@ -151,7 +159,7 @@ static int neigh_forced_gc(struct neigh_
tbl->last_flush = jiffies;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
return shrunk;
}
@@ -189,7 +197,7 @@ static void neigh_flush_dev(struct neigh
if (dev && n->dev != dev)
continue;
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
write_lock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
@@ -220,17 +228,17 @@ static void neigh_flush_dev(struct neigh
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
pneigh_ifdown(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
@@ -326,8 +334,8 @@ static void neigh_hash_grow(struct neigh
unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
hash_val &= new_hash_mask;
- hlist_del(&n->hlist);
- hlist_add_head(&n->hlist, &new_hash[hash_val]);
+ __hlist_del(&n->hlist);
+ hlist_add_head_rcu(&n->hlist, &new_hash[hash_val]);
}
}
tbl->hash_buckets = new_hash;
@@ -346,8 +354,8 @@ struct neighbour *neigh_lookup(struct ne
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
@@ -356,7 +364,7 @@ struct neighbour *neigh_lookup(struct ne
}
n = NULL;
found:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return n;
}
@@ -369,8 +377,8 @@ struct neighbour *neigh_lookup_nodev(str
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (!memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
@@ -379,7 +387,7 @@ struct neighbour *neigh_lookup_nodev(str
}
n = NULL;
found:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return n;
}
@@ -416,7 +424,7 @@ struct neighbour *neigh_create(struct ne
n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
@@ -436,21 +444,22 @@ struct neighbour *neigh_create(struct ne
}
}
- hlist_add_head(&n->hlist, &tbl->hash_buckets[hash_val]);
n->dead = 0;
neigh_hold(n);
- write_unlock_bh(&tbl->lock);
+ hlist_add_head_rcu(&n->hlist, &tbl->hash_buckets[hash_val]);
+ spin_unlock_bh(&tbl->lock);
NEIGH_PRINTK2("neigh %p is created.\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
+/* Assumes rcu_read_lock is held */
struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, int creat)
{
@@ -464,16 +473,14 @@ struct pneigh_entry * pneigh_lookup(stru
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
- read_lock_bh(&tbl->lock);
-
- hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
+ hlist_for_each_entry_rcu(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
if (!memcmp(n->key, pkey, key_len) &&
(n->dev == dev || !n->dev)) {
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
goto out;
}
}
- read_unlock_bh(&tbl->lock);
+
n = NULL;
if (!creat)
goto out;
@@ -495,13 +502,18 @@ struct pneigh_entry * pneigh_lookup(stru
goto out;
}
- write_lock_bh(&tbl->lock);
- hlist_add_head(&n->hlist, &tbl->phash_buckets[hash_val]);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ hlist_add_head_rcu(&n->hlist, &tbl->phash_buckets[hash_val]);
+ spin_unlock_bh(&tbl->lock);
out:
return n;
}
+static void pneigh_destroy(struct rcu_head *head)
+{
+ struct pneigh_entry *n = container_of(head, struct pneigh_entry, rcu);
+ kfree(n);
+}
int pneigh_delete(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
@@ -516,20 +528,20 @@ int pneigh_delete(struct neigh_table *tb
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
- hlist_del(&n->hlist);
- write_unlock_bh(&tbl->lock);
+ hlist_del_rcu(&n->hlist);
+ spin_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
- kfree(n);
+ call_rcu(&n->rcu, pneigh_destroy);
return 0;
}
}
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
return -ENOENT;
}
@@ -543,7 +555,7 @@ static int pneigh_ifdown(struct neigh_ta
hlist_for_each_entry_safe(n, tmp, nxt, &tbl->phash_buckets[h], hlist) {
if (!dev || n->dev == dev) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
@@ -644,7 +656,7 @@ static void neigh_periodic_timer(unsigne
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
- write_lock(&tbl->lock);
+ spin_lock(&tbl->lock);
/*
* periodically recompute ReachableTime from random function
@@ -676,7 +688,7 @@ static void neigh_periodic_timer(unsigne
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(now, n->used + n->parms->gc_staletime))) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
write_unlock(&n->lock);
neigh_release(n);
@@ -697,7 +709,7 @@ static void neigh_periodic_timer(unsigne
mod_timer(&tbl->gc_timer, now + expire);
- write_unlock(&tbl->lock);
+ spin_unlock(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1285,10 +1297,10 @@ struct neigh_parms *neigh_parms_alloc(st
p->dev = dev;
}
p->sysctl_table = NULL;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
p->next = tbl->parms.next;
tbl->parms.next = p;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
return p;
}
@@ -1307,19 +1319,19 @@ void neigh_parms_release(struct neigh_ta
if (!parms || parms == &tbl->parms)
return;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
for (p = &tbl->parms.next; *p; p = &(*p)->next) {
if (*p == parms) {
*p = parms->next;
parms->dead = 1;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
if (parms->dev)
dev_put(parms->dev);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
return;
}
}
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
NEIGH_PRINTK1("neigh_parms_release: not found\n");
}
@@ -1365,7 +1377,7 @@ void neigh_table_init_no_netlink(struct
get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
- rwlock_init(&tbl->lock);
+ spin_lock_init(&tbl->lock);
init_timer(&tbl->gc_timer);
tbl->gc_timer.data = (unsigned long)tbl;
tbl->gc_timer.function = neigh_periodic_timer;
@@ -1620,7 +1632,7 @@ static int neightbl_fill_info(struct sk_
ndtmsg = nlmsg_data(nlh);
- read_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -1680,11 +1692,11 @@ static int neightbl_fill_info(struct sk_
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_end(skb, nlh);
nla_put_failure:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_cancel(skb, nlh);
}
@@ -1703,7 +1715,7 @@ static int neightbl_fill_param_info(stru
ndtmsg = nlmsg_data(nlh);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock(); /* this maybe unnecessary */
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -1712,10 +1724,10 @@ static int neightbl_fill_param_info(stru
neightbl_fill_parms(skb, parms) < 0)
goto errout;
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_end(skb, nlh);
errout:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_cancel(skb, nlh);
}
@@ -1793,7 +1805,7 @@ int neightbl_set(struct sk_buff *skb, st
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
@@ -1874,7 +1886,7 @@ int neightbl_set(struct sk_buff *skb, st
err = 0;
errout_tbl_lock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
errout_locked:
rcu_read_unlock();
errout:
@@ -1890,7 +1902,7 @@ int neightbl_dump_info(struct sk_buff *s
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
- rcu_read_lock();
+ rcu_read_lock_bh();
list_for_each_entry_rcu(tbl, &neigh_tables, list) {
struct neigh_parms *p;
@@ -1986,20 +1998,20 @@ static int neigh_dump_table(struct neigh
continue;
if (h > s_h)
s_idx = 0;
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
idx = 0;
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[h], hlist) {
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[h], hlist) {
if (idx >= s_idx &&
neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH, NLM_F_MULTI) <= 0) {
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
rc = -1;
goto out;
}
++idx;
}
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
rc = skb->len;
out:
@@ -2039,14 +2051,15 @@ void neigh_for_each(struct neigh_table *
{
int chain;
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ struct neighbour *n;
struct hlist_node *p;
- hlist_for_each(p, &tbl->hash_buckets[chain])
- cb(hlist_entry(p, struct neighbour, hlist), cookie);
+ hlist_for_each_entry_rcu(n, p, &tbl->hash_buckets[chain], hlist)
+ cb(n, cookie);
}
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -2067,12 +2080,12 @@ void __neigh_for_each_release(struct nei
write_lock(&n->lock);
release = cb(n);
if (release) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
}
write_unlock(&n->lock);
if (release)
- neigh_release(n);
+ call_rcu(&n->rcu, neigh_rcu_release);
}
}
}
@@ -2116,7 +2129,7 @@ found:
static struct neighbour *next_neigh(struct hlist_node *node)
{
- if (node)
+ if (rcu_dereference(node))
return hlist_entry(node, struct neighbour, hlist);
else
return NULL;
@@ -2191,7 +2204,7 @@ static struct pneigh_entry *pneigh_get_f
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
- pn = tbl->phash_buckets[bucket].first;
+ pn = rcu_dereference(tbl->phash_buckets[bucket].first);
if (pn)
break;
}
@@ -2208,12 +2221,12 @@ static struct pneigh_entry *pneigh_get_n
struct neigh_table *tbl = state->tbl;
struct hlist_node *tmp = &pn->hlist;
- tmp = tmp->next;
+ tmp = rcu_dereference(tmp->next);
if (tmp)
goto found;
while (++state->bucket < PNEIGH_HASHMASK) {
- tmp = tbl->phash_buckets[state->bucket].first;
+ tmp = rcu_dereference(tbl->phash_buckets[state->bucket].first);
if (tmp)
goto found;
}
@@ -2261,7 +2274,7 @@ void *neigh_seq_start(struct seq_file *s
state->bucket = 0;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
pos_minus_one = *pos - 1;
return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
@@ -2297,10 +2310,7 @@ EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
{
- struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
-
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_seq_stop);
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 5/6] neighbour: convert lookup to sequence lock
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
` (3 preceding siblings ...)
2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
2006-08-28 23:07 ` [PATCH 6/6] neighbour: convert hard header cache to sequence number Stephen Hemminger
5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: neighbour-seqlock.patch --]
[-- Type: text/plain, Size: 17283 bytes --]
The reading of neighbour table entries can be converted from a slow
reader/writer lock to a fast lockless sequence number check.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/net/neighbour.h | 2
net/core/neighbour.c | 117 +++++++++++++++++++++++++++++-------------------
net/ipv4/arp.c | 101 +++++++++++++++++++++++++----------------
net/ipv6/ndisc.c | 16 +++---
net/ipv6/route.c | 12 ++--
net/sched/sch_teql.c | 11 +++-
6 files changed, 155 insertions(+), 104 deletions(-)
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -100,7 +100,7 @@ struct neighbour
__u8 type;
__u8 dead;
atomic_t probes;
- rwlock_t lock;
+ seqlock_t lock;
unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
struct hh_cache *hh;
atomic_t refcnt;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -143,17 +143,17 @@ static int neigh_forced_gc(struct neigh_
* - nobody refers to it.
* - it is not permanent
*/
- write_lock(&n->lock);
+ write_seqlock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
hlist_del_rcu(&n->hlist);
n->dead = 1;
shrunk = 1;
- write_unlock(&n->lock);
+ write_sequnlock(&n->lock);
call_rcu(&n->rcu, neigh_rcu_release);
continue;
}
- write_unlock(&n->lock);
+ write_sequnlock(&n->lock);
}
}
@@ -198,7 +198,7 @@ static void neigh_flush_dev(struct neigh
continue;
hlist_del_rcu(&n->hlist);
- write_lock(&n->lock);
+ write_seqlock(&n->lock);
neigh_del_timer(n);
n->dead = 1;
@@ -220,7 +220,7 @@ static void neigh_flush_dev(struct neigh
n->nud_state = NUD_NONE;
NEIGH_PRINTK2("neigh %p is stray.\n", n);
}
- write_unlock(&n->lock);
+ write_sequnlock(&n->lock);
neigh_release(n);
}
}
@@ -267,7 +267,7 @@ static struct neighbour *neigh_alloc(str
memset(n, 0, tbl->entry_size);
skb_queue_head_init(&n->arp_queue);
- rwlock_init(&n->lock);
+ seqlock_init(&n->lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
@@ -615,7 +615,7 @@ void neigh_destroy(struct neighbour *nei
/* Neighbour state is suspicious;
disable fast path.
- Called with write_locked neigh.
+ Called with locked neigh.
*/
static void neigh_suspect(struct neighbour *neigh)
{
@@ -632,7 +632,7 @@ static void neigh_suspect(struct neighbo
/* Neighbour state is OK;
enable fast path.
- Called with write_locked neigh.
+ Called with locked neigh.
*/
static void neigh_connect(struct neighbour *neigh)
{
@@ -676,7 +676,7 @@ static void neigh_periodic_timer(unsigne
hlist_for_each_entry_safe(n, node, tmp, head, hlist) {
unsigned int state;
- write_lock(&n->lock);
+ write_seqlock(&n->lock);
state = n->nud_state;
if (state & (NUD_PERMANENT | NUD_IN_TIMER))
@@ -690,12 +690,12 @@ static void neigh_periodic_timer(unsigne
time_after(now, n->used + n->parms->gc_staletime))) {
hlist_del_rcu(&n->hlist);
n->dead = 1;
- write_unlock(&n->lock);
+ write_sequnlock(&n->lock);
neigh_release(n);
continue;
}
next_elt:
- write_unlock(&n->lock);
+ write_sequnlock(&n->lock);
}
/* Cycle through all hash buckets every base_reachable_time/2 ticks.
@@ -738,7 +738,7 @@ static void neigh_timer_handler(unsigned
unsigned state;
int notify = 0;
- write_lock(&neigh->lock);
+ write_seqlock(&neigh->lock);
state = neigh->nud_state;
now = jiffies;
@@ -748,6 +748,7 @@ static void neigh_timer_handler(unsigned
#ifndef CONFIG_SMP
printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
#endif
+ write_sequnlock(&neigh->lock);
goto out;
}
@@ -808,9 +809,9 @@ static void neigh_timer_handler(unsigned
*/
while (neigh->nud_state == NUD_FAILED &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- write_unlock(&neigh->lock);
+ write_sequnlock(&neigh->lock);
neigh->ops->error_report(neigh, skb);
- write_lock(&neigh->lock);
+ write_sequnlock(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
}
@@ -821,20 +822,22 @@ static void neigh_timer_handler(unsigned
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
+
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
struct sk_buff *skb = skb_peek(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
skb_get(skb);
- write_unlock(&neigh->lock);
+ write_sequnlock(&neigh->lock);
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
if (skb)
kfree_skb(skb);
} else {
-out:
- write_unlock(&neigh->lock);
+ write_sequnlock(&neigh->lock);
}
+
+out:
if (notify)
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -850,11 +853,11 @@ int __neigh_event_send(struct neighbour
int rc;
unsigned long now;
- write_lock_bh(&neigh->lock);
+ write_seqlock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
- goto out_unlock_bh;
+ goto out;
now = jiffies;
@@ -868,7 +871,7 @@ int __neigh_event_send(struct neighbour
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
- write_unlock_bh(&neigh->lock);
+ write_sequnlock_bh(&neigh->lock);
if (skb)
kfree_skb(skb);
@@ -896,8 +899,8 @@ int __neigh_event_send(struct neighbour
}
rc = 1;
}
-out_unlock_bh:
- write_unlock_bh(&neigh->lock);
+out:
+ write_sequnlock_bh(&neigh->lock);
return rc;
}
@@ -948,7 +951,7 @@ int neigh_update(struct neighbour *neigh
struct net_device *dev;
int update_isrouter = 0;
- write_lock_bh(&neigh->lock);
+ write_seqlock_bh(&neigh->lock);
dev = neigh->dev;
old = neigh->nud_state;
@@ -1052,22 +1055,23 @@ int neigh_update(struct neighbour *neigh
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
struct neighbour *n1 = neigh;
- write_unlock_bh(&neigh->lock);
+ write_sequnlock_bh(&neigh->lock);
/* On shaper/eql skb->dst->neighbour != neigh :( */
if (skb->dst && skb->dst->neighbour)
n1 = skb->dst->neighbour;
n1->output(skb);
- write_lock_bh(&neigh->lock);
+ write_seqlock_bh(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
}
-out:
+
if (update_isrouter) {
neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
(neigh->flags | NTF_ROUTER) :
(neigh->flags & ~NTF_ROUTER);
}
- write_unlock_bh(&neigh->lock);
+out:
+ write_sequnlock_bh(&neigh->lock);
if (notify)
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -1144,6 +1148,30 @@ int neigh_compat_output(struct sk_buff *
return dev_queue_xmit(skb);
}
+static int neigh_hard_header(struct sk_buff *skb, struct net_device *dev,
+ const struct neighbour *neigh)
+{
+ int rc;
+
+ unsigned seq;
+
+ for(;;) {
+ seq = read_seqbegin(&neigh->lock);
+ rc = dev->hard_header(skb, dev, ntohs(skb->protocol),
+ neigh->ha, NULL, skb->len);
+
+ if (likely(!read_seqretry(&neigh->lock, seq)))
+ break;
+
+ if (rc < 0)
+ break;
+
+ __skb_pull(skb, rc);
+ }
+
+ return rc;
+}
+
/* Slow and careful. */
int neigh_resolve_output(struct sk_buff *skb)
@@ -1160,19 +1188,17 @@ int neigh_resolve_output(struct sk_buff
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
+
if (dev->hard_header_cache && !dst->hh) {
- write_lock_bh(&neigh->lock);
+ write_seqlock_bh(&neigh->lock);
if (!dst->hh)
neigh_hh_init(neigh, dst, dst->ops->protocol);
err = dev->hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
- write_unlock_bh(&neigh->lock);
- } else {
- read_lock_bh(&neigh->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
- }
+ write_sequnlock_bh(&neigh->lock);
+ } else
+ err = neigh_hard_header(skb, dev, neigh);
+
if (err >= 0)
rc = neigh->ops->queue_xmit(skb);
else
@@ -1196,14 +1222,11 @@ int neigh_connected_output(struct sk_buf
int err;
struct dst_entry *dst = skb->dst;
struct neighbour *neigh = dst->neighbour;
- struct net_device *dev = neigh->dev;
__skb_pull(skb, skb->nh.raw - skb->data);
- read_lock_bh(&neigh->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, NULL, skb->len);
- read_unlock_bh(&neigh->lock);
+ err = neigh_hard_header(skb, neigh->dev, neigh);
+
if (err >= 0)
err = neigh->ops->queue_xmit(skb);
else {
@@ -1960,11 +1983,15 @@ static int neigh_fill_info(struct sk_buf
NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
- read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
+
+ /* Not really updating this neighbour but don't want to
+ * deal with the unwind case when seqlock needs retry
+ */
+ write_seqlock_bh(&neigh->lock);
if ((neigh->nud_state & NUD_VALID) &&
nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
- read_unlock_bh(&neigh->lock);
+ write_sequnlock_bh(&neigh->lock);
goto nla_put_failure;
}
@@ -1972,7 +1999,7 @@ static int neigh_fill_info(struct sk_buf
ci.ndm_confirmed = now - neigh->confirmed;
ci.ndm_updated = now - neigh->updated;
ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1;
- read_unlock_bh(&neigh->lock);
+ write_sequnlock_bh(&neigh->lock);
NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
@@ -2077,13 +2104,13 @@ void __neigh_for_each_release(struct nei
&tbl->hash_buckets[chain], hlist) {
int release;
- write_lock(&n->lock);
+ write_seqlock(&n->lock);
release = cb(n);
if (release) {
hlist_del_rcu(&n->hlist);
n->dead = 1;
}
- write_unlock(&n->lock);
+ write_sequnlock(&n->lock);
if (release)
call_rcu(&n->rcu, neigh_rcu_release);
}
--- net-2.6.19.orig/net/ipv4/arp.c
+++ net-2.6.19/net/ipv4/arp.c
@@ -328,6 +328,31 @@ static void arp_error_report(struct neig
kfree_skb(skb);
}
+
+static unsigned arp_state_to_flags(const struct neighbour *neigh)
+{
+ unsigned flags = 0;
+ if (neigh->nud_state&NUD_PERMANENT)
+ flags = ATF_PERM|ATF_COM;
+ else if (neigh->nud_state&NUD_VALID)
+ flags = ATF_COM;
+ return flags;
+}
+
+static void arp_get_neigh_addr(u8 *ha, const struct neighbour *neigh,
+ unsigned len, unsigned *flags)
+{
+ unsigned seq;
+
+ do {
+ seq = read_seqbegin(&neigh->lock);
+ memcpy(ha, neigh->ha, len);
+ if (flags)
+ *flags = arp_state_to_flags(neigh);
+ } while (read_seqretry(&neigh->lock, seq));
+
+}
+
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
u32 saddr = 0;
@@ -369,8 +394,12 @@ static void arp_solicit(struct neighbour
if ((probes -= neigh->parms->ucast_probes) < 0) {
if (!(neigh->nud_state&NUD_VALID))
printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
- dst_ha = neigh->ha;
- read_lock_bh(&neigh->lock);
+
+ dst_ha = kmalloc(MAX_ADDR_LEN, GFP_ATOMIC);
+ if (!dst_ha)
+ return;
+
+ arp_get_neigh_addr(dst_ha, neigh, MAX_ADDR_LEN, NULL);
} else if ((probes -= neigh->parms->app_probes) < 0) {
#ifdef CONFIG_ARPD
neigh_app_ns(neigh);
@@ -380,8 +409,9 @@ static void arp_solicit(struct neighbour
arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
dst_ha, dev->dev_addr, NULL);
+
if (dst_ha)
- read_unlock_bh(&neigh->lock);
+ kfree(dst_ha);
}
static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
@@ -489,10 +519,7 @@ int arp_find(unsigned char *haddr, struc
if (n) {
n->used = jiffies;
if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
- read_lock_bh(&n->lock);
- memcpy(haddr, n->ha, dev->addr_len);
- read_unlock_bh(&n->lock);
- neigh_release(n);
+ arp_get_neigh_addr(haddr, n, dev->addr_len, NULL);
return 0;
}
neigh_release(n);
@@ -1047,16 +1074,6 @@ static int arp_req_set(struct arpreq *r,
return err;
}
-static unsigned arp_state_to_flags(struct neighbour *neigh)
-{
- unsigned flags = 0;
- if (neigh->nud_state&NUD_PERMANENT)
- flags = ATF_PERM|ATF_COM;
- else if (neigh->nud_state&NUD_VALID)
- flags = ATF_COM;
- return flags;
-}
-
/*
* Get an ARP cache entry.
*/
@@ -1069,10 +1086,8 @@ static int arp_req_get(struct arpreq *r,
neigh = neigh_lookup(&arp_tbl, &ip, dev);
if (neigh) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
+ arp_get_neigh_addr(r->arp_ha.sa_data, neigh, dev->addr_len,
+ &r->arp_flags);
r->arp_ha.sa_family = dev->type;
strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
neigh_release(neigh);
@@ -1258,7 +1273,7 @@ void __init arp_init(void)
/*
* ax25 -> ASCII conversion
*/
-static char *ax2asc2(ax25_address *a, char *buf)
+static char *ax2asc2(const ax25_address *a, char *buf)
{
char c, *s;
int n;
@@ -1290,35 +1305,41 @@ static char *ax2asc2(ax25_address *a, ch
#define HBUFFERLEN 30
static void arp_format_neigh_entry(struct seq_file *seq,
- struct neighbour *n)
+ const struct neighbour *n)
{
char hbuffer[HBUFFERLEN];
const char hexbuf[] = "0123456789ABCDEF";
int k, j;
+ unsigned hflags, seqno;
char tbuf[16];
struct net_device *dev = n->dev;
int hatype = dev->type;
- read_lock(&n->lock);
- /* Convert hardware address to XX:XX:XX:XX ... form. */
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
- if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
- ax2asc2((ax25_address *)n->ha, hbuffer);
- else {
-#endif
- for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
- hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
- hbuffer[k++] = hexbuf[n->ha[j] & 15];
- hbuffer[k++] = ':';
- }
- hbuffer[--k] = 0;
+ do {
+ seqno = read_seqbegin(&n->lock);
+
+ /* Convert hardware address to XX:XX:XX:XX ... form. */
#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
- }
+ if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+ ax2asc2((const ax25_address *)n->ha, hbuffer);
+ else
#endif
- sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+ {
+ for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+ hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+ hbuffer[k++] = hexbuf[n->ha[j] & 15];
+ hbuffer[k++] = ':';
+ }
+ hbuffer[--k] = 0;
+ }
+
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+ hflags = arp_state_to_flags(n);
+ } while (read_seqretry(&n->lock, seqno));
+
seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n",
- tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
- read_unlock(&n->lock);
+ tbuf, hatype, hflags, hbuffer, dev->name);
+
}
static void arp_format_pneigh_entry(struct seq_file *seq,
--- net-2.6.19.orig/net/ipv6/ndisc.c
+++ net-2.6.19/net/ipv6/ndisc.c
@@ -1412,15 +1412,15 @@ void ndisc_send_redirect(struct sk_buff
return;
}
- if (dev->addr_len) {
- read_lock_bh(&neigh->lock);
- if (neigh->nud_state & NUD_VALID) {
+ if (dev->addr_len && (neigh->nud_state & NUD_VALID)) {
+ unsigned seq;
+ do {
+ seq = read_seqbegin(&neigh->lock);
memcpy(ha_buf, neigh->ha, dev->addr_len);
- read_unlock_bh(&neigh->lock);
- ha = ha_buf;
- len += ndisc_opt_addr_space(dev);
- } else
- read_unlock_bh(&neigh->lock);
+ } while (read_seqretry(&neigh->lock, seq));
+
+ ha = ha_buf;
+ len += ndisc_opt_addr_space(dev);
}
rd_len = min_t(unsigned int,
--- net-2.6.19.orig/net/ipv6/route.c
+++ net-2.6.19/net/ipv6/route.c
@@ -279,20 +279,19 @@ static void rt6_probe(struct rt6_info *r
*/
if (!neigh || (neigh->nud_state & NUD_VALID))
return;
- read_lock_bh(&neigh->lock);
+
if (!(neigh->nud_state & NUD_VALID) &&
- time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
+ time_after(jiffies,
+ neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
struct in6_addr mcaddr;
struct in6_addr *target;
neigh->updated = jiffies;
- read_unlock_bh(&neigh->lock);
target = (struct in6_addr *)&neigh->primary_key;
addrconf_addr_solict_mult(target, &mcaddr);
ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
- } else
- read_unlock_bh(&neigh->lock);
+ }
}
#else
static inline void rt6_probe(struct rt6_info *rt)
@@ -323,10 +322,9 @@ static int inline rt6_check_neigh(struct
!(rt->rt6i_flags & RTF_GATEWAY))
m = 1;
else if (neigh) {
- read_lock_bh(&neigh->lock);
+ smp_rmb();
if (neigh->nud_state & NUD_VALID)
m = 2;
- read_unlock_bh(&neigh->lock);
}
return m;
}
--- net-2.6.19.orig/net/sched/sch_teql.c
+++ net-2.6.19/net/sched/sch_teql.c
@@ -248,9 +248,14 @@ __teql_resolve(struct sk_buff *skb, stru
}
if (neigh_event_send(n, skb_res) == 0) {
int err;
- read_lock(&n->lock);
- err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len);
- read_unlock(&n->lock);
+ unsigned seq;
+
+ do {
+ seq = read_seqbegin(&n->lock);
+ err = dev->hard_header(skb, dev, ntohs(skb->protocol),
+ n->ha, NULL, skb->len);
+ } while (read_seqretry(&n->lock, seq));
+
if (err < 0) {
neigh_release(n);
return -EINVAL;
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 6/6] neighbour: convert hard header cache to sequence number
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
` (4 preceding siblings ...)
2006-08-28 23:07 ` [PATCH 5/6] neighbour: convert lookup to sequence lock Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: hardheader-rcu.patch --]
[-- Type: text/plain, Size: 5181 bytes --]
The reading of the hard header cache in the output path can be
made lockless using seqlock.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/linux/netdevice.h | 3 ++-
include/net/neighbour.h | 2 ++
net/core/neighbour.c | 40 +++++++++++++++++++++++++++++++++++-----
net/ipv4/ip_output.c | 13 +++----------
net/ipv6/ip6_output.c | 13 +++----------
5 files changed, 45 insertions(+), 26 deletions(-)
--- net-2.6.19.orig/include/linux/netdevice.h
+++ net-2.6.19/include/linux/netdevice.h
@@ -193,7 +193,7 @@ struct hh_cache
*/
int hh_len; /* length of header */
int (*hh_output)(struct sk_buff *skb);
- rwlock_t hh_lock;
+ seqlock_t hh_lock;
/* cached hardware header; allow for machine alignment needs. */
#define HH_DATA_MOD 16
@@ -217,6 +217,7 @@ struct hh_cache
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
((((dev)->hard_header_len+extra)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
+
/* These flag bits are private to the generic network queueing
* layer, they may not be explicitly referenced by any other
* code.
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -591,9 +591,11 @@ void neigh_destroy(struct neighbour *nei
while ((hh = neigh->hh) != NULL) {
neigh->hh = hh->hh_next;
hh->hh_next = NULL;
- write_lock_bh(&hh->hh_lock);
+
+ write_seqlock_bh(&hh->hh_lock);
hh->hh_output = neigh_blackhole;
- write_unlock_bh(&hh->hh_lock);
+ write_sequnlock_bh(&hh->hh_lock);
+
if (atomic_dec_and_test(&hh->hh_refcnt))
kfree(hh);
}
@@ -912,9 +914,9 @@ static void neigh_update_hhs(struct neig
if (update) {
for (hh = neigh->hh; hh; hh = hh->hh_next) {
- write_lock_bh(&hh->hh_lock);
+ write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
- write_unlock_bh(&hh->hh_lock);
+ write_sequnlock_bh(&hh->hh_lock);
}
}
}
@@ -1105,7 +1107,7 @@ static void neigh_hh_init(struct neighbo
break;
if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
- rwlock_init(&hh->hh_lock);
+ seqlock_init(&hh->hh_lock);
hh->hh_type = protocol;
atomic_set(&hh->hh_refcnt, 0);
hh->hh_next = NULL;
@@ -1128,6 +1130,33 @@ static void neigh_hh_init(struct neighbo
}
}
+
+/*
+ * Add header to skb from hard header cache
+ * Handle case where cache gets changed.
+ */
+int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
+{
+ int len, alen;
+ unsigned seq;
+ int (*output)(struct sk_buff *);
+
+ for(;;) {
+ seq = read_seqbegin(&hh->hh_lock);
+ len = hh->hh_len;
+ alen = HH_DATA_ALIGN(len);
+ output = hh->hh_output;
+ memcpy(skb->data - alen, hh->hh_data, alen);
+ skb_push(skb, len);
+
+ if (likely(!read_seqretry(&hh->hh_lock, seq)))
+ return output(skb);
+
+ /* undo and try again */
+ __skb_pull(skb, len);
+ }
+}
+
/* This function can be used in contexts, where only old dev_queue_xmit
worked, f.e. if you want to override normal output path (eql, shaper),
but resolution is not made yet.
@@ -2767,6 +2796,7 @@ EXPORT_SYMBOL(neigh_delete);
EXPORT_SYMBOL(neigh_destroy);
EXPORT_SYMBOL(neigh_dump_info);
EXPORT_SYMBOL(neigh_event_ns);
+EXPORT_SYMBOL(neigh_hh_output);
EXPORT_SYMBOL(neigh_ifdown);
EXPORT_SYMBOL(neigh_lookup);
EXPORT_SYMBOL(neigh_lookup_nodev);
--- net-2.6.19.orig/net/ipv4/ip_output.c
+++ net-2.6.19/net/ipv4/ip_output.c
@@ -182,16 +182,9 @@ static inline int ip_finish_output2(stru
skb = skb2;
}
- if (hh) {
- int hh_alen;
-
- read_lock_bh(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(hh->hh_len);
- memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
- read_unlock_bh(&hh->hh_lock);
- skb_push(skb, hh->hh_len);
- return hh->hh_output(skb);
- } else if (dst->neighbour)
+ if (hh)
+ return neigh_hh_output(hh, skb);
+ else if (dst->neighbour)
return dst->neighbour->output(skb);
if (net_ratelimit())
--- net-2.6.19.orig/net/ipv6/ip6_output.c
+++ net-2.6.19/net/ipv6/ip6_output.c
@@ -76,16 +76,9 @@ static inline int ip6_output_finish(stru
struct dst_entry *dst = skb->dst;
struct hh_cache *hh = dst->hh;
- if (hh) {
- int hh_alen;
-
- read_lock_bh(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(hh->hh_len);
- memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
- read_unlock_bh(&hh->hh_lock);
- skb_push(skb, hh->hh_len);
- return hh->hh_output(skb);
- } else if (dst->neighbour)
+ if (hh)
+ return neigh_hh_output(hh, skb);
+ else if (dst->neighbour)
return dst->neighbour->output(skb);
IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -193,6 +193,8 @@ extern struct neighbour * neigh_create(s
struct net_device *dev);
extern void neigh_destroy(struct neighbour *neigh);
extern int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb);
+extern int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb);
+
extern int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
u32 flags);
extern void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
@ 2006-08-29 15:28 ` Alexey Kuznetsov
2006-08-29 18:22 ` Stephen Hemminger
0 siblings, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 15:28 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Hello!
> @@ -346,8 +354,8 @@ struct neighbour *neigh_lookup(struct ne
>
> NEIGH_CACHE_STAT_INC(tbl, lookups);
>
> - read_lock_bh(&tbl->lock);
> - hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> neigh_hold(n);
> NEIGH_CACHE_STAT_INC(tbl, hits);
Sure? Seems, you cannot grab refcnt here, the entry can be already
released.
Probably, you should do atomic_inc_and_test() here and restart lookup,
if it fails.
Alexey
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 15:28 ` Alexey Kuznetsov
@ 2006-08-29 18:22 ` Stephen Hemminger
2006-08-29 18:34 ` Martin Josefsson
2006-08-29 21:17 ` Alexey Kuznetsov
0 siblings, 2 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 18:22 UTC (permalink / raw)
To: Alexey Kuznetsov; +Cc: David Miller, netdev
On Tue, 29 Aug 2006 19:28:16 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:
> Hello!
>
> > @@ -346,8 +354,8 @@ struct neighbour *neigh_lookup(struct ne
> >
> > NEIGH_CACHE_STAT_INC(tbl, lookups);
> >
> > - read_lock_bh(&tbl->lock);
> > - hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> > + rcu_read_lock();
> > + hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> > if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> > neigh_hold(n);
> > NEIGH_CACHE_STAT_INC(tbl, hits);
>
> Sure? Seems, you cannot grab refcnt here, the entry can be already
> released.
>
> Probably, you should do atomic_inc_and_test() here and restart lookup,
> if it fails.
>
> Alexey
atomic_inc_and_test is true iff result is zero, so that won't work.
But the following should work:
hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
if (unlikely(&atomic_inc_return(&n->refcnt) == 1)) {
neigh_release(n);
continue;
}
NEIGH_CACHE_STAT_INC(tbl, hits);
I'll wrap the atomic_inc_return inside neigh_hold_return()
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 18:22 ` Stephen Hemminger
@ 2006-08-29 18:34 ` Martin Josefsson
2006-08-29 20:17 ` Stephen Hemminger
2006-08-29 21:17 ` Alexey Kuznetsov
1 sibling, 1 reply; 19+ messages in thread
From: Martin Josefsson @ 2006-08-29 18:34 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Alexey Kuznetsov, David Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 318 bytes --]
On Tue, 2006-08-29 at 11:22 -0700, Stephen Hemminger wrote:
> > Probably, you should do atomic_inc_and_test() here and restart lookup,
> > if it fails.
> >
> > Alexey
>
> atomic_inc_and_test is true iff result is zero, so that won't work.
Wouldn't atomic_inc_not_zero() do what you want?
--
/Martin
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 18:34 ` Martin Josefsson
@ 2006-08-29 20:17 ` Stephen Hemminger
0 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 20:17 UTC (permalink / raw)
To: Martin Josefsson; +Cc: Alexey Kuznetsov, David Miller, netdev
On Tue, 29 Aug 2006 20:34:01 +0200
Martin Josefsson <gandalf@wlug.westbo.se> wrote:
> On Tue, 2006-08-29 at 11:22 -0700, Stephen Hemminger wrote:
>
> > > Probably, you should do atomic_inc_and_test() here and restart lookup,
> > > if it fails.
> > >
> > > Alexey
> >
> > atomic_inc_and_test is true iff result is zero, so that won't work.
>
> Wouldn't atomic_inc_not_zero() do what you want?
>
Yes, but that code looks more complex for the common case.
It ends up doing a compare exchange and retrying.
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 18:22 ` Stephen Hemminger
2006-08-29 18:34 ` Martin Josefsson
@ 2006-08-29 21:17 ` Alexey Kuznetsov
2006-08-29 21:46 ` Stephen Hemminger
1 sibling, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 21:17 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Hello!
> atomic_inc_and_test is true iff result is zero, so that won't work.
I meant atomic_inc_not_zero(), as Martin noticed.
> But the following should work:
>
> hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> if (unlikely(&atomic_inc_return(&n->refcnt) == 1)) {
> neigh_release(n);
I do not think it will work. It has exactly the same race condition.
Yes, atomic_inc_not_zero() is expensive. But it looks like it is the cheapest
variant, which works correctly without more work.
Another variant would be rework use of refcnt. It can be done like rt cache:
when release of the last reference does not mean anything.
Also, probably, it makes sense to add neigh_lookup_light(), which does
not take refcnt, but required to call
neigh_release_light() (which is just rcu_read_unlock_bh()).
Alexey
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 21:17 ` Alexey Kuznetsov
@ 2006-08-29 21:46 ` Stephen Hemminger
2006-08-29 22:16 ` Alexey Kuznetsov
0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 21:46 UTC (permalink / raw)
To: Alexey Kuznetsov; +Cc: David Miller, netdev
On Wed, 30 Aug 2006 01:17:22 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:
> Hello!
>
> > atomic_inc_and_test is true iff result is zero, so that won't work.
>
> I meant atomic_inc_not_zero(), as Martin noticed.
>
>
> > But the following should work:
> >
> > hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> > if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> > if (unlikely(&atomic_inc_return(&n->refcnt) == 1)) {
> > neigh_release(n);
>
> I do not think it will work. It has exactly the same race condition.
>
> Yes, atomic_inc_not_zero() is expensive. But it looks like it is the cheapest
> variant, which works correctly without more work.
>
> Another variant would be rework use of refcnt. It can be done like rt cache:
> when release of the last reference does not mean anything.
>
> Also, probably, it makes sense to add neigh_lookup_light(), which does
> not take refcnt, but required to call
> neigh_release_light() (which is just rcu_read_unlock_bh()).
Which code paths would that make sense on?
fib_detect_death (ok)
infiniband (ok)
wireless/strip (ok) -- hey, this code is crap it has
a refcount leak already!
arp_req_get (ok)
ndisc (ok)
Perhaps killing the refcount all together, and just changing
everybody to neigh_lookup_rcu(). Nobody holds a long term reference
to the entries.
>
> Alexey
Or using the existing dead flag?
@@ -346,17 +359,24 @@ struct neighbour *neigh_lookup(struct ne
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
+ /* Don't rescuitate dead entries */
+ if (unlikely(n->dead)) {
+ neigh_release(n);
+ continue;
+ }
+
NEIGH_CACHE_STAT_INC(tbl, hits);
goto found;
}
}
n = NULL;
found:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return n;
}
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 21:46 ` Stephen Hemminger
@ 2006-08-29 22:16 ` Alexey Kuznetsov
2006-08-29 23:00 ` Stephen Hemminger
0 siblings, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 22:16 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Hello!
> > Also, probably, it makes sense to add neigh_lookup_light(), which does
> > not take refcnt, but required to call
> > neigh_release_light() (which is just rcu_read_unlock_bh()).
>
> Which code paths would that make sense on?
> fib_detect_death (ok)
> infiniband (ok)
> wireless/strip (ok) -- hey, this code is crap it has
> a refcount leak already!
> arp_req_get (ok)
> ndisc (ok)
>
> Perhaps killing the refcount all together, and just changing
> everybody to neigh_lookup_rcu(). Nobody holds a long term reference
> to the entries.
The only real user of refcnt is destination cache.
It uses __neigh_lookup/__neigh_lookup_errno, which could also
use neigh_lookup_rcu(), then do atomic_inc_not_zero() and, if it fails
fall to neigh_create().
Alexey
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 22:16 ` Alexey Kuznetsov
@ 2006-08-29 23:00 ` Stephen Hemminger
2006-08-29 23:21 ` Alexey Kuznetsov
2006-08-29 23:36 ` Alexey Kuznetsov
0 siblings, 2 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 23:00 UTC (permalink / raw)
To: Alexey Kuznetsov; +Cc: David Miller, netdev
Here is the full version of neigh_lookup_rcu().
The race of interest is what happens when neigh_lookup_rcu() returns a table
entry and it gets deleted at the same time. See Documentation/RCU/listRCU.txt
for more info.
There are a couple different scenario's.
1) update or reader looking at dead entry. The neighbour entry is queued on the
rcu callback list, and will not be invoked until all cpu's have cleared
the barrier. The reader's do see stale data but it is harmless.
After reader has cleared critical section, the neigh_rcu_release is run
and that calls neigh_release which sees last reference and calls neigh_destroy.
neigh_lookup_rcu checks for dead entries, but this is racy and can miss.
It is just an attempt to close the window slightly.
2) caller wants to create or update entry (__neigh_lookup). caller gets
just deleted entry (refcount still 1), then grabs refcount (now 2).
Similar to above, but the actual neigh_destroy() happens either
from rcu callback or neigh_release() in caller, which ever is last.
This should not be any more racy than the existing code.
---
drivers/infiniband/core/addr.c | 7 -
drivers/net/wireless/strip.c | 11 +-
include/net/neighbour.h | 43 +++++---
net/core/neighbour.c | 197 ++++++++++++++++++++++-------------------
net/ipv4/arp.c | 10 +-
net/ipv6/ndisc.c | 7 -
6 files changed, 155 insertions(+), 120 deletions(-)
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -108,6 +108,7 @@ struct neighbour
struct sk_buff_head arp_queue;
struct timer_list timer;
struct neigh_ops *ops;
+ struct rcu_head rcu;
u8 primary_key[0];
};
@@ -126,6 +127,7 @@ struct pneigh_entry
{
struct hlist_node hlist;
struct net_device *dev;
+ struct rcu_head rcu;
u8 key[0];
};
@@ -157,7 +159,7 @@ struct neigh_table
struct timer_list proxy_timer;
struct sk_buff_head proxy_queue;
atomic_t entries;
- rwlock_t lock;
+ spinlock_t lock;
unsigned long last_rand;
kmem_cache_t *kmem_cachep;
struct neigh_statistics *stats;
@@ -181,9 +183,9 @@ struct neigh_table
extern void neigh_table_init(struct neigh_table *tbl);
extern void neigh_table_init_no_netlink(struct neigh_table *tbl);
extern int neigh_table_clear(struct neigh_table *tbl);
-extern struct neighbour * neigh_lookup(struct neigh_table *tbl,
- const void *pkey,
- struct net_device *dev);
+extern struct neighbour * neigh_lookup_rcu(struct neigh_table *tbl,
+ const void *pkey,
+ const struct net_device *dev);
extern struct neighbour * neigh_lookup_nodev(struct neigh_table *tbl,
const void *pkey);
extern struct neighbour * neigh_create(struct neigh_table *tbl,
@@ -313,27 +315,38 @@ static inline int neigh_event_send(struc
static inline struct neighbour *__neigh_lookup(struct neigh_table *tbl,
const void *pkey,
- struct net_device *dev, int creat)
+ struct net_device *dev,
+ int creat)
{
- struct neighbour *n = neigh_lookup(tbl, pkey, dev);
-
- if (n || !creat)
- return n;
+ struct neighbour *n;
- n = neigh_create(tbl, pkey, dev);
- return IS_ERR(n) ? NULL : n;
+ rcu_read_lock();
+ n = neigh_lookup_rcu(tbl, pkey, dev);
+ if (n)
+ neigh_hold(n);
+ else if (creat) {
+ n = neigh_create(tbl, pkey, dev);
+ if (IS_ERR(n))
+ n = NULL;
+ }
+ rcu_read_unlock();
+ return n;
}
static inline struct neighbour *__neigh_lookup_errno(struct neigh_table *tbl,
const void *pkey,
struct net_device *dev)
{
- struct neighbour *n = neigh_lookup(tbl, pkey, dev);
+ struct neighbour *n;
+ rcu_read_lock();
+ n = neigh_lookup_rcu(tbl, pkey, dev);
if (n)
- return n;
-
- return neigh_create(tbl, pkey, dev);
+ neigh_hold(n);
+ else
+ n = neigh_create(tbl, pkey, dev);
+ rcu_read_unlock();
+ return n;
}
struct neighbour_cb {
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -67,9 +67,10 @@ static struct file_operations neigh_stat
#endif
/*
- Neighbour hash table buckets are protected with rwlock tbl->lock.
+ Neighbour hash table buckets are protected with lock tbl->lock.
- - All the scans/updates to hash buckets MUST be made under this lock.
+ - All the scans of hash buckes must be made with RCU read lock (nopreempt)
+ - updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
to protocol backends, no attempts to send something to network.
It will result in deadlocks, if backend/driver wants to use neighbour
@@ -116,6 +117,17 @@ unsigned long neigh_rand_reach_time(unsi
return (base ? (net_random() % base) + (base >> 1) : 0);
}
+/*
+ * After all readers have finished this read-side critical section
+ * decrement ref count and free. If reader raced with us they
+ * may still have harmless dead reference.
+ */
+static void neigh_rcu_release(struct rcu_head *head)
+{
+ struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+ neigh_release(neigh);
+}
static int neigh_forced_gc(struct neigh_table *tbl)
{
@@ -124,7 +136,7 @@ static int neigh_forced_gc(struct neigh_
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
for (i = 0; i <= tbl->hash_mask; i++) {
struct neighbour *n;
struct hlist_node *node, *tmp;
@@ -138,11 +150,11 @@ static int neigh_forced_gc(struct neigh_
write_lock(&n->lock);
if (atomic_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
shrunk = 1;
write_unlock(&n->lock);
- neigh_release(n);
+ call_rcu(&n->rcu, neigh_rcu_release);
continue;
}
write_unlock(&n->lock);
@@ -151,7 +163,7 @@ static int neigh_forced_gc(struct neigh_
tbl->last_flush = jiffies;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
return shrunk;
}
@@ -189,10 +201,10 @@ static void neigh_flush_dev(struct neigh
if (dev && n->dev != dev)
continue;
- hlist_del(&n->hlist);
write_lock(&n->lock);
- neigh_del_timer(n);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
+ neigh_del_timer(n);
if (atomic_read(&n->refcnt) != 1) {
/* The most unpleasant situation.
@@ -213,24 +225,25 @@ static void neigh_flush_dev(struct neigh
NEIGH_PRINTK2("neigh %p is stray.\n", n);
}
write_unlock(&n->lock);
- neigh_release(n);
+
+ call_rcu(&n->rcu, neigh_rcu_release);
}
}
}
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev);
pneigh_ifdown(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
del_timer_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue);
@@ -326,8 +339,8 @@ static void neigh_hash_grow(struct neigh
unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
hash_val &= new_hash_mask;
- hlist_del(&n->hlist);
- hlist_add_head(&n->hlist, &new_hash[hash_val]);
+ __hlist_del(&n->hlist);
+ hlist_add_head_rcu(&n->hlist, &new_hash[hash_val]);
}
}
tbl->hash_buckets = new_hash;
@@ -336,8 +349,12 @@ static void neigh_hash_grow(struct neigh
neigh_hash_free(old_hash, old_entries);
}
-struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev)
+/*
+ * Lookup device and key in neighbour table.
+ * Assumed rcu_read_lock is held
+ */
+struct neighbour *neigh_lookup_rcu(struct neigh_table *tbl, const void *pkey,
+ const struct net_device *dev)
{
struct neighbour *n;
struct hlist_node *tmp;
@@ -346,18 +363,14 @@ struct neighbour *neigh_lookup(struct ne
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
- if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
- neigh_hold(n);
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+ if (!n->dead && dev == n->dev
+ && !memcmp(n->primary_key, pkey, key_len)) {
NEIGH_CACHE_STAT_INC(tbl, hits);
- goto found;
+ return n;
}
}
- n = NULL;
-found:
- read_unlock_bh(&tbl->lock);
- return n;
+ return NULL;
}
struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
@@ -369,9 +382,9 @@ struct neighbour *neigh_lookup_nodev(str
NEIGH_CACHE_STAT_INC(tbl, lookups);
- read_lock_bh(&tbl->lock);
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
- if (!memcmp(n->primary_key, pkey, key_len)) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+ if (!n->dead && !memcmp(n->primary_key, pkey, key_len)) {
neigh_hold(n);
NEIGH_CACHE_STAT_INC(tbl, hits);
goto found;
@@ -379,7 +392,7 @@ struct neighbour *neigh_lookup_nodev(str
}
n = NULL;
found:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return n;
}
@@ -416,7 +429,7 @@ struct neighbour *neigh_create(struct ne
n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
@@ -436,21 +449,22 @@ struct neighbour *neigh_create(struct ne
}
}
- hlist_add_head(&n->hlist, &tbl->hash_buckets[hash_val]);
n->dead = 0;
neigh_hold(n);
- write_unlock_bh(&tbl->lock);
+ hlist_add_head_rcu(&n->hlist, &tbl->hash_buckets[hash_val]);
+ spin_unlock_bh(&tbl->lock);
NEIGH_PRINTK2("neigh %p is created.\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
+/* Assumes rcu_read_lock is held */
struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, int creat)
{
@@ -464,16 +478,14 @@ struct pneigh_entry * pneigh_lookup(stru
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
- read_lock_bh(&tbl->lock);
-
- hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
+ hlist_for_each_entry_rcu(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
if (!memcmp(n->key, pkey, key_len) &&
(n->dev == dev || !n->dev)) {
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
goto out;
}
}
- read_unlock_bh(&tbl->lock);
+
n = NULL;
if (!creat)
goto out;
@@ -495,13 +507,18 @@ struct pneigh_entry * pneigh_lookup(stru
goto out;
}
- write_lock_bh(&tbl->lock);
- hlist_add_head(&n->hlist, &tbl->phash_buckets[hash_val]);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ hlist_add_head_rcu(&n->hlist, &tbl->phash_buckets[hash_val]);
+ spin_unlock_bh(&tbl->lock);
out:
return n;
}
+static void pneigh_destroy(struct rcu_head *head)
+{
+ struct pneigh_entry *n = container_of(head, struct pneigh_entry, rcu);
+ kfree(n);
+}
int pneigh_delete(struct neigh_table *tbl, const void *pkey,
struct net_device *dev)
@@ -516,20 +533,20 @@ int pneigh_delete(struct neigh_table *tb
hash_val ^= hash_val >> 4;
hash_val &= PNEIGH_HASHMASK;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
- hlist_del(&n->hlist);
- write_unlock_bh(&tbl->lock);
+ hlist_del_rcu(&n->hlist);
+ spin_unlock_bh(&tbl->lock);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
dev_put(n->dev);
- kfree(n);
+ call_rcu(&n->rcu, pneigh_destroy);
return 0;
}
}
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
return -ENOENT;
}
@@ -543,7 +560,7 @@ static int pneigh_ifdown(struct neigh_ta
hlist_for_each_entry_safe(n, tmp, nxt, &tbl->phash_buckets[h], hlist) {
if (!dev || n->dev == dev) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
if (tbl->pdestructor)
tbl->pdestructor(n);
if (n->dev)
@@ -644,7 +661,7 @@ static void neigh_periodic_timer(unsigne
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
- write_lock(&tbl->lock);
+ spin_lock(&tbl->lock);
/*
* periodically recompute ReachableTime from random function
@@ -676,7 +693,7 @@ static void neigh_periodic_timer(unsigne
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
time_after(now, n->used + n->parms->gc_staletime))) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
write_unlock(&n->lock);
neigh_release(n);
@@ -697,7 +714,7 @@ static void neigh_periodic_timer(unsigne
mod_timer(&tbl->gc_timer, now + expire);
- write_unlock(&tbl->lock);
+ spin_unlock(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1285,10 +1302,10 @@ struct neigh_parms *neigh_parms_alloc(st
p->dev = dev;
}
p->sysctl_table = NULL;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
p->next = tbl->parms.next;
tbl->parms.next = p;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
return p;
}
@@ -1307,19 +1324,19 @@ void neigh_parms_release(struct neigh_ta
if (!parms || parms == &tbl->parms)
return;
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
for (p = &tbl->parms.next; *p; p = &(*p)->next) {
if (*p == parms) {
*p = parms->next;
parms->dead = 1;
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
if (parms->dev)
dev_put(parms->dev);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
return;
}
}
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
NEIGH_PRINTK1("neigh_parms_release: not found\n");
}
@@ -1365,7 +1382,7 @@ void neigh_table_init_no_netlink(struct
get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
- rwlock_init(&tbl->lock);
+ spin_lock_init(&tbl->lock);
init_timer(&tbl->gc_timer);
tbl->gc_timer.data = (unsigned long)tbl;
tbl->gc_timer.function = neigh_periodic_timer;
@@ -1463,7 +1480,8 @@ int neigh_delete(struct sk_buff *skb, st
if (dev == NULL)
goto out_dev_put;
- neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+ rcu_read_lock();
+ neigh = neigh_lookup_rcu(tbl, nla_data(dst_attr), dev);
if (neigh == NULL) {
err = -ENOENT;
goto out_dev_put;
@@ -1472,7 +1490,7 @@ int neigh_delete(struct sk_buff *skb, st
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE |
NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
+ rcu_read_unlock();
goto out_dev_put;
}
err = -EAFNOSUPPORT;
@@ -1537,7 +1555,7 @@ int neigh_add(struct sk_buff *skb, struc
if (dev == NULL)
goto out_dev_put;
- neigh = neigh_lookup(tbl, dst, dev);
+ neigh = neigh_lookup_rcu(tbl, dst, dev);
if (neigh == NULL) {
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
@@ -1552,16 +1570,15 @@ int neigh_add(struct sk_buff *skb, struc
} else {
if (nlh->nlmsg_flags & NLM_F_EXCL) {
err = -EEXIST;
- neigh_release(neigh);
goto out_dev_put;
}
+ neigh_hold(neigh);
if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
flags &= ~NEIGH_UPDATE_F_OVERRIDE;
}
err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
- neigh_release(neigh);
goto out_dev_put;
}
@@ -1620,7 +1637,7 @@ static int neightbl_fill_info(struct sk_
ndtmsg = nlmsg_data(nlh);
- read_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -1680,11 +1697,11 @@ static int neightbl_fill_info(struct sk_
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_end(skb, nlh);
nla_put_failure:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_cancel(skb, nlh);
}
@@ -1703,7 +1720,7 @@ static int neightbl_fill_param_info(stru
ndtmsg = nlmsg_data(nlh);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock(); /* this maybe unnecessary */
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -1712,10 +1729,10 @@ static int neightbl_fill_param_info(stru
neightbl_fill_parms(skb, parms) < 0)
goto errout;
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_end(skb, nlh);
errout:
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
return nlmsg_cancel(skb, nlh);
}
@@ -1793,7 +1810,7 @@ int neightbl_set(struct sk_buff *skb, st
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
@@ -1874,7 +1891,7 @@ int neightbl_set(struct sk_buff *skb, st
err = 0;
errout_tbl_lock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
errout_locked:
rcu_read_unlock();
errout:
@@ -1890,7 +1907,7 @@ int neightbl_dump_info(struct sk_buff *s
family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
- rcu_read_lock();
+ rcu_read_lock_bh();
list_for_each_entry_rcu(tbl, &neigh_tables, list) {
struct neigh_parms *p;
@@ -1986,20 +2003,20 @@ static int neigh_dump_table(struct neigh
continue;
if (h > s_h)
s_idx = 0;
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
idx = 0;
- hlist_for_each_entry(n, tmp, &tbl->hash_buckets[h], hlist) {
+ hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[h], hlist) {
if (idx >= s_idx &&
neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
RTM_NEWNEIGH, NLM_F_MULTI) <= 0) {
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
rc = -1;
goto out;
}
++idx;
}
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
rc = skb->len;
out:
@@ -2039,14 +2056,15 @@ void neigh_for_each(struct neigh_table *
{
int chain;
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
for (chain = 0; chain <= tbl->hash_mask; chain++) {
+ struct neighbour *n;
struct hlist_node *p;
- hlist_for_each(p, &tbl->hash_buckets[chain])
- cb(hlist_entry(p, struct neighbour, hlist), cookie);
+ hlist_for_each_entry_rcu(n, p, &tbl->hash_buckets[chain], hlist)
+ cb(n, cookie);
}
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -2067,12 +2085,12 @@ void __neigh_for_each_release(struct nei
write_lock(&n->lock);
release = cb(n);
if (release) {
- hlist_del(&n->hlist);
+ hlist_del_rcu(&n->hlist);
n->dead = 1;
}
write_unlock(&n->lock);
if (release)
- neigh_release(n);
+ call_rcu(&n->rcu, neigh_rcu_release);
}
}
}
@@ -2116,7 +2134,7 @@ found:
static struct neighbour *next_neigh(struct hlist_node *node)
{
- if (node)
+ if (rcu_dereference(node))
return hlist_entry(node, struct neighbour, hlist);
else
return NULL;
@@ -2191,7 +2209,7 @@ static struct pneigh_entry *pneigh_get_f
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
- pn = tbl->phash_buckets[bucket].first;
+ pn = rcu_dereference(tbl->phash_buckets[bucket].first);
if (pn)
break;
}
@@ -2208,12 +2226,12 @@ static struct pneigh_entry *pneigh_get_n
struct neigh_table *tbl = state->tbl;
struct hlist_node *tmp = &pn->hlist;
- tmp = tmp->next;
+ tmp = rcu_dereference(tmp->next);
if (tmp)
goto found;
while (++state->bucket < PNEIGH_HASHMASK) {
- tmp = tbl->phash_buckets[state->bucket].first;
+ tmp = rcu_dereference(tbl->phash_buckets[state->bucket].first);
if (tmp)
goto found;
}
@@ -2261,7 +2279,7 @@ void *neigh_seq_start(struct seq_file *s
state->bucket = 0;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
- read_lock_bh(&tbl->lock);
+ rcu_read_lock();
pos_minus_one = *pos - 1;
return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
@@ -2297,10 +2315,7 @@ EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
{
- struct neigh_seq_state *state = seq->private;
- struct neigh_table *tbl = state->tbl;
-
- read_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_seq_stop);
@@ -2731,7 +2746,7 @@ EXPORT_SYMBOL(neigh_destroy);
EXPORT_SYMBOL(neigh_dump_info);
EXPORT_SYMBOL(neigh_event_ns);
EXPORT_SYMBOL(neigh_ifdown);
-EXPORT_SYMBOL(neigh_lookup);
+EXPORT_SYMBOL(neigh_lookup_rcu);
EXPORT_SYMBOL(neigh_lookup_nodev);
EXPORT_SYMBOL(neigh_parms_alloc);
EXPORT_SYMBOL(neigh_parms_release);
--- net-2.6.19.orig/drivers/infiniband/core/addr.c
+++ net-2.6.19/drivers/infiniband/core/addr.c
@@ -165,10 +165,11 @@ static int addr_resolve_remote(struct so
goto put;
}
- neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
+ rcu_read_lock();
+ neigh = neigh_lookup_rcu(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
if (!neigh) {
ret = -ENODATA;
- goto put;
+ goto release;
}
if (!(neigh->nud_state & NUD_VALID)) {
@@ -183,7 +184,7 @@ static int addr_resolve_remote(struct so
ret = copy_addr(addr, neigh->dev, neigh->ha);
release:
- neigh_release(neigh);
+ rcu_read_unlock();
put:
ip_rt_put(rt);
out:
--- net-2.6.19.orig/drivers/net/wireless/strip.c
+++ net-2.6.19/drivers/net/wireless/strip.c
@@ -467,17 +467,20 @@ static int arp_query(unsigned char *hadd
struct net_device *dev)
{
struct neighbour *neighbor_entry;
+ int ret = 0;
- neighbor_entry = neigh_lookup(&arp_tbl, &paddr, dev);
+ rcu_read_lock();
+ neighbor_entry = neigh_lookup_rcu(&arp_tbl, &paddr, dev);
- if (neighbor_entry != NULL) {
+ if (neighbor_entry) {
neighbor_entry->used = jiffies;
if (neighbor_entry->nud_state & NUD_VALID) {
memcpy(haddr, neighbor_entry->ha, dev->addr_len);
- return 1;
+ ret = 1;
}
}
- return 0;
+ rcu_read_unlock();
+ return ret;
}
static void DumpData(char *msg, struct strip *strip_info, __u8 * ptr,
--- net-2.6.19.orig/net/ipv4/arp.c
+++ net-2.6.19/net/ipv4/arp.c
@@ -1067,7 +1067,8 @@ static int arp_req_get(struct arpreq *r,
struct neighbour *neigh;
int err = -ENXIO;
- neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ rcu_read_lock();
+ neigh = neigh_lookup_rcu(&arp_tbl, &ip, dev);
if (neigh) {
read_lock_bh(&neigh->lock);
memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
@@ -1075,9 +1076,9 @@ static int arp_req_get(struct arpreq *r,
read_unlock_bh(&neigh->lock);
r->arp_ha.sa_family = dev->type;
strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
- neigh_release(neigh);
err = 0;
}
+ rcu_read_unlock();
return err;
}
@@ -1118,14 +1119,15 @@ static int arp_req_delete(struct arpreq
return -EINVAL;
}
err = -ENXIO;
- neigh = neigh_lookup(&arp_tbl, &ip, dev);
+ rcu_read_lock();
+ neigh = neigh_lookup_rcu(&arp_tbl, &ip, dev);
if (neigh) {
if (neigh->nud_state&~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN);
- neigh_release(neigh);
}
+ rcu_read_unlock();
return err;
}
--- net-2.6.19.orig/net/ipv6/ndisc.c
+++ net-2.6.19/net/ipv6/ndisc.c
@@ -944,8 +944,9 @@ static void ndisc_recv_na(struct sk_buff
in6_ifa_put(ifp);
return;
}
- neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
+ rcu_read_lock();
+ neigh = neigh_lookup_rcu(&nd_tbl, &msg->target, dev);
if (neigh) {
u8 old_flags = neigh->flags;
@@ -969,9 +970,9 @@ static void ndisc_recv_na(struct sk_buff
ip6_del_rt(rt);
}
-out:
- neigh_release(neigh);
}
+out:
+ rcu_read_unlock();
}
static void ndisc_recv_rs(struct sk_buff *skb)
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 23:00 ` Stephen Hemminger
@ 2006-08-29 23:21 ` Alexey Kuznetsov
2006-08-29 23:49 ` Stephen Hemminger
2006-08-29 23:36 ` Alexey Kuznetsov
1 sibling, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 23:21 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Hello!
> This should not be any more racy than the existing code.
Existing code is not racy.
Critical place is interpretation of refcnt==1. Current code assumes,
that when refcnt=1 and entry is in hash table, nobody can take this
entry (table is locked). So, it can be unlinked from the table.
See? __neigh_lookup()/__neigh_lookup_errno() _MUST_ return alive
and hashed entry. And will stay hashed until the reference is held.
Or until neigh entry is forced for destruction by device going down,
in this case referring dst entries will die as well.
If dst cache grabs an entry, which is purged from table because for some
time it had refcnt==1, you got a valid dst entry referring to dead
neighbour entry.
Alexey
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 23:00 ` Stephen Hemminger
2006-08-29 23:21 ` Alexey Kuznetsov
@ 2006-08-29 23:36 ` Alexey Kuznetsov
1 sibling, 0 replies; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 23:36 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Hello!
Yes, I forgot to say I take back my suggestion about atomic_inc_test_zero().
It would not work.
Seems, it is possible to add some barriers around setting n->dead
and testing it in neigh_lookup_rcu(), but it would be scary and ugly.
To be honest, I just do not know how to do this. :-)
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 23:21 ` Alexey Kuznetsov
@ 2006-08-29 23:49 ` Stephen Hemminger
2006-08-30 0:06 ` Alexey Kuznetsov
0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 23:49 UTC (permalink / raw)
To: Alexey Kuznetsov; +Cc: David Miller, netdev
On Wed, 30 Aug 2006 03:21:26 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:
> Hello!
>
> > This should not be any more racy than the existing code.
>
> Existing code is not racy.
Race 1: w/o RCU
Cpu 0: is in neigh_lookup
gets read_lock()
finds entry
++refcount to 2
updates it
Cpu 1: is in forced_gc()
waits at write_lock()
releases read_lock()
drops ref count to 1.
sees ref count is 1
deletes it
Race 1: w RCU
Cpu 0: is in __neigh_lookup
updates it
Cpu 1: is in forced_gc()
leaves refcount=1
sees ref count is 1
deletes it
>
> Critical place is interpretation of refcnt==1. Current code assumes,
> that when refcnt=1 and entry is in hash table, nobody can take this
> entry (table is locked). So, it can be unlinked from the table.
>
> See? __neigh_lookup()/__neigh_lookup_errno() _MUST_ return alive
> and hashed entry. And will stay hashed until the reference is held.
> Or until neigh entry is forced for destruction by device going down,
> in this case referring dst entries will die as well.
Why must it be hashed, it could always get zapped just after the update.
> If dst cache grabs an entry, which is purged from table because for some
> time it had refcnt==1, you got a valid dst entry referring to dead
> neighbour entry.
Hmm.. Since this is a slow path, grabbing the write_lock on the neighbour
entry would block the gc from zapping it.
in __neigh_lookup()
neigh_hold();
write_lock(&n->lock);
if (n->dead) {
write_unlock()
neigh_release()
goto rescan;
}
write_unlock()
--
Stephen Hemminger <shemminger@osdl.org>
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 4/6] net neighbour: convert to RCU
2006-08-29 23:49 ` Stephen Hemminger
@ 2006-08-30 0:06 ` Alexey Kuznetsov
0 siblings, 0 replies; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-30 0:06 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Hello!
> Race 1: w/o RCU
> Cpu 0: is in neigh_lookup
> gets read_lock()
> finds entry
> ++refcount to 2
> updates it
> Cpu 1: is in forced_gc()
> waits at write_lock()
> releases read_lock()
> drops ref count to 1.
> sees ref count is 1
> deletes it
Do you mean it is purged, though it is actually fresh?
It is harmless race condition.
> Why must it be hashed, it could always get zapped just after the update.
Because otherwise we have to check its validity on level of dst_cache
and to rebind. We do not want this. Actually dst->neighbour is supposed
to be immutable.
It means that if some valid dst refers to a neighbour, it must remain
hashed.
> Hmm.. Since this is a slow path,
All the neighbour lookups are in slow path. Actually, lookup
of neighbour entry for use in dst cache is the most loaded path,
the rest of them are pure maintanance paths.
Yes, this would work. RCU is a little spoiled, but at least global
tbl->lock is replaced with per neighbour lock.
Alexey
^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2006-08-30 0:06 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
2006-08-28 23:07 ` [PATCH 2/6] neighbour: convert neighbour hash table to hlist Stephen Hemminger
2006-08-28 23:07 ` [PATCH 3/6] neighbour: convert pneigh " Stephen Hemminger
2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
2006-08-29 15:28 ` Alexey Kuznetsov
2006-08-29 18:22 ` Stephen Hemminger
2006-08-29 18:34 ` Martin Josefsson
2006-08-29 20:17 ` Stephen Hemminger
2006-08-29 21:17 ` Alexey Kuznetsov
2006-08-29 21:46 ` Stephen Hemminger
2006-08-29 22:16 ` Alexey Kuznetsov
2006-08-29 23:00 ` Stephen Hemminger
2006-08-29 23:21 ` Alexey Kuznetsov
2006-08-29 23:49 ` Stephen Hemminger
2006-08-30 0:06 ` Alexey Kuznetsov
2006-08-29 23:36 ` Alexey Kuznetsov
2006-08-28 23:07 ` [PATCH 5/6] neighbour: convert lookup to sequence lock Stephen Hemminger
2006-08-28 23:07 ` [PATCH 6/6] neighbour: convert hard header cache to sequence number Stephen Hemminger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).