netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/6] Lockless neighbour table
@ 2006-08-28 23:07 Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
                   ` (5 more replies)
  0 siblings, 6 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

This is an updated version of the changes to make the neighbour (ARP)
table lockless for normal accesses.  It uses RCU and seqlock in
several places.  I separated the hlist conversions from RCU
changes (in case we need to bisect for some reason).

--
Stephen Hemminger <shemminger@osdl.org>


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/6] net neighbor: convert top level list to RCU
  2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 2/6] neighbour: convert neighbour hash table to hlist Stephen Hemminger
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: neigh-top-lock.patch --]
[-- Type: text/plain, Size: 6810 bytes --]

Change the top level list of neighbor tables to use RCU.
Minor change to BUG() if a neighbor table is registered twice.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

---
 include/net/neighbour.h |    2 -
 net/core/neighbour.c    |   89 ++++++++++++++++++++++--------------------------
 2 files changed, 42 insertions(+), 49 deletions(-)

--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -61,7 +61,7 @@ static void neigh_app_notify(struct neig
 static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev);
 void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);
 
-static struct neigh_table *neigh_tables;
+static LIST_HEAD(neigh_tables);
 #ifdef CONFIG_PROC_FS
 static struct file_operations neigh_stat_seq_fops;
 #endif
@@ -93,11 +93,11 @@ static struct file_operations neigh_stat
    It is supposed, that dev->hard_header is simplistic and does
    not make callbacks to neighbour tables.
 
-   The last lock is neigh_tbl_lock. It is pure SMP lock, protecting
-   list of neighbour tables. This list is used only in process context,
+   The last lock is neigh_tbl_lock. It controls update to the
+   set of neighbor tables. Typically used during initialization
  */
 
-static DEFINE_RWLOCK(neigh_tbl_lock);
+static DEFINE_SPINLOCK(neigh_tbl_lock);
 
 static int neigh_blackhole(struct sk_buff *skb)
 {
@@ -1387,26 +1387,18 @@ void neigh_table_init(struct neigh_table
 	struct neigh_table *tmp;
 
 	neigh_table_init_no_netlink(tbl);
-	write_lock(&neigh_tbl_lock);
-	for (tmp = neigh_tables; tmp; tmp = tmp->next) {
-		if (tmp->family == tbl->family)
-			break;
-	}
-	tbl->next	= neigh_tables;
-	neigh_tables	= tbl;
-	write_unlock(&neigh_tbl_lock);
-
-	if (unlikely(tmp)) {
-		printk(KERN_ERR "NEIGH: Registering multiple tables for "
-		       "family %d\n", tbl->family);
-		dump_stack();
+
+	spin_lock(&neigh_tbl_lock);
+	list_for_each_entry(tmp, &neigh_tables, list) {
+		BUG_ON(tmp->family == tbl->family);
 	}
+
+	list_add_rcu(&tbl->list, &neigh_tables);
+	spin_unlock(&neigh_tbl_lock);
 }
 
 int neigh_table_clear(struct neigh_table *tbl)
 {
-	struct neigh_table **tp;
-
 	/* It is not clean... Fix it to unload IPv6 module safely */
 	del_timer_sync(&tbl->gc_timer);
 	del_timer_sync(&tbl->proxy_timer);
@@ -1414,14 +1406,12 @@ int neigh_table_clear(struct neigh_table
 	neigh_ifdown(tbl, NULL);
 	if (atomic_read(&tbl->entries))
 		printk(KERN_CRIT "neighbour leakage\n");
-	write_lock(&neigh_tbl_lock);
-	for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
-		if (*tp == tbl) {
-			*tp = tbl->next;
-			break;
-		}
-	}
-	write_unlock(&neigh_tbl_lock);
+
+	spin_lock(&neigh_tbl_lock);
+	list_del_rcu(&tbl->list);
+	spin_unlock(&neigh_tbl_lock);
+
+	synchronize_rcu();
 
 	neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
 	tbl->hash_buckets = NULL;
@@ -1456,13 +1446,12 @@ int neigh_delete(struct sk_buff *skb, st
 		}
 	}
 
-	read_lock(&neigh_tbl_lock);
-	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		struct neighbour *neigh;
 
 		if (tbl->family != ndm->ndm_family)
 			continue;
-		read_unlock(&neigh_tbl_lock);
 
 		if (nla_len(dst_attr) < tbl->key_len)
 			goto out_dev_put;
@@ -1487,12 +1476,12 @@ int neigh_delete(struct sk_buff *skb, st
 		neigh_release(neigh);
 		goto out_dev_put;
 	}
-	read_unlock(&neigh_tbl_lock);
 	err = -EAFNOSUPPORT;
 
 out_dev_put:
 	if (dev)
 		dev_put(dev);
+	rcu_read_unlock();
 out:
 	return err;
 }
@@ -1525,15 +1514,14 @@ int neigh_add(struct sk_buff *skb, struc
 			goto out_dev_put;
 	}
 
-	read_lock(&neigh_tbl_lock);
-	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
 		struct neighbour *neigh;
 		void *dst, *lladdr;
 
 		if (tbl->family != ndm->ndm_family)
 			continue;
-		read_unlock(&neigh_tbl_lock);
 
 		if (nla_len(tb[NDA_DST]) < tbl->key_len)
 			goto out_dev_put;
@@ -1578,12 +1566,12 @@ int neigh_add(struct sk_buff *skb, struc
 		goto out_dev_put;
 	}
 
-	read_unlock(&neigh_tbl_lock);
 	err = -EAFNOSUPPORT;
 
 out_dev_put:
 	if (dev)
 		dev_put(dev);
+	rcu_read_unlock();
 out:
 	return err;
 }
@@ -1788,8 +1776,8 @@ int neightbl_set(struct sk_buff *skb, st
 	}
 
 	ndtmsg = nlmsg_data(nlh);
-	read_lock(&neigh_tbl_lock);
-	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
 			continue;
 
@@ -1889,26 +1877,26 @@ int neightbl_set(struct sk_buff *skb, st
 errout_tbl_lock:
 	write_unlock_bh(&tbl->lock);
 errout_locked:
-	read_unlock(&neigh_tbl_lock);
+	rcu_read_unlock();
 errout:
 	return err;
 }
 
 int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int family, tidx, nidx = 0;
+	int family, tidx = 0, nidx = 0;
 	int tbl_skip = cb->args[0];
 	int neigh_skip = cb->args[1];
 	struct neigh_table *tbl;
 
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
 
-	read_lock(&neigh_tbl_lock);
-	for (tbl = neigh_tables, tidx = 0; tbl; tbl = tbl->next, tidx++) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		struct neigh_parms *p;
 
 		if (tidx < tbl_skip || (family && tbl->family != family))
-			continue;
+			goto next_tbl;
 
 		if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).pid,
 				       cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
@@ -1928,9 +1916,11 @@ int neightbl_dump_info(struct sk_buff *s
 		}
 
 		neigh_skip = 0;
+next_tbl:
+		++tidx;
 	}
 out:
-	read_unlock(&neigh_tbl_lock);
+	rcu_read_unlock();
 	cb->args[0] = tidx;
 	cb->args[1] = nidx;
 
@@ -2020,22 +2010,25 @@ out:
 int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct neigh_table *tbl;
-	int t, family, s_t;
+	int t = 0, family, s_t;
+
 
-	read_lock(&neigh_tbl_lock);
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
 	s_t = cb->args[0];
 
-	for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		if (t < s_t || (family && tbl->family != family))
-			continue;
+			goto next;
 		if (t > s_t)
 			memset(&cb->args[1], 0, sizeof(cb->args) -
 						sizeof(cb->args[0]));
 		if (neigh_dump_table(tbl, skb, cb) < 0)
 			break;
+next:
+		++t;
 	}
-	read_unlock(&neigh_tbl_lock);
+	rcu_read_unlock();
 
 	cb->args[0] = t;
 	return skb->len;
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -136,7 +136,7 @@ struct pneigh_entry
 
 struct neigh_table
 {
-	struct neigh_table	*next;
+	struct list_head	list;
 	int			family;
 	int			entry_size;
 	int			key_len;

--
Stephen Hemminger <shemminger@osdl.org>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/6] neighbour: convert neighbour hash table to hlist
  2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 3/6] neighbour: convert pneigh " Stephen Hemminger
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: hlist1.patch --]
[-- Type: text/plain, Size: 11860 bytes --]

Change the neighbour table hash list to hlist from list.h
to allow for easier later conversion to RCU.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

---
 include/net/neighbour.h |    6 -
 net/core/neighbour.c    |  160 +++++++++++++++++++++++++-----------------------
 2 files changed, 88 insertions(+), 78 deletions(-)

--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -88,10 +88,10 @@ struct neigh_statistics
 
 struct neighbour
 {
-	struct neighbour	*next;
+	struct hlist_node	hlist;
 	struct neigh_table	*tbl;
 	struct neigh_parms	*parms;
-	struct net_device		*dev;
+	struct net_device	*dev;
 	unsigned long		used;
 	unsigned long		confirmed;
 	unsigned long		updated;
@@ -161,7 +161,7 @@ struct neigh_table
 	unsigned long		last_rand;
 	kmem_cache_t		*kmem_cachep;
 	struct neigh_statistics	*stats;
-	struct neighbour	**hash_buckets;
+	struct hlist_head	*hash_buckets;
 	unsigned int		hash_mask;
 	__u32			hash_rnd;
 	unsigned int		hash_chain_gc;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -126,10 +126,11 @@ static int neigh_forced_gc(struct neigh_
 
 	write_lock_bh(&tbl->lock);
 	for (i = 0; i <= tbl->hash_mask; i++) {
-		struct neighbour *n, **np;
+		struct neighbour *n;
+		struct hlist_node *node, *tmp;
 
-		np = &tbl->hash_buckets[i];
-		while ((n = *np) != NULL) {
+		hlist_for_each_entry_safe(n, node, tmp,
+					  &tbl->hash_buckets[i], hlist) {
 			/* Neighbour record may be discarded if:
 			 * - nobody refers to it.
 			 * - it is not permanent
@@ -137,7 +138,7 @@ static int neigh_forced_gc(struct neigh_
 			write_lock(&n->lock);
 			if (atomic_read(&n->refcnt) == 1 &&
 			    !(n->nud_state & NUD_PERMANENT)) {
-				*np	= n->next;
+				hlist_del(&n->hlist);
 				n->dead = 1;
 				shrunk	= 1;
 				write_unlock(&n->lock);
@@ -145,7 +146,6 @@ static int neigh_forced_gc(struct neigh_
 				continue;
 			}
 			write_unlock(&n->lock);
-			np = &n->next;
 		}
 	}
 
@@ -181,14 +181,15 @@ static void neigh_flush_dev(struct neigh
 	int i;
 
 	for (i = 0; i <= tbl->hash_mask; i++) {
-		struct neighbour *n, **np = &tbl->hash_buckets[i];
+		struct hlist_node *node, *tmp;
+		struct neighbour *n;
 
-		while ((n = *np) != NULL) {
-			if (dev && n->dev != dev) {
-				np = &n->next;
+		hlist_for_each_entry_safe(n, node, tmp,
+					  &tbl->hash_buckets[i], hlist) {
+			if (dev && n->dev != dev)
 				continue;
-			}
-			*np = n->next;
+
+			hlist_del(&n->hlist);
 			write_lock(&n->lock);
 			neigh_del_timer(n);
 			n->dead = 1;
@@ -279,23 +280,20 @@ out_entries:
 	goto out;
 }
 
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static struct hlist_head *neigh_hash_alloc(unsigned int entries)
 {
-	unsigned long size = entries * sizeof(struct neighbour *);
-	struct neighbour **ret;
+	unsigned long size = entries * sizeof(struct hlist_head);
 
-	if (size <= PAGE_SIZE) {
-		ret = kzalloc(size, GFP_ATOMIC);
-	} else {
-		ret = (struct neighbour **)
+	if (size <= PAGE_SIZE)
+		return kzalloc(size, GFP_ATOMIC);
+	else
+		return (struct hlist_head *)
 		      __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
-	}
-	return ret;
 }
 
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free(struct hlist_head *hash, unsigned int entries)
 {
-	unsigned long size = entries * sizeof(struct neighbour *);
+	unsigned long size = entries * sizeof(struct hlist_head);
 
 	if (size <= PAGE_SIZE)
 		kfree(hash);
@@ -305,7 +303,7 @@ static void neigh_hash_free(struct neigh
 
 static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
 {
-	struct neighbour **new_hash, **old_hash;
+	struct hlist_head *new_hash, *old_hash;
 	unsigned int i, new_hash_mask, old_entries;
 
 	NEIGH_CACHE_STAT_INC(tbl, hash_grows);
@@ -321,16 +319,15 @@ static void neigh_hash_grow(struct neigh
 
 	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 	for (i = 0; i < old_entries; i++) {
-		struct neighbour *n, *next;
+		struct neighbour *n;
+		struct hlist_node *node, *tmp;
 
-		for (n = old_hash[i]; n; n = next) {
+		hlist_for_each_entry_safe(n, node, tmp, &old_hash[i], hlist) {
 			unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
 
 			hash_val &= new_hash_mask;
-			next = n->next;
-
-			n->next = new_hash[hash_val];
-			new_hash[hash_val] = n;
+			hlist_del(&n->hlist);
+			hlist_add_head(&n->hlist, &new_hash[hash_val]);
 		}
 	}
 	tbl->hash_buckets = new_hash;
@@ -343,19 +340,22 @@ struct neighbour *neigh_lookup(struct ne
 			       struct net_device *dev)
 {
 	struct neighbour *n;
+	struct hlist_node *tmp;
 	int key_len = tbl->key_len;
 	u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
 	
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
 	read_lock_bh(&tbl->lock);
-	for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
+	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
 			NEIGH_CACHE_STAT_INC(tbl, hits);
-			break;
+			goto found;
 		}
 	}
+	n = NULL;
+found:
 	read_unlock_bh(&tbl->lock);
 	return n;
 }
@@ -363,19 +363,22 @@ struct neighbour *neigh_lookup(struct ne
 struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
 {
 	struct neighbour *n;
+	struct hlist_node *tmp;
 	int key_len = tbl->key_len;
 	u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask;
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
 	read_lock_bh(&tbl->lock);
-	for (n = tbl->hash_buckets[hash_val]; n; n = n->next) {
+	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
 		if (!memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
 			NEIGH_CACHE_STAT_INC(tbl, hits);
-			break;
+			goto found;
 		}
 	}
+	n = NULL;
+found:
 	read_unlock_bh(&tbl->lock);
 	return n;
 }
@@ -387,6 +390,7 @@ struct neighbour *neigh_create(struct ne
 	int key_len = tbl->key_len;
 	int error;
 	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+	struct hlist_node *tmp;
 
 	if (!n) {
 		rc = ERR_PTR(-ENOBUFS);
@@ -424,7 +428,7 @@ struct neighbour *neigh_create(struct ne
 		goto out_tbl_unlock;
 	}
 
-	for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+	hlist_for_each_entry(n1, tmp, &tbl->hash_buckets[hash_val], hlist) {
 		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
 			neigh_hold(n1);
 			rc = n1;
@@ -432,8 +436,7 @@ struct neighbour *neigh_create(struct ne
 		}
 	}
 
-	n->next = tbl->hash_buckets[hash_val];
-	tbl->hash_buckets[hash_val] = n;
+	hlist_add_head(&n->hlist, &tbl->hash_buckets[hash_val]);
 	n->dead = 0;
 	neigh_hold(n);
 	write_unlock_bh(&tbl->lock);
@@ -635,7 +638,9 @@ static void neigh_connect(struct neighbo
 static void neigh_periodic_timer(unsigned long arg)
 {
 	struct neigh_table *tbl = (struct neigh_table *)arg;
-	struct neighbour *n, **np;
+	struct neighbour *n;
+	struct hlist_head *head;
+	struct hlist_node *node, *tmp;
 	unsigned long expire, now = jiffies;
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
@@ -654,19 +659,17 @@ static void neigh_periodic_timer(unsigne
 				neigh_rand_reach_time(p->base_reachable_time);
 	}
 
-	np = &tbl->hash_buckets[tbl->hash_chain_gc];
+	head = &tbl->hash_buckets[tbl->hash_chain_gc];
 	tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
 
-	while ((n = *np) != NULL) {
+	hlist_for_each_entry_safe(n, node, tmp, head, hlist) {
 		unsigned int state;
 
 		write_lock(&n->lock);
 
 		state = n->nud_state;
-		if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
-			write_unlock(&n->lock);
+		if (state & (NUD_PERMANENT | NUD_IN_TIMER))
 			goto next_elt;
-		}
 
 		if (time_before(n->used, n->confirmed))
 			n->used = n->confirmed;
@@ -674,16 +677,14 @@ static void neigh_periodic_timer(unsigne
 		if (atomic_read(&n->refcnt) == 1 &&
 		    (state == NUD_FAILED ||
 		     time_after(now, n->used + n->parms->gc_staletime))) {
-			*np = n->next;
+			hlist_del(&n->hlist);
 			n->dead = 1;
 			write_unlock(&n->lock);
 			neigh_release(n);
 			continue;
 		}
+	next_elt:
 		write_unlock(&n->lock);
-
-next_elt:
-		np = &n->next;
 	}
 
  	/* Cycle through all hash buckets every base_reachable_time/2 ticks.
@@ -1976,27 +1977,29 @@ nla_put_failure:
 static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 			    struct netlink_callback *cb)
 {
-	struct neighbour *n;
 	int rc, h, s_h = cb->args[1];
 	int idx, s_idx = idx = cb->args[2];
 
 	for (h = 0; h <= tbl->hash_mask; h++) {
+		struct neighbour *n;
+		struct hlist_node *tmp;
+
 		if (h < s_h)
 			continue;
 		if (h > s_h)
 			s_idx = 0;
 		read_lock_bh(&tbl->lock);
-		for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) {
-			if (idx < s_idx)
-				continue;
-			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
+		idx = 0;
+		hlist_for_each_entry(n, tmp, &tbl->hash_buckets[h], hlist) {
+			if (idx >= s_idx &&
+			    neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
 					    cb->nlh->nlmsg_seq,
-					    RTM_NEWNEIGH,
-					    NLM_F_MULTI) <= 0) {
+					    RTM_NEWNEIGH, NLM_F_MULTI) <= 0) {
 				read_unlock_bh(&tbl->lock);
 				rc = -1;
 				goto out;
 			}
+			++idx;
 		}
 		read_unlock_bh(&tbl->lock);
 	}
@@ -2040,10 +2043,10 @@ void neigh_for_each(struct neigh_table *
 
 	read_lock_bh(&tbl->lock);
 	for (chain = 0; chain <= tbl->hash_mask; chain++) {
-		struct neighbour *n;
+		struct hlist_node *p;
 
-		for (n = tbl->hash_buckets[chain]; n; n = n->next)
-			cb(n, cookie);
+		hlist_for_each(p, &tbl->hash_buckets[chain])
+			cb(hlist_entry(p, struct neighbour, hlist), cookie);
 	}
 	read_unlock_bh(&tbl->lock);
 }
@@ -2056,19 +2059,19 @@ void __neigh_for_each_release(struct nei
 	int chain;
 
 	for (chain = 0; chain <= tbl->hash_mask; chain++) {
-		struct neighbour *n, **np;
+		struct neighbour *n;
+		struct hlist_node *tmp, *next;
 
-		np = &tbl->hash_buckets[chain];
-		while ((n = *np) != NULL) {
+		hlist_for_each_entry_safe(n, tmp, next,
+					  &tbl->hash_buckets[chain], hlist) {
 			int release;
 
 			write_lock(&n->lock);
 			release = cb(n);
 			if (release) {
-				*np = n->next;
+				hlist_del(&n->hlist);
 				n->dead = 1;
-			} else
-				np = &n->next;
+			}
 			write_unlock(&n->lock);
 			if (release)
 				neigh_release(n);
@@ -2088,33 +2091,39 @@ static struct neighbour *neigh_get_first
 
 	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
-		n = tbl->hash_buckets[bucket];
+		struct hlist_node *tmp;
 
-		while (n) {
+		hlist_for_each_entry(n, tmp, &tbl->hash_buckets[bucket], hlist) {
 			if (state->neigh_sub_iter) {
 				loff_t fakep = 0;
 				void *v;
 
 				v = state->neigh_sub_iter(state, n, &fakep);
 				if (!v)
-					goto next;
+					continue;
 			}
+
 			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
-				break;
+				goto found;
 			if (n->nud_state & ~NUD_NOARP)
-				break;
-		next:
-			n = n->next;
+				goto found;
 		}
-
-		if (n)
-			break;
+		n = NULL;
 	}
+found:
 	state->bucket = bucket;
 
 	return n;
 }
 
+static struct neighbour *next_neigh(struct hlist_node *node)
+{
+	if (node)
+		return hlist_entry(node, struct neighbour, hlist);
+	else
+		return NULL;
+}
+
 static struct neighbour *neigh_get_next(struct seq_file *seq,
 					struct neighbour *n,
 					loff_t *pos)
@@ -2127,7 +2136,8 @@ static struct neighbour *neigh_get_next(
 		if (v)
 			return n;
 	}
-	n = n->next;
+
+	n = next_neigh(n->hlist.next);
 
 	while (1) {
 		while (n) {
@@ -2143,7 +2153,7 @@ static struct neighbour *neigh_get_next(
 			if (n->nud_state & ~NUD_NOARP)
 				break;
 		next:
-			n = n->next;
+			n = next_neigh(n->hlist.next);
 		}
 
 		if (n)
@@ -2152,7 +2162,7 @@ static struct neighbour *neigh_get_next(
 		if (++state->bucket > tbl->hash_mask)
 			break;
 
-		n = tbl->hash_buckets[state->bucket];
+		n = next_neigh(tbl->hash_buckets[state->bucket].first);
 	}
 
 	if (n && pos)

--
Stephen Hemminger <shemminger@osdl.org>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/6] neighbour: convert pneigh hash table to hlist
  2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 2/6] neighbour: convert neighbour hash table to hlist Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: hlist2.patch --]
[-- Type: text/plain, Size: 5376 bytes --]

Change the pneigh_entry table to hlist from list.h
to allow for easier later conversion to RCU.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

---
 include/net/neighbour.h |    6 ++--
 net/core/neighbour.c    |   58 ++++++++++++++++++++++++------------------------
 2 files changed, 33 insertions(+), 31 deletions(-)

--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -124,8 +124,8 @@ struct neigh_ops
 
 struct pneigh_entry
 {
-	struct pneigh_entry	*next;
-	struct net_device		*dev;
+	struct hlist_node	hlist;
+	struct net_device	*dev;
 	u8			key[0];
 };
 
@@ -165,7 +165,7 @@ struct neigh_table
 	unsigned int		hash_mask;
 	__u32			hash_rnd;
 	unsigned int		hash_chain_gc;
-	struct pneigh_entry	**phash_buckets;
+	struct hlist_head	*phash_buckets;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*pde;
 #endif
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -455,6 +455,7 @@ struct pneigh_entry * pneigh_lookup(stru
 				    struct net_device *dev, int creat)
 {
 	struct pneigh_entry *n;
+	struct hlist_node *tmp;
 	int key_len = tbl->key_len;
 	u32 hash_val = *(u32 *)(pkey + key_len - 4);
 
@@ -465,7 +466,7 @@ struct pneigh_entry * pneigh_lookup(stru
 
 	read_lock_bh(&tbl->lock);
 
-	for (n = tbl->phash_buckets[hash_val]; n; n = n->next) {
+	hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
 		if (!memcmp(n->key, pkey, key_len) &&
 		    (n->dev == dev || !n->dev)) {
 			read_unlock_bh(&tbl->lock);
@@ -495,8 +496,7 @@ struct pneigh_entry * pneigh_lookup(stru
 	}
 
 	write_lock_bh(&tbl->lock);
-	n->next = tbl->phash_buckets[hash_val];
-	tbl->phash_buckets[hash_val] = n;
+	hlist_add_head(&n->hlist, &tbl->phash_buckets[hash_val]);
 	write_unlock_bh(&tbl->lock);
 out:
 	return n;
@@ -506,7 +506,8 @@ out:
 int pneigh_delete(struct neigh_table *tbl, const void *pkey,
 		  struct net_device *dev)
 {
-	struct pneigh_entry *n, **np;
+	struct pneigh_entry *n;
+	struct hlist_node *tmp;
 	int key_len = tbl->key_len;
 	u32 hash_val = *(u32 *)(pkey + key_len - 4);
 
@@ -516,10 +517,9 @@ int pneigh_delete(struct neigh_table *tb
 	hash_val &= PNEIGH_HASHMASK;
 
 	write_lock_bh(&tbl->lock);
-	for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
-	     np = &n->next) {
+	hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
 		if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
-			*np = n->next;
+			hlist_del(&n->hlist);
 			write_unlock_bh(&tbl->lock);
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
@@ -535,22 +535,21 @@ int pneigh_delete(struct neigh_table *tb
 
 static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 {
-	struct pneigh_entry *n, **np;
 	u32 h;
 
 	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
-		np = &tbl->phash_buckets[h];
-		while ((n = *np) != NULL) {
+		struct pneigh_entry *n;
+		struct hlist_node *tmp, *nxt;
+
+		hlist_for_each_entry_safe(n, tmp, nxt, &tbl->phash_buckets[h], hlist) {
 			if (!dev || n->dev == dev) {
-				*np = n->next;
+				hlist_del(&n->hlist);
 				if (tbl->pdestructor)
 					tbl->pdestructor(n);
 				if (n->dev)
 					dev_put(n->dev);
 				kfree(n);
-				continue;
 			}
-			np = &n->next;
 		}
 	}
 	return -ENOENT;
@@ -1332,7 +1331,6 @@ void neigh_parms_destroy(struct neigh_pa
 void neigh_table_init_no_netlink(struct neigh_table *tbl)
 {
 	unsigned long now = jiffies;
-	unsigned long phsize;
 
 	atomic_set(&tbl->parms.refcnt, 1);
 	INIT_RCU_HEAD(&tbl->parms.rcu_head);
@@ -1359,8 +1357,8 @@ void neigh_table_init_no_netlink(struct 
 	tbl->hash_mask = 1;
 	tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
 
-	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
-	tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
+	tbl->phash_buckets = kcalloc(PNEIGH_HASHMASK + 1, sizeof(struct hlist_head),
+				     GFP_KERNEL);
 
 	if (!tbl->hash_buckets || !tbl->phash_buckets)
 		panic("cannot allocate neighbour cache hashes");
@@ -2188,18 +2186,18 @@ static struct pneigh_entry *pneigh_get_f
 {
 	struct neigh_seq_state *state = seq->private;
 	struct neigh_table *tbl = state->tbl;
-	struct pneigh_entry *pn = NULL;
+	struct hlist_node *pn = NULL;
 	int bucket = state->bucket;
 
 	state->flags |= NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
-		pn = tbl->phash_buckets[bucket];
+		pn = tbl->phash_buckets[bucket].first;
 		if (pn)
 			break;
 	}
 	state->bucket = bucket;
 
-	return pn;
+	return pn ? hlist_entry(pn, struct pneigh_entry, hlist) : NULL;
 }
 
 static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
@@ -2208,20 +2206,24 @@ static struct pneigh_entry *pneigh_get_n
 {
 	struct neigh_seq_state *state = seq->private;
 	struct neigh_table *tbl = state->tbl;
+	struct hlist_node *tmp = &pn->hlist;
 
-	pn = pn->next;
-	while (!pn) {
-		if (++state->bucket > PNEIGH_HASHMASK)
-			break;
-		pn = tbl->phash_buckets[state->bucket];
-		if (pn)
-			break;
+	tmp = tmp->next;
+	if (tmp)
+		goto found;
+
+	while (++state->bucket < PNEIGH_HASHMASK) {
+		tmp = tbl->phash_buckets[state->bucket].first;
+		if (tmp)
+			goto found;
 	}
+	return NULL;
 
-	if (pn && pos)
+found:
+	if (pos)
 		--(*pos);
 
-	return pn;
+	return hlist_entry(tmp, struct pneigh_entry, hlist);
 }
 
 static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos)

--
Stephen Hemminger <shemminger@osdl.org>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 4/6] net neighbour: convert to RCU
  2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
                   ` (2 preceding siblings ...)
  2006-08-28 23:07 ` [PATCH 3/6] neighbour: convert pneigh " Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
  2006-08-29 15:28   ` Alexey Kuznetsov
  2006-08-28 23:07 ` [PATCH 5/6] neighbour: convert lookup to sequence lock Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 6/6] neighbour: convert hard header cache to sequence number Stephen Hemminger
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: rcu-neigh.patch --]
[-- Type: text/plain, Size: 15275 bytes --]

Use RCU to allow for lock less access to the neighbour table.
This should speedup the send path because no atomic operations
will be needed to lookup ARP entries, etc.


Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

---
 include/net/neighbour.h |    4 -
 net/core/neighbour.c    |  158 +++++++++++++++++++++++++-----------------------
 2 files changed, 87 insertions(+), 75 deletions(-)

--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -108,6 +108,7 @@ struct neighbour
 	struct sk_buff_head	arp_queue;
 	struct timer_list	timer;
 	struct neigh_ops	*ops;
+	struct rcu_head		rcu;
 	u8			primary_key[0];
 };
 
@@ -126,6 +127,7 @@ struct pneigh_entry
 {
 	struct hlist_node	hlist;
 	struct net_device	*dev;
+	struct rcu_head		rcu;
 	u8			key[0];
 };
 
@@ -157,7 +159,7 @@ struct neigh_table
 	struct timer_list 	proxy_timer;
 	struct sk_buff_head	proxy_queue;
 	atomic_t		entries;
-	rwlock_t		lock;
+	spinlock_t		lock;
 	unsigned long		last_rand;
 	kmem_cache_t		*kmem_cachep;
 	struct neigh_statistics	*stats;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -67,9 +67,10 @@ static struct file_operations neigh_stat
 #endif
 
 /*
-   Neighbour hash table buckets are protected with rwlock tbl->lock.
+   Neighbour hash table buckets are protected with lock tbl->lock.
 
-   - All the scans/updates to hash buckets MUST be made under this lock.
+   - All the scans of hash buckes must be made with RCU read lock (nopreempt)
+   - updates to hash buckets MUST be made under this lock.
    - NOTHING clever should be made under this lock: no callbacks
      to protocol backends, no attempts to send something to network.
      It will result in deadlocks, if backend/driver wants to use neighbour
@@ -117,6 +118,13 @@ unsigned long neigh_rand_reach_time(unsi
 }
 
 
+static void neigh_rcu_release(struct rcu_head *head)
+{
+	struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+	neigh_release(neigh);
+}
+
 static int neigh_forced_gc(struct neigh_table *tbl)
 {
 	int shrunk = 0;
@@ -124,7 +132,7 @@ static int neigh_forced_gc(struct neigh_
 
 	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	for (i = 0; i <= tbl->hash_mask; i++) {
 		struct neighbour *n;
 		struct hlist_node *node, *tmp;
@@ -138,11 +146,11 @@ static int neigh_forced_gc(struct neigh_
 			write_lock(&n->lock);
 			if (atomic_read(&n->refcnt) == 1 &&
 			    !(n->nud_state & NUD_PERMANENT)) {
-				hlist_del(&n->hlist);
+				hlist_del_rcu(&n->hlist);
 				n->dead = 1;
 				shrunk	= 1;
 				write_unlock(&n->lock);
-				neigh_release(n);
+				call_rcu(&n->rcu, neigh_rcu_release);
 				continue;
 			}
 			write_unlock(&n->lock);
@@ -151,7 +159,7 @@ static int neigh_forced_gc(struct neigh_
 
 	tbl->last_flush = jiffies;
 
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 	return shrunk;
 }
@@ -189,7 +197,7 @@ static void neigh_flush_dev(struct neigh
 			if (dev && n->dev != dev)
 				continue;
 
-			hlist_del(&n->hlist);
+			hlist_del_rcu(&n->hlist);
 			write_lock(&n->lock);
 			neigh_del_timer(n);
 			n->dead = 1;
@@ -220,17 +228,17 @@ static void neigh_flush_dev(struct neigh
 
 void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
 {
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 }
 
 int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 {
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev);
 	pneigh_ifdown(tbl, dev);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 	del_timer_sync(&tbl->proxy_timer);
 	pneigh_queue_purge(&tbl->proxy_queue);
@@ -326,8 +334,8 @@ static void neigh_hash_grow(struct neigh
 			unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
 
 			hash_val &= new_hash_mask;
-			hlist_del(&n->hlist);
-			hlist_add_head(&n->hlist, &new_hash[hash_val]);
+			__hlist_del(&n->hlist);
+			hlist_add_head_rcu(&n->hlist, &new_hash[hash_val]);
 		}
 	}
 	tbl->hash_buckets = new_hash;
@@ -346,8 +354,8 @@ struct neighbour *neigh_lookup(struct ne
 	
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
 			NEIGH_CACHE_STAT_INC(tbl, hits);
@@ -356,7 +364,7 @@ struct neighbour *neigh_lookup(struct ne
 	}
 	n = NULL;
 found:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return n;
 }
 
@@ -369,8 +377,8 @@ struct neighbour *neigh_lookup_nodev(str
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
 		if (!memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
 			NEIGH_CACHE_STAT_INC(tbl, hits);
@@ -379,7 +387,7 @@ struct neighbour *neigh_lookup_nodev(str
 	}
 	n = NULL;
 found:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return n;
 }
 
@@ -416,7 +424,7 @@ struct neighbour *neigh_create(struct ne
 
 	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
 		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
@@ -436,21 +444,22 @@ struct neighbour *neigh_create(struct ne
 		}
 	}
 
-	hlist_add_head(&n->hlist, &tbl->hash_buckets[hash_val]);
 	n->dead = 0;
 	neigh_hold(n);
-	write_unlock_bh(&tbl->lock);
+	hlist_add_head_rcu(&n->hlist, &tbl->hash_buckets[hash_val]);
+	spin_unlock_bh(&tbl->lock);
 	NEIGH_PRINTK2("neigh %p is created.\n", n);
 	rc = n;
 out:
 	return rc;
 out_tbl_unlock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 out_neigh_release:
 	neigh_release(n);
 	goto out;
 }
 
+/* Assumes rcu_read_lock is held */
 struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
 				    struct net_device *dev, int creat)
 {
@@ -464,16 +473,14 @@ struct pneigh_entry * pneigh_lookup(stru
 	hash_val ^= hash_val >> 4;
 	hash_val &= PNEIGH_HASHMASK;
 
-	read_lock_bh(&tbl->lock);
-
-	hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
+	hlist_for_each_entry_rcu(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
 		if (!memcmp(n->key, pkey, key_len) &&
 		    (n->dev == dev || !n->dev)) {
-			read_unlock_bh(&tbl->lock);
+			rcu_read_unlock();
 			goto out;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
+
 	n = NULL;
 	if (!creat)
 		goto out;
@@ -495,13 +502,18 @@ struct pneigh_entry * pneigh_lookup(stru
 		goto out;
 	}
 
-	write_lock_bh(&tbl->lock);
-	hlist_add_head(&n->hlist, &tbl->phash_buckets[hash_val]);
-	write_unlock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
+	hlist_add_head_rcu(&n->hlist, &tbl->phash_buckets[hash_val]);
+	spin_unlock_bh(&tbl->lock);
 out:
 	return n;
 }
 
+static void pneigh_destroy(struct rcu_head *head)
+{
+	struct pneigh_entry *n = container_of(head, struct pneigh_entry, rcu);
+	kfree(n);
+}
 
 int pneigh_delete(struct neigh_table *tbl, const void *pkey,
 		  struct net_device *dev)
@@ -516,20 +528,20 @@ int pneigh_delete(struct neigh_table *tb
 	hash_val ^= hash_val >> 4;
 	hash_val &= PNEIGH_HASHMASK;
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
 		if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
-			hlist_del(&n->hlist);
-			write_unlock_bh(&tbl->lock);
+			hlist_del_rcu(&n->hlist);
+			spin_unlock_bh(&tbl->lock);
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
 			if (n->dev)
 				dev_put(n->dev);
-			kfree(n);
+			call_rcu(&n->rcu, pneigh_destroy);
 			return 0;
 		}
 	}
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	return -ENOENT;
 }
 
@@ -543,7 +555,7 @@ static int pneigh_ifdown(struct neigh_ta
 
 		hlist_for_each_entry_safe(n, tmp, nxt, &tbl->phash_buckets[h], hlist) {
 			if (!dev || n->dev == dev) {
-				hlist_del(&n->hlist);
+				hlist_del_rcu(&n->hlist);
 				if (tbl->pdestructor)
 					tbl->pdestructor(n);
 				if (n->dev)
@@ -644,7 +656,7 @@ static void neigh_periodic_timer(unsigne
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
 
-	write_lock(&tbl->lock);
+	spin_lock(&tbl->lock);
 
 	/*
 	 *	periodically recompute ReachableTime from random function
@@ -676,7 +688,7 @@ static void neigh_periodic_timer(unsigne
 		if (atomic_read(&n->refcnt) == 1 &&
 		    (state == NUD_FAILED ||
 		     time_after(now, n->used + n->parms->gc_staletime))) {
-			hlist_del(&n->hlist);
+			hlist_del_rcu(&n->hlist);
 			n->dead = 1;
 			write_unlock(&n->lock);
 			neigh_release(n);
@@ -697,7 +709,7 @@ static void neigh_periodic_timer(unsigne
 
  	mod_timer(&tbl->gc_timer, now + expire);
 
-	write_unlock(&tbl->lock);
+	spin_unlock(&tbl->lock);
 }
 
 static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1285,10 +1297,10 @@ struct neigh_parms *neigh_parms_alloc(st
 			p->dev = dev;
 		}
 		p->sysctl_table = NULL;
-		write_lock_bh(&tbl->lock);
+		spin_lock_bh(&tbl->lock);
 		p->next		= tbl->parms.next;
 		tbl->parms.next = p;
-		write_unlock_bh(&tbl->lock);
+		spin_unlock_bh(&tbl->lock);
 	}
 	return p;
 }
@@ -1307,19 +1319,19 @@ void neigh_parms_release(struct neigh_ta
 
 	if (!parms || parms == &tbl->parms)
 		return;
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	for (p = &tbl->parms.next; *p; p = &(*p)->next) {
 		if (*p == parms) {
 			*p = parms->next;
 			parms->dead = 1;
-			write_unlock_bh(&tbl->lock);
+			spin_unlock_bh(&tbl->lock);
 			if (parms->dev)
 				dev_put(parms->dev);
 			call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 			return;
 		}
 	}
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	NEIGH_PRINTK1("neigh_parms_release: not found\n");
 }
 
@@ -1365,7 +1377,7 @@ void neigh_table_init_no_netlink(struct 
 
 	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 
-	rwlock_init(&tbl->lock);
+	spin_lock_init(&tbl->lock);
 	init_timer(&tbl->gc_timer);
 	tbl->gc_timer.data     = (unsigned long)tbl;
 	tbl->gc_timer.function = neigh_periodic_timer;
@@ -1620,7 +1632,7 @@ static int neightbl_fill_info(struct sk_
 
 	ndtmsg = nlmsg_data(nlh);
 
-	read_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	ndtmsg->ndtm_family = tbl->family;
 	ndtmsg->ndtm_pad1   = 0;
 	ndtmsg->ndtm_pad2   = 0;
@@ -1680,11 +1692,11 @@ static int neightbl_fill_info(struct sk_
 	if (neightbl_fill_parms(skb, &tbl->parms) < 0)
 		goto nla_put_failure;
 
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_end(skb, nlh);
 
 nla_put_failure:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_cancel(skb, nlh);
 }
 
@@ -1703,7 +1715,7 @@ static int neightbl_fill_param_info(stru
 
 	ndtmsg = nlmsg_data(nlh);
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock();		/* this maybe unnecessary */
 	ndtmsg->ndtm_family = tbl->family;
 	ndtmsg->ndtm_pad1   = 0;
 	ndtmsg->ndtm_pad2   = 0;
@@ -1712,10 +1724,10 @@ static int neightbl_fill_param_info(stru
 	    neightbl_fill_parms(skb, parms) < 0)
 		goto errout;
 
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_end(skb, nlh);
 errout:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_cancel(skb, nlh);
 }
  
@@ -1793,7 +1805,7 @@ int neightbl_set(struct sk_buff *skb, st
 	 * We acquire tbl->lock to be nice to the periodic timers and
 	 * make sure they always see a consistent set of values.
 	 */
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	if (tb[NDTA_PARMS]) {
 		struct nlattr *tbp[NDTPA_MAX+1];
@@ -1874,7 +1886,7 @@ int neightbl_set(struct sk_buff *skb, st
 	err = 0;
 
 errout_tbl_lock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 errout_locked:
 	rcu_read_unlock();
 errout:
@@ -1890,7 +1902,7 @@ int neightbl_dump_info(struct sk_buff *s
 
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		struct neigh_parms *p;
 
@@ -1986,20 +1998,20 @@ static int neigh_dump_table(struct neigh
 			continue;
 		if (h > s_h)
 			s_idx = 0;
-		read_lock_bh(&tbl->lock);
+		rcu_read_lock();
 		idx = 0;
-		hlist_for_each_entry(n, tmp, &tbl->hash_buckets[h], hlist) {
+		hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[h], hlist) {
 			if (idx >= s_idx &&
 			    neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
 					    cb->nlh->nlmsg_seq,
 					    RTM_NEWNEIGH, NLM_F_MULTI) <= 0) {
-				read_unlock_bh(&tbl->lock);
+				rcu_read_unlock();
 				rc = -1;
 				goto out;
 			}
 			++idx;
 		}
-		read_unlock_bh(&tbl->lock);
+		rcu_read_unlock();
 	}
 	rc = skb->len;
 out:
@@ -2039,14 +2051,15 @@ void neigh_for_each(struct neigh_table *
 {
 	int chain;
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock();
 	for (chain = 0; chain <= tbl->hash_mask; chain++) {
+		struct neighbour *n;
 		struct hlist_node *p;
 
-		hlist_for_each(p, &tbl->hash_buckets[chain])
-			cb(hlist_entry(p, struct neighbour, hlist), cookie);
+		hlist_for_each_entry_rcu(n, p, &tbl->hash_buckets[chain], hlist)
+			cb(n, cookie);
 	}
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_for_each);
 
@@ -2067,12 +2080,12 @@ void __neigh_for_each_release(struct nei
 			write_lock(&n->lock);
 			release = cb(n);
 			if (release) {
-				hlist_del(&n->hlist);
+				hlist_del_rcu(&n->hlist);
 				n->dead = 1;
 			}
 			write_unlock(&n->lock);
 			if (release)
-				neigh_release(n);
+				call_rcu(&n->rcu, neigh_rcu_release);
 		}
 	}
 }
@@ -2116,7 +2129,7 @@ found:
 
 static struct neighbour *next_neigh(struct hlist_node *node)
 {
-	if (node)
+	if (rcu_dereference(node))
 		return hlist_entry(node, struct neighbour, hlist);
 	else
 		return NULL;
@@ -2191,7 +2204,7 @@ static struct pneigh_entry *pneigh_get_f
 
 	state->flags |= NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
-		pn = tbl->phash_buckets[bucket].first;
+		pn = rcu_dereference(tbl->phash_buckets[bucket].first);
 		if (pn)
 			break;
 	}
@@ -2208,12 +2221,12 @@ static struct pneigh_entry *pneigh_get_n
 	struct neigh_table *tbl = state->tbl;
 	struct hlist_node *tmp = &pn->hlist;
 
-	tmp = tmp->next;
+	tmp = rcu_dereference(tmp->next);
 	if (tmp)
 		goto found;
 
 	while (++state->bucket < PNEIGH_HASHMASK) {
-		tmp = tbl->phash_buckets[state->bucket].first;
+		tmp = rcu_dereference(tbl->phash_buckets[state->bucket].first);
 		if (tmp)
 			goto found;
 	}
@@ -2261,7 +2274,7 @@ void *neigh_seq_start(struct seq_file *s
 	state->bucket = 0;
 	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock();
 
 	pos_minus_one = *pos - 1;
 	return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
@@ -2297,10 +2310,7 @@ EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
 {
-	struct neigh_seq_state *state = seq->private;
-	struct neigh_table *tbl = state->tbl;
-
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
 

--
Stephen Hemminger <shemminger@osdl.org>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 5/6] neighbour: convert lookup to sequence lock
  2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
                   ` (3 preceding siblings ...)
  2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
  2006-08-28 23:07 ` [PATCH 6/6] neighbour: convert hard header cache to sequence number Stephen Hemminger
  5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: neighbour-seqlock.patch --]
[-- Type: text/plain, Size: 17283 bytes --]

The reading of neighbour table entries can be converted from a slow
reader/writer lock to a fast lockless sequence number check.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>


---
 include/net/neighbour.h |    2 
 net/core/neighbour.c    |  117 +++++++++++++++++++++++++++++-------------------
 net/ipv4/arp.c          |  101 +++++++++++++++++++++++++----------------
 net/ipv6/ndisc.c        |   16 +++---
 net/ipv6/route.c        |   12 ++--
 net/sched/sch_teql.c    |   11 +++-
 6 files changed, 155 insertions(+), 104 deletions(-)

--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -100,7 +100,7 @@ struct neighbour
 	__u8			type;
 	__u8			dead;
 	atomic_t		probes;
-	rwlock_t		lock;
+	seqlock_t		lock;
 	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
 	struct hh_cache		*hh;
 	atomic_t		refcnt;
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -143,17 +143,17 @@ static int neigh_forced_gc(struct neigh_
 			 * - nobody refers to it.
 			 * - it is not permanent
 			 */
-			write_lock(&n->lock);
+			write_seqlock(&n->lock);
 			if (atomic_read(&n->refcnt) == 1 &&
 			    !(n->nud_state & NUD_PERMANENT)) {
 				hlist_del_rcu(&n->hlist);
 				n->dead = 1;
 				shrunk	= 1;
-				write_unlock(&n->lock);
+				write_sequnlock(&n->lock);
 				call_rcu(&n->rcu, neigh_rcu_release);
 				continue;
 			}
-			write_unlock(&n->lock);
+			write_sequnlock(&n->lock);
 		}
 	}
 
@@ -198,7 +198,7 @@ static void neigh_flush_dev(struct neigh
 				continue;
 
 			hlist_del_rcu(&n->hlist);
-			write_lock(&n->lock);
+			write_seqlock(&n->lock);
 			neigh_del_timer(n);
 			n->dead = 1;
 
@@ -220,7 +220,7 @@ static void neigh_flush_dev(struct neigh
 					n->nud_state = NUD_NONE;
 				NEIGH_PRINTK2("neigh %p is stray.\n", n);
 			}
-			write_unlock(&n->lock);
+			write_sequnlock(&n->lock);
 			neigh_release(n);
 		}
 	}
@@ -267,7 +267,7 @@ static struct neighbour *neigh_alloc(str
 	memset(n, 0, tbl->entry_size);
 
 	skb_queue_head_init(&n->arp_queue);
-	rwlock_init(&n->lock);
+	seqlock_init(&n->lock);
 	n->updated	  = n->used = now;
 	n->nud_state	  = NUD_NONE;
 	n->output	  = neigh_blackhole;
@@ -615,7 +615,7 @@ void neigh_destroy(struct neighbour *nei
 /* Neighbour state is suspicious;
    disable fast path.
 
-   Called with write_locked neigh.
+   Called with locked neigh.
  */
 static void neigh_suspect(struct neighbour *neigh)
 {
@@ -632,7 +632,7 @@ static void neigh_suspect(struct neighbo
 /* Neighbour state is OK;
    enable fast path.
 
-   Called with write_locked neigh.
+   Called with locked neigh.
  */
 static void neigh_connect(struct neighbour *neigh)
 {
@@ -676,7 +676,7 @@ static void neigh_periodic_timer(unsigne
 	hlist_for_each_entry_safe(n, node, tmp, head, hlist) {
 		unsigned int state;
 
-		write_lock(&n->lock);
+		write_seqlock(&n->lock);
 
 		state = n->nud_state;
 		if (state & (NUD_PERMANENT | NUD_IN_TIMER))
@@ -690,12 +690,12 @@ static void neigh_periodic_timer(unsigne
 		     time_after(now, n->used + n->parms->gc_staletime))) {
 			hlist_del_rcu(&n->hlist);
 			n->dead = 1;
-			write_unlock(&n->lock);
+			write_sequnlock(&n->lock);
 			neigh_release(n);
 			continue;
 		}
 	next_elt:
-		write_unlock(&n->lock);
+		write_sequnlock(&n->lock);
 	}
 
  	/* Cycle through all hash buckets every base_reachable_time/2 ticks.
@@ -738,7 +738,7 @@ static void neigh_timer_handler(unsigned
 	unsigned state;
 	int notify = 0;
 
-	write_lock(&neigh->lock);
+	write_seqlock(&neigh->lock);
 
 	state = neigh->nud_state;
 	now = jiffies;
@@ -748,6 +748,7 @@ static void neigh_timer_handler(unsigned
 #ifndef CONFIG_SMP
 		printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
 #endif
+		write_sequnlock(&neigh->lock);
 		goto out;
 	}
 
@@ -808,9 +809,9 @@ static void neigh_timer_handler(unsigned
 		 */
 		while (neigh->nud_state == NUD_FAILED &&
 		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
-			write_unlock(&neigh->lock);
+			write_sequnlock(&neigh->lock);
 			neigh->ops->error_report(neigh, skb);
-			write_lock(&neigh->lock);
+			write_sequnlock(&neigh->lock);
 		}
 		skb_queue_purge(&neigh->arp_queue);
 	}
@@ -821,20 +822,22 @@ static void neigh_timer_handler(unsigned
 		if (!mod_timer(&neigh->timer, next))
 			neigh_hold(neigh);
 	}
+
 	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
 		struct sk_buff *skb = skb_peek(&neigh->arp_queue);
 		/* keep skb alive even if arp_queue overflows */
 		if (skb)
 			skb_get(skb);
-		write_unlock(&neigh->lock);
+		write_sequnlock(&neigh->lock);
 		neigh->ops->solicit(neigh, skb);
 		atomic_inc(&neigh->probes);
 		if (skb)
 			kfree_skb(skb);
 	} else {
-out:
-		write_unlock(&neigh->lock);
+		write_sequnlock(&neigh->lock);
 	}
+
+out:
 	if (notify)
 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
 
@@ -850,11 +853,11 @@ int __neigh_event_send(struct neighbour 
 	int rc;
 	unsigned long now;
 
-	write_lock_bh(&neigh->lock);
+	write_seqlock_bh(&neigh->lock);
 
 	rc = 0;
 	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
-		goto out_unlock_bh;
+		goto out;
 
 	now = jiffies;
 	
@@ -868,7 +871,7 @@ int __neigh_event_send(struct neighbour 
 		} else {
 			neigh->nud_state = NUD_FAILED;
 			neigh->updated = jiffies;
-			write_unlock_bh(&neigh->lock);
+			write_sequnlock_bh(&neigh->lock);
 
 			if (skb)
 				kfree_skb(skb);
@@ -896,8 +899,8 @@ int __neigh_event_send(struct neighbour 
 		}
 		rc = 1;
 	}
-out_unlock_bh:
-	write_unlock_bh(&neigh->lock);
+out:
+	write_sequnlock_bh(&neigh->lock);
 	return rc;
 }
 
@@ -948,7 +951,7 @@ int neigh_update(struct neighbour *neigh
 	struct net_device *dev;
 	int update_isrouter = 0;
 
-	write_lock_bh(&neigh->lock);
+	write_seqlock_bh(&neigh->lock);
 
 	dev    = neigh->dev;
 	old    = neigh->nud_state;
@@ -1052,22 +1055,23 @@ int neigh_update(struct neighbour *neigh
 		while (neigh->nud_state & NUD_VALID &&
 		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
 			struct neighbour *n1 = neigh;
-			write_unlock_bh(&neigh->lock);
+			write_sequnlock_bh(&neigh->lock);
 			/* On shaper/eql skb->dst->neighbour != neigh :( */
 			if (skb->dst && skb->dst->neighbour)
 				n1 = skb->dst->neighbour;
 			n1->output(skb);
-			write_lock_bh(&neigh->lock);
+			write_seqlock_bh(&neigh->lock);
 		}
 		skb_queue_purge(&neigh->arp_queue);
 	}
-out:
+
 	if (update_isrouter) {
 		neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
 			(neigh->flags | NTF_ROUTER) :
 			(neigh->flags & ~NTF_ROUTER);
 	}
-	write_unlock_bh(&neigh->lock);
+out:
+	write_sequnlock_bh(&neigh->lock);
 
 	if (notify)
 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
@@ -1144,6 +1148,30 @@ int neigh_compat_output(struct sk_buff *
 	return dev_queue_xmit(skb);
 }
 
+static int neigh_hard_header(struct sk_buff *skb, struct net_device *dev,
+			     const struct neighbour *neigh)
+{
+	int rc;
+
+	unsigned seq;
+
+	for(;;) {
+		seq = read_seqbegin(&neigh->lock);
+		rc = dev->hard_header(skb, dev, ntohs(skb->protocol),
+				      neigh->ha, NULL, skb->len);
+
+		if (likely(!read_seqretry(&neigh->lock, seq)))
+			break;
+
+		if (rc < 0)
+			break;
+
+		__skb_pull(skb, rc);
+	}
+
+	return rc;
+}
+
 /* Slow and careful. */
 
 int neigh_resolve_output(struct sk_buff *skb)
@@ -1160,19 +1188,17 @@ int neigh_resolve_output(struct sk_buff 
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
+
 		if (dev->hard_header_cache && !dst->hh) {
-			write_lock_bh(&neigh->lock);
+			write_seqlock_bh(&neigh->lock);
 			if (!dst->hh)
 				neigh_hh_init(neigh, dst, dst->ops->protocol);
 			err = dev->hard_header(skb, dev, ntohs(skb->protocol),
 					       neigh->ha, NULL, skb->len);
-			write_unlock_bh(&neigh->lock);
-		} else {
-			read_lock_bh(&neigh->lock);
-			err = dev->hard_header(skb, dev, ntohs(skb->protocol),
-					       neigh->ha, NULL, skb->len);
-			read_unlock_bh(&neigh->lock);
-		}
+			write_sequnlock_bh(&neigh->lock);
+		} else
+			err = neigh_hard_header(skb, dev, neigh);
+
 		if (err >= 0)
 			rc = neigh->ops->queue_xmit(skb);
 		else
@@ -1196,14 +1222,11 @@ int neigh_connected_output(struct sk_buf
 	int err;
 	struct dst_entry *dst = skb->dst;
 	struct neighbour *neigh = dst->neighbour;
-	struct net_device *dev = neigh->dev;
 
 	__skb_pull(skb, skb->nh.raw - skb->data);
 
-	read_lock_bh(&neigh->lock);
-	err = dev->hard_header(skb, dev, ntohs(skb->protocol),
-			       neigh->ha, NULL, skb->len);
-	read_unlock_bh(&neigh->lock);
+	err = neigh_hard_header(skb, neigh->dev, neigh);
+
 	if (err >= 0)
 		err = neigh->ops->queue_xmit(skb);
 	else {
@@ -1960,11 +1983,15 @@ static int neigh_fill_info(struct sk_buf
 
 	NLA_PUT(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key);
 
-	read_lock_bh(&neigh->lock);
 	ndm->ndm_state	 = neigh->nud_state;
+
+	/* Not really updating this neighbour but don't want to
+	 * deal with the unwind case when seqlock needs retry
+	 */
+	write_seqlock_bh(&neigh->lock);
 	if ((neigh->nud_state & NUD_VALID) &&
 	    nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
-		read_unlock_bh(&neigh->lock);
+		write_sequnlock_bh(&neigh->lock);
 		goto nla_put_failure;
 	}
 
@@ -1972,7 +1999,7 @@ static int neigh_fill_info(struct sk_buf
 	ci.ndm_confirmed = now - neigh->confirmed;
 	ci.ndm_updated	 = now - neigh->updated;
 	ci.ndm_refcnt	 = atomic_read(&neigh->refcnt) - 1;
-	read_unlock_bh(&neigh->lock);
+	write_sequnlock_bh(&neigh->lock);
 
 	NLA_PUT_U32(skb, NDA_PROBES, atomic_read(&neigh->probes));
 	NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci);
@@ -2077,13 +2104,13 @@ void __neigh_for_each_release(struct nei
 					  &tbl->hash_buckets[chain], hlist) {
 			int release;
 
-			write_lock(&n->lock);
+			write_seqlock(&n->lock);
 			release = cb(n);
 			if (release) {
 				hlist_del_rcu(&n->hlist);
 				n->dead = 1;
 			}
-			write_unlock(&n->lock);
+			write_sequnlock(&n->lock);
 			if (release)
 				call_rcu(&n->rcu, neigh_rcu_release);
 		}
--- net-2.6.19.orig/net/ipv4/arp.c
+++ net-2.6.19/net/ipv4/arp.c
@@ -328,6 +328,31 @@ static void arp_error_report(struct neig
 	kfree_skb(skb);
 }
 
+
+static unsigned arp_state_to_flags(const struct neighbour *neigh)
+{
+	unsigned flags = 0;
+	if (neigh->nud_state&NUD_PERMANENT)
+		flags = ATF_PERM|ATF_COM;
+	else if (neigh->nud_state&NUD_VALID)
+		flags = ATF_COM;
+	return flags;
+}
+
+static void arp_get_neigh_addr(u8 *ha, const struct neighbour *neigh,
+			       unsigned len, unsigned *flags)
+{
+	unsigned seq;
+
+	do {
+		seq = read_seqbegin(&neigh->lock);
+		memcpy(ha, neigh->ha, len);
+		if (flags)
+			*flags = arp_state_to_flags(neigh);
+	} while (read_seqretry(&neigh->lock, seq));
+
+}
+
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 {
 	u32 saddr = 0;
@@ -369,8 +394,12 @@ static void arp_solicit(struct neighbour
 	if ((probes -= neigh->parms->ucast_probes) < 0) {
 		if (!(neigh->nud_state&NUD_VALID))
 			printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
-		dst_ha = neigh->ha;
-		read_lock_bh(&neigh->lock);
+
+		dst_ha = kmalloc(MAX_ADDR_LEN, GFP_ATOMIC);
+		if (!dst_ha)
+			return;
+
+		arp_get_neigh_addr(dst_ha, neigh, MAX_ADDR_LEN, NULL);
 	} else if ((probes -= neigh->parms->app_probes) < 0) {
 #ifdef CONFIG_ARPD
 		neigh_app_ns(neigh);
@@ -380,8 +409,9 @@ static void arp_solicit(struct neighbour
 
 	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
 		 dst_ha, dev->dev_addr, NULL);
+
 	if (dst_ha)
-		read_unlock_bh(&neigh->lock);
+		kfree(dst_ha);
 }
 
 static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
@@ -489,10 +519,7 @@ int arp_find(unsigned char *haddr, struc
 	if (n) {
 		n->used = jiffies;
 		if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
-			read_lock_bh(&n->lock);
- 			memcpy(haddr, n->ha, dev->addr_len);
-			read_unlock_bh(&n->lock);
-			neigh_release(n);
+			arp_get_neigh_addr(haddr, n, dev->addr_len, NULL);
 			return 0;
 		}
 		neigh_release(n);
@@ -1047,16 +1074,6 @@ static int arp_req_set(struct arpreq *r,
 	return err;
 }
 
-static unsigned arp_state_to_flags(struct neighbour *neigh)
-{
-	unsigned flags = 0;
-	if (neigh->nud_state&NUD_PERMANENT)
-		flags = ATF_PERM|ATF_COM;
-	else if (neigh->nud_state&NUD_VALID)
-		flags = ATF_COM;
-	return flags;
-}
-
 /*
  *	Get an ARP cache entry.
  */
@@ -1069,10 +1086,8 @@ static int arp_req_get(struct arpreq *r,
 
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		read_lock_bh(&neigh->lock);
-		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
-		r->arp_flags = arp_state_to_flags(neigh);
-		read_unlock_bh(&neigh->lock);
+		arp_get_neigh_addr(r->arp_ha.sa_data, neigh, dev->addr_len,
+				   &r->arp_flags);
 		r->arp_ha.sa_family = dev->type;
 		strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
 		neigh_release(neigh);
@@ -1258,7 +1273,7 @@ void __init arp_init(void)
 /*
  *	ax25 -> ASCII conversion
  */
-static char *ax2asc2(ax25_address *a, char *buf)
+static char *ax2asc2(const ax25_address *a, char *buf)
 {
 	char c, *s;
 	int n;
@@ -1290,35 +1305,41 @@ static char *ax2asc2(ax25_address *a, ch
 #define HBUFFERLEN 30
 
 static void arp_format_neigh_entry(struct seq_file *seq,
-				   struct neighbour *n)
+				   const struct neighbour *n)
 {
 	char hbuffer[HBUFFERLEN];
 	const char hexbuf[] = "0123456789ABCDEF";
 	int k, j;
+	unsigned hflags, seqno;
 	char tbuf[16];
 	struct net_device *dev = n->dev;
 	int hatype = dev->type;
 
-	read_lock(&n->lock);
-	/* Convert hardware address to XX:XX:XX:XX ... form. */
-#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
-	if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
-		ax2asc2((ax25_address *)n->ha, hbuffer);
-	else {
-#endif
-	for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
-		hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
-		hbuffer[k++] = hexbuf[n->ha[j] & 15];
-		hbuffer[k++] = ':';
-	}
-	hbuffer[--k] = 0;
+	do {
+		seqno = read_seqbegin(&n->lock);
+
+		/* Convert hardware address to XX:XX:XX:XX ... form. */
 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
-	}
+		if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+			ax2asc2((const ax25_address *)n->ha, hbuffer);
+		else
 #endif
-	sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+		{
+			for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+				hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+				hbuffer[k++] = hexbuf[n->ha[j] & 15];
+				hbuffer[k++] = ':';
+			}
+			hbuffer[--k] = 0;
+		}
+
+		sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+		hflags = arp_state_to_flags(n);
+	} while (read_seqretry(&n->lock, seqno));
+
 	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
-		   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
-	read_unlock(&n->lock);
+		   tbuf, hatype, hflags, hbuffer, dev->name);
+
 }
 
 static void arp_format_pneigh_entry(struct seq_file *seq,
--- net-2.6.19.orig/net/ipv6/ndisc.c
+++ net-2.6.19/net/ipv6/ndisc.c
@@ -1412,15 +1412,15 @@ void ndisc_send_redirect(struct sk_buff 
 		return;
 	}
 
-	if (dev->addr_len) {
-		read_lock_bh(&neigh->lock);
-		if (neigh->nud_state & NUD_VALID) {
+	if (dev->addr_len && (neigh->nud_state & NUD_VALID)) {
+		unsigned seq;
+		do {
+			seq = read_seqbegin(&neigh->lock);
 			memcpy(ha_buf, neigh->ha, dev->addr_len);
-			read_unlock_bh(&neigh->lock);
-			ha = ha_buf;
-			len += ndisc_opt_addr_space(dev);
-		} else
-			read_unlock_bh(&neigh->lock);
+		} while (read_seqretry(&neigh->lock, seq));
+
+		ha = ha_buf;
+		len += ndisc_opt_addr_space(dev);
 	}
 
 	rd_len = min_t(unsigned int,
--- net-2.6.19.orig/net/ipv6/route.c
+++ net-2.6.19/net/ipv6/route.c
@@ -279,20 +279,19 @@ static void rt6_probe(struct rt6_info *r
 	 */
 	if (!neigh || (neigh->nud_state & NUD_VALID))
 		return;
-	read_lock_bh(&neigh->lock);
+
 	if (!(neigh->nud_state & NUD_VALID) &&
-	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
+	    time_after(jiffies,
+		       neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 		struct in6_addr mcaddr;
 		struct in6_addr *target;
 
 		neigh->updated = jiffies;
-		read_unlock_bh(&neigh->lock);
 
 		target = (struct in6_addr *)&neigh->primary_key;
 		addrconf_addr_solict_mult(target, &mcaddr);
 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
-	} else
-		read_unlock_bh(&neigh->lock);
+	}
 }
 #else
 static inline void rt6_probe(struct rt6_info *rt)
@@ -323,10 +322,9 @@ static int inline rt6_check_neigh(struct
 	    !(rt->rt6i_flags & RTF_GATEWAY))
 		m = 1;
 	else if (neigh) {
-		read_lock_bh(&neigh->lock);
+		smp_rmb();
 		if (neigh->nud_state & NUD_VALID)
 			m = 2;
-		read_unlock_bh(&neigh->lock);
 	}
 	return m;
 }
--- net-2.6.19.orig/net/sched/sch_teql.c
+++ net-2.6.19/net/sched/sch_teql.c
@@ -248,9 +248,14 @@ __teql_resolve(struct sk_buff *skb, stru
 	}
 	if (neigh_event_send(n, skb_res) == 0) {
 		int err;
-		read_lock(&n->lock);
-		err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len);
-		read_unlock(&n->lock);
+		unsigned seq;
+
+		do {
+			seq = read_seqbegin(&n->lock);
+			err = dev->hard_header(skb, dev, ntohs(skb->protocol),
+					       n->ha, NULL, skb->len);
+		} while (read_seqretry(&n->lock, seq));
+
 		if (err < 0) {
 			neigh_release(n);
 			return -EINVAL;

--
Stephen Hemminger <shemminger@osdl.org>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 6/6] neighbour: convert hard header cache to sequence number
  2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
                   ` (4 preceding siblings ...)
  2006-08-28 23:07 ` [PATCH 5/6] neighbour: convert lookup to sequence lock Stephen Hemminger
@ 2006-08-28 23:07 ` Stephen Hemminger
  5 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-28 23:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: hardheader-rcu.patch --]
[-- Type: text/plain, Size: 5181 bytes --]

The reading of the hard header cache in the output path can be
made lockless using seqlock.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>

---
 include/linux/netdevice.h |    3 ++-
 include/net/neighbour.h   |    2 ++
 net/core/neighbour.c      |   40 +++++++++++++++++++++++++++++++++++-----
 net/ipv4/ip_output.c      |   13 +++----------
 net/ipv6/ip6_output.c     |   13 +++----------
 5 files changed, 45 insertions(+), 26 deletions(-)

--- net-2.6.19.orig/include/linux/netdevice.h
+++ net-2.6.19/include/linux/netdevice.h
@@ -193,7 +193,7 @@ struct hh_cache
                                          */
 	int		hh_len;		/* length of header */
 	int		(*hh_output)(struct sk_buff *skb);
-	rwlock_t	hh_lock;
+	seqlock_t	hh_lock;
 
 	/* cached hardware header; allow for machine alignment needs.        */
 #define HH_DATA_MOD	16
@@ -217,6 +217,7 @@ struct hh_cache
 #define LL_RESERVED_SPACE_EXTRA(dev,extra) \
 	((((dev)->hard_header_len+extra)&~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
 
+
 /* These flag bits are private to the generic network queueing
  * layer, they may not be explicitly referenced by any other
  * code.
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -591,9 +591,11 @@ void neigh_destroy(struct neighbour *nei
 	while ((hh = neigh->hh) != NULL) {
 		neigh->hh = hh->hh_next;
 		hh->hh_next = NULL;
-		write_lock_bh(&hh->hh_lock);
+
+		write_seqlock_bh(&hh->hh_lock);
 		hh->hh_output = neigh_blackhole;
-		write_unlock_bh(&hh->hh_lock);
+		write_sequnlock_bh(&hh->hh_lock);
+
 		if (atomic_dec_and_test(&hh->hh_refcnt))
 			kfree(hh);
 	}
@@ -912,9 +914,9 @@ static void neigh_update_hhs(struct neig
 
 	if (update) {
 		for (hh = neigh->hh; hh; hh = hh->hh_next) {
-			write_lock_bh(&hh->hh_lock);
+			write_seqlock_bh(&hh->hh_lock);
 			update(hh, neigh->dev, neigh->ha);
-			write_unlock_bh(&hh->hh_lock);
+			write_sequnlock_bh(&hh->hh_lock);
 		}
 	}
 }
@@ -1105,7 +1107,7 @@ static void neigh_hh_init(struct neighbo
 			break;
 
 	if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
-		rwlock_init(&hh->hh_lock);
+		seqlock_init(&hh->hh_lock);
 		hh->hh_type = protocol;
 		atomic_set(&hh->hh_refcnt, 0);
 		hh->hh_next = NULL;
@@ -1128,6 +1130,33 @@ static void neigh_hh_init(struct neighbo
 	}
 }
 
+
+/*
+ * Add header to skb from hard header cache
+ * Handle case where cache gets changed.
+ */
+int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
+{
+	int len, alen;
+	unsigned seq;
+	int (*output)(struct sk_buff *);
+
+	for(;;) {
+		seq = read_seqbegin(&hh->hh_lock);
+		len = hh->hh_len;
+		alen = HH_DATA_ALIGN(len);
+		output = hh->hh_output;
+		memcpy(skb->data - alen, hh->hh_data, alen);
+		skb_push(skb, len);
+
+		if (likely(!read_seqretry(&hh->hh_lock, seq)))
+			return output(skb);
+
+		/* undo and try again */
+		__skb_pull(skb, len);
+	}
+}
+
 /* This function can be used in contexts, where only old dev_queue_xmit
    worked, f.e. if you want to override normal output path (eql, shaper),
    but resolution is not made yet.
@@ -2767,6 +2796,7 @@ EXPORT_SYMBOL(neigh_delete);
 EXPORT_SYMBOL(neigh_destroy);
 EXPORT_SYMBOL(neigh_dump_info);
 EXPORT_SYMBOL(neigh_event_ns);
+EXPORT_SYMBOL(neigh_hh_output);
 EXPORT_SYMBOL(neigh_ifdown);
 EXPORT_SYMBOL(neigh_lookup);
 EXPORT_SYMBOL(neigh_lookup_nodev);
--- net-2.6.19.orig/net/ipv4/ip_output.c
+++ net-2.6.19/net/ipv4/ip_output.c
@@ -182,16 +182,9 @@ static inline int ip_finish_output2(stru
 		skb = skb2;
 	}
 
-	if (hh) {
-		int hh_alen;
-
-		read_lock_bh(&hh->hh_lock);
-		hh_alen = HH_DATA_ALIGN(hh->hh_len);
-  		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
-		read_unlock_bh(&hh->hh_lock);
-	        skb_push(skb, hh->hh_len);
-		return hh->hh_output(skb);
-	} else if (dst->neighbour)
+	if (hh)
+		return neigh_hh_output(hh, skb);
+	else if (dst->neighbour)
 		return dst->neighbour->output(skb);
 
 	if (net_ratelimit())
--- net-2.6.19.orig/net/ipv6/ip6_output.c
+++ net-2.6.19/net/ipv6/ip6_output.c
@@ -76,16 +76,9 @@ static inline int ip6_output_finish(stru
 	struct dst_entry *dst = skb->dst;
 	struct hh_cache *hh = dst->hh;
 
-	if (hh) {
-		int hh_alen;
-
-		read_lock_bh(&hh->hh_lock);
-		hh_alen = HH_DATA_ALIGN(hh->hh_len);
-		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
-		read_unlock_bh(&hh->hh_lock);
-	        skb_push(skb, hh->hh_len);
-		return hh->hh_output(skb);
-	} else if (dst->neighbour)
+	if (hh)
+		return neigh_hh_output(hh, skb);
+	else if (dst->neighbour)
 		return dst->neighbour->output(skb);
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -193,6 +193,8 @@ extern struct neighbour *	neigh_create(s
 					     struct net_device *dev);
 extern void			neigh_destroy(struct neighbour *neigh);
 extern int			__neigh_event_send(struct neighbour *neigh, struct sk_buff *skb);
+extern int			neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb);
+
 extern int			neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, 
 					     u32 flags);
 extern void			neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev);

--
Stephen Hemminger <shemminger@osdl.org>



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
@ 2006-08-29 15:28   ` Alexey Kuznetsov
  2006-08-29 18:22     ` Stephen Hemminger
  0 siblings, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 15:28 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hello!

> @@ -346,8 +354,8 @@ struct neighbour *neigh_lookup(struct ne
>  	
>  	NEIGH_CACHE_STAT_INC(tbl, lookups);
>  
> -	read_lock_bh(&tbl->lock);
> -	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> +	rcu_read_lock();
> +	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
>  		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
>  			neigh_hold(n);
>  			NEIGH_CACHE_STAT_INC(tbl, hits);

Sure? Seems, you cannot grab refcnt here, the entry can be already
released.

Probably, you should do atomic_inc_and_test() here and restart lookup,
if it fails.

Alexey

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 15:28   ` Alexey Kuznetsov
@ 2006-08-29 18:22     ` Stephen Hemminger
  2006-08-29 18:34       ` Martin Josefsson
  2006-08-29 21:17       ` Alexey Kuznetsov
  0 siblings, 2 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 18:22 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, netdev

On Tue, 29 Aug 2006 19:28:16 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:

> Hello!
> 
> > @@ -346,8 +354,8 @@ struct neighbour *neigh_lookup(struct ne
> >  	
> >  	NEIGH_CACHE_STAT_INC(tbl, lookups);
> >  
> > -	read_lock_bh(&tbl->lock);
> > -	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> > +	rcu_read_lock();
> > +	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> >  		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> >  			neigh_hold(n);
> >  			NEIGH_CACHE_STAT_INC(tbl, hits);
> 
> Sure? Seems, you cannot grab refcnt here, the entry can be already
> released.
> 
> Probably, you should do atomic_inc_and_test() here and restart lookup,
> if it fails.
> 
> Alexey

atomic_inc_and_test is true iff result is zero, so that won't work.
But the following should work:

	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
			if (unlikely(&atomic_inc_return(&n->refcnt) == 1)) {
				neigh_release(n);
				continue;
			}
 			NEIGH_CACHE_STAT_INC(tbl, hits);

I'll wrap the atomic_inc_return inside neigh_hold_return()


-- 
Stephen Hemminger <shemminger@osdl.org>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 18:22     ` Stephen Hemminger
@ 2006-08-29 18:34       ` Martin Josefsson
  2006-08-29 20:17         ` Stephen Hemminger
  2006-08-29 21:17       ` Alexey Kuznetsov
  1 sibling, 1 reply; 19+ messages in thread
From: Martin Josefsson @ 2006-08-29 18:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Alexey Kuznetsov, David Miller, netdev

[-- Attachment #1: Type: text/plain, Size: 318 bytes --]

On Tue, 2006-08-29 at 11:22 -0700, Stephen Hemminger wrote:

> > Probably, you should do atomic_inc_and_test() here and restart lookup,
> > if it fails.
> > 
> > Alexey
> 
> atomic_inc_and_test is true iff result is zero, so that won't work.

Wouldn't atomic_inc_not_zero() do what you want?

-- 
/Martin

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 18:34       ` Martin Josefsson
@ 2006-08-29 20:17         ` Stephen Hemminger
  0 siblings, 0 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 20:17 UTC (permalink / raw)
  To: Martin Josefsson; +Cc: Alexey Kuznetsov, David Miller, netdev

On Tue, 29 Aug 2006 20:34:01 +0200
Martin Josefsson <gandalf@wlug.westbo.se> wrote:

> On Tue, 2006-08-29 at 11:22 -0700, Stephen Hemminger wrote:
> 
> > > Probably, you should do atomic_inc_and_test() here and restart lookup,
> > > if it fails.
> > > 
> > > Alexey
> > 
> > atomic_inc_and_test is true iff result is zero, so that won't work.
> 
> Wouldn't atomic_inc_not_zero() do what you want?
> 

Yes, but that code looks more complex for the common case.
It ends up doing a compare exchange and retrying.

-- 
Stephen Hemminger <shemminger@osdl.org>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 18:22     ` Stephen Hemminger
  2006-08-29 18:34       ` Martin Josefsson
@ 2006-08-29 21:17       ` Alexey Kuznetsov
  2006-08-29 21:46         ` Stephen Hemminger
  1 sibling, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 21:17 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hello!

> atomic_inc_and_test is true iff result is zero, so that won't work.

I meant atomic_inc_not_zero(), as Martin noticed.


> But the following should work:
> 
> 	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> 			if (unlikely(&atomic_inc_return(&n->refcnt) == 1)) {
> 				neigh_release(n);

I do not think it will work. It has exactly the same race condition.

Yes, atomic_inc_not_zero() is expensive. But it looks like it is the cheapest
variant, which works correctly without more work.

Another variant would be rework use of refcnt. It can be done like rt cache:
when release of the last reference does not mean anything.

Also, probably, it makes sense to add neigh_lookup_light(), which does
not take refcnt, but required to call
neigh_release_light() (which is just rcu_read_unlock_bh()).

Alexey

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 21:17       ` Alexey Kuznetsov
@ 2006-08-29 21:46         ` Stephen Hemminger
  2006-08-29 22:16           ` Alexey Kuznetsov
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 21:46 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, netdev

On Wed, 30 Aug 2006 01:17:22 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:

> Hello!
> 
> > atomic_inc_and_test is true iff result is zero, so that won't work.
> 
> I meant atomic_inc_not_zero(), as Martin noticed.
> 
> 
> > But the following should work:
> > 
> > 	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
> > 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
> > 			if (unlikely(&atomic_inc_return(&n->refcnt) == 1)) {
> > 				neigh_release(n);
> 
> I do not think it will work. It has exactly the same race condition.
> 
> Yes, atomic_inc_not_zero() is expensive. But it looks like it is the cheapest
> variant, which works correctly without more work.
> 
> Another variant would be rework use of refcnt. It can be done like rt cache:
> when release of the last reference does not mean anything.
> 
> Also, probably, it makes sense to add neigh_lookup_light(), which does
> not take refcnt, but required to call
> neigh_release_light() (which is just rcu_read_unlock_bh()).

Which code paths would that make sense on?
	fib_detect_death (ok)
	infiniband (ok)
	wireless/strip (ok) -- hey, this code is crap it has
		a refcount leak already!
	arp_req_get (ok)
	ndisc (ok)

Perhaps killing the refcount all together, and just changing
everybody to neigh_lookup_rcu(). Nobody holds a long term reference
to the entries.

> 
> Alexey

Or using the existing dead flag?

@@ -346,17 +359,24 @@ struct neighbour *neigh_lookup(struct ne
 	
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
+			/* Don't rescuitate dead entries */
+			if (unlikely(n->dead)) {
+				neigh_release(n);
+				continue;
+			}
+
 			NEIGH_CACHE_STAT_INC(tbl, hits);
 			goto found;
 		}
 	}
 	n = NULL;
 found:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return n;
 }
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 21:46         ` Stephen Hemminger
@ 2006-08-29 22:16           ` Alexey Kuznetsov
  2006-08-29 23:00             ` Stephen Hemminger
  0 siblings, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 22:16 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hello!

> > Also, probably, it makes sense to add neigh_lookup_light(), which does
> > not take refcnt, but required to call
> > neigh_release_light() (which is just rcu_read_unlock_bh()).
> 
> Which code paths would that make sense on?
> 	fib_detect_death (ok)
> 	infiniband (ok)
> 	wireless/strip (ok) -- hey, this code is crap it has
> 		a refcount leak already!
> 	arp_req_get (ok)
> 	ndisc (ok)
> 
> Perhaps killing the refcount all together, and just changing
> everybody to neigh_lookup_rcu(). Nobody holds a long term reference
> to the entries.

The only real user of refcnt is destination cache.

It uses __neigh_lookup/__neigh_lookup_errno, which could also
use neigh_lookup_rcu(), then do atomic_inc_not_zero() and, if it fails
fall to neigh_create().

Alexey

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 22:16           ` Alexey Kuznetsov
@ 2006-08-29 23:00             ` Stephen Hemminger
  2006-08-29 23:21               ` Alexey Kuznetsov
  2006-08-29 23:36               ` Alexey Kuznetsov
  0 siblings, 2 replies; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 23:00 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, netdev

Here is the full version of neigh_lookup_rcu().

The race of interest is what happens when neigh_lookup_rcu() returns a table
entry and it gets deleted at the same time. See Documentation/RCU/listRCU.txt
for more info.

There are a couple different scenario's.

1) update or reader looking at dead entry.  The neighbour entry is queued on the
   rcu callback list, and will not be invoked until all cpu's have cleared
   the barrier.  The reader's do see stale data but it is harmless.
   After reader has cleared critical section, the neigh_rcu_release is run
   and that calls neigh_release which sees last reference and calls neigh_destroy.

   neigh_lookup_rcu checks for dead entries, but this is racy and can miss.
   It is just an attempt to close the window slightly.

2) caller wants to create or update entry (__neigh_lookup).  caller gets
   just deleted entry (refcount still 1), then grabs refcount (now 2).
   Similar to above, but the actual neigh_destroy() happens either
   from rcu callback or neigh_release() in caller, which ever is last.


This should not be any more racy than the existing code.

---
 drivers/infiniband/core/addr.c |    7 -
 drivers/net/wireless/strip.c   |   11 +-
 include/net/neighbour.h        |   43 +++++---
 net/core/neighbour.c           |  197 ++++++++++++++++++++++-------------------
 net/ipv4/arp.c                 |   10 +-
 net/ipv6/ndisc.c               |    7 -
 6 files changed, 155 insertions(+), 120 deletions(-)

--- net-2.6.19.orig/include/net/neighbour.h
+++ net-2.6.19/include/net/neighbour.h
@@ -108,6 +108,7 @@ struct neighbour
 	struct sk_buff_head	arp_queue;
 	struct timer_list	timer;
 	struct neigh_ops	*ops;
+	struct rcu_head		rcu;
 	u8			primary_key[0];
 };
 
@@ -126,6 +127,7 @@ struct pneigh_entry
 {
 	struct hlist_node	hlist;
 	struct net_device	*dev;
+	struct rcu_head		rcu;
 	u8			key[0];
 };
 
@@ -157,7 +159,7 @@ struct neigh_table
 	struct timer_list 	proxy_timer;
 	struct sk_buff_head	proxy_queue;
 	atomic_t		entries;
-	rwlock_t		lock;
+	spinlock_t		lock;
 	unsigned long		last_rand;
 	kmem_cache_t		*kmem_cachep;
 	struct neigh_statistics	*stats;
@@ -181,9 +183,9 @@ struct neigh_table
 extern void			neigh_table_init(struct neigh_table *tbl);
 extern void			neigh_table_init_no_netlink(struct neigh_table *tbl);
 extern int			neigh_table_clear(struct neigh_table *tbl);
-extern struct neighbour *	neigh_lookup(struct neigh_table *tbl,
-					     const void *pkey,
-					     struct net_device *dev);
+extern struct neighbour *	neigh_lookup_rcu(struct neigh_table *tbl,
+						 const void *pkey,
+						 const struct net_device *dev);
 extern struct neighbour *	neigh_lookup_nodev(struct neigh_table *tbl,
 						   const void *pkey);
 extern struct neighbour *	neigh_create(struct neigh_table *tbl,
@@ -313,27 +315,38 @@ static inline int neigh_event_send(struc
 
 static inline struct neighbour *__neigh_lookup(struct neigh_table *tbl,
 					       const void *pkey,
-					       struct net_device *dev, int creat)
+					       struct net_device *dev,
+					       int creat)
 {
-	struct neighbour *n = neigh_lookup(tbl, pkey, dev);
-
-	if (n || !creat)
-		return n;
+	struct neighbour *n;
 
-	n = neigh_create(tbl, pkey, dev);
-	return IS_ERR(n) ? NULL : n;
+	rcu_read_lock();
+	n = neigh_lookup_rcu(tbl, pkey, dev);
+	if (n)
+		neigh_hold(n);
+	else if (creat) {
+		n = neigh_create(tbl, pkey, dev);
+		if (IS_ERR(n))
+			n = NULL;
+	}
+	rcu_read_unlock();
+	return n;
 }
 
 static inline struct neighbour *__neigh_lookup_errno(struct neigh_table *tbl,
 						     const void *pkey,
 						     struct net_device *dev)
 {
-	struct neighbour *n = neigh_lookup(tbl, pkey, dev);
+	struct neighbour *n;
 
+	rcu_read_lock();
+	n = neigh_lookup_rcu(tbl, pkey, dev);
 	if (n)
-		return n;
-
-	return neigh_create(tbl, pkey, dev);
+		neigh_hold(n);
+	else
+		n = neigh_create(tbl, pkey, dev);
+	rcu_read_unlock();
+	return n;
 }
 
 struct neighbour_cb {
--- net-2.6.19.orig/net/core/neighbour.c
+++ net-2.6.19/net/core/neighbour.c
@@ -67,9 +67,10 @@ static struct file_operations neigh_stat
 #endif
 
 /*
-   Neighbour hash table buckets are protected with rwlock tbl->lock.
+   Neighbour hash table buckets are protected with lock tbl->lock.
 
-   - All the scans/updates to hash buckets MUST be made under this lock.
+   - All the scans of hash buckes must be made with RCU read lock (nopreempt)
+   - updates to hash buckets MUST be made under this lock.
    - NOTHING clever should be made under this lock: no callbacks
      to protocol backends, no attempts to send something to network.
      It will result in deadlocks, if backend/driver wants to use neighbour
@@ -116,6 +117,17 @@ unsigned long neigh_rand_reach_time(unsi
 	return (base ? (net_random() % base) + (base >> 1) : 0);
 }
 
+/*
+ * After all readers have finished this read-side critical section
+ * decrement ref count and free. If reader raced with us they
+ * may still have harmless dead reference.
+ */
+static void neigh_rcu_release(struct rcu_head *head)
+{
+	struct neighbour *neigh = container_of(head, struct neighbour, rcu);
+
+	neigh_release(neigh);
+}
 
 static int neigh_forced_gc(struct neigh_table *tbl)
 {
@@ -124,7 +136,7 @@ static int neigh_forced_gc(struct neigh_
 
 	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	for (i = 0; i <= tbl->hash_mask; i++) {
 		struct neighbour *n;
 		struct hlist_node *node, *tmp;
@@ -138,11 +150,11 @@ static int neigh_forced_gc(struct neigh_
 			write_lock(&n->lock);
 			if (atomic_read(&n->refcnt) == 1 &&
 			    !(n->nud_state & NUD_PERMANENT)) {
-				hlist_del(&n->hlist);
+				hlist_del_rcu(&n->hlist);
 				n->dead = 1;
 				shrunk	= 1;
 				write_unlock(&n->lock);
-				neigh_release(n);
+				call_rcu(&n->rcu, neigh_rcu_release);
 				continue;
 			}
 			write_unlock(&n->lock);
@@ -151,7 +163,7 @@ static int neigh_forced_gc(struct neigh_
 
 	tbl->last_flush = jiffies;
 
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 	return shrunk;
 }
@@ -189,10 +201,10 @@ static void neigh_flush_dev(struct neigh
 			if (dev && n->dev != dev)
 				continue;
 
-			hlist_del(&n->hlist);
 			write_lock(&n->lock);
-			neigh_del_timer(n);
+			hlist_del_rcu(&n->hlist);
 			n->dead = 1;
+			neigh_del_timer(n);
 
 			if (atomic_read(&n->refcnt) != 1) {
 				/* The most unpleasant situation.
@@ -213,24 +225,25 @@ static void neigh_flush_dev(struct neigh
 				NEIGH_PRINTK2("neigh %p is stray.\n", n);
 			}
 			write_unlock(&n->lock);
-			neigh_release(n);
+
+			call_rcu(&n->rcu, neigh_rcu_release);
 		}
 	}
 }
 
 void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
 {
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 }
 
 int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
 {
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev);
 	pneigh_ifdown(tbl, dev);
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 
 	del_timer_sync(&tbl->proxy_timer);
 	pneigh_queue_purge(&tbl->proxy_queue);
@@ -326,8 +339,8 @@ static void neigh_hash_grow(struct neigh
 			unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
 
 			hash_val &= new_hash_mask;
-			hlist_del(&n->hlist);
-			hlist_add_head(&n->hlist, &new_hash[hash_val]);
+			__hlist_del(&n->hlist);
+			hlist_add_head_rcu(&n->hlist, &new_hash[hash_val]);
 		}
 	}
 	tbl->hash_buckets = new_hash;
@@ -336,8 +349,12 @@ static void neigh_hash_grow(struct neigh
 	neigh_hash_free(old_hash, old_entries);
 }
 
-struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
-			       struct net_device *dev)
+/*
+ * Lookup device and key in neighbour table.
+ * Assumed rcu_read_lock is held
+ */
+struct neighbour *neigh_lookup_rcu(struct neigh_table *tbl, const void *pkey,
+				   const struct net_device *dev)
 {
 	struct neighbour *n;
 	struct hlist_node *tmp;
@@ -346,18 +363,14 @@ struct neighbour *neigh_lookup(struct ne
 	
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
-		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
-			neigh_hold(n);
+	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+		if (!n->dead && dev == n->dev
+		    && !memcmp(n->primary_key, pkey, key_len)) {
 			NEIGH_CACHE_STAT_INC(tbl, hits);
-			goto found;
+			return n;
 		}
 	}
-	n = NULL;
-found:
-	read_unlock_bh(&tbl->lock);
-	return n;
+	return NULL;
 }
 
 struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey)
@@ -369,9 +382,9 @@ struct neighbour *neigh_lookup_nodev(str
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hlist_for_each_entry(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
-		if (!memcmp(n->primary_key, pkey, key_len)) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[hash_val], hlist) {
+		if (!n->dead && !memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
 			NEIGH_CACHE_STAT_INC(tbl, hits);
 			goto found;
@@ -379,7 +392,7 @@ struct neighbour *neigh_lookup_nodev(str
 	}
 	n = NULL;
 found:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return n;
 }
 
@@ -416,7 +429,7 @@ struct neighbour *neigh_create(struct ne
 
 	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
 		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
@@ -436,21 +449,22 @@ struct neighbour *neigh_create(struct ne
 		}
 	}
 
-	hlist_add_head(&n->hlist, &tbl->hash_buckets[hash_val]);
 	n->dead = 0;
 	neigh_hold(n);
-	write_unlock_bh(&tbl->lock);
+	hlist_add_head_rcu(&n->hlist, &tbl->hash_buckets[hash_val]);
+	spin_unlock_bh(&tbl->lock);
 	NEIGH_PRINTK2("neigh %p is created.\n", n);
 	rc = n;
 out:
 	return rc;
 out_tbl_unlock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 out_neigh_release:
 	neigh_release(n);
 	goto out;
 }
 
+/* Assumes rcu_read_lock is held */
 struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey,
 				    struct net_device *dev, int creat)
 {
@@ -464,16 +478,14 @@ struct pneigh_entry * pneigh_lookup(stru
 	hash_val ^= hash_val >> 4;
 	hash_val &= PNEIGH_HASHMASK;
 
-	read_lock_bh(&tbl->lock);
-
-	hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
+	hlist_for_each_entry_rcu(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
 		if (!memcmp(n->key, pkey, key_len) &&
 		    (n->dev == dev || !n->dev)) {
-			read_unlock_bh(&tbl->lock);
+			rcu_read_unlock();
 			goto out;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
+
 	n = NULL;
 	if (!creat)
 		goto out;
@@ -495,13 +507,18 @@ struct pneigh_entry * pneigh_lookup(stru
 		goto out;
 	}
 
-	write_lock_bh(&tbl->lock);
-	hlist_add_head(&n->hlist, &tbl->phash_buckets[hash_val]);
-	write_unlock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
+	hlist_add_head_rcu(&n->hlist, &tbl->phash_buckets[hash_val]);
+	spin_unlock_bh(&tbl->lock);
 out:
 	return n;
 }
 
+static void pneigh_destroy(struct rcu_head *head)
+{
+	struct pneigh_entry *n = container_of(head, struct pneigh_entry, rcu);
+	kfree(n);
+}
 
 int pneigh_delete(struct neigh_table *tbl, const void *pkey,
 		  struct net_device *dev)
@@ -516,20 +533,20 @@ int pneigh_delete(struct neigh_table *tb
 	hash_val ^= hash_val >> 4;
 	hash_val &= PNEIGH_HASHMASK;
 
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	hlist_for_each_entry(n, tmp, &tbl->phash_buckets[hash_val], hlist) {
 		if (!memcmp(n->key, pkey, key_len) && n->dev == dev) {
-			hlist_del(&n->hlist);
-			write_unlock_bh(&tbl->lock);
+			hlist_del_rcu(&n->hlist);
+			spin_unlock_bh(&tbl->lock);
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
 			if (n->dev)
 				dev_put(n->dev);
-			kfree(n);
+			call_rcu(&n->rcu, pneigh_destroy);
 			return 0;
 		}
 	}
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	return -ENOENT;
 }
 
@@ -543,7 +560,7 @@ static int pneigh_ifdown(struct neigh_ta
 
 		hlist_for_each_entry_safe(n, tmp, nxt, &tbl->phash_buckets[h], hlist) {
 			if (!dev || n->dev == dev) {
-				hlist_del(&n->hlist);
+				hlist_del_rcu(&n->hlist);
 				if (tbl->pdestructor)
 					tbl->pdestructor(n);
 				if (n->dev)
@@ -644,7 +661,7 @@ static void neigh_periodic_timer(unsigne
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
 
-	write_lock(&tbl->lock);
+	spin_lock(&tbl->lock);
 
 	/*
 	 *	periodically recompute ReachableTime from random function
@@ -676,7 +693,7 @@ static void neigh_periodic_timer(unsigne
 		if (atomic_read(&n->refcnt) == 1 &&
 		    (state == NUD_FAILED ||
 		     time_after(now, n->used + n->parms->gc_staletime))) {
-			hlist_del(&n->hlist);
+			hlist_del_rcu(&n->hlist);
 			n->dead = 1;
 			write_unlock(&n->lock);
 			neigh_release(n);
@@ -697,7 +714,7 @@ static void neigh_periodic_timer(unsigne
 
  	mod_timer(&tbl->gc_timer, now + expire);
 
-	write_unlock(&tbl->lock);
+	spin_unlock(&tbl->lock);
 }
 
 static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1285,10 +1302,10 @@ struct neigh_parms *neigh_parms_alloc(st
 			p->dev = dev;
 		}
 		p->sysctl_table = NULL;
-		write_lock_bh(&tbl->lock);
+		spin_lock_bh(&tbl->lock);
 		p->next		= tbl->parms.next;
 		tbl->parms.next = p;
-		write_unlock_bh(&tbl->lock);
+		spin_unlock_bh(&tbl->lock);
 	}
 	return p;
 }
@@ -1307,19 +1324,19 @@ void neigh_parms_release(struct neigh_ta
 
 	if (!parms || parms == &tbl->parms)
 		return;
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	for (p = &tbl->parms.next; *p; p = &(*p)->next) {
 		if (*p == parms) {
 			*p = parms->next;
 			parms->dead = 1;
-			write_unlock_bh(&tbl->lock);
+			spin_unlock_bh(&tbl->lock);
 			if (parms->dev)
 				dev_put(parms->dev);
 			call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 			return;
 		}
 	}
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 	NEIGH_PRINTK1("neigh_parms_release: not found\n");
 }
 
@@ -1365,7 +1382,7 @@ void neigh_table_init_no_netlink(struct 
 
 	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 
-	rwlock_init(&tbl->lock);
+	spin_lock_init(&tbl->lock);
 	init_timer(&tbl->gc_timer);
 	tbl->gc_timer.data     = (unsigned long)tbl;
 	tbl->gc_timer.function = neigh_periodic_timer;
@@ -1463,7 +1480,8 @@ int neigh_delete(struct sk_buff *skb, st
 		if (dev == NULL)
 			goto out_dev_put;
 
-		neigh = neigh_lookup(tbl, nla_data(dst_attr), dev);
+		rcu_read_lock();
+		neigh = neigh_lookup_rcu(tbl, nla_data(dst_attr), dev);
 		if (neigh == NULL) {
 			err = -ENOENT;
 			goto out_dev_put;
@@ -1472,7 +1490,7 @@ int neigh_delete(struct sk_buff *skb, st
 		err = neigh_update(neigh, NULL, NUD_FAILED,
 				   NEIGH_UPDATE_F_OVERRIDE |
 				   NEIGH_UPDATE_F_ADMIN);
-		neigh_release(neigh);
+		rcu_read_unlock();
 		goto out_dev_put;
 	}
 	err = -EAFNOSUPPORT;
@@ -1537,7 +1555,7 @@ int neigh_add(struct sk_buff *skb, struc
 		if (dev == NULL)
 			goto out_dev_put;
 
-		neigh = neigh_lookup(tbl, dst, dev);
+		neigh = neigh_lookup_rcu(tbl, dst, dev);
 		if (neigh == NULL) {
 			if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
 				err = -ENOENT;
@@ -1552,16 +1570,15 @@ int neigh_add(struct sk_buff *skb, struc
 		} else {
 			if (nlh->nlmsg_flags & NLM_F_EXCL) {
 				err = -EEXIST;
-				neigh_release(neigh);
 				goto out_dev_put;
 			}
 
+			neigh_hold(neigh);
 			if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
 				flags &= ~NEIGH_UPDATE_F_OVERRIDE;
 		}
 
 		err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
-		neigh_release(neigh);
 		goto out_dev_put;
 	}
 
@@ -1620,7 +1637,7 @@ static int neightbl_fill_info(struct sk_
 
 	ndtmsg = nlmsg_data(nlh);
 
-	read_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 	ndtmsg->ndtm_family = tbl->family;
 	ndtmsg->ndtm_pad1   = 0;
 	ndtmsg->ndtm_pad2   = 0;
@@ -1680,11 +1697,11 @@ static int neightbl_fill_info(struct sk_
 	if (neightbl_fill_parms(skb, &tbl->parms) < 0)
 		goto nla_put_failure;
 
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_end(skb, nlh);
 
 nla_put_failure:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_cancel(skb, nlh);
 }
 
@@ -1703,7 +1720,7 @@ static int neightbl_fill_param_info(stru
 
 	ndtmsg = nlmsg_data(nlh);
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock();		/* this maybe unnecessary */
 	ndtmsg->ndtm_family = tbl->family;
 	ndtmsg->ndtm_pad1   = 0;
 	ndtmsg->ndtm_pad2   = 0;
@@ -1712,10 +1729,10 @@ static int neightbl_fill_param_info(stru
 	    neightbl_fill_parms(skb, parms) < 0)
 		goto errout;
 
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_end(skb, nlh);
 errout:
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 	return nlmsg_cancel(skb, nlh);
 }
  
@@ -1793,7 +1810,7 @@ int neightbl_set(struct sk_buff *skb, st
 	 * We acquire tbl->lock to be nice to the periodic timers and
 	 * make sure they always see a consistent set of values.
 	 */
-	write_lock_bh(&tbl->lock);
+	spin_lock_bh(&tbl->lock);
 
 	if (tb[NDTA_PARMS]) {
 		struct nlattr *tbp[NDTPA_MAX+1];
@@ -1874,7 +1891,7 @@ int neightbl_set(struct sk_buff *skb, st
 	err = 0;
 
 errout_tbl_lock:
-	write_unlock_bh(&tbl->lock);
+	spin_unlock_bh(&tbl->lock);
 errout_locked:
 	rcu_read_unlock();
 errout:
@@ -1890,7 +1907,7 @@ int neightbl_dump_info(struct sk_buff *s
 
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
 
-	rcu_read_lock();
+	rcu_read_lock_bh();
 	list_for_each_entry_rcu(tbl, &neigh_tables, list) {
 		struct neigh_parms *p;
 
@@ -1986,20 +2003,20 @@ static int neigh_dump_table(struct neigh
 			continue;
 		if (h > s_h)
 			s_idx = 0;
-		read_lock_bh(&tbl->lock);
+		rcu_read_lock();
 		idx = 0;
-		hlist_for_each_entry(n, tmp, &tbl->hash_buckets[h], hlist) {
+		hlist_for_each_entry_rcu(n, tmp, &tbl->hash_buckets[h], hlist) {
 			if (idx >= s_idx &&
 			    neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
 					    cb->nlh->nlmsg_seq,
 					    RTM_NEWNEIGH, NLM_F_MULTI) <= 0) {
-				read_unlock_bh(&tbl->lock);
+				rcu_read_unlock();
 				rc = -1;
 				goto out;
 			}
 			++idx;
 		}
-		read_unlock_bh(&tbl->lock);
+		rcu_read_unlock();
 	}
 	rc = skb->len;
 out:
@@ -2039,14 +2056,15 @@ void neigh_for_each(struct neigh_table *
 {
 	int chain;
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock();
 	for (chain = 0; chain <= tbl->hash_mask; chain++) {
+		struct neighbour *n;
 		struct hlist_node *p;
 
-		hlist_for_each(p, &tbl->hash_buckets[chain])
-			cb(hlist_entry(p, struct neighbour, hlist), cookie);
+		hlist_for_each_entry_rcu(n, p, &tbl->hash_buckets[chain], hlist)
+			cb(n, cookie);
 	}
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_for_each);
 
@@ -2067,12 +2085,12 @@ void __neigh_for_each_release(struct nei
 			write_lock(&n->lock);
 			release = cb(n);
 			if (release) {
-				hlist_del(&n->hlist);
+				hlist_del_rcu(&n->hlist);
 				n->dead = 1;
 			}
 			write_unlock(&n->lock);
 			if (release)
-				neigh_release(n);
+				call_rcu(&n->rcu, neigh_rcu_release);
 		}
 	}
 }
@@ -2116,7 +2134,7 @@ found:
 
 static struct neighbour *next_neigh(struct hlist_node *node)
 {
-	if (node)
+	if (rcu_dereference(node))
 		return hlist_entry(node, struct neighbour, hlist);
 	else
 		return NULL;
@@ -2191,7 +2209,7 @@ static struct pneigh_entry *pneigh_get_f
 
 	state->flags |= NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
-		pn = tbl->phash_buckets[bucket].first;
+		pn = rcu_dereference(tbl->phash_buckets[bucket].first);
 		if (pn)
 			break;
 	}
@@ -2208,12 +2226,12 @@ static struct pneigh_entry *pneigh_get_n
 	struct neigh_table *tbl = state->tbl;
 	struct hlist_node *tmp = &pn->hlist;
 
-	tmp = tmp->next;
+	tmp = rcu_dereference(tmp->next);
 	if (tmp)
 		goto found;
 
 	while (++state->bucket < PNEIGH_HASHMASK) {
-		tmp = tbl->phash_buckets[state->bucket].first;
+		tmp = rcu_dereference(tbl->phash_buckets[state->bucket].first);
 		if (tmp)
 			goto found;
 	}
@@ -2261,7 +2279,7 @@ void *neigh_seq_start(struct seq_file *s
 	state->bucket = 0;
 	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
 
-	read_lock_bh(&tbl->lock);
+	rcu_read_lock();
 
 	pos_minus_one = *pos - 1;
 	return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN;
@@ -2297,10 +2315,7 @@ EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
 {
-	struct neigh_seq_state *state = seq->private;
-	struct neigh_table *tbl = state->tbl;
-
-	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
 
@@ -2731,7 +2746,7 @@ EXPORT_SYMBOL(neigh_destroy);
 EXPORT_SYMBOL(neigh_dump_info);
 EXPORT_SYMBOL(neigh_event_ns);
 EXPORT_SYMBOL(neigh_ifdown);
-EXPORT_SYMBOL(neigh_lookup);
+EXPORT_SYMBOL(neigh_lookup_rcu);
 EXPORT_SYMBOL(neigh_lookup_nodev);
 EXPORT_SYMBOL(neigh_parms_alloc);
 EXPORT_SYMBOL(neigh_parms_release);
--- net-2.6.19.orig/drivers/infiniband/core/addr.c
+++ net-2.6.19/drivers/infiniband/core/addr.c
@@ -165,10 +165,11 @@ static int addr_resolve_remote(struct so
 		goto put;
 	}
 
-	neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
+	rcu_read_lock();
+	neigh = neigh_lookup_rcu(&arp_tbl, &rt->rt_gateway, rt->idev->dev);
 	if (!neigh) {
 		ret = -ENODATA;
-		goto put;
+		goto release;
 	}
 
 	if (!(neigh->nud_state & NUD_VALID)) {
@@ -183,7 +184,7 @@ static int addr_resolve_remote(struct so
 
 	ret = copy_addr(addr, neigh->dev, neigh->ha);
 release:
-	neigh_release(neigh);
+	rcu_read_unlock();
 put:
 	ip_rt_put(rt);
 out:
--- net-2.6.19.orig/drivers/net/wireless/strip.c
+++ net-2.6.19/drivers/net/wireless/strip.c
@@ -467,17 +467,20 @@ static int arp_query(unsigned char *hadd
 		     struct net_device *dev)
 {
 	struct neighbour *neighbor_entry;
+	int ret = 0;
 
-	neighbor_entry = neigh_lookup(&arp_tbl, &paddr, dev);
+	rcu_read_lock();
+	neighbor_entry = neigh_lookup_rcu(&arp_tbl, &paddr, dev);
 
-	if (neighbor_entry != NULL) {
+	if (neighbor_entry) {
 		neighbor_entry->used = jiffies;
 		if (neighbor_entry->nud_state & NUD_VALID) {
 			memcpy(haddr, neighbor_entry->ha, dev->addr_len);
-			return 1;
+			ret = 1;
 		}
 	}
-	return 0;
+	rcu_read_unlock();
+	return ret;
 }
 
 static void DumpData(char *msg, struct strip *strip_info, __u8 * ptr,
--- net-2.6.19.orig/net/ipv4/arp.c
+++ net-2.6.19/net/ipv4/arp.c
@@ -1067,7 +1067,8 @@ static int arp_req_get(struct arpreq *r,
 	struct neighbour *neigh;
 	int err = -ENXIO;
 
-	neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	rcu_read_lock();
+	neigh = neigh_lookup_rcu(&arp_tbl, &ip, dev);
 	if (neigh) {
 		read_lock_bh(&neigh->lock);
 		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
@@ -1075,9 +1076,9 @@ static int arp_req_get(struct arpreq *r,
 		read_unlock_bh(&neigh->lock);
 		r->arp_ha.sa_family = dev->type;
 		strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
-		neigh_release(neigh);
 		err = 0;
 	}
+	rcu_read_unlock();
 	return err;
 }
 
@@ -1118,14 +1119,15 @@ static int arp_req_delete(struct arpreq 
 			return -EINVAL;
 	}
 	err = -ENXIO;
-	neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	rcu_read_lock();
+	neigh = neigh_lookup_rcu(&arp_tbl, &ip, dev);
 	if (neigh) {
 		if (neigh->nud_state&~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED, 
 					   NEIGH_UPDATE_F_OVERRIDE|
 					   NEIGH_UPDATE_F_ADMIN);
-		neigh_release(neigh);
 	}
+	rcu_read_unlock();
 	return err;
 }
 
--- net-2.6.19.orig/net/ipv6/ndisc.c
+++ net-2.6.19/net/ipv6/ndisc.c
@@ -944,8 +944,9 @@ static void ndisc_recv_na(struct sk_buff
 		in6_ifa_put(ifp);
 		return;
 	}
-	neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
 
+	rcu_read_lock();
+	neigh = neigh_lookup_rcu(&nd_tbl, &msg->target, dev);
 	if (neigh) {
 		u8 old_flags = neigh->flags;
 
@@ -969,9 +970,9 @@ static void ndisc_recv_na(struct sk_buff
 				ip6_del_rt(rt);
 		}
 
-out:
-		neigh_release(neigh);
 	}
+out:
+	rcu_read_unlock();
 }
 
 static void ndisc_recv_rs(struct sk_buff *skb)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 23:00             ` Stephen Hemminger
@ 2006-08-29 23:21               ` Alexey Kuznetsov
  2006-08-29 23:49                 ` Stephen Hemminger
  2006-08-29 23:36               ` Alexey Kuznetsov
  1 sibling, 1 reply; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 23:21 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hello!

> This should not be any more racy than the existing code.

Existing code is not racy.

Critical place is interpretation of refcnt==1. Current code assumes,
that when refcnt=1 and entry is in hash table, nobody can take this
entry (table is locked). So, it can be unlinked from the table.

See? __neigh_lookup()/__neigh_lookup_errno() _MUST_ return alive
and hashed entry. And will stay hashed until the reference is held.
Or until neigh entry is forced for destruction by device going down,
in this case referring dst entries will die as well.

If dst cache grabs an entry, which is purged from table because for some
time it had refcnt==1, you got a valid dst entry referring to dead
neighbour entry.

Alexey

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 23:00             ` Stephen Hemminger
  2006-08-29 23:21               ` Alexey Kuznetsov
@ 2006-08-29 23:36               ` Alexey Kuznetsov
  1 sibling, 0 replies; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-29 23:36 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hello!

Yes, I forgot to say I take back my suggestion about atomic_inc_test_zero().
It would not work.

Seems, it is possible to add some barriers around setting n->dead
and testing it in neigh_lookup_rcu(), but it would be scary and ugly.
To be honest, I just do not know how to do this. :-)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 23:21               ` Alexey Kuznetsov
@ 2006-08-29 23:49                 ` Stephen Hemminger
  2006-08-30  0:06                   ` Alexey Kuznetsov
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2006-08-29 23:49 UTC (permalink / raw)
  To: Alexey Kuznetsov; +Cc: David Miller, netdev

On Wed, 30 Aug 2006 03:21:26 +0400
Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> wrote:

> Hello!
> 
> > This should not be any more racy than the existing code.
> 
> Existing code is not racy.

Race 1: w/o RCU
					Cpu 0: is in neigh_lookup
						gets read_lock()
						finds entry
						++refcount to 2
						updates it
	Cpu 1: is in forced_gc()
		waits at write_lock()
						releases read_lock()
						drops ref count to 1.
		sees ref count is 1
		deletes it

Race 1: w RCU
					Cpu 0: is in __neigh_lookup
						updates it
	Cpu 1: is in forced_gc()
						leaves refcount=1
		sees ref count is 1
		deletes it


> 
> Critical place is interpretation of refcnt==1. Current code assumes,
> that when refcnt=1 and entry is in hash table, nobody can take this
> entry (table is locked). So, it can be unlinked from the table.
> 
> See? __neigh_lookup()/__neigh_lookup_errno() _MUST_ return alive
> and hashed entry. And will stay hashed until the reference is held.
> Or until neigh entry is forced for destruction by device going down,
> in this case referring dst entries will die as well.

Why must it be hashed, it could always get zapped just after the update.

> If dst cache grabs an entry, which is purged from table because for some
> time it had refcnt==1, you got a valid dst entry referring to dead
> neighbour entry.

Hmm.. Since this is a slow path, grabbing the write_lock on the neighbour
entry would block the gc from zapping it.

in __neigh_lookup()
		neigh_hold();
		write_lock(&n->lock);
		if (n->dead) {
			write_unlock()
			neigh_release()
			goto rescan;
		}

		write_unlock()





-- 
Stephen Hemminger <shemminger@osdl.org>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] net neighbour: convert to RCU
  2006-08-29 23:49                 ` Stephen Hemminger
@ 2006-08-30  0:06                   ` Alexey Kuznetsov
  0 siblings, 0 replies; 19+ messages in thread
From: Alexey Kuznetsov @ 2006-08-30  0:06 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Hello!

> Race 1: w/o RCU
> 					Cpu 0: is in neigh_lookup
> 						gets read_lock()
> 						finds entry
> 						++refcount to 2
> 						updates it
> 	Cpu 1: is in forced_gc()
> 		waits at write_lock()
> 						releases read_lock()
> 						drops ref count to 1.
> 		sees ref count is 1
> 		deletes it

Do you mean it is purged, though it is actually fresh?
It is harmless race condition.


> Why must it be hashed, it could always get zapped just after the update.

Because otherwise we have to check its validity on level of dst_cache
and to rebind. We do not want this. Actually dst->neighbour is supposed
to be immutable.

It means that if some valid dst refers to a neighbour, it must remain
hashed.


> Hmm.. Since this is a slow path,

All the neighbour lookups are in slow path. Actually, lookup
of neighbour entry for use in dst cache is the most loaded path,
the rest of them are pure maintanance paths.

Yes, this would work. RCU is a little spoiled, but at least global
tbl->lock is replaced with per neighbour lock.

Alexey

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2006-08-30  0:06 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-28 23:07 [PATCH 0/6] Lockless neighbour table Stephen Hemminger
2006-08-28 23:07 ` [PATCH 1/6] net neighbor: convert top level list to RCU Stephen Hemminger
2006-08-28 23:07 ` [PATCH 2/6] neighbour: convert neighbour hash table to hlist Stephen Hemminger
2006-08-28 23:07 ` [PATCH 3/6] neighbour: convert pneigh " Stephen Hemminger
2006-08-28 23:07 ` [PATCH 4/6] net neighbour: convert to RCU Stephen Hemminger
2006-08-29 15:28   ` Alexey Kuznetsov
2006-08-29 18:22     ` Stephen Hemminger
2006-08-29 18:34       ` Martin Josefsson
2006-08-29 20:17         ` Stephen Hemminger
2006-08-29 21:17       ` Alexey Kuznetsov
2006-08-29 21:46         ` Stephen Hemminger
2006-08-29 22:16           ` Alexey Kuznetsov
2006-08-29 23:00             ` Stephen Hemminger
2006-08-29 23:21               ` Alexey Kuznetsov
2006-08-29 23:49                 ` Stephen Hemminger
2006-08-30  0:06                   ` Alexey Kuznetsov
2006-08-29 23:36               ` Alexey Kuznetsov
2006-08-28 23:07 ` [PATCH 5/6] neighbour: convert lookup to sequence lock Stephen Hemminger
2006-08-28 23:07 ` [PATCH 6/6] neighbour: convert hard header cache to sequence number Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).