[patch 7/11] net: Use bigrefs for net

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch 7/11] net: Use bigrefs for net_device.refcount
       [not found] <20050913155112.GB3570@localhost.localdomain>
@ 2005-09-13 16:10 ` Ravikiran G Thirumalai
  2005-09-13 16:26   ` Stephen Hemminger
  2005-09-13 18:27   ` Eric Dumazet
  2005-09-13 16:12 ` [patch 8/11] net: dst_abstraction macros Ravikiran G Thirumalai
  2005-09-13 16:17 ` [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu Ravikiran G Thirumalai
  2 siblings, 2 replies; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 16:10 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, dipankar, bharata, shai, Rusty Russell, netdev,
	davem

The net_device has a refcnt used to keep track of it's uses.
This is used at the time of unregistering the network device
(module unloading ..) (see netdev_wait_allrefs) .
For loopback_dev , this refcnt increment/decrement  is causing
unnecessary traffic on the interlink for NUMA system
affecting it's performance.  This patch improves tbench numbers by 6% on a
8way x86 Xeon (x445).

This patch is dependent on the bigref patch 

Signed-off-by : Niraj Kumar <nirajk@calsoftinc.com>
Signed-off-by : Shai Fultheim <shai@scalex86.org>
Signed-off-by : Ravikiran Thirumalai <kiran@scalex86.org>

Index: alloc_percpu-2.6.13/drivers/net/loopback.c
===================================================================
--- alloc_percpu-2.6.13.orig/drivers/net/loopback.c	2005-08-28 16:41:01.000000000 -0700
+++ alloc_percpu-2.6.13/drivers/net/loopback.c	2005-09-12 12:04:25.000000000 -0700
@@ -226,6 +226,12 @@
 		loopback_dev.priv = stats;
 		loopback_dev.get_stats = &get_stats;
 	}
+
+	/* 
+	 * This is the only struct net_device not allocated by alloc_netdev
+	 * So explicitly init the bigref hanging off loopback_dev
+	 */
+	bigref_init(&loopback_dev.netdev_refcnt);
 	
 	return register_netdev(&loopback_dev);
 };
Index: alloc_percpu-2.6.13/include/linux/netdevice.h
===================================================================
--- alloc_percpu-2.6.13.orig/include/linux/netdevice.h	2005-08-28 16:41:01.000000000 -0700
+++ alloc_percpu-2.6.13/include/linux/netdevice.h	2005-09-12 11:54:21.000000000 -0700
@@ -37,6 +37,7 @@
 #include <linux/config.h>
 #include <linux/device.h>
 #include <linux/percpu.h>
+#include <linux/bigref.h>
 
 struct divert_blk;
 struct vlan_group;
@@ -377,7 +378,7 @@
 	/* device queue lock */
 	spinlock_t		queue_lock;
 	/* Number of references to this device */
-	atomic_t		refcnt;
+	struct bigref	        netdev_refcnt;	
 	/* delayed register/unregister */
 	struct list_head	todo_list;
 	/* device name hash chain */
@@ -677,11 +678,11 @@
 
 static inline void dev_put(struct net_device *dev)
 {
-	atomic_dec(&dev->refcnt);
+	bigref_put(&dev->netdev_refcnt, NULL);
 }
 
-#define __dev_put(dev) atomic_dec(&(dev)->refcnt)
-#define dev_hold(dev) atomic_inc(&(dev)->refcnt)
+#define __dev_put(dev) bigref_put(&(dev)->netdev_refcnt, NULL);
+#define dev_hold(dev) bigref_get(&(dev)->netdev_refcnt);
 
 /* Carrier loss detection, dial on demand. The functions netif_carrier_on
  * and _off may be called from IRQ context, but it is caller
Index: alloc_percpu-2.6.13/net/core/dev.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/core/dev.c	2005-08-28 16:41:01.000000000 -0700
+++ alloc_percpu-2.6.13/net/core/dev.c	2005-09-12 11:54:21.000000000 -0700
@@ -2658,6 +2658,7 @@
 		goto out;
 
 	dev->iflink = -1;
+	bigref_set(&dev->netdev_refcnt, 0);
 
 	/* Init, if this function is available */
 	if (dev->init) {
@@ -2808,7 +2809,7 @@
 	unsigned long rebroadcast_time, warning_time;
 
 	rebroadcast_time = warning_time = jiffies;
-	while (atomic_read(&dev->refcnt) != 0) {
+	while ( bigref_val(&dev->netdev_refcnt) != 0) {
 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
 			rtnl_shlock();
 
@@ -2838,7 +2839,7 @@
 			printk(KERN_EMERG "unregister_netdevice: "
 			       "waiting for %s to become free. Usage "
 			       "count = %d\n",
-			       dev->name, atomic_read(&dev->refcnt));
+			       dev->name, bigref_val(&dev->netdev_refcnt));
 			warning_time = jiffies;
 		}
 	}
@@ -2909,7 +2910,7 @@
 			netdev_wait_allrefs(dev);
 
 			/* paranoia */
-			BUG_ON(atomic_read(&dev->refcnt));
+			BUG_ON(bigref_val(&dev->netdev_refcnt));
 			BUG_TRAP(!dev->ip_ptr);
 			BUG_TRAP(!dev->ip6_ptr);
 			BUG_TRAP(!dev->dn_ptr);
@@ -2969,6 +2970,7 @@
 
 	setup(dev);
 	strcpy(dev->name, name);
+	bigref_init(&dev->netdev_refcnt);
 	return dev;
 }
 EXPORT_SYMBOL(alloc_netdev);
@@ -2986,6 +2988,7 @@
 #ifdef CONFIG_SYSFS
 	/*  Compatiablity with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
+		bigref_destroy(&dev->netdev_refcnt);
 		kfree((char *)dev - dev->padded);
 		return;
 	}
@@ -2996,6 +2999,7 @@
 	/* will free via class release */
 	class_device_put(&dev->class_dev);
 #else
+	bigref_destroy(&dev->netdev_refcnt);
 	kfree((char *)dev - dev->padded);
 #endif
 }
@@ -3210,7 +3214,7 @@
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
 		queue->backlog_dev.weight = weight_p;
 		queue->backlog_dev.poll = process_backlog;
-		atomic_set(&queue->backlog_dev.refcnt, 1);
+		bigref_init(&queue->backlog_dev.netdev_refcnt);
 	}
 
 	dev_boot_phase = 0;
Index: alloc_percpu-2.6.13/net/core/net-sysfs.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/core/net-sysfs.c	2005-08-28 16:41:01.000000000 -0700
+++ alloc_percpu-2.6.13/net/core/net-sysfs.c	2005-09-12 11:54:21.000000000 -0700
@@ -16,6 +16,7 @@
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/wireless.h>
+#include <linux/bigref.h>
 
 #define to_class_dev(obj) container_of(obj,struct class_device,kobj)
 #define to_net_dev(class) container_of(class, struct net_device, class_dev)
@@ -400,6 +401,8 @@
 		= container_of(cd, struct net_device, class_dev);
 
 	BUG_ON(dev->reg_state != NETREG_RELEASED);
+	
+	bigref_destroy(&dev->netdev_refcnt);
 
 	kfree((char *)dev - dev->padded);
 }

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [patch 8/11] net: dst_abstraction macros
       [not found] <20050913155112.GB3570@localhost.localdomain>
  2005-09-13 16:10 ` [patch 7/11] net: Use bigrefs for net_device.refcount Ravikiran G Thirumalai
@ 2005-09-13 16:12 ` Ravikiran G Thirumalai
  2005-09-13 16:17 ` [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu Ravikiran G Thirumalai
  2 siblings, 0 replies; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 16:12 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, dipankar, bharata, shai, Rusty Russell, netdev,
	davem

This patch introduces macros to handle the use, lastuse and refcnt fields in
the dst_entry structure using macros. Having macros manipulate these fields
allows cleaner source code and provides an easy way for modifications to the
way these performance critical fields are handled.

The introduction of macros removes some code that is repeated in 
various places. Also

- decnet_dn_route: introduces dn_dst_useful to check the usefulness of a dst
		entry. dst_update_rtu used to reduce code duplication.

- net/ipv4/route.c: add ip_rt_copy. dst_update_rtu used to reduce code duplication.

The patch is a prerequisite for the dst numa patch. 

Signed-off-by: Pravin B. Shelar <pravins@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>

Index: alloc_percpu-2.6.13-rc6/include/net/dst.h
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/include/net/dst.h	2005-08-15 17:54:34.721623250 -0400
+++ alloc_percpu-2.6.13-rc6/include/net/dst.h	2005-08-15 17:58:14.499358500 -0400
@@ -103,6 +103,30 @@
 
 #ifdef __KERNEL__
 
+#define dst_use(__dst) (__dst)->__use
+#define dst_use_inc(__dst) (__dst)->__use++
+
+#define dst_lastuse(__dst) (__dst)->lastuse
+#define dst_lastuse_set(__dst) (__dst)->lastuse = jiffies
+
+#define dst_update_tu(__dst) do { dst_lastuse_set(__dst);dst_use_inc(__dst); } while (0)
+#define dst_update_rtu(__dst) do { dst_lastuse_set(__dst);dst_hold(__dst);dst_use_inc(__dst); } while (0)
+
+#define dst_refcnt(__dst) atomic_read(&(__dst)->__refcnt)
+#define dst_refcnt_one(__dst) atomic_set(&(__dst)->__refcnt, 1)
+#define dst_refcnt_dec(__dst) atomic_dec(&(__dst)->__refcnt)
+#define dst_hold(__dst) atomic_inc(&(__dst)->__refcnt)
+
+static inline
+void dst_release(struct dst_entry * dst)
+{
+	if (dst) {
+		WARN_ON(dst_refcnt(dst) < 1);
+		smp_mb__before_atomic_dec();
+		dst_refcnt_dec(dst);
+	}
+}
+
 static inline u32
 dst_metric(const struct dst_entry *dst, int metric)
 {
@@ -134,29 +158,14 @@
 	return dst_metric(dst, RTAX_LOCK) & (1<<metric);
 }
 
-static inline void dst_hold(struct dst_entry * dst)
-{
-	atomic_inc(&dst->__refcnt);
-}
-
 static inline
 struct dst_entry * dst_clone(struct dst_entry * dst)
 {
 	if (dst)
-		atomic_inc(&dst->__refcnt);
+		dst_hold(dst);
 	return dst;
 }
 
-static inline
-void dst_release(struct dst_entry * dst)
-{
-	if (dst) {
-		WARN_ON(atomic_read(&dst->__refcnt) < 1);
-		smp_mb__before_atomic_dec();
-		atomic_dec(&dst->__refcnt);
-	}
-}
-
 /* Children define the path of the packet through the
  * Linux networking.  Thus, destinations are stackable.
  */
@@ -177,7 +186,7 @@
 {
 	if (dst->obsolete > 1)
 		return;
-	if (!atomic_read(&dst->__refcnt)) {
+	if (!dst_refcnt(dst)) {
 		dst = dst_destroy(dst);
 		if (!dst)
 			return;
Index: alloc_percpu-2.6.13-rc6/net/core/dst.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/core/dst.c	2005-08-15 17:54:34.761625750 -0400
+++ alloc_percpu-2.6.13-rc6/net/core/dst.c	2005-08-15 17:58:14.499358500 -0400
@@ -57,7 +57,7 @@
 	dstp = &dst_garbage_list;
 	work_performed = 0;
 	while ((dst = *dstp) != NULL) {
-		if (atomic_read(&dst->__refcnt)) {
+		if (dst_refcnt(dst)) {
 			dstp = &dst->next;
 			delayed++;
 			continue;
@@ -176,9 +176,8 @@
 	struct neighbour *neigh;
 	struct hh_cache *hh;
 
-	smp_rmb();
-
 again:
+	smp_rmb();
 	neigh = dst->neighbour;
 	hh = dst->hh;
 	child = dst->child;
@@ -206,16 +205,16 @@
 	dst = child;
 	if (dst) {
 		int nohash = dst->flags & DST_NOHASH;
-
-		if (atomic_dec_and_test(&dst->__refcnt)) {
-			/* We were real parent of this dst, so kill child. */
-			if (nohash)
+		dst_refcnt_dec(dst);
+		if (nohash) {
+			if (!dst_refcnt(dst)) {
+				/* We were real parent of this dst, so kill child. */
 				goto again;
-		} else {
-			/* Child is still referenced, return it for freeing. */
-			if (nohash)
+			} else {
+				/* Child is still referenced, return it for freeing. */
 				return dst;
-			/* Child is still in his hash table */
+				/* Child is still in his hash table */
+			}
 		}
 	}
 	return NULL;
Index: alloc_percpu-2.6.13-rc6/net/decnet/dn_route.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/decnet/dn_route.c	2005-08-15 17:54:34.789627500 -0400
+++ alloc_percpu-2.6.13-rc6/net/decnet/dn_route.c	2005-08-15 17:58:14.503358750 -0400
@@ -155,6 +155,11 @@
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
+static inline int dn_dst_useful(struct dn_route *rth, unsigned long now, unsigned long expire)
+{
+	return  (atomic_read(&rth->u.dst.__refcnt) || (now - rth->u.dst.lastuse) < expire) ;
+}
+
 static void dn_dst_check_expire(unsigned long dummy)
 {
 	int i;
@@ -167,8 +172,7 @@
 
 		spin_lock(&dn_rt_hash_table[i].lock);
 		while((rt=*rtp) != NULL) {
-			if (atomic_read(&rt->u.dst.__refcnt) ||
-					(now - rt->u.dst.lastuse) < expire) {
+			if (dn_dst_useful(rt, now, expire)) {
 				rtp = &rt->u.rt_next;
 				continue;
 			}
@@ -198,8 +202,7 @@
 		rtp = &dn_rt_hash_table[i].chain;
 
 		while((rt=*rtp) != NULL) {
-			if (atomic_read(&rt->u.dst.__refcnt) ||
-					(now - rt->u.dst.lastuse) < expire) {
+			if (dn_dst_useful(rt, now, expire)) {
 				rtp = &rt->u.rt_next;
 				continue;
 			}
@@ -277,10 +280,8 @@
 static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp)
 {
 	struct dn_route *rth, **rthp;
-	unsigned long now = jiffies;
-
-	rthp = &dn_rt_hash_table[hash].chain;
 
+ 	rthp = &dn_rt_hash_table[hash].chain;
 	spin_lock_bh(&dn_rt_hash_table[hash].lock);
 	while((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
@@ -290,9 +291,7 @@
 					   dn_rt_hash_table[hash].chain);
 			rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
 
-			rth->u.dst.__use++;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.lastuse = now;
+			dst_update_rtu(&rth->u.dst);
 			spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 
 			dnrt_drop(rt);
@@ -304,10 +303,8 @@
 
 	rcu_assign_pointer(rt->u.rt_next, dn_rt_hash_table[hash].chain);
 	rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
-	
-	dst_hold(&rt->u.dst);
-	rt->u.dst.__use++;
-	rt->u.dst.lastuse = now;
+
+	dst_update_rtu(&rt->u.dst);
 	spin_unlock_bh(&dn_rt_hash_table[hash].lock);
 	*rp = rt;
 	return 0;
@@ -1091,7 +1088,7 @@
 	if (rt == NULL)
 		goto e_nobufs;
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rt->u.dst);
 	rt->u.dst.flags   = DST_HOST;
 
 	rt->fl.fld_src    = oldflp->fld_src;
@@ -1115,7 +1112,7 @@
 	rt->u.dst.neighbour = neigh;
 	neigh = NULL;
 
-	rt->u.dst.lastuse = jiffies;
+	dst_lastuse_set(&rt->u.dst);
 	rt->u.dst.output  = dn_output;
 	rt->u.dst.input   = dn_rt_bug;
 	rt->rt_flags      = flags;
@@ -1173,9 +1170,7 @@
 #endif
 			    (rt->fl.iif == 0) &&
 			    (rt->fl.oif == flp->oif)) {
-				rt->u.dst.lastuse = jiffies;
-				dst_hold(&rt->u.dst);
-				rt->u.dst.__use++;
+				dst_update_rtu(&rt->u.dst);
 				rcu_read_unlock_bh();
 				*pprt = &rt->u.dst;
 				return 0;
@@ -1381,7 +1376,7 @@
 	rt->u.dst.flags = DST_HOST;
 	rt->u.dst.neighbour = neigh;
 	rt->u.dst.dev = out_dev;
-	rt->u.dst.lastuse = jiffies;
+	dst_lastuse_set(&rt->u.dst);
 	rt->u.dst.output = dn_rt_bug;
 	switch(res.type) {
 		case RTN_UNICAST:
@@ -1452,9 +1447,7 @@
 		    (rt->fl.fld_fwmark == skb->nfmark) &&
 #endif
 		    (rt->fl.iif == cb->iif)) {
-			rt->u.dst.lastuse = jiffies;
-			dst_hold(&rt->u.dst);
-			rt->u.dst.__use++;
+			dst_update_rtu(&rt->u.dst);
 			rcu_read_unlock();
 			skb->dst = (struct dst_entry *)rt;
 			return 0;
@@ -1504,9 +1497,9 @@
 		RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway);
 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
 		goto rtattr_failure;
-	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
-	ci.rta_used     = rt->u.dst.__use;
-	ci.rta_clntref  = atomic_read(&rt->u.dst.__refcnt);
+	ci.rta_lastuse = jiffies_to_clock_t(jiffies - dst_lastuse(&rt->u.dst));
+	ci.rta_used     =  dst_use(&rt->u.dst);
+	ci.rta_clntref  = dst_refcnt(&rt->u.dst);
 	if (rt->u.dst.expires)
 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
 	else
@@ -1729,8 +1722,8 @@
 			rt->u.dst.dev ? rt->u.dst.dev->name : "*",
 			dn_addr2asc(dn_ntohs(rt->rt_daddr), buf1),
 			dn_addr2asc(dn_ntohs(rt->rt_saddr), buf2),
-			atomic_read(&rt->u.dst.__refcnt),
-			rt->u.dst.__use,
+			dst_refcnt(&rt->u.dst),
+			dst_use(&rt->u.dst),
 			(int) dst_metric(&rt->u.dst, RTAX_RTT));
 	return 0;
 } 
Index: alloc_percpu-2.6.13-rc6/net/ipv4/ipvs/ip_vs_xmit.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/ipvs/ip_vs_xmit.c	2005-08-15 17:54:34.837630500 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/ipvs/ip_vs_xmit.c	2005-08-15 17:55:23.980701750 -0400
@@ -88,7 +88,7 @@
 			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
 			IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
 				  NIPQUAD(dest->addr),
-				  atomic_read(&rt->u.dst.__refcnt), rtos);
+				  dst_refcnt(&rt->u.dst), rtos);
 		}
 		spin_unlock(&dest->dst_lock);
 	} else {
Index: alloc_percpu-2.6.13-rc6/net/ipv4/multipath_drr.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/multipath_drr.c	2005-08-15 17:54:34.905634750 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/multipath_drr.c	2005-08-15 17:55:23.980701750 -0400
@@ -149,8 +149,7 @@
 		    multipath_comparekeys(&nh->fl, flp)) {
 			int nh_ifidx = nh->u.dst.dev->ifindex;
 
-			nh->u.dst.lastuse = jiffies;
-			nh->u.dst.__use++;
+			dst_update_tu(&nh->u.dst);
 			if (result != NULL)
 				continue;
 
Index: alloc_percpu-2.6.13-rc6/net/ipv4/multipath_random.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/multipath_random.c	2005-08-15 17:54:34.909635000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/multipath_random.c	2005-08-15 17:55:23.980701750 -0400
@@ -94,7 +94,8 @@
 		for (rt = first; rt; rt = rt->u.rt_next) {
 			if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
 			    multipath_comparekeys(&rt->fl, flp)) {
-				rt->u.dst.lastuse = jiffies;
+
+				dst_lastuse_set(&rt->u.dst);
 
 				if (i == candidate_no)
 					decision = rt;
@@ -107,7 +108,7 @@
 		}
 	}
 
-	decision->u.dst.__use++;
+	dst_use_inc(&decision->u.dst);
 	*rp = decision;
 }
 
Index: alloc_percpu-2.6.13-rc6/net/ipv4/multipath_rr.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/multipath_rr.c	2005-08-15 17:54:34.973639000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/multipath_rr.c	2005-08-15 17:55:24.056706500 -0400
@@ -62,10 +62,11 @@
  	     nh = rcu_dereference(nh->u.rt_next)) {
 		if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
 		    multipath_comparekeys(&nh->fl, flp)) {
-			nh->u.dst.lastuse = jiffies;
+			int __use = dst_use(&nh->u.dst);
+			dst_lastuse_set(&nh->u.dst);
 
-			if (min_use == -1 || nh->u.dst.__use < min_use) {
-				min_use = nh->u.dst.__use;
+			if (min_use == -1 || __use < min_use) {
+				min_use = __use;
 				min_use_cand = nh;
 			}
 		}
@@ -74,7 +75,7 @@
 	if (!result)
 		result = first;
 
-	result->u.dst.__use++;
+	dst_use_inc(&result->u.dst);
 	*rp = result;
 }
 
Index: alloc_percpu-2.6.13-rc6/net/ipv4/multipath_wrandom.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/multipath_wrandom.c	2005-08-15 17:54:34.973639000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/multipath_wrandom.c	2005-08-15 17:55:24.056706500 -0400
@@ -202,7 +202,7 @@
 	decision = first;
 	last_mpc = NULL;
 	for (mpc = first_mpc; mpc; mpc = mpc->next) {
-		mpc->rt->u.dst.lastuse = jiffies;
+		dst_lastuse_set(&mpc->rt->u.dst);
 		if (last_power <= selector && selector < mpc->power)
 			decision = mpc->rt;
 
@@ -217,8 +217,7 @@
 		/* concurrent __multipath_flush may lead to !last_mpc */
 		kfree(last_mpc);
 	}
-
-	decision->u.dst.__use++;
+	dst_use_inc(&decision->u.dst);
 	*rp = decision;
 }
 
Index: alloc_percpu-2.6.13-rc6/net/ipv4/route.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/route.c	2005-08-15 17:54:34.973639000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/route.c	2005-08-15 17:58:14.503358750 -0400
@@ -334,8 +334,8 @@
 			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 			r->u.dst.dev ? r->u.dst.dev->name : "*",
 			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
-			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
-			r->u.dst.__use, 0, (unsigned long)r->rt_src,
+			r->rt_flags, dst_refcnt(&r->u.dst),
+			dst_use(&r->u.dst), 0, (unsigned long)r->rt_src,
 			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 			dst_metric(&r->u.dst, RTAX_WINDOW),
@@ -512,7 +512,7 @@
 	unsigned long age;
 	int ret = 0;
 
-	if (atomic_read(&rth->u.dst.__refcnt))
+	if (dst_refcnt(&rth->u.dst))
 		goto out;
 
 	ret = 1;
@@ -536,7 +536,7 @@
  */
 static inline u32 rt_score(struct rtable *rt)
 {
-	u32 score = jiffies - rt->u.dst.lastuse;
+	u32 score = jiffies - dst_lastuse(&rt->u.dst);
 
 	score = ~score & ~(3<<30);
 
@@ -943,9 +943,7 @@
 			 */
 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
-			rth->u.dst.__use++;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.lastuse = now;
+			dst_update_rtu(&rth->u.dst);
 			spin_unlock_bh(rt_hash_lock_addr(hash));
 
 			rt_drop(rt);
@@ -953,7 +951,7 @@
 			return 0;
 		}
 
-		if (!atomic_read(&rth->u.dst.__refcnt)) {
+		if (!dst_refcnt(&rth->u.dst)) {
 			u32 score = rt_score(rth);
 
 			if (score <= min_score) {
@@ -1108,6 +1106,12 @@
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
+void ip_rt_copy(struct rtable *to, struct rtable *from)
+{
+	*to = *from;
+	to->u.dst.__use 	= 1;
+}
+
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
 		    u32 saddr, u8 tos, struct net_device *dev)
 {
@@ -1175,17 +1179,17 @@
 				}
 
 				/* Copy all the information. */
-				*rt = *rth;
- 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
-				rt->u.dst.__use		= 1;
-				atomic_set(&rt->u.dst.__refcnt, 1);
+				ip_rt_copy(rt, rth);
+
+				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
+				dst_lastuse_set(&rt->u.dst);
+				dst_refcnt_one(&rt->u.dst);
 				rt->u.dst.child		= NULL;
 				if (rt->u.dst.dev)
 					dev_hold(rt->u.dst.dev);
 				if (rt->idev)
 					in_dev_hold(rt->idev);
 				rt->u.dst.obsolete	= 0;
-				rt->u.dst.lastuse	= jiffies;
 				rt->u.dst.path		= &rt->u.dst;
 				rt->u.dst.neighbour	= NULL;
 				rt->u.dst.hh		= NULL;
@@ -1619,7 +1623,7 @@
 
 	rth->u.dst.output= ip_rt_bug;
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rth->u.dst);
 	rth->u.dst.flags= DST_HOST;
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
@@ -1818,7 +1822,7 @@
 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
 	if (err)
 		return err;
-	atomic_set(&rth->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rth->u.dst);
 
 	/* put it into the cache */
 	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
@@ -1876,7 +1880,7 @@
 		 * outside
 		 */
 		if (hop == lasthop)
-			atomic_set(&(skb->dst->__refcnt), 1);
+			dst_refcnt_one(skb->dst);
 	}
 	return err;
 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
@@ -2012,7 +2016,7 @@
 
 	rth->u.dst.output= ip_rt_bug;
 
-	atomic_set(&rth->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rth->u.dst);
 	rth->u.dst.flags= DST_HOST;
 	if (in_dev->cnf.no_policy)
 		rth->u.dst.flags |= DST_NOPOLICY;
@@ -2102,9 +2106,7 @@
 		    rth->fl.fl4_fwmark == skb->nfmark &&
 #endif
 		    rth->fl.fl4_tos == tos) {
-			rth->u.dst.lastuse = jiffies;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.__use++;
+			dst_update_rtu(&rth->u.dst);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
 			skb->dst = (struct dst_entry*)rth;
@@ -2288,7 +2290,7 @@
 	if (err == 0) {
 		u32 tos = RT_FL_TOS(oldflp);
 
-		atomic_set(&rth->u.dst.__refcnt, 1);
+		dst_refcnt_one(&rth->u.dst);
 		
 		hash = rt_hash_code(oldflp->fl4_dst, 
 				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
@@ -2348,7 +2350,7 @@
 			if (err != 0)
 				return err;
 		}
-		atomic_set(&(*rp)->u.dst.__refcnt, 1);
+		dst_refcnt_one(&(*rp)->u.dst);
 		return err;
 	} else {
 		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
@@ -2584,10 +2586,7 @@
 				rcu_read_unlock_bh();
 				return 0;
 			}
-
-			rth->u.dst.lastuse = jiffies;
-			dst_hold(&rth->u.dst);
-			rth->u.dst.__use++;
+			dst_update_rtu(&rth->u.dst);
 			RT_CACHE_STAT_INC(out_hit);
 			rcu_read_unlock_bh();
 			*rp = rth;
@@ -2673,9 +2672,9 @@
 		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
 		goto rtattr_failure;
-	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
-	ci.rta_used	= rt->u.dst.__use;
-	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
+	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - dst_lastuse(&rt->u.dst));
+	ci.rta_used	= dst_use(&rt->u.dst);
+	ci.rta_clntref	= dst_refcnt(&rt->u.dst);
 	if (rt->u.dst.expires)
 		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
 	else
Index: alloc_percpu-2.6.13-rc6/net/ipv4/xfrm4_policy.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv4/xfrm4_policy.c	2005-08-15 17:54:34.973639000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv4/xfrm4_policy.c	2005-08-15 17:55:24.060706750 -0400
@@ -135,7 +135,7 @@
 			dev_hold(rt->u.dst.dev);
 		dst_prev->obsolete	= -1;
 		dst_prev->flags	       |= DST_HOST;
-		dst_prev->lastuse	= jiffies;
+		dst_lastuse_set(dst_prev);
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
Index: alloc_percpu-2.6.13-rc6/net/ipv6/ip6_fib.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv6/ip6_fib.c	2005-08-15 17:54:34.973639000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv6/ip6_fib.c	2005-08-15 17:58:14.503358750 -0400
@@ -1160,8 +1160,8 @@
 		}
 		gc_args.more++;
 	} else if (rt->rt6i_flags & RTF_CACHE) {
-		if (atomic_read(&rt->u.dst.__refcnt) == 0 &&
-		    time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) {
+		if (dst_refcnt(&rt->u.dst) == 0 &&
+		    time_after_eq(now, dst_lastuse(&rt->u.dst) + gc_args.timeout)) {
 			RT6_TRACE("aging clone %p\n", rt);
 			return -1;
 		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
Index: alloc_percpu-2.6.13-rc6/net/ipv6/route.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv6/route.c	2005-08-15 17:54:34.973639000 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv6/route.c	2005-08-15 17:58:14.507359000 -0400
@@ -368,10 +368,9 @@
 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
 	rt = rt6_device_match(fn->leaf, oif, strict);
 	dst_hold(&rt->u.dst);
-	rt->u.dst.__use++;
-	read_unlock_bh(&rt6_lock);
 
-	rt->u.dst.lastuse = jiffies;
+	read_unlock_bh(&rt6_lock);
+	dst_update_tu(&rt->u.dst);
 	if (rt->u.dst.error == 0)
 		return rt;
 	dst_release(&rt->u.dst);
@@ -512,8 +511,7 @@
 out:
 	read_unlock_bh(&rt6_lock);
 out2:
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.__use++;
+	dst_update_tu(&rt->u.dst);
 	skb->dst = (struct dst_entry *) rt;
 }
 
@@ -572,8 +570,7 @@
 out:
 	read_unlock_bh(&rt6_lock);
 out2:
-	rt->u.dst.lastuse = jiffies;
-	rt->u.dst.__use++;
+	dst_update_tu(&rt->u.dst);
 	return &rt->u.dst;
 }
 
@@ -685,7 +682,7 @@
 	rt->rt6i_dev	  = dev;
 	rt->rt6i_idev     = idev;
 	rt->rt6i_nexthop  = neigh;
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rt->u.dst);
 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
@@ -719,7 +716,7 @@
 	pprev = &ndisc_dst_gc_list;
 	freed = 0;
 	while ((dst = *pprev) != NULL) {
-		if (!atomic_read(&dst->__refcnt)) {
+		if (!dst_refcnt(dst)) {
 			*pprev = dst->next;
 			dst_free(dst);
 			freed++;
@@ -1261,7 +1258,7 @@
 		rt->rt6i_idev = ort->rt6i_idev;
 		if (rt->rt6i_idev)
 			in6_dev_hold(rt->rt6i_idev);
-		rt->u.dst.lastuse = jiffies;
+		dst_lastuse_set(&rt->u.dst);
 		rt->rt6i_expires = 0;
 
 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
@@ -1424,7 +1421,7 @@
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
 
-	atomic_set(&rt->u.dst.__refcnt, 1);
+	dst_refcnt_one(&rt->u.dst);
 
 	return rt;
 }
@@ -1637,13 +1634,13 @@
 	if (rt->u.dst.dev)
 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
-	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
+	ci.rta_lastuse = jiffies_to_clock_t(jiffies - dst_lastuse(&rt->u.dst));
 	if (rt->rt6i_expires)
 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
 	else
 		ci.rta_expires = 0;
-	ci.rta_used = rt->u.dst.__use;
-	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
+	ci.rta_used = dst_use(&rt->u.dst);
+	ci.rta_clntref = dst_refcnt(&rt->u.dst);
 	ci.rta_error = rt->u.dst.error;
 	ci.rta_id = 0;
 	ci.rta_ts = 0;
@@ -1927,8 +1924,8 @@
 	}
 	arg->len += sprintf(arg->buffer + arg->len,
 			    " %08x %08x %08x %08x %8s\n",
-			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
-			    rt->u.dst.__use, rt->rt6i_flags, 
+			    rt->rt6i_metric, dst_refcnt(&rt->u.dst),
+			    dst_use(&rt->u.dst), rt->rt6i_flags,
 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
 	return 0;
 }
Index: alloc_percpu-2.6.13-rc6/net/ipv6/xfrm6_policy.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/ipv6/xfrm6_policy.c	2005-08-15 17:54:34.977639250 -0400
+++ alloc_percpu-2.6.13-rc6/net/ipv6/xfrm6_policy.c	2005-08-15 17:55:24.160713000 -0400
@@ -156,7 +156,7 @@
 			dev_hold(rt->u.dst.dev);
 		dst_prev->obsolete	= -1;
 		dst_prev->flags	       |= DST_HOST;
-		dst_prev->lastuse	= jiffies;
+		dst_lastuse_set(dst_prev);
 		dst_prev->header_len	= header_len;
 		dst_prev->trailer_len	= trailer_len;
 		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
Index: alloc_percpu-2.6.13-rc6/net/xfrm/xfrm_policy.c
===================================================================
--- alloc_percpu-2.6.13-rc6.orig/net/xfrm/xfrm_policy.c	2005-08-15 17:54:34.977639250 -0400
+++ alloc_percpu-2.6.13-rc6/net/xfrm/xfrm_policy.c	2005-08-15 17:55:24.184714500 -0400
@@ -1090,7 +1090,7 @@
 
 static int unused_bundle(struct dst_entry *dst)
 {
-	return !atomic_read(&dst->__refcnt);
+	return !dst_refcnt(dst);
 }
 
 static void __xfrm_garbage_collect(void)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
       [not found] <20050913155112.GB3570@localhost.localdomain>
  2005-09-13 16:10 ` [patch 7/11] net: Use bigrefs for net_device.refcount Ravikiran G Thirumalai
  2005-09-13 16:12 ` [patch 8/11] net: dst_abstraction macros Ravikiran G Thirumalai
@ 2005-09-13 16:17 ` Ravikiran G Thirumalai
  2005-09-13 20:24   ` David S. Miller
  2 siblings, 1 reply; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 16:17 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, dipankar, bharata, shai, Rusty Russell, netdev,
	davem

Patch to use alloc_percpu for dst_entry.refcount.  This patch reduces the
cacheline bouncing of the atomic_t dst_entry.__refcount.  This Patch gets us
55% better tbench throughput, on a 8way x445 box.

Signed-off by: Pravin B. Shelar <pravins@calsoftinc.com>
Signed-off by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off by: Christoph Lameter <christoph@lameter.com>
Signed-off by: Ravikiran Thirumalai <kirant@scalex86.org>

Index: alloc_percpu-2.6.13/include/net/dst.h
===================================================================
--- alloc_percpu-2.6.13.orig/include/net/dst.h	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/include/net/dst.h	2005-09-12 16:44:05.000000000 -0700
@@ -35,11 +35,33 @@
 
 struct sk_buff;
 
+#ifdef CONFIG_NUMA
+
+/*	A per cpu instance of this exist for every dst_entry.
+ *	These are the most written fields of dst_entry.
+ */
+struct per_cpu_cnt
+{
+	int 		refcnt;
+	int 		use;
+	unsigned long	lastuse;
+};
+
+#endif
+
 struct dst_entry
 {
 	struct dst_entry        *next;
+#ifdef CONFIG_NUMA
+	/* first cpu that should be checked for time-out */
+	int 			s_cpu;
+	/* per cpu client references   */
+	struct per_cpu_cnt	*pcc;
+#else
 	atomic_t		__refcnt;	/* client references	*/
 	int			__use;
+	unsigned long		lastuse;
+#endif
 	struct dst_entry	*child;
 	struct net_device       *dev;
 	short			error;
@@ -50,7 +72,6 @@
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
 #define DST_BALANCED            0x10
-	unsigned long		lastuse;
 	unsigned long		expires;
 
 	unsigned short		header_len;	/* more space at head required */
@@ -103,25 +124,94 @@
 
 #ifdef __KERNEL__
 
+#ifdef CONFIG_NUMA
+
+static inline int dst_use(struct dst_entry *dst)
+{
+	int total = 0, cpu;
+
+	for_each_online_cpu(cpu)
+		total += per_cpu_ptr(dst->pcc, cpu)->use;
+	return total;
+}
+
+#define dst_use_inc(__dst) do {					\
+		per_cpu_ptr((__dst)->pcc, get_cpu())->use++ ;	\
+		put_cpu();					\
+	} while(0);
+
+static inline unsigned long dst_lastuse(struct dst_entry *dst)
+{
+	unsigned long max = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		if (max < per_cpu_ptr(dst->pcc, cpu)->lastuse)
+			max = per_cpu_ptr(dst->pcc, cpu)->lastuse;
+	return max;
+}
+
+#define dst_lastuse_set(__dst)  do {					  \
+		per_cpu_ptr((__dst)->pcc, get_cpu())->lastuse = jiffies ; \
+		put_cpu();						  \
+	} while(0);
+
+static inline int dst_refcnt(struct dst_entry *dst)
+{
+	int cpu, sum = 0;
+
+	for_each_online_cpu(cpu)
+		sum += per_cpu_ptr(dst->pcc, cpu)->refcnt;
+
+	return sum;
+}
+
+#define dst_refcnt_one(__dst) do { 					  \
+			per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt = 1; \
+			put_cpu();		  			  \
+		} while(0);
+
+#define dst_refcnt_dec(__dst) do { 					\
+			per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt--;	\
+			put_cpu();					\
+		} while(0);
+#define dst_hold(__dst) do { 						 \
+			per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt++ ; \
+			put_cpu();					 \
+		} while(0);
+
+#else
+
 #define dst_use(__dst) (__dst)->__use
 #define dst_use_inc(__dst) (__dst)->__use++
 
 #define dst_lastuse(__dst) (__dst)->lastuse
 #define dst_lastuse_set(__dst) (__dst)->lastuse = jiffies
 
-#define dst_update_tu(__dst) do { dst_lastuse_set(__dst);dst_use_inc(__dst); } while (0)
-#define dst_update_rtu(__dst) do { dst_lastuse_set(__dst);dst_hold(__dst);dst_use_inc(__dst); } while (0)
-
 #define dst_refcnt(__dst) atomic_read(&(__dst)->__refcnt)
 #define dst_refcnt_one(__dst) atomic_set(&(__dst)->__refcnt, 1)
 #define dst_refcnt_dec(__dst) atomic_dec(&(__dst)->__refcnt)
 #define dst_hold(__dst) atomic_inc(&(__dst)->__refcnt)
 
+#endif
+#define dst_update_tu(__dst) do { 		\
+		dst_lastuse_set(__dst);		\
+		dst_use_inc(__dst); 		\
+	} while (0);
+
+#define dst_update_rtu(__dst) do { 		\
+		dst_lastuse_set(__dst);		\
+		dst_hold(__dst);		\
+		dst_use_inc(__dst); 		\
+	} while (0)
+
 static inline
 void dst_release(struct dst_entry * dst)
 {
 	if (dst) {
+#if  (!defined (CONFIG_NUMA) || (RT_CACHE_DEBUG >= 2 ))
 		WARN_ON(dst_refcnt(dst) < 1);
+#endif
 		smp_mb__before_atomic_dec();
 		dst_refcnt_dec(dst);
 	}
@@ -271,6 +361,48 @@
 
 extern void		dst_init(void);
 
+/*	This function allocates and initializes rtu array of given dst-entry.
+ */
+static inline int dst_init_rtu_array(struct dst_entry *dst)
+{
+#ifdef CONFIG_NUMA
+	int cpu;
+	dst->pcc = alloc_percpu(struct per_cpu_cnt, GFP_ATOMIC);
+	if(!dst->pcc)
+		return -ENOMEM;
+
+	for_each_cpu(cpu) {
+		per_cpu_ptr(dst->pcc, cpu)->use = 0;
+		per_cpu_ptr(dst->pcc, cpu)->refcnt = 0;
+		per_cpu_ptr(dst->pcc, cpu)->lastuse = jiffies;
+	}
+	dst->s_cpu = smp_processor_id();
+#else
+	atomic_set(&dst->__refcnt, 0);
+	dst->lastuse = jiffies;
+#endif
+	return 0;
+}
+
+static inline void dst_free_rtu_array(struct dst_entry *dst)
+{
+#ifdef CONFIG_NUMA
+	free_percpu(dst->pcc);
+#endif
+}
+
+#if	defined (CONFIG_HOTPLUG_CPU) && defined (CONFIG_NUMA)
+inline static void dst_ref_xfr_cpu_down(struct dst_entry *__dst, int cpu)
+{
+	int refcnt = per_cpu_ptr((__dst)->pcc, cpu)->refcnt;
+	if (refcnt) {
+		per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt += refcnt;
+		put_cpu();
+		per_cpu_ptr((__dst)->pcc, cpu)->refcnt = 0;
+	}
+}
+#endif
+
 struct flowi;
 #ifndef CONFIG_XFRM
 static inline int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
Index: alloc_percpu-2.6.13/net/bridge/br_netfilter.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/bridge/br_netfilter.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/bridge/br_netfilter.c	2005-09-12 12:24:01.000000000 -0700
@@ -85,7 +85,6 @@
 static struct rtable __fake_rtable = {
 	.u = {
 		.dst = {
-			.__refcnt		= ATOMIC_INIT(1),
 			.dev			= &__fake_net_device,
 			.path			= &__fake_rtable.u.dst,
 			.metrics		= {[RTAX_MTU - 1] = 1500},
@@ -1010,6 +1009,10 @@
 {
 	int i;
 
+	if (dst_init_rtu_array(&__fake_rtable.u.dst) < 0)
+		panic("br_netfilter : cannot allocate memory for dst-entry rtu array");
+	dst_refcnt_one(&__fake_rtable.u.dst);
+
 	for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++) {
 		int ret;
 
@@ -1046,4 +1049,5 @@
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(brnf_sysctl_header);
 #endif
+	dst_free_rtu_array(&__fake_rtable.u.dst);
 }
Index: alloc_percpu-2.6.13/net/core/dst.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/core/dst.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/core/dst.c	2005-09-12 12:24:01.000000000 -0700
@@ -131,9 +131,9 @@
 	if (!dst)
 		return NULL;
 	memset(dst, 0, ops->entry_size);
-	atomic_set(&dst->__refcnt, 0);
+	if (dst_init_rtu_array(dst) < 0)
+		return NULL;
 	dst->ops = ops;
-	dst->lastuse = jiffies;
 	dst->path = dst;
 	dst->input = dst_discard_in;
 	dst->output = dst_discard_out;
@@ -200,6 +200,7 @@
 #if RT_CACHE_DEBUG >= 2 
 	atomic_dec(&dst_total);
 #endif
+	dst_free_rtu_array(dst);
 	kmem_cache_free(dst->ops->kmem_cachep, dst);
 
 	dst = child;
Index: alloc_percpu-2.6.13/net/decnet/dn_route.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/decnet/dn_route.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/decnet/dn_route.c	2005-09-12 12:24:01.000000000 -0700
@@ -77,6 +77,7 @@
 #include <linux/netfilter_decnet.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/cpu.h>
 #include <asm/errno.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
@@ -157,7 +158,29 @@
 
 static inline int dn_dst_useful(struct dn_route *rth, unsigned long now, unsigned long expire)
 {
+#ifdef CONFIG_NUMA
+	{
+		int max, sum = 0, age, cpu;
+		struct dst_entry *dst = &rth->u.dst;
+
+		cpu = dst->s_cpu;
+		max = cpu + NR_CPUS;
+		for(sum = 0; cpu < max; cpu++) {
+			int cpu_ = cpu % NR_CPUS;
+			if (cpu_online(cpu_)) {
+				sum += per_cpu_ptr(dst->pcc, cpu_)->refcnt;
+				age = now - per_cpu_ptr(dst->pcc, cpu_)->lastuse;
+				if (age <= expire) {
+					dst->s_cpu = cpu_ ;
+					return 1;
+				}
+			}
+		}
+		return (sum != 0);
+	}
+#else
 	return  (atomic_read(&rth->u.dst.__refcnt) || (now - rth->u.dst.lastuse) < expire) ;
+#endif
 }
 
 static void dn_dst_check_expire(unsigned long dummy)
@@ -1766,6 +1789,43 @@
 
 #endif /* CONFIG_PROC_FS */
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+static int __devinit dn_rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+                                   void *hcpu)
+{
+	int cpu = (int) hcpu;
+
+	switch(action) {
+		int i;
+		struct dn_route *rt, *next;
+
+		case CPU_DEAD:
+
+		for(i = 0; i < dn_rt_hash_mask; i++) {
+			spin_lock_bh(&dn_rt_hash_table[i].lock);
+
+			if ((rt = dn_rt_hash_table[i].chain) == NULL)
+				goto nothing_to_do;
+
+			for(; rt; rt=next) {
+				dst_ref_xfr_cpu_down(&rt->u.dst, cpu);
+				next = rt->u.rt_next;
+			}
+nothing_to_do:
+			spin_unlock_bh(&dn_rt_hash_table[i].lock);
+		}
+
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dn_rtcache_cpu_notifier =
+			{ &dn_rtcache_cpu_callback, NULL, 0 };
+
+#endif
+
 void __init dn_route_init(void)
 {
 	int i, goal, order;
@@ -1822,10 +1882,16 @@
         dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
 
 	proc_net_fops_create("decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops);
+#if	defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&dn_rtcache_cpu_notifier);
+#endif
 }
 
 void __exit dn_route_cleanup(void)
 {
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	unregister_cpu_notifier(&dn_rtcache_cpu_notifier);
+#endif
 	del_timer(&dn_route_timer);
 	dn_run_flush(0);
 
Index: alloc_percpu-2.6.13/net/ipv4/route.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/ipv4/route.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/ipv4/route.c	2005-09-12 12:24:01.000000000 -0700
@@ -92,6 +92,7 @@
 #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/cpu.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -507,6 +508,54 @@
 		rth->u.dst.expires;
 }
 
+#ifdef CONFIG_NUMA
+
+/*
+ * For NUMA systems, we do not want to sum up all local cpu refcnts every
+ * time. So we consider lastuse element of the dst_entry and start loop
+ * with the cpu where this entry was allocated. If dst_entry is not timed
+ * out then update s_cpu of this dst_entry so that next time we can start from
+ * that cpu.
+ */
+static inline int rt_check_age(struct rtable *rth,
+			unsigned long tmo1, unsigned long tmo2)
+{
+	int max, sum = 0, age, idx;
+	struct dst_entry *dst = &rth->u.dst;
+	unsigned long now = jiffies;
+
+	idx = dst->s_cpu;
+	max = idx + NR_CPUS;
+	for(sum = 0; idx < max; idx++) {
+		int cpu_ = idx % NR_CPUS;
+		if (cpu_online(cpu_)) {
+			sum += per_cpu_ptr(dst->pcc, cpu_)->refcnt;
+			age = now - per_cpu_ptr(dst->pcc, cpu_)->lastuse;
+			if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+					(age <= tmo2 && rt_valuable(rth))) {
+				dst->s_cpu = cpu_ ;
+				return 0;
+			}
+		}
+	}
+	return (sum == 0);
+}
+
+/*
+ * In this function order of examining three factors (ref_cnt, expires,
+ * lastuse) is changed, considering the cost of analyzing refcnt and lastuse
+ * which are localized for each cpu on NUMA.
+ */
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+	if (rth->u.dst.expires && time_after_eq(jiffies, rth->u.dst.expires))
+		return (dst_refcnt(&rth->u.dst) == 0) ;
+
+	return rt_check_age(rth, tmo1, tmo2);
+}
+
+#else
+
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 {
 	unsigned long age;
@@ -529,6 +578,8 @@
 out:	return ret;
 }
 
+#endif
+
 /* Bits of score are:
  * 31: very valuable
  * 30: not quite useless
@@ -1108,8 +1159,19 @@
 
 void ip_rt_copy(struct rtable *to, struct rtable *from)
 {
+#ifdef CONFIG_NUMA
+	struct per_cpu_cnt *tmp_pnc;
+	tmp_pnc = to->u.dst.pcc;
+
+	*to = *from;
+	to->u.dst.pcc = tmp_pnc;
+	per_cpu_ptr(to->u.dst.pcc,get_cpu())->use = 1;
+	to->u.dst.s_cpu = smp_processor_id();
+	put_cpu();
+#else
 	*to = *from;
 	to->u.dst.__use 	= 1;
+#endif
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3108,6 +3170,33 @@
 }
 __setup("rhash_entries=", set_rhash_entries);
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+static int __devinit rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+				   void *hcpu)
+{
+	int cpu = (int) hcpu;
+
+	switch(action) {
+		int i ;
+		struct rtable *rth;
+		case CPU_DEAD:
+			for(i = rt_hash_mask; i >= 0; i--) {
+				spin_lock_irq(rt_hash_lock_addr(i));
+				rth = rt_hash_table[i].chain;
+				while(rth) {
+					dst_ref_xfr_cpu_down(&rth->u.dst, cpu);
+					rth = rth->u.rt_next;
+				}
+				spin_unlock_irq(rt_hash_lock_addr(i));
+			}
+			break;
+	}
+	return NOTIFY_OK;
+}
+static struct notifier_block rtcache_cpu_notifier = { &rtcache_cpu_callback, NULL, 0 };
+#endif
+
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3197,6 +3286,9 @@
 	xfrm_init();
 	xfrm4_init();
 #endif
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&rtcache_cpu_notifier);
+#endif
 	return rc;
 }
 
Index: alloc_percpu-2.6.13/net/ipv6/ip6_fib.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/ipv6/ip6_fib.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/ipv6/ip6_fib.c	2005-09-12 12:24:01.000000000 -0700
@@ -1209,6 +1209,35 @@
 	spin_unlock_bh(&fib6_gc_lock);
 }
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+#include <linux/cpu.h>
+inline static int rt6_ref_xfr_cpu_down(struct rt6_info *rt, void *arg)
+{
+	dst_ref_xfr_cpu_down(&rt->u.dst, (int)arg);
+	return 0;
+}
+
+static int __devinit ipv6_rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+                                   void *hcpu)
+{
+	int cpu = (int) hcpu;
+
+	switch(action) {
+		case CPU_DEAD:
+			write_lock_bh(&rt6_lock);
+			fib6_clean_tree(&ip6_routing_table, rt6_ref_xfr_cpu_down,
+					0, (void *)cpu);
+			write_unlock_bh(&rt6_lock);
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipv6_rtcache_cpu_notifier =
+				{ &ipv6_rtcache_cpu_callback, NULL, 0 };
+#endif
+
 void __init fib6_init(void)
 {
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
@@ -1217,10 +1246,16 @@
 					   NULL, NULL);
 	if (!fib6_node_kmem)
 		panic("cannot create fib6_nodes cache");
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&ipv6_rtcache_cpu_notifier);
+#endif
 }
 
 void fib6_gc_cleanup(void)
 {
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	unregister_cpu_notifier(&ipv6_rtcache_cpu_notifier);
+#endif
 	del_timer(&ip6_fib_timer);
 	kmem_cache_destroy(fib6_node_kmem);
 }
Index: alloc_percpu-2.6.13/net/ipv6/route.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/ipv6/route.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/ipv6/route.c	2005-09-12 12:24:01.000000000 -0700
@@ -110,8 +110,6 @@
 struct rt6_info ip6_null_entry = {
 	.u = {
 		.dst = {
-			.__refcnt	= ATOMIC_INIT(1),
-			.__use		= 1,
 			.dev		= &loopback_dev,
 			.obsolete	= -1,
 			.error		= -ENETUNREACH,
@@ -2104,6 +2102,10 @@
 						     NULL, NULL);
 	if (!ip6_dst_ops.kmem_cachep)
 		panic("cannot create ip6_dst_cache");
+	if (dst_init_rtu_array(&ip6_null_entry.u.dst) < 0)
+		panic("ip6_route : can't allocate memory for dst-entry array");
+	dst_use_inc(&ipv6_null_entry.u.dist);
+	dst_refcnt_one(&ip6_null_entry.u.dst);
 
 	fib6_init();
 #ifdef 	CONFIG_PROC_FS
@@ -2130,4 +2132,5 @@
 	rt6_ifdown(NULL);
 	fib6_gc_cleanup();
 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
+	dst_free_rtu_array(&ip6_null_entry.u.dst);
 }

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 16:10 ` [patch 7/11] net: Use bigrefs for net_device.refcount Ravikiran G Thirumalai
@ 2005-09-13 16:26   ` Stephen Hemminger
  2005-09-13 16:35     ` Ben Greear
  2005-09-13 20:26     ` David S. Miller
  2005-09-13 18:27   ` Eric Dumazet
  1 sibling, 2 replies; 16+ messages in thread
From: Stephen Hemminger @ 2005-09-13 16:26 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, linux-kernel, dipankar, bharata, shai,
	Rusty Russell, netdev, davem

On Tue, 13 Sep 2005 09:10:12 -0700
Ravikiran G Thirumalai <kiran@scalex86.org> wrote:

> The net_device has a refcnt used to keep track of it's uses.
> This is used at the time of unregistering the network device
> (module unloading ..) (see netdev_wait_allrefs) .
> For loopback_dev , this refcnt increment/decrement  is causing
> unnecessary traffic on the interlink for NUMA system
> affecting it's performance.  This patch improves tbench numbers by 6% on a
> 8way x86 Xeon (x445).
> 

Since when is bringing a network device up/down performance critical?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 16:26   ` Stephen Hemminger
@ 2005-09-13 16:35     ` Ben Greear
  2005-09-13 16:46       ` Stephen Hemminger
  2005-09-13 20:26     ` David S. Miller
  1 sibling, 1 reply; 16+ messages in thread
From: Ben Greear @ 2005-09-13 16:35 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Ravikiran G Thirumalai, Andrew Morton, linux-kernel, dipankar,
	bharata, shai, Rusty Russell, netdev, davem

Stephen Hemminger wrote:
> On Tue, 13 Sep 2005 09:10:12 -0700
> Ravikiran G Thirumalai <kiran@scalex86.org> wrote:
> 
> 
>>The net_device has a refcnt used to keep track of it's uses.
>>This is used at the time of unregistering the network device
>>(module unloading ..) (see netdev_wait_allrefs) .
>>For loopback_dev , this refcnt increment/decrement  is causing
>>unnecessary traffic on the interlink for NUMA system
>>affecting it's performance.  This patch improves tbench numbers by 6% on a
>>8way x86 Xeon (x445).
>>
> 
> 
> Since when is bringing a network device up/down performance critical?

We grab and drop a reference for each poll of a device, roughly.

See dev_hold in _netif_rx_schedule(struct net_device *dev)
in include/netdevice.h, for instance.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 16:35     ` Ben Greear
@ 2005-09-13 16:46       ` Stephen Hemminger
  0 siblings, 0 replies; 16+ messages in thread
From: Stephen Hemminger @ 2005-09-13 16:46 UTC (permalink / raw)
  To: Ben Greear
  Cc: Ravikiran G Thirumalai, Andrew Morton, linux-kernel, dipankar,
	bharata, shai, Rusty Russell, netdev, davem

On Tue, 13 Sep 2005 09:35:14 -0700
Ben Greear <greearb@candelatech.com> wrote:

> Stephen Hemminger wrote:
> > On Tue, 13 Sep 2005 09:10:12 -0700
> > Ravikiran G Thirumalai <kiran@scalex86.org> wrote:
> > 
> > 
> >>The net_device has a refcnt used to keep track of it's uses.
> >>This is used at the time of unregistering the network device
> >>(module unloading ..) (see netdev_wait_allrefs) .
> >>For loopback_dev , this refcnt increment/decrement  is causing
> >>unnecessary traffic on the interlink for NUMA system
> >>affecting it's performance.  This patch improves tbench numbers by 6% on a
> >>8way x86 Xeon (x445).
> >>
> > 
> > 
> > Since when is bringing a network device up/down performance critical?
> 
> We grab and drop a reference for each poll of a device, roughly.
> 
> See dev_hold in _netif_rx_schedule(struct net_device *dev)
> in include/netdevice.h, for instance.

Yeah, that would be an issue, especially since the rest of that
path is nicely per-cpu

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 16:10 ` [patch 7/11] net: Use bigrefs for net_device.refcount Ravikiran G Thirumalai
  2005-09-13 16:26   ` Stephen Hemminger
@ 2005-09-13 18:27   ` Eric Dumazet
  2005-09-13 18:53     ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 16+ messages in thread
From: Eric Dumazet @ 2005-09-13 18:27 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andrew Morton, linux-kernel, dipankar, bharata, shai,
	Rusty Russell, netdev, davem

Ravikiran G Thirumalai a écrit :
> The net_device has a refcnt used to keep track of it's uses.
> This is used at the time of unregistering the network device
> (module unloading ..) (see netdev_wait_allrefs) .
> For loopback_dev , this refcnt increment/decrement  is causing
> unnecessary traffic on the interlink for NUMA system
> affecting it's performance.  This patch improves tbench numbers by 6% on a
> 8way x86 Xeon (x445).
  ===================================================================
> --- alloc_percpu-2.6.13.orig/include/linux/netdevice.h	2005-08-28 16:41:01.000000000 -0700
> +++ alloc_percpu-2.6.13/include/linux/netdevice.h	2005-09-12 11:54:21.000000000 -0700
> @@ -37,6 +37,7 @@
>  #include <linux/config.h>
>  #include <linux/device.h>
>  #include <linux/percpu.h>
> +#include <linux/bigref.h>
>  
>  struct divert_blk;
>  struct vlan_group;
> @@ -377,7 +378,7 @@
>  	/* device queue lock */
>  	spinlock_t		queue_lock;
>  	/* Number of references to this device */
> -	atomic_t		refcnt;
> +	struct bigref	        netdev_refcnt;	
>  	/* delayed register/unregister */
>  	struct list_head	todo_list;
>  	/* device name hash chain */
> @@ -677,11 +678,11 @@

Hum...

Did you tried to place refcnt/netdev_refcnt in a separate cache line than 
queue_lock ? I got good results too...

 >  	/* device queue lock */
 >  	spinlock_t		queue_lock;
 >  	/* Number of references to this device */
 > -	atomic_t		refcnt;
 > +	struct bigref	        netdev_refcnt ____cacheline_aligned_in_smp ;	
 >  	/* delayed register/unregister */
 >  	struct list_head	todo_list;
 >  	/* device name hash chain */

Every time a cpu take the queue_lock spinlock, it exclusively gets one cache 
line. If another cpu try to access netdev_refcnt, it has to grab this cache 
line (even if properely per_cpu designed, there is still one shared field). In 
fact the whole struct net_device should be re-ordered for SMP/NUMA performance.

Eric

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 18:27   ` Eric Dumazet
@ 2005-09-13 18:53     ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 18:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andrew Morton, linux-kernel, dipankar, bharata, shai,
	Rusty Russell, netdev, davem

On Tue, Sep 13, 2005 at 08:27:52PM +0200, Eric Dumazet wrote:
> Ravikiran G Thirumalai a écrit :
> 
> Hum...
> 
> Did you tried to place refcnt/netdev_refcnt in a separate cache line than 
> queue_lock ? I got good results too...
> 
> >  	/* device queue lock */
> >  	spinlock_t		queue_lock;
> >  	/* Number of references to this device */
> > -	atomic_t		refcnt;
> > +	struct bigref	        netdev_refcnt ____cacheline_aligned_in_smp ; 
> >  	/* delayed register/unregister */
> >  	struct list_head	todo_list;
> >  	/* device name hash chain */
> 
> Every time a cpu take the queue_lock spinlock, it exclusively gets one 
> cache line. If another cpu try to access netdev_refcnt, it has to grab this 
> cache line (even if properely per_cpu designed, there is still one shared 
> field). In fact the whole struct net_device should be re-ordered for 
> SMP/NUMA performance.

I agree. Maybe placing the queue_lock in a different cacheline is the 
right approach?

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
  2005-09-13 16:17 ` [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu Ravikiran G Thirumalai
@ 2005-09-13 20:24   ` David S. Miller
  2005-09-13 22:07     ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 16+ messages in thread
From: David S. Miller @ 2005-09-13 20:24 UTC (permalink / raw)
  To: kiran; +Cc: akpm, linux-kernel, dipankar, bharata, shai, rusty, netdev


There is no way in the world this enormous amount of NUMA
complexity is being added to the destination cache layer.

Sorry.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 16:26   ` Stephen Hemminger
  2005-09-13 16:35     ` Ben Greear
@ 2005-09-13 20:26     ` David S. Miller
  2005-09-13 22:16       ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 16+ messages in thread
From: David S. Miller @ 2005-09-13 20:26 UTC (permalink / raw)
  To: shemminger
  Cc: kiran, akpm, linux-kernel, dipankar, bharata, shai, rusty, netdev

From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 13 Sep 2005 09:26:59 -0700

> Since when is bringing a network device up/down performance critical?

The issue is the dev_get()'s that occur all over the place
to during packet transmit/receive, that's what they are
trying to address.

I'm still against all of these invasive NUMA changes to the
networking though, they are simply too ugly and special cased
to consider seriously.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
  2005-09-13 20:24   ` David S. Miller
@ 2005-09-13 22:07     ` Ravikiran G Thirumalai
  2005-09-13 22:12       ` David S. Miller
  0 siblings, 1 reply; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 22:07 UTC (permalink / raw)
  To: David S. Miller
  Cc: akpm, linux-kernel, dipankar, bharata, shai, rusty, netdev

On Tue, Sep 13, 2005 at 01:24:42PM -0700, David S. Miller wrote:
> 
> There is no way in the world this enormous amount of NUMA
> complexity is being added to the destination cache layer.

Agreed the dst changes are ugly; that can be worked on.  But the 
cacheline bouncing problem on the atomic_t dst_entry refcounter has been 
around for quite a while -- even on SMPs, not just NUMA.  We need a solution 
for that.  I thought you were against the dst_entry bloat caused by the 
previous version of the dst patch.  alloc_percpu takes that away.  You had
concerns about workloads with low route locality. Unfortunately we don't have
access to infrastructure setup for such tests :( 

As for the ugliness, would something on the lines of net_device refcounter 
patch in the series above be acceptable?

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
  2005-09-13 22:07     ` Ravikiran G Thirumalai
@ 2005-09-13 22:12       ` David S. Miller
  2005-09-13 23:17         ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 16+ messages in thread
From: David S. Miller @ 2005-09-13 22:12 UTC (permalink / raw)
  To: kiran; +Cc: akpm, linux-kernel, dipankar, bharata, shai, rusty, netdev

From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 13 Sep 2005 15:07:37 -0700

> Agreed the dst changes are ugly; that can be worked on.  But the
> cacheline bouncing problem on the atomic_t dst_entry refcounter has
> been around for quite a while -- even on SMPs, not just NUMA.  We
> need a solution for that.  I thought you were against the dst_entry
> bloat caused by the previous version of the dst patch.  alloc_percpu
> takes that away.  You had concerns about workloads with low route
> locality. Unfortunately we don't have access to infrastructure setup
> for such tests :(

You don't have two computers connected on a network?

All you need is that, load a bunch of routes into one system that
point to an IP address which you just force an ARP entry for (so it
just gets lost in the ether) and then generate a rDOS workload through
it from another machine using pktgen.

I'm fine with funny per-cpu memory allocation strategies, perhaps
(would have to see a patch doing _only_ that to be sure).

But using bigrefs, no way.  We have enough trouble making the data
structures small without adding bloat like that.  A busy server can
have hundreds of thousands of dst cache entries active on it, and they
chew up enough memory as is.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 7/11] net: Use bigrefs for net_device.refcount
  2005-09-13 20:26     ` David S. Miller
@ 2005-09-13 22:16       ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 22:16 UTC (permalink / raw)
  To: David S. Miller
  Cc: shemminger, akpm, linux-kernel, dipankar, bharata, shai, rusty,
	netdev

On Tue, Sep 13, 2005 at 01:26:07PM -0700, David S. Miller wrote:
> From: Stephen Hemminger <shemminger@osdl.org>
> Date: Tue, 13 Sep 2005 09:26:59 -0700
> 
> > Since when is bringing a network device up/down performance critical?
> 
> The issue is the dev_get()'s that occur all over the place
> to during packet transmit/receive, that's what they are
> trying to address.
> 
> I'm still against all of these invasive NUMA changes to the
> networking though, they are simply too ugly and special cased
> to consider seriously.

All of them or the dst ones?  Hopefully the netdevice refcounter patch
is not ugly or complicated as the dst ones? And why are they special cased?
Are networking workloads with high route locality not interesting?

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
  2005-09-13 22:12       ` David S. Miller
@ 2005-09-13 23:17         ` Ravikiran G Thirumalai
  2005-09-13 23:27           ` David S. Miller
  0 siblings, 1 reply; 16+ messages in thread
From: Ravikiran G Thirumalai @ 2005-09-13 23:17 UTC (permalink / raw)
  To: David S. Miller
  Cc: akpm, linux-kernel, dipankar, bharata, shai, rusty, netdev

On Tue, Sep 13, 2005 at 03:12:16PM -0700, David S. Miller wrote:
> From: Ravikiran G Thirumalai <kiran@scalex86.org>
> Date: Tue, 13 Sep 2005 15:07:37 -0700
> ...
> But using bigrefs, no way.  We have enough trouble making the data
> structures small without adding bloat like that.  A busy server can
> have hundreds of thousands of dst cache entries active on it, and they
> chew up enough memory as is.
> 

But even 1 Million dst cache entries would be 16+4 MB additional for a 4 cpu 
box....is that too much?  The alloc_percpu reimplementation interleaves
objects on cache lines, unlike the existing implementation which pads per-cpu
objects to cache lines...

If you are referring to embedded routing devices,
would they use CONFIG_NUMA or CONFIG_SMP?? (bigrefs nicely fold back to
regular atomic_t s on UPs)

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
  2005-09-13 23:17         ` Ravikiran G Thirumalai
@ 2005-09-13 23:27           ` David S. Miller
  2005-09-14  7:21             ` Rusty Russell
  0 siblings, 1 reply; 16+ messages in thread
From: David S. Miller @ 2005-09-13 23:27 UTC (permalink / raw)
  To: kiran; +Cc: akpm, linux-kernel, dipankar, bharata, shai, rusty, netdev

From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 13 Sep 2005 16:17:17 -0700

> But even 1 Million dst cache entries would be 16+4 MB additional for
> a 4 cpu box....is that too much?

Absolutely.

Per-cpu counters are great for things like single instance
statistics et al.  But once you start doing them per-object
that's out of control bloat as far as I'm concerned.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu
  2005-09-13 23:27           ` David S. Miller
@ 2005-09-14  7:21             ` Rusty Russell
  0 siblings, 0 replies; 16+ messages in thread
From: Rusty Russell @ 2005-09-14  7:21 UTC (permalink / raw)
  To: David S. Miller
  Cc: kiran, akpm, linux-kernel, dipankar, bharata, shai, netdev

On Tue, 2005-09-13 at 16:27 -0700, David S. Miller wrote:
> From: Ravikiran G Thirumalai <kiran@scalex86.org>
> Date: Tue, 13 Sep 2005 16:17:17 -0700
> 
> > But even 1 Million dst cache entries would be 16+4 MB additional for
> > a 4 cpu box....is that too much?
> 
> Absolutely.
> 
> Per-cpu counters are great for things like single instance
> statistics et al.  But once you start doing them per-object
> that's out of control bloat as far as I'm concerned.

This is why my original per-cpu allocator patch was damn slow, and
GFP_KERNEL only.  I wasn't convinced that high-churn objects are a good
fit for spreading across cpus.

I thought that net devices and modules (which uses a primitive
hard-coded "bigref" currently) were a fair uses for bigrefs, though I'd
like to see some stats.

Cheers,
Rusty.
-- 
A bad analogy is like a leaky screwdriver -- Richard Braakman

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2005-09-14  7:21 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20050913155112.GB3570@localhost.localdomain>
2005-09-13 16:10 ` [patch 7/11] net: Use bigrefs for net_device.refcount Ravikiran G Thirumalai
2005-09-13 16:26   ` Stephen Hemminger
2005-09-13 16:35     ` Ben Greear
2005-09-13 16:46       ` Stephen Hemminger
2005-09-13 20:26     ` David S. Miller
2005-09-13 22:16       ` Ravikiran G Thirumalai
2005-09-13 18:27   ` Eric Dumazet
2005-09-13 18:53     ` Ravikiran G Thirumalai
2005-09-13 16:12 ` [patch 8/11] net: dst_abstraction macros Ravikiran G Thirumalai
2005-09-13 16:17 ` [patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu Ravikiran G Thirumalai
2005-09-13 20:24   ` David S. Miller
2005-09-13 22:07     ` Ravikiran G Thirumalai
2005-09-13 22:12       ` David S. Miller
2005-09-13 23:17         ` Ravikiran G Thirumalai
2005-09-13 23:27           ` David S. Miller
2005-09-14  7:21             ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).