netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 0/4] more stuff for 2.6.22
@ 2007-03-12 21:08 Stephen Hemminger
  2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
                   ` (3 more replies)
  0 siblings, 4 replies; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Minor section rearrangements and /net/proc/ptype 

--
Stephen Hemminger <shemminger@linux-foundation.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 1/4] network dev read_mostly
  2007-03-12 21:08 [patch 0/4] more stuff for 2.6.22 Stephen Hemminger
@ 2007-03-12 21:08 ` Stephen Hemminger
  2007-03-12 21:34   ` David Miller
                     ` (2 more replies)
  2007-03-12 21:08 ` [patch 2/4] net: make seq_operations const Stephen Hemminger
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: dev-read-mostly.patch --]
[-- Type: text/plain, Size: 1293 bytes --]

For Eric, mark packet type and network device watermarks
as read mostly.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
---
 net/core/dev.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3f0c468..c82a56b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,8 +146,8 @@ #include <linux/ctype.h>
  */
 
 static DEFINE_SPINLOCK(ptype_lock);
-static struct list_head ptype_base[16];	/* 16 way hashed list */
-static struct list_head ptype_all;		/* Taps */
+static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
+static struct list_head ptype_all __read_mostly;	/* Taps */
 
 #ifdef CONFIG_NET_DMA
 static struct dma_client *net_dma_client;
@@ -1535,9 +1535,9 @@ out:
 			Receiver routines
   =======================================================================*/
 
-int netdev_max_backlog = 1000;
-int netdev_budget = 300;
-int weight_p = 64;            /* old backlog weight */
+int netdev_max_backlog __read_mostly = 1000;
+int netdev_budget __read_mostly = 300;
+int weight_p __read_mostly = 64;            /* old backlog weight */
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
-- 
1.4.1

--
Stephen Hemminger <shemminger@linux-foundation.org>


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [patch 2/4] net: make seq_operations const
  2007-03-12 21:08 [patch 0/4] more stuff for 2.6.22 Stephen Hemminger
  2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
@ 2007-03-12 21:08 ` Stephen Hemminger
  2007-03-12 21:34   ` David Miller
  2007-03-12 21:08 ` [patch 3/4] net: show bound packet types Stephen Hemminger
  2007-03-12 21:08 ` [patch 4/4] tcp: statistics not read_mostly Stephen Hemminger
  3 siblings, 1 reply; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: net-seq-ops-const.patch --]
[-- Type: text/plain, Size: 6357 bytes --]

The seq_file operations stuff can be marked constant to
get it out of dirty cache.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
---
 net/core/dev.c       |    4 ++--
 net/core/dev_mcast.c |    2 +-
 net/core/neighbour.c |    2 +-
 net/core/sock.c      |    2 +-
 net/core/wireless.c  |    2 +-
 net/ipv4/arp.c       |    2 +-
 net/ipv4/fib_hash.c  |    2 +-
 net/ipv4/fib_trie.c  |    4 ++--
 net/ipv4/igmp.c      |    4 ++--
 net/ipv4/ipmr.c      |    4 ++--
 net/ipv4/raw.c       |    2 +-
 net/ipv4/route.c     |    4 ++--
 12 files changed, 17 insertions(+), 17 deletions(-)

--- net-2.6.22.orig/net/core/dev.c	2007-03-12 12:29:38.000000000 -0700
+++ net-2.6.22/net/core/dev.c	2007-03-12 12:29:39.000000000 -0700
@@ -2177,7 +2177,7 @@
 	return 0;
 }
 
-static struct seq_operations dev_seq_ops = {
+static const struct seq_operations dev_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
 	.stop  = dev_seq_stop,
@@ -2197,7 +2197,7 @@
 	.release = seq_release,
 };
 
-static struct seq_operations softnet_seq_ops = {
+static const struct seq_operations softnet_seq_ops = {
 	.start = softnet_seq_start,
 	.next  = softnet_seq_next,
 	.stop  = softnet_seq_stop,
--- net-2.6.22.orig/net/core/dev_mcast.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/core/dev_mcast.c	2007-03-12 12:29:39.000000000 -0700
@@ -264,7 +264,7 @@
 	return 0;
 }
 
-static struct seq_operations dev_mc_seq_ops = {
+static const struct seq_operations dev_mc_seq_ops = {
 	.start = dev_mc_seq_start,
 	.next  = dev_mc_seq_next,
 	.stop  = dev_mc_seq_stop,
--- net-2.6.22.orig/net/core/neighbour.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/core/neighbour.c	2007-03-12 12:29:39.000000000 -0700
@@ -2384,7 +2384,7 @@
 	return 0;
 }
 
-static struct seq_operations neigh_stat_seq_ops = {
+static const struct seq_operations neigh_stat_seq_ops = {
 	.start	= neigh_stat_seq_start,
 	.next	= neigh_stat_seq_next,
 	.stop	= neigh_stat_seq_stop,
--- net-2.6.22.orig/net/core/sock.c	2007-03-12 12:05:58.000000000 -0700
+++ net-2.6.22/net/core/sock.c	2007-03-12 12:29:39.000000000 -0700
@@ -1925,7 +1925,7 @@
 	return 0;
 }
 
-static struct seq_operations proto_seq_ops = {
+static const struct seq_operations proto_seq_ops = {
 	.start  = proto_seq_start,
 	.next   = proto_seq_next,
 	.stop   = proto_seq_stop,
--- net-2.6.22.orig/net/core/wireless.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/core/wireless.c	2007-03-12 12:29:39.000000000 -0700
@@ -660,7 +660,7 @@
 	return 0;
 }
 
-static struct seq_operations wireless_seq_ops = {
+static const struct seq_operations wireless_seq_ops = {
 	.start = dev_seq_start,
 	.next  = dev_seq_next,
 	.stop  = dev_seq_stop,
--- net-2.6.22.orig/net/ipv4/arp.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/ipv4/arp.c	2007-03-12 12:29:39.000000000 -0700
@@ -1360,7 +1360,7 @@
 
 /* ------------------------------------------------------------------------ */
 
-static struct seq_operations arp_seq_ops = {
+static const struct seq_operations arp_seq_ops = {
 	.start  = arp_seq_start,
 	.next   = neigh_seq_next,
 	.stop   = neigh_seq_stop,
--- net-2.6.22.orig/net/ipv4/fib_hash.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/ipv4/fib_hash.c	2007-03-12 12:29:39.000000000 -0700
@@ -1027,7 +1027,7 @@
 	return 0;
 }
 
-static struct seq_operations fib_seq_ops = {
+static const struct seq_operations fib_seq_ops = {
 	.start  = fib_seq_start,
 	.next   = fib_seq_next,
 	.stop   = fib_seq_stop,
--- net-2.6.22.orig/net/ipv4/fib_trie.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/ipv4/fib_trie.c	2007-03-12 12:29:39.000000000 -0700
@@ -2332,7 +2332,7 @@
 	return 0;
 }
 
-static struct seq_operations fib_trie_seq_ops = {
+static const struct seq_operations fib_trie_seq_ops = {
 	.start  = fib_trie_seq_start,
 	.next   = fib_trie_seq_next,
 	.stop   = fib_trie_seq_stop,
@@ -2453,7 +2453,7 @@
 	return 0;
 }
 
-static struct seq_operations fib_route_seq_ops = {
+static const struct seq_operations fib_route_seq_ops = {
 	.start  = fib_trie_seq_start,
 	.next   = fib_trie_seq_next,
 	.stop   = fib_trie_seq_stop,
--- net-2.6.22.orig/net/ipv4/igmp.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/ipv4/igmp.c	2007-03-12 12:29:39.000000000 -0700
@@ -2397,7 +2397,7 @@
 	return 0;
 }
 
-static struct seq_operations igmp_mc_seq_ops = {
+static const struct seq_operations igmp_mc_seq_ops = {
 	.start	=	igmp_mc_seq_start,
 	.next	=	igmp_mc_seq_next,
 	.stop	=	igmp_mc_seq_stop,
@@ -2571,7 +2571,7 @@
 	return 0;
 }
 
-static struct seq_operations igmp_mcf_seq_ops = {
+static const struct seq_operations igmp_mcf_seq_ops = {
 	.start	=	igmp_mcf_seq_start,
 	.next	=	igmp_mcf_seq_next,
 	.stop	=	igmp_mcf_seq_stop,
--- net-2.6.22.orig/net/ipv4/ipmr.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/ipv4/ipmr.c	2007-03-12 12:29:39.000000000 -0700
@@ -1677,7 +1677,7 @@
 	return 0;
 }
 
-static struct seq_operations ipmr_vif_seq_ops = {
+static const struct seq_operations ipmr_vif_seq_ops = {
 	.start = ipmr_vif_seq_start,
 	.next  = ipmr_vif_seq_next,
 	.stop  = ipmr_vif_seq_stop,
@@ -1840,7 +1840,7 @@
 	return 0;
 }
 
-static struct seq_operations ipmr_mfc_seq_ops = {
+static const struct seq_operations ipmr_mfc_seq_ops = {
 	.start = ipmr_mfc_seq_start,
 	.next  = ipmr_mfc_seq_next,
 	.stop  = ipmr_mfc_seq_stop,
--- net-2.6.22.orig/net/ipv4/raw.c	2007-03-12 12:05:34.000000000 -0700
+++ net-2.6.22/net/ipv4/raw.c	2007-03-12 12:29:39.000000000 -0700
@@ -887,7 +887,7 @@
 	return 0;
 }
 
-static struct seq_operations raw_seq_ops = {
+static const struct seq_operations raw_seq_ops = {
 	.start = raw_seq_start,
 	.next  = raw_seq_next,
 	.stop  = raw_seq_stop,
--- net-2.6.22.orig/net/ipv4/route.c	2007-03-12 12:05:58.000000000 -0700
+++ net-2.6.22/net/ipv4/route.c	2007-03-12 12:29:39.000000000 -0700
@@ -364,7 +364,7 @@
 	return 0;
 }
 
-static struct seq_operations rt_cache_seq_ops = {
+static const struct seq_operations rt_cache_seq_ops = {
 	.start  = rt_cache_seq_start,
 	.next   = rt_cache_seq_next,
 	.stop   = rt_cache_seq_stop,
@@ -470,7 +470,7 @@
 	return 0;
 }
 
-static struct seq_operations rt_cpu_seq_ops = {
+static const struct seq_operations rt_cpu_seq_ops = {
 	.start  = rt_cpu_seq_start,
 	.next   = rt_cpu_seq_next,
 	.stop   = rt_cpu_seq_stop,

--
Stephen Hemminger <shemminger@linux-foundation.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 3/4] net: show bound packet types
  2007-03-12 21:08 [patch 0/4] more stuff for 2.6.22 Stephen Hemminger
  2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
  2007-03-12 21:08 ` [patch 2/4] net: make seq_operations const Stephen Hemminger
@ 2007-03-12 21:08 ` Stephen Hemminger
  2007-03-12 21:35   ` David Miller
  2007-03-12 21:08 ` [patch 4/4] tcp: statistics not read_mostly Stephen Hemminger
  3 siblings, 1 reply; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: ptype-proc.patch --]
[-- Type: text/plain, Size: 3854 bytes --]

Show what protocols are bound to what packet types in /proc/net/ptype
Uses kallsyms to decode function pointers if possible.
Example:
	Type Device      Function
	ALL  eth1     packet_rcv_spkt+0x0
	0800          ip_rcv+0x0
	0806          arp_rcv+0x0
	86dd          :ipv6:ipv6_rcv+0x0

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
---
 net/core/dev.c |  134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

--- net-2.6.22.orig/net/core/dev.c	2007-03-12 12:29:39.000000000 -0700
+++ net-2.6.22/net/core/dev.c	2007-03-12 12:29:45.000000000 -0700
@@ -2217,6 +2217,135 @@
 	.release = seq_release,
 };
 
+static void *ptype_get_idx(loff_t pos)
+{
+	struct packet_type *pt = NULL;
+	loff_t i = 0;
+	int t;
+
+	list_for_each_entry_rcu(pt, &ptype_all, list) {
+		if (i == pos)
+			return pt;
+		++i;
+	}
+
+	for (t = 0; t < 16; t++) {
+		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
+			if (i == pos)
+				return pt;
+			++i;
+		}
+	}
+	return NULL;
+}
+
+static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	rcu_read_lock();
+	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct packet_type *pt;
+	struct list_head *nxt;
+	int hash;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ptype_get_idx(0);
+
+	pt = v;
+	nxt = pt->list.next;
+	if (pt->type == htons(ETH_P_ALL)) {
+		if (nxt != &ptype_all)
+			goto found;
+		hash = 0;
+		nxt = ptype_base[0].next;
+	} else
+		hash = ntohs(pt->type) & 15;
+
+	while (nxt == &ptype_base[hash]) {
+		if (++hash >= 16)
+			return NULL;
+		nxt = ptype_base[hash].next;
+	}
+found:
+	return list_entry(nxt, struct packet_type, list);
+}
+
+static void ptype_seq_stop(struct seq_file *seq, void *v)
+{
+	rcu_read_unlock();
+}
+
+static void ptype_seq_decode(struct seq_file *seq, void *sym)
+{
+#ifdef CONFIG_KALLSYMS
+	unsigned long offset = 0, symsize;
+	const char *symname;
+	char *modname;
+	char namebuf[128];
+
+	symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset,
+				  &modname, namebuf);
+
+	if (symname) {
+		char *delim = ":";
+
+		if (!modname)
+			modname = delim = "";
+		seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim,
+			   symname, offset);
+		return;
+	}
+#endif
+
+	seq_printf(seq, "[%p]", sym);
+}
+
+static int ptype_seq_show(struct seq_file *seq, void *v)
+{
+	struct packet_type *pt = v;
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "Type Device      Function\n");
+	else {
+		if (pt->type == htons(ETH_P_ALL))
+			seq_puts(seq, "ALL ");
+		else
+			seq_printf(seq, "%04x", ntohs(pt->type));
+
+		seq_printf(seq, " %-8s ",
+			   pt->dev ? pt->dev->name : "");
+		ptype_seq_decode(seq,  pt->func);
+		seq_putc(seq, '\n');
+	}
+
+	return 0;
+}
+
+static const struct seq_operations ptype_seq_ops = {
+	.start = ptype_seq_start,
+	.next  = ptype_seq_next,
+	.stop  = ptype_seq_stop,
+	.show  = ptype_seq_show,
+};
+
+static int ptype_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ptype_seq_ops);
+}
+
+static const struct file_operations ptype_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ptype_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+
 #ifdef CONFIG_WIRELESS_EXT
 extern int wireless_proc_init(void);
 #else
@@ -2231,6 +2360,9 @@
 		goto out;
 	if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
 		goto out_dev;
+	if (!proc_net_fops_create("ptype", S_IRUGO, &ptype_seq_fops))
+		goto out_dev2;
+
 	if (wireless_proc_init())
 		goto out_softnet;
 	rc = 0;
@@ -2238,6 +2370,8 @@
 	return rc;
 out_softnet:
 	proc_net_remove("softnet_stat");
+out_dev2:
+	proc_net_remove("ptype");
 out_dev:
 	proc_net_remove("dev");
 	goto out;

--
Stephen Hemminger <shemminger@linux-foundation.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* [patch 4/4] tcp: statistics not read_mostly
  2007-03-12 21:08 [patch 0/4] more stuff for 2.6.22 Stephen Hemminger
                   ` (2 preceding siblings ...)
  2007-03-12 21:08 ` [patch 3/4] net: show bound packet types Stephen Hemminger
@ 2007-03-12 21:08 ` Stephen Hemminger
  2007-03-12 21:15   ` David Miller
  3 siblings, 1 reply; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: tcp_stats-not-mostly --]
[-- Type: text/plain, Size: 948 bytes --]

The TCP statistics shouldn't be located in the middle of the
read_mostly section surrounded by sysctl values.
Move EXPORT_SYMBOL next to data like other declarations near by.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

---
 net/ipv4/tcp.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- net-2.6.22.orig/net/ipv4/tcp.c	2007-03-12 14:05:26.000000000 -0700
+++ net-2.6.22/net/ipv4/tcp.c	2007-03-12 14:06:08.000000000 -0700
@@ -271,7 +271,8 @@
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
-DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+EXPORT_SYMBOL(tcp_statistics);
 
 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 
@@ -2492,4 +2493,3 @@
 EXPORT_SYMBOL(tcp_sendpage);
 EXPORT_SYMBOL(tcp_setsockopt);
 EXPORT_SYMBOL(tcp_shutdown);
-EXPORT_SYMBOL(tcp_statistics);

--
Stephen Hemminger <shemminger@linux-foundation.org>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 4/4] tcp: statistics not read_mostly
  2007-03-12 21:08 ` [patch 4/4] tcp: statistics not read_mostly Stephen Hemminger
@ 2007-03-12 21:15   ` David Miller
  2007-03-12 21:26     ` Stephen Hemminger
  0 siblings, 1 reply; 23+ messages in thread
From: David Miller @ 2007-03-12 21:15 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 12 Mar 2007 14:08:21 -0700

> The TCP statistics shouldn't be located in the middle of the
> read_mostly section surrounded by sysctl values.
> Move EXPORT_SYMBOL next to data like other declarations near by.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Stephen, they are pointers to the statistics, not the statistic
counters themselves.  We're marking the _pointers_ as __read_mostly
here.

When Eric originally submitted the change to add __read_mostly here I
didn't understand it either.

Look at the definition of DEFINE_SNMP_STAT().

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 4/4] tcp: statistics not read_mostly
  2007-03-12 21:15   ` David Miller
@ 2007-03-12 21:26     ` Stephen Hemminger
  2007-03-12 21:33       ` David Miller
  2007-03-13 20:09       ` Andi Kleen
  0 siblings, 2 replies; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-12 21:26 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Mon, 12 Mar 2007 14:15:50 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <shemminger@linux-foundation.org>
> Date: Mon, 12 Mar 2007 14:08:21 -0700
> 
> > The TCP statistics shouldn't be located in the middle of the
> > read_mostly section surrounded by sysctl values.
> > Move EXPORT_SYMBOL next to data like other declarations near by.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
> 
> Stephen, they are pointers to the statistics, not the statistic
> counters themselves.  We're marking the _pointers_ as __read_mostly
> here.
> 
> When Eric originally submitted the change to add __read_mostly here I
> didn't understand it either.
> 
> Look at the definition of DEFINE_SNMP_STAT().

Okay, that's confusing. And maybe the comment suggests future work:



/* 
 * FIXME: On x86 and some other CPUs the split into user and softirq parts
 * is not needed because addl $1,memory is atomic against interrupts (but 
 * atomic_inc would be overkill because of the lock cycles). Wants new 
 * nonlocked_atomic_inc() primitives -AK
 */ 
-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 4/4] tcp: statistics not read_mostly
  2007-03-12 21:26     ` Stephen Hemminger
@ 2007-03-12 21:33       ` David Miller
  2007-03-13 20:09       ` Andi Kleen
  1 sibling, 0 replies; 23+ messages in thread
From: David Miller @ 2007-03-12 21:33 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 12 Mar 2007 14:26:47 -0700

> On Mon, 12 Mar 2007 14:15:50 -0700 (PDT)
> David Miller <davem@davemloft.net> wrote:
> 
> > Look at the definition of DEFINE_SNMP_STAT().
> 
> Okay, that's confusing. And maybe the comment suggests future work:

Indeed.

> /* 
>  * FIXME: On x86 and some other CPUs the split into user and softirq parts
>  * is not needed because addl $1,memory is atomic against interrupts (but 
>  * atomic_inc would be overkill because of the lock cycles). Wants new 
>  * nonlocked_atomic_inc() primitives -AK
>  */ 

Yep.  And another interesting case are straight loads and stores of
per-cpu values on platforms that have a PDA'ish thing like x86_64 and
sparc64.  The latter has the per-cpu base in a register so in several
cases the usual preemption protection simply does not matter and we
could optimize them away.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
@ 2007-03-12 21:34   ` David Miller
  2007-03-13  5:37   ` Eric Dumazet
  2007-03-15  2:18   ` [patch 1/4] network dev read_mostly Benjamin LaHaise
  2 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2007-03-12 21:34 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 12 Mar 2007 14:08:18 -0700

> For Eric, mark packet type and network device watermarks
> as read mostly.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Applied.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 2/4] net: make seq_operations const
  2007-03-12 21:08 ` [patch 2/4] net: make seq_operations const Stephen Hemminger
@ 2007-03-12 21:34   ` David Miller
  0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2007-03-12 21:34 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 12 Mar 2007 14:08:19 -0700

> The seq_file operations stuff can be marked constant to
> get it out of dirty cache.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Applied.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 3/4] net: show bound packet types
  2007-03-12 21:08 ` [patch 3/4] net: show bound packet types Stephen Hemminger
@ 2007-03-12 21:35   ` David Miller
  0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2007-03-12 21:35 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Mon, 12 Mar 2007 14:08:20 -0700

> Show what protocols are bound to what packet types in /proc/net/ptype
> Uses kallsyms to decode function pointers if possible.
> Example:
> 	Type Device      Function
> 	ALL  eth1     packet_rcv_spkt+0x0
> 	0800          ip_rcv+0x0
> 	0806          arp_rcv+0x0
> 	86dd          :ipv6:ipv6_rcv+0x0
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

Applied, thanks for adding the kallsyms bits.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
  2007-03-12 21:34   ` David Miller
@ 2007-03-13  5:37   ` Eric Dumazet
  2007-03-13 21:26     ` [RFC] Get rid of netdev_nit Stephen Hemminger
  2007-03-15  2:18   ` [patch 1/4] network dev read_mostly Benjamin LaHaise
  2 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2007-03-13  5:37 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> For Eric, mark packet type and network device watermarks
> as read mostly.

;)

>  
>  static DEFINE_SPINLOCK(ptype_lock);
> -static struct list_head ptype_base[16];	/* 16 way hashed list */
> -static struct list_head ptype_all;		/* Taps */
> +static struct list_head ptype_base[16] __read_mostly;	/* 16 way hashed list */
> +static struct list_head ptype_all __read_mostly;	/* Taps */


what about netdev_nit ?


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 4/4] tcp: statistics not read_mostly
  2007-03-12 21:26     ` Stephen Hemminger
  2007-03-12 21:33       ` David Miller
@ 2007-03-13 20:09       ` Andi Kleen
  1 sibling, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2007-03-13 20:09 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger <shemminger@linux-foundation.org> writes:
> 
> /* 
>  * FIXME: On x86 and some other CPUs the split into user and softirq parts
>  * is not needed because addl $1,memory is atomic against interrupts (but 
>  * atomic_inc would be overkill because of the lock cycles). Wants new 
>  * nonlocked_atomic_inc() primitives -AK
>  */ 

That exists now as local_t.  And in fact the generic (non x86) local_t
is implemented in the same way as the current network statistics
(although I'm not convinced that's the best portable way to do this)

-Andi

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [RFC] Get rid of netdev_nit
  2007-03-13  5:37   ` Eric Dumazet
@ 2007-03-13 21:26     ` Stephen Hemminger
  2007-04-21  0:02       ` David Miller
  0 siblings, 1 reply; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-13 21:26 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev

It isn't any faster to test a boolean global variable than do a 
simple check for empty list.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
---
 net/core/dev.c |   18 +++++-------------
 1 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3a8590c..f2ae2c9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -226,12 +226,6 @@ #endif
 *******************************************************************************/
 
 /*
- *	For efficiency
- */
-
-static int netdev_nit;
-
-/*
  *	Add a protocol ID to the list. Now that the input handler is
  *	smarter we can dispense with all the messy stuff that used to be
  *	here.
@@ -265,10 +259,9 @@ void dev_add_pack(struct packet_type *pt
 	int hash;
 
 	spin_lock_bh(&ptype_lock);
-	if (pt->type == htons(ETH_P_ALL)) {
-		netdev_nit++;
+	if (pt->type == htons(ETH_P_ALL))
 		list_add_rcu(&pt->list, &ptype_all);
-	} else {
+	else {
 		hash = ntohs(pt->type) & 15;
 		list_add_rcu(&pt->list, &ptype_base[hash]);
 	}
@@ -295,10 +288,9 @@ void __dev_remove_pack(struct packet_typ
 
 	spin_lock_bh(&ptype_lock);
 
-	if (pt->type == htons(ETH_P_ALL)) {
-		netdev_nit--;
+	if (pt->type == htons(ETH_P_ALL))
 		head = &ptype_all;
-	} else
+	else
 		head = &ptype_base[ntohs(pt->type) & 15];
 
 	list_for_each_entry(pt1, head, list) {
@@ -1333,7 +1325,7 @@ static int dev_gso_segment(struct sk_buf
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	if (likely(!skb->next)) {
-		if (netdev_nit)
+		if (!list_empty(&ptype_all))
 			dev_queue_xmit_nit(skb, dev);
 
 		if (netif_needs_gso(dev, skb)) {
-- 
1.4.1


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
  2007-03-12 21:34   ` David Miller
  2007-03-13  5:37   ` Eric Dumazet
@ 2007-03-15  2:18   ` Benjamin LaHaise
  2007-03-15  4:54     ` David Miller
                       ` (2 more replies)
  2 siblings, 3 replies; 23+ messages in thread
From: Benjamin LaHaise @ 2007-03-15  2:18 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

On Mon, Mar 12, 2007 at 02:08:18PM -0700, Stephen Hemminger wrote:
> For Eric, mark packet type and network device watermarks
> as read mostly.

The following x86-64 bits might be intersting, as they allow you to 
completely eliminate the memory access for run time defined constants.  
Note that read_always writes are non-atomic, so some other form of 
protection is necessary for readers (and rcu won't cut it).  That can be 
fixed somewhat by specifying the alignment for the mov instruction to 
ensure writes are atomic, but for many uses that is overkill.  This kind 
of change can make the biggest difference for high-latency cases, like L1 
cache misses on the Prescott P4.  I've not benched it on a P4 of late, 
though.

		-ben


diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 5f197b0..022ee38 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -70,6 +70,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
 	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
 
+	init_read_always();
+
  	for (i = 0; i < NR_CPUS; i++)
  		cpu_pda(i) = &boot_cpu_pda[i];
 
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index b73212c..19852dc 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -50,6 +50,10 @@ SECTIONS
   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
   __stop___ex_table = .;
 
+  start__read_always = .;
+  __read_always : AT(ADDR(__read_always) - LOAD_OFFSET) { *(__read_always) }
+  stop__read_always = .;
+
   RODATA
 
   BUG_TABLE
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 6ada723..d48415e 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -31,6 +31,7 @@
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
 #include <asm/kdebug.h>
+#include <asm/read_always.h>
 #include <asm-generic/sections.h>
 
 /* Page fault error code bits */
@@ -41,11 +42,67 @@
 #define PF_INSTR	(1<<4)
 
 static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+static DEFINE_READ_ALWAYS(char, notify_page_fault_active);
+static DEFINE_READ_ALWAYS(char, page_fault_trace);
+
+void init_read_always(void)
+{
+	extern unsigned int start__read_always[], stop__read_always[];
+	unsigned int *fixup;
+
+	fixup = start__read_always;
+	while (fixup < stop__read_always) {
+		void *where = (void *)(fixup[0] - 0x100000000L);
+		void *which = (void *)(fixup[1] - 0x100000000L);
+		long size = fixup[2];
+		fixup += 3;
+
+		switch (size) {
+		case 1:		*(u8 *)where = *(u8 *)which;	break;
+		case 2:		*(u16 *)where = *(u16 *)which;	break;
+		case 4:		*(u32 *)where = *(u32 *)which;	break;
+		case 8:		*(u64 *)where = *(u64 *)which;	break;
+		}
+	}
+}
+
+void set_read_always_size(void *ptr, long val, int size)
+{
+	extern unsigned int start__read_always[], stop__read_always[];
+	unsigned int *fixup;
+
+	switch(size) {
+	case 1:	*(u8 *)ptr = val;	break;
+	case 2:	*(u16 *)ptr = val;	break;
+	case 4:	*(u32 *)ptr = val;	break;
+	case 8:	*(u64 *)ptr = val;	break;
+	}
+
+	fixup = start__read_always;
+	while (fixup < stop__read_always) {
+		void *where = (void *)(fixup[0] - 0x100000000L);
+		void *which = (void *)(fixup[1] - 0x100000000L);
+		long actual_size = fixup[2];
+		fixup += 3;
+
+		if (which != ptr)
+			continue;
+
+		BUG_ON(size != actual_size);
+		switch(size) {
+		case 1:	*(u8 *)where = val;	break;
+		case 2:	*(u16 *)where = val;	break;
+		case 4:	*(u32 *)where = val;	break;
+		case 8:	*(u64 *)where = val;	break;
+		}
+	}
+}
 
 /* Hook to register for page fault notifications */
 int register_page_fault_notifier(struct notifier_block *nb)
 {
 	vmalloc_sync_all();
+	set_read_always(notify_page_fault_active, 1);
 	return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_page_fault_notifier);
@@ -56,7 +113,7 @@ int unregister_page_fault_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
 
-static inline int notify_page_fault(struct pt_regs *regs, long err)
+static int notify_page_fault(struct pt_regs *regs, long err)
 {
 	struct die_args args = {
 		.regs = regs,
@@ -301,7 +358,6 @@ static int vmalloc_fault(unsigned long address)
 	return 0;
 }
 
-int page_fault_trace = 0;
 int exception_trace = 1;
 
 /*
@@ -355,7 +411,8 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 			if (vmalloc_fault(address) >= 0)
 				return;
 		}
-		if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+		if (read_always(notify_page_fault_active) &&
+		    notify_page_fault(regs, error_code) == NOTIFY_STOP)
 			return;
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
@@ -364,13 +421,14 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
-	if (notify_page_fault(regs, error_code) == NOTIFY_STOP)
+	if (read_always(notify_page_fault_active) &&
+	    notify_page_fault(regs, error_code) == NOTIFY_STOP)
 		return;
 
 	if (likely(regs->eflags & X86_EFLAGS_IF))
 		local_irq_enable();
 
-	if (unlikely(page_fault_trace))
+	if (unlikely(read_always(page_fault_trace)))
 		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
 		       regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); 
 
@@ -628,7 +686,7 @@ void vmalloc_sync_all(void)
 
 static int __init enable_pagefaulttrace(char *str)
 {
-	page_fault_trace = 1;
+	set_read_always(page_fault_trace, 1);
 	return 1;
 }
 __setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/fs/exec.c b/fs/exec.c
index 7e36c6f..018ac4a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -65,7 +65,7 @@ int suid_dumpable = 0;
 EXPORT_SYMBOL(suid_dumpable);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
-static struct linux_binfmt *formats;
+static DEFINE_READ_ALWAYS(struct linux_binfmt *, formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
 int register_binfmt(struct linux_binfmt * fmt)
@@ -85,7 +85,7 @@ int register_binfmt(struct linux_binfmt * fmt)
 		tmp = &(*tmp)->next;
 	}
 	fmt->next = formats;
-	formats = fmt;
+	set_read_always(formats, fmt);
 	write_unlock(&binfmt_lock);
 	return 0;	
 }
@@ -100,6 +100,8 @@ int unregister_binfmt(struct linux_binfmt * fmt)
 	while (*tmp) {
 		if (fmt == *tmp) {
 			*tmp = fmt->next;
+			if (tmp == &formats)
+				set_read_always(formats, fmt->next);
 			write_unlock(&binfmt_lock);
 			return 0;
 		}
@@ -150,7 +152,7 @@ asmlinkage long sys_uselib(const char __user * library)
 		struct linux_binfmt * fmt;
 
 		read_lock(&binfmt_lock);
-		for (fmt = formats ; fmt ; fmt = fmt->next) {
+		for (fmt = read_always(formats); fmt ; fmt = fmt->next) {
 			if (!fmt->load_shlib)
 				continue;
 			if (!try_module_get(fmt->module))
@@ -1068,7 +1070,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	retval = -ENOENT;
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
-		for (fmt = formats ; fmt ; fmt = fmt->next) {
+		for (fmt = read_always(formats) ; fmt ; fmt = fmt->next) {
 			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
 			if (!fn)
 				continue;
diff --git a/include/asm-x86_64/read_always.h b/include/asm-x86_64/read_always.h
index 166b2a2..389e6e6 100644
--- a/include/asm-x86_64/read_always.h
+++ b/include/asm-x86_64/read_always.h
@@ -1 +1,60 @@
-#include <asm-generic/read_always.h>
+#ifndef __ASM__READ_ALWAYS_H
+#define __ASM__READ_ALWAYS_H
+
+#ifdef MODULE
+/* FIXME: making modules implement this optimization requires more work. */
+#define DEFINE_READ_ALWAYS(type, var)	type var
+#define read_always(var)		(var)
+
+#else
+
+#define DEFINE_READ_ALWAYS(type, var)					\
+	type var;							\
+	extern inline type read_always_##var(void)			\
+	{								\
+		extern void __size_is_unsupported(void) __attribute__((noreturn));		\
+		type ret;						\
+		switch (sizeof(ret)) {					\
+		case 1:							\
+			__asm__ __volatile__(				\
+				"movb $0x12,%0\n1:\n"			\
+				".section __read_always,\"a\"\n"	\
+				"	.long	1b-1+0x100000000\n"	\
+				"	.long	" #var "+0x100000000\n"	\
+				"	.long	1\n"			\
+				".previous\n"				\
+				: "=r" (ret));				\
+			break;						\
+		case 4:							\
+			__asm__ __volatile__(				\
+				"movl $0x12345678,%0\n1:\n"		\
+				".section __read_always,\"a\"\n"	\
+				"	.long	1b-4+0x100000000\n"	\
+				"	.long	" #var "+0x100000000\n"	\
+				"	.long	1\n"			\
+				".previous\n"				\
+				: "=r" (ret));				\
+			break;						\
+		case 8:							\
+			__asm__ __volatile__(				\
+				"movq $0x123456789abcdef0,%0\n1:\n"	\
+				".section __read_always,\"a\"\n"	\
+				"	.long	1b-8+0x100000000\n"	\
+				"	.long	" #var "+0x100000000\n"	\
+				"	.long	8\n"			\
+				".previous\n"				\
+				: "=r" (ret));				\
+			break;						\
+		default:						\
+			__size_is_unsupported();			\
+		}							\
+		return ret;						\
+	}
+
+#define read_always(var)	read_always_##var()
+#endif
+
+#define set_read_always(var, val) set_read_always_size(&var, (long)(val), sizeof var)
+extern void set_read_always_size(void *var, long val, int size);
+
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 60e0e4a..f0c3908 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -114,7 +114,9 @@ struct vm_area_struct {
 #endif
 };
 
-extern struct kmem_cache *vm_area_cachep;
+#include <asm/read_always.h>
+extern DEFINE_READ_ALWAYS(struct kmem_cache *, __vm_area_cachep);
+#define vm_area_cachep	read_always(__vm_area_cachep)
 
 /*
  * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ee9e314..629ce04 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -407,8 +407,10 @@ struct node_active_region {
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
 
 #ifndef CONFIG_DISCONTIGMEM
+#include <asm/read_always.h>
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
-extern struct page *mem_map;
+extern DEFINE_READ_ALWAYS(struct page *, __mem_map);
+#define mem_map		read_always(__mem_map)
 #endif
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index d154cc7..c7d8b7f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -81,13 +81,13 @@ int nr_processes(void)
 }
 
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct()	kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk)	kmem_cache_free(task_struct_cachep, (tsk))
-static struct kmem_cache *task_struct_cachep;
+# define alloc_task_struct()	kmem_cache_alloc(read_always(task_struct_cachep), GFP_KERNEL)
+# define free_task_struct(tsk)	kmem_cache_free(read_always(task_struct_cachep), (tsk))
+static DEFINE_READ_ALWAYS(struct kmem_cache *, task_struct_cachep);
 #endif
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
-static struct kmem_cache *signal_cachep;
+static DEFINE_READ_ALWAYS(struct kmem_cache *, signal_cachep);
 
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
 struct kmem_cache *sighand_cachep;
@@ -99,10 +99,10 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+struct kmem_cache *__vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
-static struct kmem_cache *mm_cachep;
+static DEFINE_READ_ALWAYS(struct kmem_cache *, mm_cachep);
 
 void free_task(struct task_struct *tsk)
 {
@@ -134,9 +134,9 @@ void __init fork_init(unsigned long mempages)
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
-	task_struct_cachep =
+	set_read_always(task_struct_cachep,
 		kmem_cache_create("task_struct", sizeof(struct task_struct),
-			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
+			ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL));
 #endif
 
 	/*
@@ -320,8 +320,8 @@ static inline void mm_free_pgd(struct mm_struct * mm)
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
-#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
+#define allocate_mm()	(kmem_cache_alloc(read_always(mm_cachep), GFP_KERNEL))
+#define free_mm(mm)	(kmem_cache_free(read_always(mm_cachep), (mm)))
 
 #include <linux/init_task.h>
 
@@ -836,14 +836,14 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 		atomic_inc(&current->signal->live);
 		return 0;
 	}
-	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+	sig = kmem_cache_alloc(read_always(signal_cachep), GFP_KERNEL);
 	tsk->signal = sig;
 	if (!sig)
 		return -ENOMEM;
 
 	ret = copy_thread_group_keys(tsk);
 	if (ret < 0) {
-		kmem_cache_free(signal_cachep, sig);
+		kmem_cache_free(read_always(signal_cachep), sig);
 		return ret;
 	}
 
@@ -900,7 +900,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
-	kmem_cache_free(signal_cachep, sig);
+	kmem_cache_free(read_always(signal_cachep), sig);
 }
 
 static inline void cleanup_signal(struct task_struct *tsk)
@@ -1434,21 +1434,21 @@ void __init proc_caches_init(void)
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU,
 			sighand_ctor, NULL);
-	signal_cachep = kmem_cache_create("signal_cache",
+	set_read_always(signal_cachep, kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL));
 	files_cachep = kmem_cache_create("files_cache", 
 			sizeof(struct files_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 	fs_cachep = kmem_cache_create("fs_cache", 
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
+	set_read_always(__vm_area_cachep, kmem_cache_create("vm_area_struct",
 			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL, NULL);
-	mm_cachep = kmem_cache_create("mm_struct",
+			SLAB_PANIC, NULL, NULL));
+	set_read_always(mm_cachep, kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL));
 }
 
 
diff --git a/mm/memory.c b/mm/memory.c
index e7066e7..36c062e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,12 +61,13 @@
 #include <linux/elf.h>
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
+#include <asm/read_always.h>
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
-struct page *mem_map;
+struct page *__mem_map;
 
 EXPORT_SYMBOL(max_mapnr);
-EXPORT_SYMBOL(mem_map);
+EXPORT_SYMBOL(__mem_map);
 #endif
 
 unsigned long num_physpages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 353ce90..b312e5c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2718,11 +2718,16 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0)) {
-		mem_map = NODE_DATA(0)->node_mem_map;
+		struct page *map = NODE_DATA(0)->node_mem_map;
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-		if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-			mem_map -= pgdat->node_start_pfn;
+		if (page_to_pfn(map) != pgdat->node_start_pfn)
+			map -= pgdat->node_start_pfn;
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#ifdef mem_map
+		set_read_always(__mem_map, map);
+#else
+		mem_map = map;
+#endif
 	}
 #endif
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
diff --git a/net/core/dev.c b/net/core/dev.c
index cf71614..e3975cf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -229,7 +229,7 @@ extern void netdev_unregister_sysfs(struct net_device *);
  *	For efficiency
  */
 
-static int netdev_nit;
+static DEFINE_READ_ALWAYS(int, netdev_nit);
 
 /*
  *	Add a protocol ID to the list. Now that the input handler is
@@ -266,7 +266,7 @@ void dev_add_pack(struct packet_type *pt)
 
 	spin_lock_bh(&ptype_lock);
 	if (pt->type == htons(ETH_P_ALL)) {
-		netdev_nit++;
+		set_read_always(netdev_nit, netdev_nit + 1);
 		list_add_rcu(&pt->list, &ptype_all);
 	} else {
 		hash = ntohs(pt->type) & 15;
@@ -296,7 +296,7 @@ void __dev_remove_pack(struct packet_type *pt)
 	spin_lock_bh(&ptype_lock);
 
 	if (pt->type == htons(ETH_P_ALL)) {
-		netdev_nit--;
+		set_read_always(netdev_nit, netdev_nit - 1);
 		head = &ptype_all;
 	} else
 		head = &ptype_base[ntohs(pt->type) & 15];
@@ -1343,7 +1343,7 @@ static int dev_gso_segment(struct sk_buff *skb)
 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	if (likely(!skb->next)) {
-		if (netdev_nit)
+		if (read_always(netdev_nit))
 			dev_queue_xmit_nit(skb, dev);
 
 		if (netif_needs_gso(dev, skb)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 820761f..2595a97 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -64,11 +64,14 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <asm/read_always.h>
 
 #include "kmap_skb.h"
 
-static struct kmem_cache *skbuff_head_cache __read_mostly;
-static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+static DEFINE_READ_ALWAYS(struct kmem_cache *, __skbuff_head_cache);
+#define skbuff_head_cache	read_always(__skbuff_head_cache)
+static DEFINE_READ_ALWAYS(struct kmem_cache *, __skbuff_fclone_cache);
+#define skbuff_fclone_cache	read_always(__skbuff_fclone_cache)
 
 /*
  *	Keep out-of-line to prevent kernel bloat.
@@ -2046,17 +2049,19 @@ EXPORT_SYMBOL_GPL(skb_segment);
 
 void __init skb_init(void)
 {
-	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+	set_read_always(__skbuff_head_cache, kmem_cache_create(
+					      "skbuff_head_cache",
 					      sizeof(struct sk_buff),
 					      0,
 					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					      NULL, NULL);
-	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+					      NULL, NULL));
+	set_read_always(__skbuff_fclone_cache, kmem_cache_create(
+						"skbuff_fclone_cache",
 						(2*sizeof(struct sk_buff)) +
 						sizeof(atomic_t),
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-						NULL, NULL);
+						NULL, NULL));
 }
 
 EXPORT_SYMBOL(___pskb_trim);

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15  2:18   ` [patch 1/4] network dev read_mostly Benjamin LaHaise
@ 2007-03-15  4:54     ` David Miller
  2007-03-15  6:28     ` Eric Dumazet
  2007-03-15 15:10     ` Andi Kleen
  2 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2007-03-15  4:54 UTC (permalink / raw)
  To: bcrl; +Cc: shemminger, netdev

From: Benjamin LaHaise <bcrl@kvack.org>
Date: Wed, 14 Mar 2007 22:18:40 -0400

> On Mon, Mar 12, 2007 at 02:08:18PM -0700, Stephen Hemminger wrote:
> > For Eric, mark packet type and network device watermarks
> > as read mostly.
> 
> The following x86-64 bits might be intersting, as they allow you to 
> completely eliminate the memory access for run time defined constants.  
> Note that read_always writes are non-atomic, so some other form of 
> protection is necessary for readers (and rcu won't cut it).  That can be 
> fixed somewhat by specifying the alignment for the mov instruction to 
> ensure writes are atomic, but for many uses that is overkill.  This kind 
> of change can make the biggest difference for high-latency cases, like L1 
> cache misses on the Prescott P4.  I've not benched it on a P4 of late, 
> though.

That's a really cool idea.

I think for this kind of stuff, however, we should just use ELF
relocations to handle this.  Don't resolve the symbols, record
references to them, resolve them to 0 in the image link, and make the
set_foo() routine walk the relocations and fill them out with the new
value.

We already have to have code to do this for modules.

Sparc32 already does something like this with it's btfixup mechanism,
it's the same kind of idea.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15  2:18   ` [patch 1/4] network dev read_mostly Benjamin LaHaise
  2007-03-15  4:54     ` David Miller
@ 2007-03-15  6:28     ` Eric Dumazet
  2007-03-15  7:25       ` David Miller
  2007-03-15 15:10     ` Andi Kleen
  2 siblings, 1 reply; 23+ messages in thread
From: Eric Dumazet @ 2007-03-15  6:28 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Stephen Hemminger, David Miller, netdev

Benjamin LaHaise a écrit :
> On Mon, Mar 12, 2007 at 02:08:18PM -0700, Stephen Hemminger wrote:
>> For Eric, mark packet type and network device watermarks
>> as read mostly.
> 
> The following x86-64 bits might be intersting, as they allow you to 
> completely eliminate the memory access for run time defined constants.  
> Note that read_always writes are non-atomic, so some other form of 
> protection is necessary for readers (and rcu won't cut it).  That can be 
> fixed somewhat by specifying the alignment for the mov instruction to 
> ensure writes are atomic, but for many uses that is overkill.  This kind 
> of change can make the biggest difference for high-latency cases, like L1 
> cache misses on the Prescott P4.  I've not benched it on a P4 of late, 
> though.
> 

Very very nice idea Ben !

However netdev_nit is not a good condidate because it might change quite often 
in fact :(

Clearly kmem_cache pointers are very good candidates.

One problem with your patch is that all read_always() of pointers are going to 
use 3 bytes more of code, thus raising icache pressure.

48 b8 c3 08 e8 8c af    mov    $0x71af8ce808c3,%rax
71 00 00

instead of %rip relative addressing

48 8b 05 99 f3 09 00    mov    652185(%rip),%rax



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15  6:28     ` Eric Dumazet
@ 2007-03-15  7:25       ` David Miller
  2007-03-15  7:42         ` Eric Dumazet
  2007-03-15 13:17         ` Benjamin LaHaise
  0 siblings, 2 replies; 23+ messages in thread
From: David Miller @ 2007-03-15  7:25 UTC (permalink / raw)
  To: dada1; +Cc: bcrl, shemminger, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 15 Mar 2007 07:28:35 +0100

> One problem with your patch is that all read_always() of pointers are going to 
> use 3 bytes more of code, thus raising icache pressure.
> 
> 48 b8 c3 08 e8 8c af    mov    $0x71af8ce808c3,%rax
> 71 00 00
> 
> instead of %rip relative addressing
> 
> 48 8b 05 99 f3 09 00    mov    652185(%rip),%rax

Could we obtain %rip relative addressing with the ELF
relocation approach I mentioned?

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15  7:25       ` David Miller
@ 2007-03-15  7:42         ` Eric Dumazet
  2007-03-15 13:17         ` Benjamin LaHaise
  1 sibling, 0 replies; 23+ messages in thread
From: Eric Dumazet @ 2007-03-15  7:42 UTC (permalink / raw)
  To: David Miller; +Cc: bcrl, shemminger, netdev

David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Thu, 15 Mar 2007 07:28:35 +0100
> 
>> One problem with your patch is that all read_always() of pointers are going to 
>> use 3 bytes more of code, thus raising icache pressure.
>>
>> 48 b8 c3 08 e8 8c af    mov    $0x71af8ce808c3,%rax
>> 71 00 00
>>
>> instead of %rip relative addressing
>>
>> 48 8b 05 99 f3 09 00    mov    652185(%rip),%rax
> 
> Could we obtain %rip relative addressing with the ELF
> relocation approach I mentioned?

I dont think so, because 32 bits relative to %rip is not enough to cover all 
the addresses that a 64bits kernel can use :)

%rip relative addressing works only because all the x86_64 kernel (text+ 
static/bss data) are in the 0xffffffff80000000 - fffffffffff00000  2^31 quadrant

but vmalloc() range for example  is ffffc20000000000 - ffffe1ffffffffff (45 bits)



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15  7:25       ` David Miller
  2007-03-15  7:42         ` Eric Dumazet
@ 2007-03-15 13:17         ` Benjamin LaHaise
  2007-03-16 17:03           ` Stephen Hemminger
  1 sibling, 1 reply; 23+ messages in thread
From: Benjamin LaHaise @ 2007-03-15 13:17 UTC (permalink / raw)
  To: David Miller; +Cc: dada1, shemminger, netdev

On Thu, Mar 15, 2007 at 12:25:16AM -0700, David Miller wrote:
> Could we obtain %rip relative addressing with the ELF
> relocation approach I mentioned?

I think we can for some of the objects -- things like slab caches are 
good candidates if we have the initialization done at init time, which 
would actually be a very cool way of getting rid of the static slab 
creation calls.  I'll cook something better up on the slab front.

As for other variables, many can't be rip relative as they often end up 
pointing to addresses outside of the +/-2GB range we have there.

The main reason I came up with this was from looking at the various 
kprobes and notifier chain overhead in common code paths.  In many 
instances we need a single byte flag showing the feature is in use to 
jump out of the hot path.

Then there are the selinux hooks....

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <zyntrop@kvack.org>.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15  2:18   ` [patch 1/4] network dev read_mostly Benjamin LaHaise
  2007-03-15  4:54     ` David Miller
  2007-03-15  6:28     ` Eric Dumazet
@ 2007-03-15 15:10     ` Andi Kleen
  2 siblings, 0 replies; 23+ messages in thread
From: Andi Kleen @ 2007-03-15 15:10 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: Stephen Hemminger, David Miller, netdev

Benjamin LaHaise <bcrl@kvack.org> writes:

> On Mon, Mar 12, 2007 at 02:08:18PM -0700, Stephen Hemminger wrote:
> > For Eric, mark packet type and network device watermarks
> > as read mostly.
> 
> The following x86-64 bits might be intersting, as they allow you to 
> completely eliminate the memory access for run time defined constants.  

I like the concept, although the patch could still need a few cleanups.
But it would be useful in a couple of cases.

Disadvantage: it will likely lead to even more weird sysctls and hooks
because one of the best argument against them will be gone then @)

-Andi


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [patch 1/4] network dev read_mostly
  2007-03-15 13:17         ` Benjamin LaHaise
@ 2007-03-16 17:03           ` Stephen Hemminger
  0 siblings, 0 replies; 23+ messages in thread
From: Stephen Hemminger @ 2007-03-16 17:03 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: David Miller, dada1, netdev

On Thu, 15 Mar 2007 09:17:22 -0400
Benjamin LaHaise <bcrl@kvack.org> wrote:

> On Thu, Mar 15, 2007 at 12:25:16AM -0700, David Miller wrote:
> > Could we obtain %rip relative addressing with the ELF
> > relocation approach I mentioned?
> 
> I think we can for some of the objects -- things like slab caches are 
> good candidates if we have the initialization done at init time, which 
> would actually be a very cool way of getting rid of the static slab 
> creation calls.  I'll cook something better up on the slab front.
> 
> As for other variables, many can't be rip relative as they often end up 
> pointing to addresses outside of the +/-2GB range we have there.
> 
> The main reason I came up with this was from looking at the various 
> kprobes and notifier chain overhead in common code paths.  In many 
> instances we need a single byte flag showing the feature is in use to 
> jump out of the hot path.
> 
> Then there are the selinux hooks....
> 
> 		-ben

There is an ugliness and maintenance vs performance tradeoff here.
The described implementation leaves me gagging. Is there some way
to do the same thing with ELF sections and post build processing?


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC] Get rid of netdev_nit
  2007-03-13 21:26     ` [RFC] Get rid of netdev_nit Stephen Hemminger
@ 2007-04-21  0:02       ` David Miller
  0 siblings, 0 replies; 23+ messages in thread
From: David Miller @ 2007-04-21  0:02 UTC (permalink / raw)
  To: shemminger; +Cc: dada1, netdev

From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 13 Mar 2007 14:26:35 -0700

> It isn't any faster to test a boolean global variable than do a 
> simple check for empty list.
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>

I'll apply this, thanks Stephen.

I think the history is that long ago we used to export
netdev_nit and we didn't want to export the details of
the list implementation too.

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2007-04-21  0:02 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-12 21:08 [patch 0/4] more stuff for 2.6.22 Stephen Hemminger
2007-03-12 21:08 ` [patch 1/4] network dev read_mostly Stephen Hemminger
2007-03-12 21:34   ` David Miller
2007-03-13  5:37   ` Eric Dumazet
2007-03-13 21:26     ` [RFC] Get rid of netdev_nit Stephen Hemminger
2007-04-21  0:02       ` David Miller
2007-03-15  2:18   ` [patch 1/4] network dev read_mostly Benjamin LaHaise
2007-03-15  4:54     ` David Miller
2007-03-15  6:28     ` Eric Dumazet
2007-03-15  7:25       ` David Miller
2007-03-15  7:42         ` Eric Dumazet
2007-03-15 13:17         ` Benjamin LaHaise
2007-03-16 17:03           ` Stephen Hemminger
2007-03-15 15:10     ` Andi Kleen
2007-03-12 21:08 ` [patch 2/4] net: make seq_operations const Stephen Hemminger
2007-03-12 21:34   ` David Miller
2007-03-12 21:08 ` [patch 3/4] net: show bound packet types Stephen Hemminger
2007-03-12 21:35   ` David Miller
2007-03-12 21:08 ` [patch 4/4] tcp: statistics not read_mostly Stephen Hemminger
2007-03-12 21:15   ` David Miller
2007-03-12 21:26     ` Stephen Hemminger
2007-03-12 21:33       ` David Miller
2007-03-13 20:09       ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).