Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH v2 3/5] netdev: add tracepoints to netdev layer
From: Koki Sanagi @ 2010-06-24  8:18 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

This patch adds tracepoint to dev_queue_xmit, dev_hard_start_xmit and
netif_receive_skb. These tracepints help you to monitor network driver's
input/output.

            sshd-4445  [001] 241367.066046: net_dev_queue: dev=eth3 skbaddr=dd6b2538 len=114
            sshd-4445  [001] 241367.066047: net_dev_xmit: dev=eth3 skbaddr=dd6b2538 len=114 rc=0
          <idle>-0     [001] 241367.067472: net_dev_receive: dev=eth3 skbaddr=f5e59000 len=52

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 include/trace/events/net.h |   83 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c             |    5 +++
 net/core/net-traces.c      |    1 +
 3 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
new file mode 100644
index 0000000..ee10970
--- /dev/null
+++ b/include/trace/events/net.h
@@ -0,0 +1,83 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM net
+
+#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_H
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(net_dev_xmit,
+
+	TP_PROTO(struct sk_buff *skb,
+		 int rc),
+
+	TP_ARGS(skb, rc),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__field(	int,		rc		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__entry->rc = rc;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
+);
+
+TRACE_EVENT(net_dev_queue,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+);
+
+TRACE_EVENT(net_dev_receive,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned int,	len		)
+		__string(	name,		skb->dev->name	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->len = skb->len;
+		__assign_str(name, skb->dev->name);
+	),
+
+	TP_printk("dev=%s skbaddr=%p len=%u",
+		__get_str(name), __entry->skbaddr, __entry->len)
+);
+#endif /* _TRACE_NET_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/core/dev.c b/net/core/dev.c
index 5902426..4b64b21 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -130,6 +130,7 @@
 #include <linux/jhash.h>
 #include <linux/random.h>
 #include <trace/events/napi.h>
+#include <trace/events/net.h>
 #include <linux/pci.h>
 
 #include "net-sysfs.h"
@@ -1922,6 +1923,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		rc = ops->ndo_start_xmit(skb, dev);
+		trace_net_dev_xmit(skb, rc);
 		if (rc == NETDEV_TX_OK)
 			txq_trans_update(txq);
 		return rc;
@@ -1942,6 +1944,7 @@ gso:
 			skb_dst_drop(nskb);
 
 		rc = ops->ndo_start_xmit(nskb, dev);
+		trace_net_dev_xmit(nskb, rc);
 		if (unlikely(rc != NETDEV_TX_OK)) {
 			if (rc & ~NETDEV_TX_MASK)
 				goto out_kfree_gso_skb;
@@ -2156,6 +2159,7 @@ int dev_queue_xmit(struct sk_buff *skb)
 	}
 
 gso:
+	trace_net_dev_queue(skb);
 	/* Disable soft irqs for various locks below. Also
 	 * stops preemption for RCU.
 	 */
@@ -2942,6 +2946,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (netdev_tstamp_prequeue)
 		net_timestamp_check(skb);
 
+	trace_net_dev_receive(skb);
 #ifdef CONFIG_RPS
 	{
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index afa6380..7f1bb2a 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -26,6 +26,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/skb.h>
+#include <trace/events/net.h>
 #include <trace/events/napi.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);


^ permalink raw reply related

* [RFC PATCH v2 4/5] skb: add tracepoints to freeing skb
From: Koki Sanagi @ 2010-06-24  8:19 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

 This patch adds tracepoint to consume_skb and dev_kfree_skb_irq.
Combinating with tracepoint on dev_hard_start_xmit, we can check how long it
takes to free transmited packets. And using it, we can calculate how many
packets driver had at that time. It is useful when a drop of transmited packet
is a problem.

          <idle>-0     [001] 241409.218333: consume_skb: skbaddr=dd6b2fb8
          <idle>-0     [001] 241409.490555: dev_kfree_skb_irq: skbaddr=f5e29840

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
include/trace/events/skb.h |   36 ++++++++++++++++++++++++++++++++++++
 net/core/dev.c             |    2 ++
 net/core/skbuff.c          |    1 +
 3 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 4b2be6d..6ab5b34 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -35,6 +35,42 @@ TRACE_EVENT(kfree_skb,
 		__entry->skbaddr, __entry->protocol, __entry->location)
 );
 
+TRACE_EVENT(consume_skb,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,	skbaddr	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+	),
+
+	TP_printk("skbaddr=%p",
+		__entry->skbaddr)
+);
+
+TRACE_EVENT(dev_kfree_skb_irq,
+
+	TP_PROTO(struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__field(	void *,	skbaddr	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+	),
+
+	TP_printk("skbaddr=%p",
+		__entry->skbaddr)
+);
+
 TRACE_EVENT(skb_copy_datagram_iovec,
 
 	TP_PROTO(const struct sk_buff *skb, int len),
diff --git a/net/core/dev.c b/net/core/dev.c
index 4b64b21..807b1ca 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -131,6 +131,7 @@
 #include <linux/random.h>
 #include <trace/events/napi.h>
 #include <trace/events/net.h>
+#include <trace/events/skb.h>
 #include <linux/pci.h>
 
 #include "net-sysfs.h"
@@ -1580,6 +1581,7 @@ void dev_kfree_skb_irq(struct sk_buff *skb)
 		struct softnet_data *sd;
 		unsigned long flags;
 
+		trace_dev_kfree_skb_irq(skb);
 		local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
 		skb->next = sd->completion_queue;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 34432b4..a7b4036 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb)
 		smp_rmb();
 	else if (likely(!atomic_dec_and_test(&skb->users)))
 		return;
+	trace_consume_skb(skb);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(consume_skb);


^ permalink raw reply related

* [RFC PATCH v2 5/5] perf:add a script shows a process of packet
From: Koki Sanagi @ 2010-06-24  8:21 UTC (permalink / raw)
  To: netdev; +Cc: davem, scott.a.mcmillan, kaneshige.kenji, izumi.taku
In-Reply-To: <4C2312A8.9060903@jp.fujitsu.com>

This perf script shows a time-chart of process of packet.
Patch 0/5 shows an output of this.
If you want to use it, install perf and record perf.data like following.

#perf trace record netdev-times [script]

If you set script, perf gathers records until it ends.
If not, you must Ctrl-C to stop recording.

And if you want a report from record,

#perf trace report netdev-times [options]

If you use some options, you can limit an output.
Option is below.

tx: show only process of tx packets
rx: show only process of rx packets
dev=: show a process specified with this option

In the future, I want src/dst IP(v6) address filter option. 
It's now under consideration/construction.

For example, if you want to show a process of received packets associated
with eth3,

#perf trace report netdev-times rx dev=eth3
79074.756672832sec cpu=1
irq_entry(+0.000000msec,irq=77:eth3)
         |------------softirq_raise(+0.001277msec)
irq_exit (+0.002278msec)     |
                             |
                      softirq_entry(+0.003562msec)
                             |
                             |---netif_receive_skb(+0.006279msec,len=100)
                             |            |
                             |   skb_copy_datagram_iovec(+0.038778msec, 2285:sshd)
                             |
                      napi_poll_exit(+0.017160msec, eth3)
                             |
                      softirq_exit(+0.018248msec)


This perf script helps us to analyze a process time of transmit/receive
sequence.

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 tools/perf/scripts/python/bin/netdev-times-record |    7 +
 tools/perf/scripts/python/bin/netdev-times-report |    5 +
 tools/perf/scripts/python/netdev-times.py         |  495 +++++++++++++++++++++
 3 files changed, 507 insertions(+), 0 deletions(-)

diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record
new file mode 100644
index 0000000..1dfa8d5
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,7 @@
+#!/bin/bash
+perf record -c 1 -f -R -a -e net:net_dev_xmit -e net:net_dev_queue	\
+		-e net:net_dev_receive -e skb:consume_skb		\
+		-e skb:dev_kfree_skb_irq -e napi:napi_poll		\
+		-e irq:irq_handler_entry -e irq:irq_handler_exit	\
+		-e irq:softirq_entry -e irq:softirq_exit		\
+		-e irq:softirq_raise -e skb:skb_copy_datagram_iovec $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report
new file mode 100644
index 0000000..ecc8122
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: displayi a process of packet and processing time
+# args: tx rx dev src dst
+
+perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
new file mode 100644
index 0000000..5e68be4
--- /dev/null
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -0,0 +1,495 @@
+# Display process of packets and processed time.
+# It helps you to investigate networking.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+all_event_list = []; # insert all tracepoint event related with this script
+irq_dic = {}; # key is cpu and value is a list which stacks irqs
+              # which raise NET_RX softirq
+net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
+		 # and a list which stacks receive
+receive_hunk_list = []; # a list which include a sequence of receive events
+receive_skb_list = []; # received packet list for matching
+		       # skb_copy_datagram_iovec
+
+queue_list = []; # list of packets which pass through dev_queue_xmit
+xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
+free_list = [];  # list of packets which is freed
+
+show_tx = 0;
+show_rx = 0;
+dev = 0; # store a name of device specified by option "dev="
+
+# Calculate a time interval(msec) from src(nsec) to dst(nsec)
+def diff_msec(src, dst):
+	return (dst - src) / 1000000.0
+
+# Display a process of transmitting a packet
+def print_transmit(hunk):
+	if dev != 0 and hunk['dev'].find(dev) < 0:
+		return
+	print "%7s %5d %6d.%09dsec %12.6fmsec      %12.6fmsec" % \
+		(hunk['dev'], hunk['len'],
+		nsecs_secs(hunk['queue_t']),
+		nsecs_nsecs(hunk['queue_t']),
+		diff_msec(hunk['queue_t'], hunk['xmit_t']),
+		diff_msec(hunk['xmit_t'], hunk['free_t']))
+
+# Display a process of received packets and interrputs associated with
+# a NET_RX softirq
+def print_receive(hunk):
+	show_hunk = 0
+	if 'irq_list' not in hunk.keys() \
+	or len(hunk['irq_list']) == 0:
+		return
+	irq_list = hunk['irq_list']
+	cpu = irq_list[0]['cpu']
+	base_t = irq_list[0]['irq_ent_t']
+	# check if this hunk should be showed
+	if dev != 0:
+		for i in range(len(irq_list)):
+			if irq_list[i]['name'].find(dev) >= 0:
+				show_hunk = 1
+				break
+	else:
+		show_hunk = 1
+	if show_hunk == 0:
+		return
+
+	print "%d.%09dsec cpu=%d" % \
+		(nsecs_secs(base_t), nsecs_nsecs(base_t), cpu)
+	for i in range(len(irq_list)):
+		print "irq_entry(+%fmsec,irq=%d:%s)" % \
+			(diff_msec(base_t, irq_list[i]['irq_ent_t']),
+			irq_list[i]['irq'], irq_list[i]['name'])
+
+		if 'sirq_raise_t' in irq_list[i].keys():
+			print "         |------------" \
+			      "softirq_raise(+%fmsec)" % \
+				diff_msec(base_t, irq_list[i]['sirq_raise_t'])
+
+		if 'irq_ext_t' in irq_list[i].keys():
+			print "irq_exit (+%fmsec)     |" % \
+				diff_msec(base_t, irq_list[i]['irq_ext_t'])
+
+		print "                             |"
+
+	if 'sirq_ent_t' not in hunk.keys():
+		print 'maybe softirq_entry is dropped'
+		return
+	print "                      " \
+		"softirq_entry(+%fmsec)\n" \
+		"                      " \
+		"       |" % \
+		diff_msec(base_t, hunk['sirq_ent_t'])
+	event_list = hunk['event_list']
+	for i in range(len(event_list)):
+		event = event_list[i]
+		if event['event_name'] == 'napi_poll':
+			print "                      " \
+			      "napi_poll_exit(+%fmsec, %s)" % \
+			(diff_msec(base_t, event['event_t']), event['dev'])
+			print "                      " \
+			      "       |"
+		elif 'comm' in event.keys():
+			print "                      " \
+				"       |---netif_receive_skb" \
+				"(+%fmsec,len=%d)\n" \
+				"                      " \
+				"       |            |\n" \
+				"                      " \
+				"       |   skb_copy_datagram_iovec" \
+				"(+%fmsec, %d:%s)\n" \
+				"                      " \
+				"       |" % \
+			(diff_msec(base_t, event['event_t']),
+			event['len'],
+			diff_msec(base_t, event['comm_t']),
+			event['pid'], event['comm'])
+		else:
+			print "                      " \
+				"       |---netif_receive_skb" \
+				"(+%fmsec,len=%d)\n" \
+				"                      " \
+				"       |" % \
+				(diff_msec(base_t, event['event_t']),
+					event['len'])
+
+	print "                      " \
+	      "softirq_exit(+%fmsec)\n" % \
+		 diff_msec(base_t, hunk['sirq_ext_t'])
+
+def trace_begin():
+	global show_tx
+	global show_rx
+	global dev
+
+	for i in range(len(sys.argv)):
+		if i == 0:
+			continue
+		arg = sys.argv[i]
+		if arg == 'tx':
+			show_tx = 1
+		elif arg =='rx':
+			show_rx = 1
+		elif arg.find('dev=',0, 4) >= 0:
+			dev = arg[4:]
+	if show_tx == 0  and show_rx == 0:
+		show_tx = 1
+		show_rx = 1
+
+def trace_end():
+	global show_tx
+	global show_rx
+	# order all events in time
+	all_event_list.sort(lambda a,b :cmp(a['time'], b['time']))
+	# process all events
+	for i in range(len(all_event_list)):
+		event = all_event_list[i]
+		event_name = event['event_name']
+		if event_name == 'irq__softirq_exit':
+			handle_irq_softirq_exit(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['vec'])
+		elif event_name == 'irq__softirq_entry':
+			handle_irq_softirq_entry(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'],event['vec'])
+		elif event_name == 'irq__softirq_raise':
+			handle_irq_softirq_raise(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['vec'])
+		elif event_name == 'irq__irq_handler_entry':
+			handle_irq_handler_entry(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['irq'], event['name'])
+		elif event_name == 'irq__irq_handler_exit':
+			handle_irq_handler_exit(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['irq'], event['ret'])
+		elif event_name == 'napi__napi_poll':
+			handle_napi_poll(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['napi'],
+				event['dev_name'])
+		elif event_name == 'net__net_dev_receive':
+			handle_net_dev_receive(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'], event['name'])
+		elif event_name == 'skb__skb_copy_datagram_iovec':
+			handle_skb_copy_datagram_iovec(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'])
+		elif event_name == 'net__net_dev_queue':
+			handle_net_dev_queue(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'], event['name'])
+		elif event_name == 'net__net_dev_xmit':
+			handle_net_dev_xmit(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'],
+				event['skblen'], event['rc'], event['name'])
+		elif event_name == 'skb__dev_kfree_skb_irq':
+			handle_dev_kfree_skb_irq(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'])
+		elif event_name == 'skb__consume_skb':
+			handle_consume_skb(event['event_name'],
+				event['context'], event['common_cpu'],
+				event['common_pid'], event['common_comm'],
+				event['time'], event['skbaddr'])
+	# display receive hunks
+	if show_rx == 1:
+		for i in range(len(receive_hunk_list)):
+			print_receive(receive_hunk_list[i])
+	# display transmit hunks
+	if show_tx == 1:
+		print "   dev    len      dev_queue_xmit|----------|" \
+			"dev_hard_start_xmit|-----|free_skb"
+		print "                         |             |" \
+			"                           |"
+		for i in range(len(free_list)):
+			print_transmit(free_list[i])
+
+def irq__softirq_exit(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'vec':vec}
+	all_event_list.append(event_data)
+
+def handle_irq_softirq_exit(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	vec):
+	rec_data = {'sirq_ext_t':time}
+	if common_cpu in irq_dic.keys():
+		rec_data.update({'irq_list':irq_dic[common_cpu]})
+		del irq_dic[common_cpu]
+	if common_cpu in net_rx_dic.keys():
+		rec_data.update({
+		    'event_list':net_rx_dic[common_cpu]['event_list'],
+		    'sirq_ent_t':net_rx_dic[common_cpu]['sirq_ent_t']})
+		del net_rx_dic[common_cpu]
+	# merge information realted to a NET_RX softirq
+	receive_hunk_list.append(rec_data)
+
+def irq__softirq_entry(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'vec':vec}
+	all_event_list.append(event_data)
+
+def handle_irq_softirq_entry(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	vec):
+		net_rx_dic[common_cpu] = {'event_list':[],
+					  'sirq_ent_t':time}
+
+def irq__softirq_raise(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'vec':vec}
+	all_event_list.append(event_data)
+
+def handle_irq_softirq_raise(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	vec):
+	if common_cpu not in irq_dic.keys() \
+	or len(irq_dic[common_cpu]) == 0:
+		return
+	irq = irq_dic[common_cpu].pop()
+	# put a time to prev irq on the same cpu
+	irq.update({'sirq_raise_t':time})
+	irq_dic[common_cpu].append(irq)
+
+def irq__irq_handler_entry(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	irq, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'irq':irq, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_irq_handler_entry(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	irq, name):
+	if common_cpu not in irq_dic.keys():
+		irq_dic[common_cpu] = []
+	irq_record = {'irq':irq,
+		      'name':name,
+		      'cpu':common_cpu,
+		      'irq_ent_t':time}
+	irq_dic[common_cpu].append(irq_record)
+
+def irq__irq_handler_exit(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	irq, ret):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'irq':irq, 'ret':ret}
+	all_event_list.append(event_data)
+
+def handle_irq_handler_exit(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	irq, ret):
+	if common_cpu not in irq_dic.keys():
+		return
+	irq_record = irq_dic[common_cpu].pop()
+	irq_record.update({'irq_ext_t':time})
+	# if an irq doesn't include NET_RX softirq, drop.
+	if 'sirq_raise_t' in irq_record.keys():
+		irq_dic[common_cpu].append(irq_record)
+
+def napi__napi_poll(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	napi, dev_name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'napi':napi, 'dev_name':dev_name}
+	all_event_list.append(event_data)
+
+def handle_napi_poll(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	napi, dev_name):
+	if common_cpu in net_rx_dic.keys():
+		event_list = net_rx_dic[common_cpu]['event_list']
+		rec_data = {'event_name':'napi_poll',
+			    'dev':dev_name,
+			    'event_t':time}
+		event_list.append(rec_data)
+
+def net__net_dev_receive(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr,skblen, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_net_dev_receive(event_name, context, common_cpu,
+	ccommon_pid, common_comm, time,
+	skbaddr, skblen, name):
+	if common_cpu in net_rx_dic.keys():
+		rec_data = {'event_name':'netif_receive_skb',
+			    'event_t':time,
+			    'skbaddr':skbaddr,
+			    'len':skblen}
+		event_list = net_rx_dic[common_cpu]['event_list']
+		event_list.append(rec_data)
+		receive_skb_list.insert(0, rec_data)
+
+def skb__skb_copy_datagram_iovec(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr, skblen):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen}
+	all_event_list.append(event_data)
+
+def handle_skb_copy_datagram_iovec(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr, skblen):
+	for i in range(len(receive_skb_list)):
+		rec_data = receive_skb_list[i]
+		if skbaddr == rec_data['skbaddr'] and \
+			'comm' not in rec_data.keys():
+			rec_data.update({'comm':common_comm,
+					 'pid':common_pid,
+					 'comm_t':time})
+			del receive_skb_list[i]
+			break
+
+def net__net_dev_queue(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr, skblen, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_net_dev_queue(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr, skblen, name):
+	skb = {'dev':name,
+	       'skbaddr':skbaddr,
+	       'len':skblen,
+	       'queue_t':time}
+	xmit_list.insert(0, skb)
+
+def net__net_dev_xmit(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr, skblen, rc, name):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr, 'skblen':skblen, 'rc':rc, 'name':name}
+	all_event_list.append(event_data)
+
+def handle_net_dev_xmit(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr, skblen, rc, name):
+	if rc == 0: # NETDEV_TX_OK
+		for i in range(len(xmit_list)):
+			skb = xmit_list[i]
+			if skb['skbaddr'] == skbaddr:
+				skb['xmit_t'] = time
+				queue_list.insert(0, skb)
+				del xmit_list[i]
+				break
+
+def free_skb(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr):
+	for i in range(len(queue_list)):
+		skb = queue_list[i]
+		if skb['skbaddr'] ==skbaddr:
+			skb['free_t'] = time
+			free_list.append(skb)
+			del queue_list[i]
+			break
+
+def skb__dev_kfree_skb_irq(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr}
+	all_event_list.append(event_data)
+
+def handle_dev_kfree_skb_irq(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr):
+	free_skb(event_name, context, common_cpu,
+		common_pid, common_comm, time,
+		skbaddr)
+
+def skb__consume_skb(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	skbaddr):
+	event_data = {'event_name':event_name, 'context':context,
+		'common_cpu':common_cpu, 'common_pid':common_pid,
+		'common_comm':common_comm,'time':nsecs(common_secs,
+							common_nsecs),
+		'skbaddr':skbaddr}
+	all_event_list.append(event_data)
+
+def handle_consume_skb(event_name, context, common_cpu,
+	common_pid, common_comm, time,
+	skbaddr):
+	free_skb(event_name, context, common_cpu,
+		common_pid, common_comm, time,
+		skbaddr)


^ permalink raw reply related

* Re: [RFC PATCH v7 01/19] Add a new structure for skb buffer from external.
From: Herbert Xu @ 2010-06-24 10:08 UTC (permalink / raw)
  To: Dong, Eddie
  Cc: Xin, Xiaohui, Stephen Hemminger, netdev@vger.kernel.org,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mst@redhat.com,
	mingo@elte.hu, davem@davemloft.net, jdike@linux.intel.com
In-Reply-To: <1A42CE6F5F474C41B63392A5F80372B21F58CE7F@shsmsx501.ccr.corp.intel.com>

On Wed, Jun 23, 2010 at 06:05:41PM +0800, Dong, Eddie wrote:
> 
> I mean once the frontend side driver post the buffers to the backend driver, the backend driver will "immediately" use that buffers to compose skb or gro_frags and post them to the assigned host NIC driver as receive buffers. In that case, if the backend driver recieves a packet from the NIC that requires to do copy, it may be unable to find additional free guest buffer because all of them are already used by the NIC driver. We have to reserve some guest buffers for the possible copy even if the buffer address is not identified by original skb :(

OK I see what you mean.  Can you tell me how does Xiaohui's
previous patch-set deal with this problem?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH net-next-2.6] net: u64_stats_sync improvements
From: Eric Dumazet @ 2010-06-24 10:04 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

- Add a comment about interrupts:

6) If counter might be written by an interrupt, readers should block
interrupts.

- Fix a typo in sample of use.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/u64_stats_sync.h |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index d050515..201d319 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -23,6 +23,10 @@
  *    pure reads. But if they have to fetch many values, it's better to not allow
  *    preemptions/interruptions to avoid many retries.
  *
+ * 6) If counter might be written by an interrupt, readers should block interrupts.
+ *    (On UP, there is no seqcount_t protection, a reader allowing interrupts could
+ *     read partial values)
+ *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -46,7 +50,7 @@
  *         start = u64_stats_fetch_begin(&stats->syncp);
  *         tbytes = stats->bytes64; // non atomic operation
  *         tpackets = stats->packets64; // non atomic operation
- * } while (u64_stats_fetch_retry(&stats->lock, syncp));
+ * } while (u64_stats_fetch_retry(&stats->syncp, start));
  *
  *
  * Example of use in drivers/net/loopback.c, using per_cpu containers,



^ permalink raw reply related

* [PATCH net-next-2.6] net: use this_cpu_ptr()
From: Eric Dumazet @ 2010-06-24 10:52 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

use this_cpu_ptr(p) instead of per_cpu_ptr(p, smp_processor_id())

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/flow.c     |    4 ++--
 net/ipv4/ip_input.c |    2 +-
 net/ipv4/tcp.c      |    2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/flow.c b/net/core/flow.c
index 1619006..8c7c91a 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -222,7 +222,7 @@ flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 	unsigned int hash;
 
 	local_bh_disable();
-	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	fcp = this_cpu_ptr(fc->percpu);
 
 	fle = NULL;
 	flo = NULL;
@@ -302,7 +302,7 @@ static void flow_cache_flush_tasklet(unsigned long data)
 	LIST_HEAD(gc_list);
 	int i, deleted = 0;
 
-	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
+	fcp = this_cpu_ptr(fc->percpu);
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
 		hlist_for_each_entry_safe(fle, entry, tmp,
 					  &fcp->hash_table[i], u.hlist) {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index db47a5a..d859bcc 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -342,7 +342,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (unlikely(skb_dst(skb)->tclassid)) {
-		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
+		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
 		u32 idx = skb_dst(skb)->tclassid;
 		st[idx&0xFF].o_packets++;
 		st[idx&0xFF].o_bytes += skb->len;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 779d40c..b9e721c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2958,7 +2958,7 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 	spin_unlock(&tcp_md5sig_pool_lock);
 
 	if (p)
-		return *per_cpu_ptr(p, smp_processor_id());
+		return *this_cpu_ptr(p);
 
 	local_bh_enable();
 	return NULL;



^ permalink raw reply related

* [PATCH net-next-2.6 2/4] net: u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh()
From: Eric Dumazet @ 2010-06-24 10:54 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

- Must disable preemption in case of 32bit UP in u64_stats_fetch_begin()
and u64_stats_fetch_retry()

- Add new u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh() for
network usage, disabling BH on 32bit UP only.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/u64_stats_sync.h |   59 +++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index 201d319..00c1592 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -27,6 +27,9 @@
  *    (On UP, there is no seqcount_t protection, a reader allowing interrupts could
  *     read partial values)
  *
+ * 7) For softirq uses, readers can use u64_stats_fetch_begin_bh() and
+ *    u64_stats_fetch_retry_bh() helpers
+ *
  * Usage :
  *
  * Stats producer (writer) should use following template granted it already got
@@ -58,54 +61,80 @@
  */
 #include <linux/seqlock.h>
 
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 struct u64_stats_sync {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	seqcount_t	seq;
+#endif
 };
 
 static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&syncp->seq);
+#endif
 }
 
 static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_end(&syncp->seq);
+#endif
 }
 
 static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
+#else
+#if BITS_PER_LONG==32
+	preempt_disable();
+#endif
+	return 0;
+#endif
 }
 
 static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_retry(&syncp->seq, start);
-}
-
 #else
-struct u64_stats_sync {
-};
-
-static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
-{
-}
-
-static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
-{
+#if BITS_PER_LONG==32
+	preempt_enable();
+#endif
+	return false;
+#endif
 }
 
-static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+/*
+ * In case softirq handlers can update u64 counters, readers can use following helpers
+ * - SMP 32bit arches use seqcount protection, irq safe.
+ * - UP 32bit must disable BH.
+ * - 64bit have no problem atomically reading u64 values, irq safe.
+ */
+static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_begin(&syncp->seq);
+#else
+#if BITS_PER_LONG==32
+	local_bh_disable();
+#endif
 	return 0;
+#endif
 }
 
-static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+static bool inline u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	return read_seqcount_retry(&syncp->seq, start);
+#else
+#if BITS_PER_LONG==32
+	local_bh_enable();
+#endif
 	return false;
-}
 #endif
+}
 
 #endif /* _LINUX_U64_STATS_SYNC_H */



^ permalink raw reply related

* [PATCH net-next-2.6 3/4] macvlan: 64 bit rx counters
From: Eric Dumazet @ 2010-06-24 10:54 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev

Use u64_stats_sync infrastructure to implement 64bit stats.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/macvlan.c      |   37 +++++++++++++++++++++--------------
 include/linux/if_macvlan.h |   19 +++++++++++------
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e096875..e6d626e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -431,29 +431,38 @@ static void macvlan_uninit(struct net_device *dev)
 	free_percpu(vlan->rx_stats);
 }
 
-static struct net_device_stats *macvlan_dev_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev)
 {
-	struct net_device_stats *stats = &dev->stats;
+	struct rtnl_link_stats64 *stats = &dev->stats64;
 	struct macvlan_dev *vlan = netdev_priv(dev);
 
-	dev_txq_stats_fold(dev, stats);
+	dev_txq_stats_fold(dev, &dev->stats);
 
 	if (vlan->rx_stats) {
-		struct macvlan_rx_stats *p, rx = {0};
+		struct macvlan_rx_stats *p, accum = {0};
+		u64 rx_packets, rx_bytes, rx_multicast;
+		unsigned int start;
 		int i;
 
 		for_each_possible_cpu(i) {
 			p = per_cpu_ptr(vlan->rx_stats, i);
-			rx.rx_packets += p->rx_packets;
-			rx.rx_bytes   += p->rx_bytes;
-			rx.rx_errors  += p->rx_errors;
-			rx.multicast  += p->multicast;
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rx_packets	= p->rx_packets;
+				rx_bytes	= p->rx_bytes;
+				rx_multicast	= p->rx_multicast;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+			accum.rx_packets	+= rx_packets;
+			accum.rx_bytes		+= rx_bytes;
+			accum.rx_multicast	+= rx_multicast;
+			/* rx_errors is an ulong, updated without syncp protection */
+			accum.rx_errors		+= p->rx_errors;
 		}
-		stats->rx_packets = rx.rx_packets;
-		stats->rx_bytes   = rx.rx_bytes;
-		stats->rx_errors  = rx.rx_errors;
-		stats->rx_dropped = rx.rx_errors;
-		stats->multicast  = rx.multicast;
+		stats->rx_packets = accum.rx_packets;
+		stats->rx_bytes   = accum.rx_bytes;
+		stats->rx_errors  = accum.rx_errors;
+		stats->rx_dropped = accum.rx_errors;
+		stats->multicast  = accum.rx_multicast;
 	}
 	return stats;
 }
@@ -502,7 +511,7 @@ static const struct net_device_ops macvlan_netdev_ops = {
 	.ndo_change_rx_flags	= macvlan_change_rx_flags,
 	.ndo_set_mac_address	= macvlan_set_mac_address,
 	.ndo_set_multicast_list	= macvlan_set_multicast_list,
-	.ndo_get_stats		= macvlan_dev_get_stats,
+	.ndo_get_stats64	= macvlan_dev_get_stats64,
 	.ndo_validate_addr	= eth_validate_addr,
 };
 
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c26a0e4..e24ce6e 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/netlink.h>
 #include <net/netlink.h>
+#include <linux/u64_stats_sync.h>
 
 #if defined(CONFIG_MACVTAP) || defined(CONFIG_MACVTAP_MODULE)
 struct socket *macvtap_get_socket(struct file *);
@@ -27,14 +28,16 @@ struct macvtap_queue;
  *	struct macvlan_rx_stats - MACVLAN percpu rx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
- *	@multicast: number of received multicast packets
+ *	@rx_multicast: number of received multicast packets
+ *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
  */
 struct macvlan_rx_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long multicast;
-	unsigned long rx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	struct u64_stats_sync	syncp;
+	unsigned long		rx_errors;
 };
 
 struct macvlan_dev {
@@ -56,12 +59,14 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 {
 	struct macvlan_rx_stats *rx_stats;
 
-	rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+	rx_stats = this_cpu_ptr(vlan->rx_stats);
 	if (likely(success)) {
+		u64_stats_update_begin(&rx_stats->syncp);
 		rx_stats->rx_packets++;;
 		rx_stats->rx_bytes += len;
 		if (multicast)
-			rx_stats->multicast++;
+			rx_stats->rx_multicast++;
+		u64_stats_update_end(&rx_stats->syncp);
 	} else {
 		rx_stats->rx_errors++;
 	}



^ permalink raw reply related

* [PATCH net-next-2.6 4/4] vlan: 64 bit rx counters
From: Eric Dumazet @ 2010-06-24 10:55 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, netdev

Use u64_stats_sync infrastructure to implement 64bit rx stats.

(tx stats are addressed later)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/8021q/vlan.h      |   13 ++++++-----
 net/8021q/vlan_core.c |    7 +++---
 net/8021q/vlan_dev.c  |   46 +++++++++++++++++++++++++---------------
 3 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 6abdcac..8d9503a 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -2,6 +2,7 @@
 #define __BEN_VLAN_802_1Q_INC__
 
 #include <linux/if_vlan.h>
+#include <linux/u64_stats_sync.h>
 
 
 /**
@@ -21,14 +22,16 @@ struct vlan_priority_tci_mapping {
  *	struct vlan_rx_stats - VLAN percpu rx stats
  *	@rx_packets: number of received packets
  *	@rx_bytes: number of received bytes
- *	@multicast: number of received multicast packets
+ *	@rx_multicast: number of received multicast packets
+ *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
  */
 struct vlan_rx_stats {
-	unsigned long rx_packets;
-	unsigned long rx_bytes;
-	unsigned long multicast;
-	unsigned long rx_errors;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_multicast;
+	struct u64_stats_sync	syncp;
+	unsigned long		rx_errors;
 };
 
 /**
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 50f58f5..1b9406a 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -41,9 +41,9 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 	skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci);
 	skb->vlan_tci = 0;
 
-	rx_stats = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats,
-			       smp_processor_id());
+	rx_stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats);
 
+	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
 	rx_stats->rx_bytes += skb->len;
 
@@ -51,7 +51,7 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 	case PACKET_BROADCAST:
 		break;
 	case PACKET_MULTICAST:
-		rx_stats->multicast++;
+		rx_stats->rx_multicast++;
 		break;
 	case PACKET_OTHERHOST:
 		/* Our lower layer thinks this is not local, let's make sure.
@@ -62,6 +62,7 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
 			skb->pkt_type = PACKET_HOST;
 		break;
 	}
+	u64_stats_update_end(&rx_stats->syncp);
 	return 0;
 }
 
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 5298426..c6456cb 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -166,6 +166,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 
 	rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats,
 			       smp_processor_id());
+	u64_stats_update_begin(&rx_stats->syncp);
 	rx_stats->rx_packets++;
 	rx_stats->rx_bytes += skb->len;
 
@@ -182,7 +183,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		break;
 
 	case PACKET_MULTICAST:
-		rx_stats->multicast++;
+		rx_stats->rx_multicast++;
 		break;
 
 	case PACKET_OTHERHOST:
@@ -197,6 +198,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 	default:
 		break;
 	}
+	u64_stats_update_end(&rx_stats->syncp);
 
 	vlan_set_encap_proto(skb, vhdr);
 
@@ -801,27 +803,37 @@ static u32 vlan_ethtool_get_flags(struct net_device *dev)
 	return dev_ethtool_get_flags(vlan->real_dev);
 }
 
-static struct net_device_stats *vlan_dev_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev)
 {
-	struct net_device_stats *stats = &dev->stats;
+	struct rtnl_link_stats64 *stats = &dev->stats64;
 
-	dev_txq_stats_fold(dev, stats);
+	dev_txq_stats_fold(dev, &dev->stats);
 
 	if (vlan_dev_info(dev)->vlan_rx_stats) {
-		struct vlan_rx_stats *p, rx = {0};
+		struct vlan_rx_stats *p, accum = {0};
 		int i;
 
 		for_each_possible_cpu(i) {
+			u64 rxpackets, rxbytes, rxmulticast;
+			unsigned int start;
+
 			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats, i);
-			rx.rx_packets += p->rx_packets;
-			rx.rx_bytes   += p->rx_bytes;
-			rx.rx_errors  += p->rx_errors;
-			rx.multicast  += p->multicast;
+			do {
+				start = u64_stats_fetch_begin_bh(&p->syncp);
+				rxpackets	= p->rx_packets;
+				rxbytes		= p->rx_bytes;
+				rxmulticast	= p->rx_multicast;
+			} while (u64_stats_fetch_retry_bh(&p->syncp, start));
+			accum.rx_packets += rxpackets;
+			accum.rx_bytes   += rxbytes;
+			accum.rx_multicast += rxmulticast;
+			/* rx_errors is an ulong, not protected by syncp */
+			accum.rx_errors  += p->rx_errors;
 		}
-		stats->rx_packets = rx.rx_packets;
-		stats->rx_bytes   = rx.rx_bytes;
-		stats->rx_errors  = rx.rx_errors;
-		stats->multicast  = rx.multicast;
+		stats->rx_packets = accum.rx_packets;
+		stats->rx_bytes   = accum.rx_bytes;
+		stats->rx_errors  = accum.rx_errors;
+		stats->multicast  = accum.rx_multicast;
 	}
 	return stats;
 }
@@ -848,7 +860,7 @@ static const struct net_device_ops vlan_netdev_ops = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -872,7 +884,7 @@ static const struct net_device_ops vlan_netdev_accel_ops = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -897,7 +909,7 @@ static const struct net_device_ops vlan_netdev_ops_sq = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -922,7 +934,7 @@ static const struct net_device_ops vlan_netdev_accel_ops_sq = {
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
-	.ndo_get_stats		= vlan_dev_get_stats,
+	.ndo_get_stats64	= vlan_dev_get_stats64,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,



^ permalink raw reply related

* [PATCH net-next-2.6] tcp: tso_fragment() might avoid GFP_ATOMIC
From: Eric Dumazet @ 2010-06-24 11:00 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

We can pass a gfp argument to tso_fragment() and avoid GFP_ATOMIC
allocations sometimes.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/tcp_output.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 51d316d..25ff62e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1460,7 +1460,7 @@ int tcp_may_send_now(struct sock *sk)
  * packet has never been sent out before (and thus is not cloned).
  */
 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
-			unsigned int mss_now)
+			unsigned int mss_now, gfp_t gfp)
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
@@ -1470,7 +1470,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	if (skb->len != skb->data_len)
 		return tcp_fragment(sk, skb, len, mss_now);
 
-	buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC);
+	buff = sk_stream_alloc_skb(sk, 0, gfp);
 	if (unlikely(buff == NULL))
 		return -ENOMEM;
 
@@ -1768,7 +1768,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						    cwnd_quota);
 
 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now)))
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
 			break;
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;



^ permalink raw reply related

* [2.6.35-rc3] NFS: possible irq lock inversion dependency
From: Tetsuo Handa @ 2010-06-24 11:53 UTC (permalink / raw)
  To: linux-fsdevel, netdev

Hello.

I sometimes get below warning when the system is about to reboot/halt.
Is this already reported? If not, I'll try to establish steps to reproduce.

----- Dump 1 -----

[  508.594713] nfsd: last server has exited, flushing export cache
[  509.100525] 
[  509.100529] =========================================================
[  509.102129] [ INFO: possible irq lock inversion dependency detected ]
<4>[  509.102513]                                       [<c103f69f>] sys_exit_group+0xf/0x20
<4>[  509.102513]                                       [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]  }
[  509.102513]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  509.102513]  ... acquired at:
[  509.102513]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]    [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]    [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]    [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]    [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]    [<c1052a65>] kthread+0x75/0x80
[  509.102513]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513] 
[  509.102513] 
[  509.102513] stack backtrace:
[  509.102513] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  509.102513] Call Trace:
[  509.102513]  [<c103cbe8>] ? printk+0x18/0x20
[  509.102513]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  509.102513]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  509.102513]  [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]  [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  509.102513]  [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  509.102513]  [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  509.102513]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  509.102513]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1035456>] ? complete+0x46/0x60
[  509.102513]  [<c1052a65>] kthread+0x75/0x80
[  509.102513]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c100317a>] kernel_thread_helper+0x6/0x1c
9>] _raw_spin_lock+0x39/0x70
[  509.102513]                                          [<c128ae51>] sk_clone+0xb1/0x2c0
[  509.102513]                                          [<c12c0046>] inet_csk_clone+0x16/0x90
[  509.102513]                                          [<c12d519c>] tcp_create_openreq_child+0x1c/0x460
[  509.102513]                                          [<c12d2a1f>] tcp_v4_syn_recv_sock+0x3f/0x1e0
[  509.102513]                                          [<c12d576c>] tcp_check_req+0x18c/0x3b0
[  509.102513]                                          [<c12d2c0d>] tcp_v4_hnd_req+0x4d/0x160
[  509.102513]                                          [<c12d2f39>] tcp_v4_do_rcv+0x159/0x280
[  509.102513]                                          [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  509.102513]                                          [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  509.102513]                                          [<c12b5e40>] ip_local_deliver+0x30/0x40
[  509.102513]                                          [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  509.102513]                                          [<c12b63f6>] ip_rcv+0x166/0x240
[  509.102513]                                          [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  509.102513]                                          [<c12985e8>] process_backlog+0x88/0x160
[  509.102513]                                          [<c12989b7>] net_rx_action+0x127/0x140
[  509.102513]                                          [<c1041b30>] __do_softirq+0xd0/0x130
[  509.102513]     INITIAL USE at:
[  509.102513]                                         [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  509.102513]                                         [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                         [<c1331dbe>] _raw_spin_lock_bh+0x3e/0x80
[  509.102513]                                         [<c128c4c9>] lock_sock_fast+0x29/0x90
[  509.102513]                                         [<c12dae84>] udp_destroy_sock+0x14/0x40
[  509.102513]                                         [<c128c823>] sk_common_release+0xb3/0xc0
[  509.102513]                                         [<c12db7f8>] udp_lib_close+0x8/0x10
[  509.102513]                                         [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]                                         [<c1286c36>] sock_release+0x66/0x80
[  509.102513]                                         [<c1287952>] sock_close+0x12/0x30
[  509.102513]                                         [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]                                         [<c10b70b9>] fput+0x19/0x20
[  509.102513]                                         [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]                                         [<c103eafb>] close_files+0xab/0x140
[  509.102513]                                         [<c103ebf9>] put_files_struct+0x29/0xf0
[  509.102513]                                         [<c103ed50>] exit_files+0x40/0x50
[  509.102513]                                         [<c103f400>] do_exit+0x100/0x2b0
[  509.102513]                                         [<c103f614>] do_group_exit+0x34/0xb0
[  509.102513]                                         [<c103f69f>] sys_exit_group+0xf/0x20
[  509.102513]                                         [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]   }
[  509.102513]   ... key      at: [<c1cbfc50>] af_family_slock_keys+0x10/0x140
[  509.102513]   ... acquired at:
[  509.102513]    [<c1064a8b>] check_prevs_add+0xab/0x100
[  509.102513]    [<c1064e15>] validate_chain+0x305/0x5a0
[  509.102513]    [<c1066f03>] __lock_acquire+0x2b3/0x8e0
[  509.102513]    [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]    [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  509.102513]    [<c12c0356>] inet_csk_listen_stop+0x86/0x160
[  509.102513]    [<c12c2dc0>] tcp_close+0x350/0x360
[  509.102513]    [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]    [<c1286c36>] sock_release+0x66/0x80
[  509.102513]    [<c1287952>] sock_close+0x12/0x30
[  509.102513]    [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]    [<c10b70b9>] fput+0x19/0x20
[  509.102513]    [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]    [<c10b558d>] sys_close+0x6d/0x100
[  509.102513]    [<c13327e1>] syscall_call+0x7/0xb
[  509.102513] 
[  509.102513] -> (clock-AF_INET){++.?..} ops: 877 {
[  509.102513]    HARDIRQ-ON-W at:
[  509.102513]                                        [<c106625e>] mark_irqflags+0xfe/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  509.102513]                                        [<c128c79f>] sk_common_release+0x2f/0xc0
[  509.102513]                                        [<c12db7f8>] udp_lib_close+0x8/0x10
[  509.102513]                                        [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]                                        [<c1286c36>] sock_release+0x66/0x80
[  509.102513]                                        [<c1287952>] sock_close+0x12/0x30
[  509.102513]                                        [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]                                        [<c10b70b9>] fput+0x19/0x20
[  509.102513]                                        [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]                                        [<c103eafb>] close_files+0xab/0x140
[  509.102513]                                        [<c103ebf9>] put_files_struct+0x29/0xf0
[  509.102513]                                        [<c103ed50>] exit_files+0x40/0x50
[  509.102513]                                        [<c103f400>] do_exit+0x100/0x2b0
[  509.102513]                                        [<c103f614>] do_group_exit+0x34/0xb0
[  509.102513]                                        [<c103f69f>] sys_exit_group+0xf/0x20
[  509.102513]                                        [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]    HARDIRQ-ON-R at:
[  509.102513]                                        [<c10661ce>] mark_irqflags+0x6e/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]                                        [<c12cbea8>] tcp_rcv_synsent_state_process+0x388/0x580
[  509.102513]                                        [<c12cc547>] tcp_rcv_state_process+0x4a7/0x560
[  509.102513]                                        [<c12d2e51>] tcp_v4_do_rcv+0x71/0x280
[  509.102513]                                        [<c128b786>] __release_sock+0x66/0x150
[  509.102513]                                        [<c128c497>] release_sock+0x87/0x90
[  509.102513]                                        [<c12e1cba>] inet_stream_connect+0x5a/0x1b0
[  509.102513]                                        [<c1289448>] kernel_connect+0x18/0x30
[  509.102513]                                        [<c130acce>] xs_tcp_finish_connecting+0x4e/0x120
[  509.102513]                                        [<c130adfb>] xs_tcp_setup_socket+0x5b/0x180
[  509.102513]                                        [<c130b034>] xs_tcp_connect_worker4+0x14/0x20
[  509.102513]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]                                        [<c1052a65>] kthread+0x75/0x80
[  509.102513]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513]    IN-SOFTIRQ-R at:
[  509.102513]                                        [<c106627e>] mark_irqflags+0x11e/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]                                        [<c1309f81>] xs_tcp_data_ready+0x21/0x90
[  509.102513]                                        [<c12ca378>] tcp_data_queue+0x248/0x820
[  509.102513]                                        [<c12cb6ee>] tcp_rcv_established+0xae/0x4e0
[  509.102513]                                        [<c12d2fb1>] tcp_v4_do_rcv+0x1d1/0x280
[  509.102513]                                        [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  509.102513]                                        [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  509.102513]                                        [<c12b5e40>] ip_local_deliver+0x30/0x40
[  509.102513]                                        [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  509.102513]                                        [<c12b63f6>] ip_rcv+0x166/0x240
[  509.102513]                                        [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  509.102513]                                        [<c12985e8>] process_backlog+0x88/0x160
[  509.102513]                                        [<c12989b7>] net_rx_action+0x127/0x140
[  509.102513]                                        [<c1041b30>] __do_softirq+0xd0/0x130
[  509.102513]    SOFTIRQ-ON-R at:
[  509.102513]                                        [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]                                        [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]                                        [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]                                        [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]                                        [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]                                        [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]                                        [<c1052a65>] kthread+0x75/0x80
[  509.102513]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513]    INITIAL USE at:
[  509.102513]                                       [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  509.102513]                                       [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]                                       [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  509.102513]                                       [<c128c79f>] sk_common_release+0x2f/0xc0
[  509.102513]                                       [<c12db7f8>] udp_lib_close+0x8/0x10
[  509.102513]                                       [<c12e18ce>] inet_release+0xbe/0x100
[  509.102513]                                       [<c1286c36>] sock_release+0x66/0x80
[  509.102513]                                       [<c1287952>] sock_close+0x12/0x30
[  509.102513]                                       [<c10b705b>] __fput+0x1cb/0x210
[  509.102513]                                       [<c10b70b9>] fput+0x19/0x20
[  509.102513]                                       [<c10b54f3>] filp_close+0x43/0x70
[  509.102513]                                       [<c103eafb>] close_files+0xab/0x140
[  509.102513]                                       [<c103ebf9>] put_files_struct+0x29/0xf0
[  509.102513]                                       [<c103ed50>] exit_files+0x40/0x50
[  509.102513]                                       [<c103f400>] do_exit+0x100/0x2b0
[  509.102513]                                       [<c103f614>] do_group_exit+0x34/0xb0
[  509.102513]                                       [<c103f69f>] sys_exit_group+0xf/0x20
[  509.102513]                                       [<c13327e1>] syscall_call+0x7/0xb
[  509.102513]  }
[  509.102513]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  509.102513]  ... acquired at:
[  509.102513]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]    [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]    [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]    [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]    [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]    [<c1052a65>] kthread+0x75/0x80
[  509.102513]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  509.102513] 
[  509.102513] 
[  509.102513] stack backtrace:
[  509.102513] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  509.102513] Call Trace:
[  509.102513]  [<c103cbe8>] ? printk+0x18/0x20
[  509.102513]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  509.102513]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  509.102513]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  509.102513]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  509.102513]  [<c106657c>] mark_lock+0x21c/0x3c0
[  509.102513]  [<c1066242>] mark_irqflags+0xe2/0x180
[  509.102513]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  509.102513]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  509.102513]  [<c106857a>] lock_acquire+0x7a/0xa0
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  509.102513]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  509.102513]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  509.102513]  [<c12e2397>] inet_shutdown+0x97/0x110
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  509.102513]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  509.102513]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  509.102513]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  509.102513]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  509.102513]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  509.102513]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  509.102513]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  509.102513]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  509.102513]  [<c1035456>] ? complete+0x46/0x60
[  509.102513]  [<c1052a65>] kthread+0x75/0x80
[  509.102513]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c10529f0>] ? kthread+0x0/0x80
[  509.102513]  [<c100317a>] kernel_thread_helper+0x6/0x1c
[  518.099361] ACPI: Preparing to enter system sleep state S5
[  518.101223] Disabling non-boot CPUs ...
[  518.607480] lockdep: fixing up alternatives.
[  518.608334] SMP alternatives: switching to UP code
[  518.908243] Power down.
[  518.909864] acpi_power_off called




----- Dump 2 -----

[  974.096047] nfsd: last server has exited, flushing export cache
[  975.514620] 
[  975.514622] =========================================================
[  975.516172] [ INFO: possible irq lock inversion dependency detected ]
x34/0xb0
<4>[  975.517504]                                       [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517507]                                       [<c13327e1>] syscall_call+0x7/0xb
[  975.517510]  }
[  975.517511]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  975.517513]  ... acquired at:
[  975.517515]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517517]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517519]    [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517522]    [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517524]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517526]    [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517529]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517531]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517534]    [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517536]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517539]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517541]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517544]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517546]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517549]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517551]    [<c1052a65>] kthread+0x75/0x80
[  975.517553]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517556] 
[  975.517570] 
[  975.517571] stack backtrace:
[  975.517628] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  975.517648] Call Trace:
[  975.517692]  [<c103cbe8>] ? printk+0x18/0x20
[  975.517704]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  975.517708]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517711]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517713]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  975.517716]  [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517718]  [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517721]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517723]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  975.517726]  [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517729]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517732]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517735]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517737]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517740]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  975.517743]  [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517745]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517748]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517751]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517753]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517756]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517758]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517761]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517763]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  975.517766]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  975.517769]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517771]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517774]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517779]  [<c1035456>] ? complete+0x46/0x60
[  975.517782]  [<c1052a65>] kthread+0x75/0x80
[  975.517784]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  975.517786]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517789]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517791]  [<c100317a>] kernel_thread_helper+0x6/0x1c
9>] _raw_spin_lock+0x39/0x70
[  975.517015]                                          [<c128ae51>] sk_clone+0xb1/0x2c0
[  975.517018]                                          [<c12c0046>] inet_csk_clone+0x16/0x90
[  975.517022]                                          [<c12d519c>] tcp_create_openreq_child+0x1c/0x460
[  975.517025]                                          [<c12d2a1f>] tcp_v4_syn_recv_sock+0x3f/0x1e0
[  975.517028]                                          [<c12d576c>] tcp_check_req+0x18c/0x3b0
[  975.517031]                                          [<c12d2c0d>] tcp_v4_hnd_req+0x4d/0x160
[  975.517033]                                          [<c12d2f39>] tcp_v4_do_rcv+0x159/0x280
[  975.517036]                                          [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  975.517038]                                          [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  975.517042]                                          [<c12b5e40>] ip_local_deliver+0x30/0x40
[  975.517045]                                          [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  975.517048]                                          [<c12b63f6>] ip_rcv+0x166/0x240
[  975.517050]                                          [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  975.517054]                                          [<c12985e8>] process_backlog+0x88/0x160
[  975.517057]                                          [<c12989b7>] net_rx_action+0x127/0x140
[  975.517060]                                          [<c1041b30>] __do_softirq+0xd0/0x130
[  975.517063]     INITIAL USE at:
[  975.517065]                                         [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  975.517067]                                         [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517070]                                         [<c1331dbe>] _raw_spin_lock_bh+0x3e/0x80
[  975.517073]                                         [<c128c4c9>] lock_sock_fast+0x29/0x90
[  975.517076]                                         [<c12dae84>] udp_destroy_sock+0x14/0x40
[  975.517079]                                         [<c128c823>] sk_common_release+0xb3/0xc0
[  975.517083]                                         [<c12db7f8>] udp_lib_close+0x8/0x10
[  975.517085]                                         [<c12e18ce>] inet_release+0xbe/0x100
[  975.517088]                                         [<c1286c36>] sock_release+0x66/0x80
[  975.517091]                                         [<c1287952>] sock_close+0x12/0x30
[  975.517094]                                         [<c10b705b>] __fput+0x1cb/0x210
[  975.517097]                                         [<c10b70b9>] fput+0x19/0x20
[  975.517099]                                         [<c10b54f3>] filp_close+0x43/0x70
[  975.517102]                                         [<c103eafb>] close_files+0xab/0x140
[  975.517105]                                         [<c103ebf9>] put_files_struct+0x29/0xf0
[  975.517108]                                         [<c103ed50>] exit_files+0x40/0x50
[  975.517111]                                         [<c103f400>] do_exit+0x100/0x2b0
[  975.517114]                                         [<c103f614>] do_group_exit+0x34/0xb0
[  975.517117]                                         [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517120]                                         [<c13327e1>] syscall_call+0x7/0xb
[  975.517130]   }
[  975.517139]   ... key      at: [<c1cbfc50>] af_family_slock_keys+0x10/0x140
[  975.517180]   ... acquired at:
[  975.517187]    [<c1064a8b>] check_prevs_add+0xab/0x100
[  975.517190]    [<c1064e15>] validate_chain+0x305/0x5a0
[  975.517193]    [<c1066f03>] __lock_acquire+0x2b3/0x8e0
[  975.517195]    [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517198]    [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  975.517200]    [<c12c0356>] inet_csk_listen_stop+0x86/0x160
[  975.517203]    [<c12c2dc0>] tcp_close+0x350/0x360
[  975.517205]    [<c12e18ce>] inet_release+0xbe/0x100
[  975.517208]    [<c1286c36>] sock_release+0x66/0x80
[  975.517210]    [<c1287952>] sock_close+0x12/0x30
[  975.517212]    [<c10b705b>] __fput+0x1cb/0x210
[  975.517215]    [<c10b70b9>] fput+0x19/0x20
[  975.517217]    [<c10b54f3>] filp_close+0x43/0x70
[  975.517220]    [<c10b558d>] sys_close+0x6d/0x100
[  975.517222]    [<c13327e1>] syscall_call+0x7/0xb
[  975.517230] 
[  975.517240] -> (clock-AF_INET){++.?..} ops: 879 {
[  975.517244]    HARDIRQ-ON-W at:
[  975.517246]                                        [<c106625e>] mark_irqflags+0xfe/0x180
[  975.517249]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517252]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517254]                                        [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  975.517257]                                        [<c128c79f>] sk_common_release+0x2f/0xc0
[  975.517260]                                        [<c12db7f8>] udp_lib_close+0x8/0x10
[  975.517263]                                        [<c12e18ce>] inet_release+0xbe/0x100
[  975.517266]                                        [<c1286c36>] sock_release+0x66/0x80
[  975.517269]                                        [<c1287952>] sock_close+0x12/0x30
[  975.517271]                                        [<c10b705b>] __fput+0x1cb/0x210
[  975.517274]                                        [<c10b70b9>] fput+0x19/0x20
[  975.517277]                                        [<c10b54f3>] filp_close+0x43/0x70
[  975.517280]                                        [<c103eafb>] close_files+0xab/0x140
[  975.517283]                                        [<c103ebf9>] put_files_struct+0x29/0xf0
[  975.517286]                                        [<c103ed50>] exit_files+0x40/0x50
[  975.517289]                                        [<c103f400>] do_exit+0x100/0x2b0
[  975.517291]                                        [<c103f614>] do_group_exit+0x34/0xb0
[  975.517294]                                        [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517297]                                        [<c13327e1>] syscall_call+0x7/0xb
[  975.517300]    HARDIRQ-ON-R at:
[  975.517302]                                        [<c10661ce>] mark_irqflags+0x6e/0x180
[  975.517304]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517307]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517310]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517312]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517316]                                        [<c12cbea8>] tcp_rcv_synsent_state_process+0x388/0x580
[  975.517319]                                        [<c12cc547>] tcp_rcv_state_process+0x4a7/0x560
[  975.517322]                                        [<c12d2e51>] tcp_v4_do_rcv+0x71/0x280
[  975.517325]                                        [<c128b786>] __release_sock+0x66/0x150
[  975.517327]                                        [<c128c497>] release_sock+0x87/0x90
[  975.517330]                                        [<c12e1cba>] inet_stream_connect+0x5a/0x1b0
[  975.517333]                                        [<c1289448>] kernel_connect+0x18/0x30
[  975.517336]                                        [<c130acce>] xs_tcp_finish_connecting+0x4e/0x120
[  975.517339]                                        [<c130adfb>] xs_tcp_setup_socket+0x5b/0x180
[  975.517342]                                        [<c130b034>] xs_tcp_connect_worker4+0x14/0x20
[  975.517344]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517347]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517350]                                        [<c1052a65>] kthread+0x75/0x80
[  975.517353]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517358]    IN-SOFTIRQ-R at:
[  975.517359]                                        [<c106627e>] mark_irqflags+0x11e/0x180
[  975.517364]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517367]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517369]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517372]                                        [<c1309f81>] xs_tcp_data_ready+0x21/0x90
[  975.517375]                                        [<c12ca378>] tcp_data_queue+0x248/0x820
[  975.517378]                                        [<c12cb6ee>] tcp_rcv_established+0xae/0x4e0
[  975.517381]                                        [<c12d2fb1>] tcp_v4_do_rcv+0x1d1/0x280
[  975.517384]                                        [<c12d35d4>] tcp_v4_rcv+0x574/0xa30
[  975.517386]                                        [<c12b5c4f>] ip_local_deliver_finish+0xff/0x2c0
[  975.517389]                                        [<c12b5e40>] ip_local_deliver+0x30/0x40
[  975.517392]                                        [<c12b5f99>] ip_rcv_finish+0x149/0x440
[  975.517395]                                        [<c12b63f6>] ip_rcv+0x166/0x240
[  975.517398]                                        [<c1297b0d>] __netif_receive_skb+0x1cd/0x280
[  975.517401]                                        [<c12985e8>] process_backlog+0x88/0x160
[  975.517404]                                        [<c12989b7>] net_rx_action+0x127/0x140
[  975.517407]                                        [<c1041b30>] __do_softirq+0xd0/0x130
[  975.517411]    SOFTIRQ-ON-R at:
[  975.517412]                                        [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517415]                                        [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517418]                                        [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517421]                                        [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517423]                                        [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517427]                                        [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517429]                                        [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517432]                                        [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517435]                                        [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517438]                                        [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517442]                                        [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517445]                                        [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517448]                                        [<c1052a65>] kthread+0x75/0x80
[  975.517450]                                        [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517453]    INITIAL USE at:
[  975.517455]                                       [<c1066e16>] __lock_acquire+0x1c6/0x8e0
[  975.517457]                                       [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517460]                                       [<c13323fe>] _raw_write_lock_bh+0x3e/0x80
[  975.517463]                                       [<c128c79f>] sk_common_release+0x2f/0xc0
[  975.517466]                                       [<c12db7f8>] udp_lib_close+0x8/0x10
[  975.517469]                                       [<c12e18ce>] inet_release+0xbe/0x100
[  975.517471]                                       [<c1286c36>] sock_release+0x66/0x80
[  975.517474]                                       [<c1287952>] sock_close+0x12/0x30
[  975.517477]                                       [<c10b705b>] __fput+0x1cb/0x210
[  975.517480]                                       [<c10b70b9>] fput+0x19/0x20
[  975.517483]                                       [<c10b54f3>] filp_close+0x43/0x70
[  975.517485]                                       [<c103eafb>] close_files+0xab/0x140
[  975.517488]                                       [<c103ebf9>] put_files_struct+0x29/0xf0
[  975.517495]                                       [<c103ed50>] exit_files+0x40/0x50
[  975.517498]                                       [<c103f400>] do_exit+0x100/0x2b0
[  975.517501]                                       [<c103f614>] do_group_exit+0x34/0xb0
[  975.517504]                                       [<c103f69f>] sys_exit_group+0xf/0x20
[  975.517507]                                       [<c13327e1>] syscall_call+0x7/0xb
[  975.517510]  }
[  975.517511]  ... key      at: [<c1cbfd90>] af_callback_keys+0x10/0x130
[  975.517513]  ... acquired at:
[  975.517515]    [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517517]    [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517519]    [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517522]    [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517524]    [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517526]    [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517529]    [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517531]    [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517534]    [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517536]    [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517539]    [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517541]    [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517544]    [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517546]    [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517549]    [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517551]    [<c1052a65>] kthread+0x75/0x80
[  975.517553]    [<c100317a>] kernel_thread_helper+0x6/0x1c
[  975.517556] 
[  975.517570] 
[  975.517571] stack backtrace:
[  975.517628] Pid: 337, comm: rpciod/1 Not tainted 2.6.35-rc3-ccs #6
[  975.517648] Call Trace:
[  975.517692]  [<c103cbe8>] ? printk+0x18/0x20
[  975.517704]  [<c1065558>] print_irq_inversion_bug+0x108/0x120
[  975.517708]  [<c10656c6>] check_usage_backwards+0x76/0xd0
[  975.517711]  [<c10658d9>] mark_lock_irq+0x99/0x240
[  975.517713]  [<c1065650>] ? check_usage_backwards+0x0/0xd0
[  975.517716]  [<c106657c>] mark_lock+0x21c/0x3c0
[  975.517718]  [<c1066242>] mark_irqflags+0xe2/0x180
[  975.517721]  [<c1066fdd>] __lock_acquire+0x38d/0x8e0
[  975.517723]  [<c10682da>] ? __lock_is_held+0x3a/0x60
[  975.517726]  [<c106857a>] lock_acquire+0x7a/0xa0
[  975.517729]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517732]  [<c1331f69>] _raw_read_lock+0x39/0x70
[  975.517735]  [<c130a0ae>] ? xs_tcp_state_change+0x1e/0x1c0
[  975.517737]  [<c130a0ae>] xs_tcp_state_change+0x1e/0x1c0
[  975.517740]  [<c12cf3bf>] ? tcp_send_fin+0x4f/0xc0
[  975.517743]  [<c12e2397>] inet_shutdown+0x97/0x110
[  975.517745]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517748]  [<c1289649>] kernel_sock_shutdown+0x9/0x10
[  975.517751]  [<c13093a7>] xs_tcp_shutdown+0x17/0x20
[  975.517753]  [<c13096a7>] xs_tcp_close+0x27/0x30
[  975.517756]  [<c130792d>] xprt_autoclose+0x1d/0x50
[  975.517758]  [<c104edc0>] run_workqueue+0xe0/0x1d0
[  975.517761]  [<c104ed5e>] ? run_workqueue+0x7e/0x1d0
[  975.517763]  [<c1307910>] ? xprt_autoclose+0x0/0x50
[  975.517766]  [<c1052e8a>] ? prepare_to_wait+0x3a/0x60
[  975.517769]  [<c104ef4b>] worker_thread+0x9b/0xd0
[  975.517771]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517774]  [<c1053000>] ? autoremove_wake_function+0x0/0x50
[  975.517779]  [<c1035456>] ? complete+0x46/0x60
[  975.517782]  [<c1052a65>] kthread+0x75/0x80
[  975.517784]  [<c104eeb0>] ? worker_thread+0x0/0xd0
[  975.517786]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517789]  [<c10529f0>] ? kthread+0x0/0x80
[  975.517791]  [<c100317a>] kernel_thread_helper+0x6/0x1c
[  984.626828] Restarting system.
[  984.627458] machine restart

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: Steffen Klassert @ 2010-06-24 12:05 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, netdev, Chase Douglas, Arne Nordmark
In-Reply-To: <1277337341.26161.18.camel@localhost>

Hi.

On Thu, Jun 24, 2010 at 12:55:41AM +0100, Ben Hutchings wrote:
> This avoids scheduling in atomic context and also means that IRQs
> will only be deferred for relatively short periods of time.
> 
> Previously discussed in:
> http://article.gmane.org/gmane.linux.network/155024
> 
> Reported-by: Arne Nordmark <nordmark@mech.kth.se>
> Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
> Tested-by: Arne Nordmark <nordmark@mech.kth.se> [against 2.6.32]
> ---
>  drivers/net/3c59x.c |   66 ++++++++++++++++++++++++++++++---------------------
>  1 files changed, 39 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
> index beddef9..f4a3fb1 100644
> --- a/drivers/net/3c59x.c
> +++ b/drivers/net/3c59x.c
> @@ -644,9 +644,15 @@ struct vortex_private {
>  	u16 deferred;						/* Resend these interrupts when we
>  										 * bale from the ISR */
>  	u16 io_size;						/* Size of PCI region (for release_region) */
> -	spinlock_t lock;					/* Serialise access to device & its vortex_private */
> -	struct mii_if_info mii;				/* MII lib hooks/info */
> -	int window;					/* Register window */
> +
> +	/* Serialises access to hardware other than MII and variables below.
> +	 * The lock hierarchy is rtnl_lock > lock > mii_lock > window_lock. */
> +	spinlock_t lock;
> +
> +	spinlock_t mii_lock;		/* Serialises access to MII */
> +	struct mii_if_info mii;		/* MII lib hooks/info */
> +	spinlock_t window_lock;		/* Serialises access to windowed regs */

You should initialize the new locks properly with spin_lock_init().

> +	int window;			/* Register window */
>  };
>  
>  static void window_set(struct vortex_private *vp, int window)
> @@ -661,15 +667,23 @@ static void window_set(struct vortex_private *vp, int window)
>  static u ## size							\
>  window_read ## size(struct vortex_private *vp, int window, int addr)	\
>  {									\
> +	unsigned long flags;						\
> +	u ## size ret;							\
> +	spin_lock_irqsave(&vp->window_lock, flags);			\
>  	window_set(vp, window);						\
> -	return ioread ## size(vp->ioaddr + addr);			\
> +	ret = ioread ## size(vp->ioaddr + addr);			\
> +	spin_unlock_irqrestore(&vp->window_lock, flags);		\
> +	return ret;							\
>  }									\
>  static void								\
>  window_write ## size(struct vortex_private *vp, u ## size value,	\
>  		     int window, int addr)				\
>  {									\
> +	unsigned long flags;						\
> +	spin_lock_irqsave(&vp->window_lock, flags);			\
>  	window_set(vp, window);						\
>  	iowrite ## size(value, vp->ioaddr + addr);			\
> +	spin_unlock_irqrestore(&vp->window_lock, flags);		\
>  }

This adds a lot of calls to spin_lock_irqsave/spin_unlock_irqrestore to many
places where this is not necessary at all. For example during device probe and
device open, window_read/window_write are called multiple times, each time
disabling the interrupts. I'd suggest to have unlocked, locked and irqsave
versions of window_read/window_write and use them in appropriate places.

>  DEFINE_WINDOW_IO(8)
>  DEFINE_WINDOW_IO(16)
> @@ -1784,7 +1798,6 @@ vortex_timer(unsigned long data)
>  		pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
>  	}
>  
> -	disable_irq_lockdep(dev->irq);
>  	media_status = window_read16(vp, 4, Wn4_Media);
>  	switch (dev->if_port) {
>  	case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
> @@ -1805,10 +1818,7 @@ vortex_timer(unsigned long data)
>  	case XCVR_MII: case XCVR_NWAY:
>  		{
>  			ok = 1;
> -			/* Interrupts are already disabled */
> -			spin_lock(&vp->lock);
>  			vortex_check_media(dev, 0);
> -			spin_unlock(&vp->lock);
>  		}
>  		break;
>  	  default:					/* Other media types handled by Tx timeouts. */
> @@ -1827,6 +1837,8 @@ vortex_timer(unsigned long data)
>  	if (!ok) {
>  		unsigned int config;
>  
> +		spin_lock_irq(&vp->lock);

This can still happen every 5 seconds if the NIC has no link beat and
medialock is not set. So what about defering this locked codepath to
a workqueue, or moving the whole vortex_timer to a delayed workqueue?
In this case we don't need to disable all the interrups on the cpu, we
could still use disable_irq then.

The rest looks quite good to me.

Thanks,

Steffen

^ permalink raw reply

* Re: Question about xfrm by MARK feature
From: jamal @ 2010-06-24 12:04 UTC (permalink / raw)
  To: Gerd v. Egidy; +Cc: timo.teras, kaber, herbert, netdev
In-Reply-To: <201006231803.17261.lists@egidy.de>

Hi Gerd,

On Wed, 2010-06-23 at 18:03 +0200, Gerd v. Egidy wrote:
> Hi Jamal,
> 
> while looking through the 2.6.34 changelog I found the xfrm by MARK feature 
> you developed in february. I'm currently working on NAT for ipsec connections 
> and thought your feature might help me.
> 
> For example I have 2 different remote networks with the same ip network each 
> and both of them have a tunnel to the same local network. 

It seems "Same IP network" means that two remote locations will have
exactly same IP address? This is hard of course - but nat may do it..
There's also the nat zones feature that Patrick introduced a while back
that may help you

> I map their IPs to 
> something different so I can distinguish them in the local network. But after 
> the nat the xfrm code sees two tunnels with exactly the same values. So this 
> can't work.
> 

Can you look at the incoming encrypted packet headers and tell if they
are from different remotes? If not, are different remotes coming in via
a different network device? If yes, you can install a tc rule to mark
them as they come in before decryption and that mark should stay with
them even after they get decrypted.

> But if I understood your feature correctly, I can now mark the packets (e.g. 
> in iptables with ... -j MARK --set-mark 1) and have xfrm select the correct 
> ipsec tunnel via the mark. Correct?
> 
> But does your feature also set the mark on packets decrypted by xfrm? I need 
> some way to find out from which tunnel the packet came to correctly treat it. 
> 

Refer to above and also to policy routing.

> Do you know if any of the ipsec solutions for linux (e.g. strongswan, 
> openswan, racoon) already have support for this feature or are developing on 
> it?

AFAIK, only iproute2 can use marks. I believe the ike daemons can be
made to use reqid (as Herbert mentioned) but i am not sure that is 
sufficient for what you want.

cheers,
jamal


^ permalink raw reply

* Re: [PATCH 31/40] trace syscalls: Convert various generic compat syscalls
From: Michal Marek @ 2010-06-24 12:05 UTC (permalink / raw)
  To: linux-mm; +Cc: linux-kernel, linuxppc-dev, linux-fsdevel, kexec, netdev
In-Reply-To: <4C21E3F8.9000405@linux.intel.com>

On 23.6.2010 12:37, Andi Kleen wrote:
> It also has maintenance costs, e.g. I doubt ctags and cscope
> will be able to deal with these kinds of macros, so it has a
> high cost for everyone using these tools.

FWIW, patch 16/40 of this series teaches 'make tags' to recognize these
macros: http://permalink.gmane.org/gmane.linux.kernel/1002103

Michal

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 31/40] trace syscalls: Convert various generic compat syscalls
From: Michal Marek @ 2010-06-24 12:05 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-mm, linux-kernel, linuxppc-dev, kexec, netdev, linux-mm,
	linux-kernel, linux-fsdevel, kexec, netdev, linux-kernel,
	linuxppc-dev, linux-fsdevel, kexec, netdev
In-Reply-To: <4C21E3F8.9000405@linux.intel.com>

On 23.6.2010 12:37, Andi Kleen wrote:
> It also has maintenance costs, e.g. I doubt ctags and cscope
> will be able to deal with these kinds of macros, so it has a
> high cost for everyone using these tools.

FWIW, patch 16/40 of this series teaches 'make tags' to recognize these
macros: http://permalink.gmane.org/gmane.linux.kernel/1002103

Michal


^ permalink raw reply

* Re: [PATCH 31/40] trace syscalls: Convert various generic compat syscalls
From: Michal Marek @ 2010-06-24 12:05 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: linuxppc-dev, netdev, kexec, linux-kernel, linux-mm,
	linux-fsdevel
In-Reply-To: <4C21E3F8.9000405@linux.intel.com>

On 23.6.2010 12:37, Andi Kleen wrote:
> It also has maintenance costs, e.g. I doubt ctags and cscope
> will be able to deal with these kinds of macros, so it has a
> high cost for everyone using these tools.

FWIW, patch 16/40 of this series teaches 'make tags' to recognize these
macros: http://permalink.gmane.org/gmane.linux.kernel/1002103

Michal

^ permalink raw reply

* Re: [PATCH 31/40] trace syscalls: Convert various generic compat syscalls
From: Michal Marek @ 2010-06-24 12:05 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-mm, linuxppc-dev, linux-fsdevel, kexec, netdev,
	linux-kernel, linuxppc-dev, linux-fsdevel, kexec, netdev
In-Reply-To: <4C21E3F8.9000405@linux.intel.com>

On 23.6.2010 12:37, Andi Kleen wrote:
> It also has maintenance costs, e.g. I doubt ctags and cscope
> will be able to deal with these kinds of macros, so it has a
> high cost for everyone using these tools.

FWIW, patch 16/40 of this series teaches 'make tags' to recognize these
macros: http://permalink.gmane.org/gmane.linux.kernel/1002103

Michal

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: Ben Hutchings @ 2010-06-24 12:57 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: David Miller, netdev, Chase Douglas, Arne Nordmark
In-Reply-To: <20100624120517.GI5570@secunet.com>

[-- Attachment #1: Type: text/plain, Size: 5075 bytes --]

On Thu, 2010-06-24 at 14:05 +0200, Steffen Klassert wrote:
> Hi.
> 
> On Thu, Jun 24, 2010 at 12:55:41AM +0100, Ben Hutchings wrote:
> > This avoids scheduling in atomic context and also means that IRQs
> > will only be deferred for relatively short periods of time.
> > 
> > Previously discussed in:
> > http://article.gmane.org/gmane.linux.network/155024
> > 
> > Reported-by: Arne Nordmark <nordmark@mech.kth.se>
> > Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
> > Tested-by: Arne Nordmark <nordmark@mech.kth.se> [against 2.6.32]
> > ---
> >  drivers/net/3c59x.c |   66 ++++++++++++++++++++++++++++++---------------------
> >  1 files changed, 39 insertions(+), 27 deletions(-)
> > 
> > diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
> > index beddef9..f4a3fb1 100644
> > --- a/drivers/net/3c59x.c
> > +++ b/drivers/net/3c59x.c
> > @@ -644,9 +644,15 @@ struct vortex_private {
> >  	u16 deferred;						/* Resend these interrupts when we
> >  										 * bale from the ISR */
> >  	u16 io_size;						/* Size of PCI region (for release_region) */
> > -	spinlock_t lock;					/* Serialise access to device & its vortex_private */
> > -	struct mii_if_info mii;				/* MII lib hooks/info */
> > -	int window;					/* Register window */
> > +
> > +	/* Serialises access to hardware other than MII and variables below.
> > +	 * The lock hierarchy is rtnl_lock > lock > mii_lock > window_lock. */
> > +	spinlock_t lock;
> > +
> > +	spinlock_t mii_lock;		/* Serialises access to MII */
> > +	struct mii_if_info mii;		/* MII lib hooks/info */
> > +	spinlock_t window_lock;		/* Serialises access to windowed regs */
> 
> You should initialize the new locks properly with spin_lock_init().

Oops, yes, obviously.

> > +	int window;			/* Register window */
> >  };
> >  
> >  static void window_set(struct vortex_private *vp, int window)
> > @@ -661,15 +667,23 @@ static void window_set(struct vortex_private *vp, int window)
> >  static u ## size							\
> >  window_read ## size(struct vortex_private *vp, int window, int addr)	\
> >  {									\
> > +	unsigned long flags;						\
> > +	u ## size ret;							\
> > +	spin_lock_irqsave(&vp->window_lock, flags);			\
> >  	window_set(vp, window);						\
> > -	return ioread ## size(vp->ioaddr + addr);			\
> > +	ret = ioread ## size(vp->ioaddr + addr);			\
> > +	spin_unlock_irqrestore(&vp->window_lock, flags);		\
> > +	return ret;							\
> >  }									\
> >  static void								\
> >  window_write ## size(struct vortex_private *vp, u ## size value,	\
> >  		     int window, int addr)				\
> >  {									\
> > +	unsigned long flags;						\
> > +	spin_lock_irqsave(&vp->window_lock, flags);			\
> >  	window_set(vp, window);						\
> >  	iowrite ## size(value, vp->ioaddr + addr);			\
> > +	spin_unlock_irqrestore(&vp->window_lock, flags);		\
> >  }
> 
> This adds a lot of calls to spin_lock_irqsave/spin_unlock_irqrestore to many
> places where this is not necessary at all. For example during device probe and
> device open, window_read/window_write are called multiple times, each time
> disabling the interrupts. I'd suggest to have unlocked, locked and irqsave
> versions of window_read/window_write and use them in appropriate places.

So what?  These are not speed-critical.  The fast-path functions do
acquire the lock just once.

> >  DEFINE_WINDOW_IO(8)
> >  DEFINE_WINDOW_IO(16)
> > @@ -1784,7 +1798,6 @@ vortex_timer(unsigned long data)
> >  		pr_debug("dev->watchdog_timeo=%d\n", dev->watchdog_timeo);
> >  	}
> >  
> > -	disable_irq_lockdep(dev->irq);
> >  	media_status = window_read16(vp, 4, Wn4_Media);
> >  	switch (dev->if_port) {
> >  	case XCVR_10baseT:  case XCVR_100baseTx:  case XCVR_100baseFx:
> > @@ -1805,10 +1818,7 @@ vortex_timer(unsigned long data)
> >  	case XCVR_MII: case XCVR_NWAY:
> >  		{
> >  			ok = 1;
> > -			/* Interrupts are already disabled */
> > -			spin_lock(&vp->lock);
> >  			vortex_check_media(dev, 0);
> > -			spin_unlock(&vp->lock);
> >  		}
> >  		break;
> >  	  default:					/* Other media types handled by Tx timeouts. */
> > @@ -1827,6 +1837,8 @@ vortex_timer(unsigned long data)
> >  	if (!ok) {
> >  		unsigned int config;
> >  
> > +		spin_lock_irq(&vp->lock);
> 
> This can still happen every 5 seconds if the NIC has no link beat and
> medialock is not set. So what about defering this locked codepath to
> a workqueue, or moving the whole vortex_timer to a delayed workqueue?
> In this case we don't need to disable all the interrups on the cpu, we
> could still use disable_irq then.

This locked section is now very short - 5 or 6 register read/writes and
no delays.  We might even be able to get away without locking here as
the only software state this accesses is dev->if_port and I don't think
it can race with anything except SIOCGIFMAP (which seems harmless).

Ben.

> The rest looks quite good to me.
> 
> Thanks,
> 
> Steffen
> 

-- 
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: Steffen Klassert @ 2010-06-24 14:00 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, netdev, Chase Douglas, Arne Nordmark
In-Reply-To: <1277384239.26161.162.camel@localhost>

On Thu, Jun 24, 2010 at 01:57:19PM +0100, Ben Hutchings wrote:
> > 
> > This adds a lot of calls to spin_lock_irqsave/spin_unlock_irqrestore to many
> > places where this is not necessary at all. For example during device probe and
> > device open, window_read/window_write are called multiple times, each time
> > disabling the interrupts. I'd suggest to have unlocked, locked and irqsave
> > versions of window_read/window_write and use them in appropriate places.
> 
> So what?  These are not speed-critical.  The fast-path functions do
> acquire the lock just once.
> 

The point is that we should not disable the interrupts if we don't need to
do so. It is not speed critical for the 3c59x driver but disabling the
interrupts should be avoided whenever possible. For example during device
probe and device open we can't race against an interrupt handler because
the device is not yet running.

An example from vortex_probe1() is:

for (i = 0; i < 6; i++)
	window_write8(vp, dev->dev_addr[i], 2, i);

which expands to someting like:

for (i = 0; i < 6; i++) {
	unsigned long flags;
	spin_lock_irqsave(&vp->window_lock, flags);
	window_set(vp, window);
	iowrite8(dev->dev_addr[i], vp->ioaddr  + i);
	spin_unlock_irqrestore(&vp->window_lock, flags);
	return ret;
}

which is quite odd in a codepath that could simply do:

for (i = 0; i < 6; i++) {
	window_set(vp, window);
	iowrite8(dev->dev_addr[i], vp->ioaddr + i);
}

> 
> This locked section is now very short - 5 or 6 register read/writes and
> no delays.  We might even be able to get away without locking here as
> the only software state this accesses is dev->if_port and I don't think
> it can race with anything except SIOCGIFMAP (which seems harmless).
> 

Best would be, if we don't need to disable the interrupts on this cpu
at all. But then we probaply need to disable the interupt line with
disable_irq. That's why I suggested to move the timer to thread context.

Steffen

^ permalink raw reply

* [PATCH] vxge: fix memory leak in vxge_alloc_msix() error path
From: Michal Schmidt @ 2010-06-24 14:13 UTC (permalink / raw)
  To: Ramkrishna Vepa; +Cc: netdev

When pci_enable_msix() returned ret<0, entries and vxge_entries were leaked.
While at it, use the centralized exit idiom in the function.

Not tested. It compiles OK.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
---
 drivers/net/vxge/vxge-main.c |   29 ++++++++++++++++++++---------
 1 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/net/vxge/vxge-main.c b/drivers/net/vxge/vxge-main.c
index 45c5dc2..8b9e73b 100644
--- a/drivers/net/vxge/vxge-main.c
+++ b/drivers/net/vxge/vxge-main.c
@@ -2262,7 +2262,8 @@ start:
 		vxge_debug_init(VXGE_ERR,
 			"%s: memory allocation failed",
 			VXGE_DRIVER_NAME);
-		return  -ENOMEM;
+		ret = -ENOMEM;
+		goto alloc_entries_failed;
 	}
 
 	vdev->vxge_entries =
@@ -2271,8 +2272,8 @@ start:
 	if (!vdev->vxge_entries) {
 		vxge_debug_init(VXGE_ERR, "%s: memory allocation failed",
 			VXGE_DRIVER_NAME);
-		kfree(vdev->entries);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto alloc_vxge_entries_failed;
 	}
 
 	for (i = 0, j = 0; i < vdev->no_of_vpath; i++) {
@@ -2303,22 +2304,32 @@ start:
 		vxge_debug_init(VXGE_ERR,
 			"%s: MSI-X enable failed for %d vectors, ret: %d",
 			VXGE_DRIVER_NAME, vdev->intr_cnt, ret);
+		if ((max_config_vpath != VXGE_USE_DEFAULT) || (ret < 3)) {
+			ret = -ENODEV;
+			goto enable_msix_failed;
+		}
+
 		kfree(vdev->entries);
 		kfree(vdev->vxge_entries);
 		vdev->entries = NULL;
 		vdev->vxge_entries = NULL;
-
-		if ((max_config_vpath != VXGE_USE_DEFAULT) || (ret < 3))
-			return -ENODEV;
 		/* Try with less no of vector by reducing no of vpaths count */
 		temp = (ret - 1)/2;
 		vxge_close_vpaths(vdev, temp);
 		vdev->no_of_vpath = temp;
 		goto start;
-	} else if (ret < 0)
-		return -ENODEV;
-
+	} else if (ret < 0) {
+		ret = -ENODEV;
+		goto enable_msix_failed;
+	}
 	return 0;
+
+enable_msix_failed:
+	kfree(vdev->vxge_entries);
+alloc_vxge_entries_failed:
+	kfree(vdev->entries);
+alloc_entries_failed:
+	return ret;
 }
 
 static int vxge_enable_msix(struct vxgedev *vdev)
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH net-next-2.6 3/4] macvlan: 64 bit rx counters
From: Patrick McHardy @ 2010-06-24 14:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1277376861.2816.284.camel@edumazet-laptop>

Eric Dumazet wrote:
> Use u64_stats_sync infrastructure to implement 64bit stats.
>
>   

Looks good to me, thanks, I actually wanted to do this myself yesterday :)

Acked-by: Patrick McHardy <kaber@trash.net>


^ permalink raw reply

* Re: [PATCH net-next-2.6 4/4] vlan: 64 bit rx counters
From: Patrick McHardy @ 2010-06-24 14:57 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev
In-Reply-To: <1277376906.2816.287.camel@edumazet-laptop>

Eric Dumazet wrote:
> Use u64_stats_sync infrastructure to implement 64bit rx stats.
>
> (tx stats are addressed later)

Acked-by: Patrick McHardy <kaber@trash.net>

^ permalink raw reply

* Re: [PATCH net-next-2.6 5/5] sfc: Record hardware RX hash on each skb where possible
From: Ben Hutchings @ 2010-06-24 15:18 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1277328688.2101.12.camel@achroite.uk.solarflarecom.com>

David,

Unfortunately, this version of the patch can hit a hardware bug that
results in bogus hashes.  Let me know whether you've applied it, and
I'll send an incremental or a replacement patch.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH net-next-2.6] netfilter: allow nf_tproxy_core module to be removed
From: Patrick McHardy @ 2010-06-24 15:29 UTC (permalink / raw)
  To: David Miller; +Cc: fw, jpirko, netdev, Balazs Scheidler, KOVACS Krisztian
In-Reply-To: <20100623.115558.189705237.davem@davemloft.net>

David Miller wrote:
> From: Florian Westphal <fw@strlen.de>
> Date: Wed, 23 Jun 2010 20:46:11 +0200
>
>   
>> tproxy assigns skb->destructor, what prevents module unload while such skbs may
>> still be around?
>>     
>
> The only reference to nf_tproxy_core.ko is for the symbol, "nf_tproxy_assign_sock".
> xt_TPROXY.c, which references this symbol, thus creates a symbol dependency on this
> module, so xt_TPROXY.o needs to unload before nf_tproxy_core.ko can unload, and
> xt_TPROXY.o has it's own manner for handling module references properly.
>   

I don't see anything waiting for skbs in flight using the tproxy
destructor in either xt_TPROXY or nf_tproxy_core though, so I think
Florian is correct.


^ permalink raw reply

* [RFC net-next-2.6] snmp: ipstats_mib becomes u64 for all arches
From: Eric Dumazet @ 2010-06-24 16:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


David,

I will respin this patch after net-next-2.6 tree stabilizes a bit.

(It needs u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh(), and
probably the SNMP fix I sent earlier, currently in net-2.6 only)

Thanks

[RFC net-next-2.6] snmp: ipstats_mib becomes u64 for all arches

/proc/net/snmp and /proc/net/netstat expose SNMP counters.

Width of these counters is either 32 or 64 bits, depending on the size
of "unsigned long" in kernel.

This means user program parsing these files must already be prepared to
deal with 64bit values, regardless of user program being 32 or 64 bit.

This patch introduces 64bit snmp values for IPSTAT mib, where some
counters can wrap pretty fast if they are 32bit wide.

This uses u64_stats_sync infrastructure.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/ip.h    |   20 ++++++--
 include/net/ipv6.h  |   12 ++---
 include/net/snmp.h  |   98 ++++++++++++++++++++++++++++++++++++++----
 net/ipv4/af_inet.c  |   33 ++++++++++++++
 net/ipv4/proc.c     |   12 ++---
 net/ipv6/addrconf.c |   18 +++++++
 net/ipv6/proc.c     |   13 ++++-
 7 files changed, 176 insertions(+), 30 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 3b524df..890f972 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -165,12 +165,12 @@ struct ipv4_config {
 };
 
 extern struct ipv4_config ipv4_config;
-#define IP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.ip_statistics, field)
-#define IP_INC_STATS_BH(net, field)	SNMP_INC_STATS_BH((net)->mib.ip_statistics, field)
-#define IP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.ip_statistics, field, val)
-#define IP_ADD_STATS_BH(net, field, val) SNMP_ADD_STATS_BH((net)->mib.ip_statistics, field, val)
-#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS((net)->mib.ip_statistics, field, val)
-#define IP_UPD_PO_STATS_BH(net, field, val) SNMP_UPD_PO_STATS_BH((net)->mib.ip_statistics, field, val)
+#define IP_INC_STATS(net, field)	SNMP_INC_STATS64((net)->mib.ip_statistics, field)
+#define IP_INC_STATS_BH(net, field)	SNMP_INC_STATS64_BH((net)->mib.ip_statistics, field)
+#define IP_ADD_STATS(net, field, val)	SNMP_ADD_STATS64((net)->mib.ip_statistics, field, val)
+#define IP_ADD_STATS_BH(net, field, val) SNMP_ADD_STATS64_BH((net)->mib.ip_statistics, field, val)
+#define IP_UPD_PO_STATS(net, field, val) SNMP_UPD_PO_STATS64((net)->mib.ip_statistics, field, val)
+#define IP_UPD_PO_STATS_BH(net, field, val) SNMP_UPD_PO_STATS64_BH((net)->mib.ip_statistics, field, val)
 #define NET_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.net_statistics, field)
 #define NET_INC_STATS_BH(net, field)	SNMP_INC_STATS_BH((net)->mib.net_statistics, field)
 #define NET_INC_STATS_USER(net, field) 	SNMP_INC_STATS_USER((net)->mib.net_statistics, field)
@@ -178,6 +178,14 @@ extern struct ipv4_config ipv4_config;
 #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd)
 
 extern unsigned long snmp_fold_field(void __percpu *mib[], int offt);
+#if BITS_PER_LONG==32
+extern u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off);
+#else
+static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_off)
+{
+	return snmp_fold_field(mib, offt);
+}
+#endif
 extern int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align);
 extern void snmp_mib_free(void __percpu *ptr[2]);
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f5808d5..1f84124 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -136,17 +136,17 @@ extern struct ctl_path net_ipv6_ctl_path[];
 /* MIBs */
 
 #define IP6_INC_STATS(net, idev,field)		\
-		_DEVINC(net, ipv6, , idev, field)
+		_DEVINC(net, ipv6, 64, idev, field)
 #define IP6_INC_STATS_BH(net, idev,field)	\
-		_DEVINC(net, ipv6, _BH, idev, field)
+		_DEVINC(net, ipv6, 64_BH, idev, field)
 #define IP6_ADD_STATS(net, idev,field,val)	\
-		_DEVADD(net, ipv6, , idev, field, val)
+		_DEVADD(net, ipv6, 64, idev, field, val)
 #define IP6_ADD_STATS_BH(net, idev,field,val)	\
-		_DEVADD(net, ipv6, _BH, idev, field, val)
+		_DEVADD(net, ipv6, 64_BH, idev, field, val)
 #define IP6_UPD_PO_STATS(net, idev,field,val)   \
-		_DEVUPD(net, ipv6, , idev, field, val)
+		_DEVUPD(net, ipv6, 64, idev, field, val)
 #define IP6_UPD_PO_STATS_BH(net, idev,field,val)   \
-		_DEVUPD(net, ipv6, _BH, idev, field, val)
+		_DEVUPD(net, ipv6, 64_BH, idev, field, val)
 #define ICMP6_INC_STATS(net, idev, field)	\
 		_DEVINC(net, icmpv6, , idev, field)
 #define ICMP6_INC_STATS_BH(net, idev, field)	\
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 92456f1..af7e9dc 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -47,15 +47,15 @@ struct snmp_mib {
 }
 
 /*
- * We use all unsigned longs. Linux will soon be so reliable that even 
- * these will rapidly get too small 8-). Seriously consider the IpInReceives 
- * count on the 20Gb/s + networks people expect in a few years time!
+ * We use unsigned longs for most mibs but u64 for ipstats.
  */
+#include <linux/u64_stats_sync.h>
 
 /* IPstats */
 #define IPSTATS_MIB_MAX	__IPSTATS_MIB_MAX
 struct ipstats_mib {
-	unsigned long	mibs[IPSTATS_MIB_MAX];
+	u64		mibs[IPSTATS_MIB_MAX];
+	struct u64_stats_sync syncp;
 };
 
 /* ICMP */
@@ -122,19 +122,31 @@ struct linux_xfrm_mib {
 #define SNMP_STAT_USRPTR(name)	(name[1])
 
 #define SNMP_INC_STATS_BH(mib, field)	\
-			__this_cpu_inc(mib[0]->mibs[field])
+	do {									\
+		BUILD_BUG_ON(sizeof(mib[0]->mibs[field]) > sizeof(long));	\
+		__this_cpu_inc(mib[0]->mibs[field]);				\
+	} while (0)
 #define SNMP_INC_STATS_USER(mib, field)	\
-			this_cpu_inc(mib[1]->mibs[field])
+	do {									\
+		BUILD_BUG_ON(sizeof(mib[1]->mibs[field]) > sizeof(long));	\
+		this_cpu_inc(mib[1]->mibs[field]);				\
+	} while (0)
 #define SNMP_INC_STATS(mib, field)	\
-			this_cpu_inc(mib[!in_softirq()]->mibs[field])
+	do {									\
+		BUILD_BUG_ON(sizeof(mib[0]->mibs[field]) > sizeof(long));	\
+		this_cpu_inc(mib[!in_softirq()]->mibs[field]);			\
+	} while (0)
 #define SNMP_DEC_STATS(mib, field)	\
-			this_cpu_dec(mib[!in_softirq()]->mibs[field])
+	do {									\
+		BUILD_BUG_ON(sizeof(mib[0]->mibs[field]) > sizeof(long));	\
+		this_cpu_dec(mib[!in_softirq()]->mibs[field]);			\
+	} while (0)
 #define SNMP_ADD_STATS_BH(mib, field, addend)	\
 			__this_cpu_add(mib[0]->mibs[field], addend)
 #define SNMP_ADD_STATS_USER(mib, field, addend)	\
 			this_cpu_add(mib[1]->mibs[field], addend)
 #define SNMP_ADD_STATS(mib, field, addend)	\
-			this_cpu_add(mib[0]->mibs[field], addend)
+			this_cpu_add(mib[!in_softirq()]->mibs[field], addend)
 /*
  * Use "__typeof__(*mib[0]) *ptr" instead of "__typeof__(mib[0]) ptr"
  * to make @ptr a non-percpu pointer.
@@ -144,6 +156,7 @@ struct linux_xfrm_mib {
 		__typeof__(*mib[0]) *ptr; \
 		preempt_disable(); \
 		ptr = this_cpu_ptr((mib)[!in_softirq()]); \
+		BUILD_BUG_ON(sizeof(mib[0]->mibs[0]) > sizeof(long)); \
 		ptr->mibs[basefield##PKTS]++; \
 		ptr->mibs[basefield##OCTETS] += addend;\
 		preempt_enable(); \
@@ -152,7 +165,74 @@ struct linux_xfrm_mib {
 	do { \
 		__typeof__(*mib[0]) *ptr = \
 			__this_cpu_ptr((mib)[!in_softirq()]); \
+		BUILD_BUG_ON(sizeof(mib[0]->mibs[0]) > sizeof(long)); \
 		ptr->mibs[basefield##PKTS]++; \
 		ptr->mibs[basefield##OCTETS] += addend;\
 	} while (0)
+
+
+#if BITS_PER_LONG==32
+
+#define SNMP_ADD_STATS64_BH(mib, field, addend) 			\
+	do {								\
+		__typeof__(*mib[0]) *ptr = __this_cpu_ptr((mib)[0]);	\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->mibs[field] += addend;				\
+		u64_stats_update_end(&ptr->syncp);			\
+	} while (0)
+#define SNMP_ADD_STATS64_USER(mib, field, addend) 			\
+	do {								\
+		__typeof__(*mib[0]) *ptr;				\
+		preempt_disable();					\
+		ptr = __this_cpu_ptr((mib)[1]);				\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->mibs[field] += addend;				\
+		u64_stats_update_end(&ptr->syncp);			\
+		preempt_enable();					\
+	} while (0)
+#define SNMP_ADD_STATS64(mib, field, addend)				\
+	do {								\
+		__typeof__(*mib[0]) *ptr;				\
+		preempt_disable();					\
+		ptr = __this_cpu_ptr((mib)[!in_softirq()]);		\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->mibs[field] += addend;				\
+		u64_stats_update_end(&ptr->syncp);			\
+		preempt_enable();					\
+	} while (0)
+#define SNMP_INC_STATS64_BH(mib, field) SNMP_ADD_STATS64_BH(mib, field, 1)
+#define SNMP_INC_STATS64_USER(mib, field) SNMP_ADD_STATS64_USER(mib, field, 1)
+#define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1)
+#define SNMP_UPD_PO_STATS64(mib, basefield, addend)			\
+	do {								\
+		__typeof__(*mib[0]) *ptr;				\
+		preempt_disable();					\
+		ptr = __this_cpu_ptr((mib)[!in_softirq()]);		\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->mibs[basefield##PKTS]++;				\
+		ptr->mibs[basefield##OCTETS] += addend;			\
+		u64_stats_update_end(&ptr->syncp);			\
+		preempt_enable();					\
+	} while (0)
+#define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend)			\
+	do {								\
+		__typeof__(*mib[0]) *ptr;				\
+		ptr = __this_cpu_ptr((mib)[!in_softirq()]);		\
+		u64_stats_update_begin(&ptr->syncp);			\
+		ptr->mibs[basefield##PKTS]++;				\
+		ptr->mibs[basefield##OCTETS] += addend;			\
+		u64_stats_update_end(&ptr->syncp);			\
+	} while (0)
+#else
+#define SNMP_INC_STATS64_BH(mib, field)		SNMP_INC_STATS_BH(mib, field)
+#define SNMP_INC_STATS64_USER(mib, field)	SNMP_INC_STATS_USER(mib, field)
+#define SNMP_INC_STATS64(mib, field)		SNMP_INC_STATS(mib, field)
+#define SNMP_DEC_STATS64(mib, field)		SNMP_DEC_STATS(mib, field)
+#define SNMP_ADD_STATS64_BH(mib, field, addend) SNMP_ADD_STATS_BH(mib, field, addend)
+#define SNMP_ADD_STATS64_USER(mib, field, addend) SNMP_ADD_STATS_USER(mib, field, addend)
+#define SNMP_ADD_STATS64(mib, field, addend)	SNMP_ADD_STATS(mib, field, addend)
+#define SNMP_UPD_PO_STATS64(mib, basefield, addend) SNMP_UPD_PO_STATS(mib, basefield, addend)
+#define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend) SNMP_UPD_PO_STATS_BH(mib, basefield, addend)
+#endif
+
 #endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 640db9b..aeb178e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1427,6 +1427,39 @@ unsigned long snmp_fold_field(void __percpu *mib[], int offt)
 }
 EXPORT_SYMBOL_GPL(snmp_fold_field);
 
+#if BITS_PER_LONG==32
+
+u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+{
+	u64 res = 0;
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct u64_stats_sync *syncp;
+		u64 v0, v1;
+		unsigned int start;
+
+		/* first mib used by softirq context */
+		syncp = (struct u64_stats_sync *)(per_cpu_ptr(mib[0], i) + syncp_offset);
+		do {
+			start = u64_stats_fetch_begin_bh(syncp);
+			v0 = *(((u64 *) per_cpu_ptr(mib[0], i)) + offt);
+		} while (u64_stats_fetch_retry_bh(syncp, start));
+
+		/* second mib used in USER context */
+		syncp = (struct u64_stats_sync *)(per_cpu_ptr(mib[1], i) + syncp_offset);
+		do {
+			start = u64_stats_fetch_begin(syncp);
+			v1 = *(((u64 *) per_cpu_ptr(mib[1], i)) + offt);
+		} while (u64_stats_fetch_retry(syncp, start));
+
+		res += v0 + v1;
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(snmp_fold_field64);
+#endif
+
 int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
 {
 	BUG_ON(ptr == NULL);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index e320ca6..d41cf26 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -344,9 +344,9 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
 		   sysctl_ip_default_ttl);
 
 	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			   snmp_fold_field((void __percpu **)net->mib.ip_statistics,
-					   snmp4_ipstats_list[i].entry));
+		seq_printf(seq, " %llu",
+			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+					   snmp4_ipstats_list[i].entry, offsetof(struct ipstats_mib, syncp)));
 
 	icmp_put(seq);	/* RFC 2011 compatibility */
 	icmpmsg_put(seq);
@@ -432,9 +432,9 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
 
 	seq_puts(seq, "\nIpExt:");
 	for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
-		seq_printf(seq, " %lu",
-			   snmp_fold_field((void __percpu **)net->mib.ip_statistics,
-					   snmp4_ipextstats_list[i].entry));
+		seq_printf(seq, " %llu",
+			   snmp_fold_field64((void __percpu **)net->mib.ip_statistics,
+					   snmp4_ipextstats_list[i].entry, offsetof(struct ipstats_mib, syncp)));
 
 	seq_putc(seq, '\n');
 	return 0;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c20a7c2..56165ae 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3858,12 +3858,28 @@ static inline void __snmp6_fill_stats(u64 *stats, void __percpu **mib,
 	memset(&stats[items], 0, pad);
 }
 
+static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib,
+				      int items, int bytes, size_t syncpoff)
+{
+	int i;
+	int pad = bytes - sizeof(u64) * items;
+	BUG_ON(pad < 0);
+
+	/* Use put_unaligned() because stats may not be aligned for u64. */
+	put_unaligned(items, &stats[0]);
+	for (i = 1; i < items; i++)
+		put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+
+	memset(&stats[items], 0, pad);
+}
+
 static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
 			     int bytes)
 {
 	switch (attrtype) {
 	case IFLA_INET6_STATS:
-		__snmp6_fill_stats(stats, (void __percpu **)idev->stats.ipv6, IPSTATS_MIB_MAX, bytes);
+		__snmp6_fill_stats64(stats, (void __percpu **)idev->stats.ipv6,
+				     IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp));
 		break;
 	case IFLA_INET6_ICMP6STATS:
 		__snmp6_fill_stats(stats, (void __percpu **)idev->stats.icmpv6, ICMP6_MIB_MAX, bytes);
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 566798d..acc1960 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -179,12 +179,21 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu **mib,
 			   snmp_fold_field(mib, itemlist[i].entry));
 }
 
+static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu **mib,
+				const struct snmp_mib *itemlist, size_t syncpoff)
+{
+	int i;
+	for (i=0; itemlist[i].name; i++)
+		seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name,
+			   snmp_fold_field64(mib, itemlist[i].entry, syncpoff));
+}
+
 static int snmp6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = (struct net *)seq->private;
 
-	snmp6_seq_show_item(seq, (void __percpu **)net->mib.ipv6_statistics,
-			    snmp6_ipstats_list);
+	snmp6_seq_show_item64(seq, (void __percpu **)net->mib.ipv6_statistics,
+			    snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
 	snmp6_seq_show_item(seq, (void __percpu **)net->mib.icmpv6_statistics,
 			    snmp6_icmp6_list);
 	snmp6_seq_show_icmpv6msg(seq,



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox