Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Tejun Heo @ 2010-07-28 12:00 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <20100728104858.GB30643@redhat.com>

Hello,

On 07/28/2010 12:48 PM, Michael S. Tsirkin wrote:
> I'm unsure how flush_work operates under these conditions.  E.g. in
> workqueue.c, this seems to work by keeping a pointer to current
> workqueue in the work.  But what prevents us from destroying the
> workqueue when work might not be running?

In cmwq, work points to the gcwq it was on, which keeps track of all
the works in progress, so flushing work which is on a destroyed
workqueue should be fine, but in the original implementation, it would
end up accessing freed memory.

> Is this currently broken if you use multiple workqueues
> for the same work? If yes, I propose we do as I did,
> making flush_work get worker pointer, and only flushing
> on that worker.

The original semantics of workqueue is that flush_work() guarantees
that the work has finished executing on the workqueue it was last
queued on.  Adding @worker to flush_work() is okay, I think.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH RHEL6 RESEND] kernel performance optimization with CONFIG_DEBUG_RODATA
From: Xiaotian Feng @ 2010-07-28 11:03 UTC (permalink / raw)
  Cc: Xiaotian Feng, linux-mm, linux-nfs, netdev, cl, a.p.zijlstra,
	lwang, penberg, davem
In-Reply-To: <20100728110043.27677.13908.sendpatchset@dhcp-65-180.nay.redhat.com>

oops, script got some wrong, sorry for the noise... please ignore the 
mail ...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH RHEL6 RESEND] kernel performance optimization with CONFIG_DEBUG_RODATA
From: Xiaotian Feng @ 2010-07-28 11:00 UTC (permalink / raw)
  To: linux-mm, linux-nfs, netdev
  Cc: cl, a.p.zijlstra, Xiaotian Feng, lwang, penberg, davem

backport of following commits to improve x86_64 kernel performance with
CONFIG_DEBUG_RODATA:

straightforward backport of:
commit b9af7c0d (x86-64: preserve large page mapping for 1st 2MB kernel txt with CONFIG_DEBUG_RODATA)
commit 74e08179 (x86-64: align RODATA kernel section to 2MB with CONFIG_DEBUG_RODATA)
commit d6cc1c3a (x86-64: add comment for RODATA large page retainment)

Resolves bz557364

We still have CONFIG_DEBUG_RODATA set for regular rhel6 kernel, so this fix is still needed.

There's no kabi breakage with latest rhel6 code (don't know why...)
Brew build is available at:
https://brewweb.devel.redhat.com/taskinfo?taskID=2627856

Test has been done for RHTS kernel tier1:
http://rhts.redhat.com/cgi-bin/rhts/jobs.cgi?id=168225

No regressions are introduced by this patch. Reviews and comments are welcome.
---
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
 #define _ASM_X86_SECTIONS_H
 
 #include <asm-generic/sections.h>
+#include <asm/uaccess.h>
 
 extern char __brk_base[], __brk_limit[];
+extern struct exception_table_entry __stop___ex_table[];
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+extern char __end_rodata_hpage_align[];
+#endif
 
 #endif	/* _ASM_X86_SECTIONS_H */
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
 	.quad	x86_64_start_kernel
 	ENTRY(initial_gs)
 	.quad	INIT_PER_CPU_VAR(irq_stack_union)
-	__FINITDATA
 
 	ENTRY(stack_start)
 	.quad  init_thread_union+THREAD_SIZE-8
 	.word  0
+	__FINITDATA
 
 bad_address:
 	jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
 	i = i + 1 ;					\
 	.endr
 
+	.data
 	/*
 	 * This default setting generates an ident mapping at address 0x100000
 	 * and a mapping for the kernel that precisely maps virtual address
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+/*
+ * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
+ * we retain large page mappings for boundaries spanning kernel text, rodata
+ * and data sections.
+ *
+ * However, kernel identity mappings will have different RWX permissions
+ * to the pages mapping to text and to the pages padding (which are freed) the
+ * text section. Hence kernel identity mappings will be broken to smaller
+ * pages. For 64-bit, kernel text and kernel identity mappings are different,
+ * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
+ * as well as retain 2MB large page mappings for kernel text.
+ */
+#define X64_ALIGN_DEBUG_RODATA_BEGIN   . = ALIGN(HPAGE_SIZE);
+
+#define X64_ALIGN_DEBUG_RODATA_END				\
+		. = ALIGN(HPAGE_SIZE);				\
+		__end_rodata_hpage_align = .;
+
+#else
+
+#define X64_ALIGN_DEBUG_RODATA_BEGIN
+#define X64_ALIGN_DEBUG_RODATA_END
+
+#endif
+
 PHDRS {
 	text PT_LOAD FLAGS(5);          /* R_E */
 	data PT_LOAD FLAGS(7);          /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
+	X64_ALIGN_DEBUG_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
+	X64_ALIGN_DEBUG_RODATA_END
 
 	/* Data */
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ static int kernel_set_to_readonly;
 
 void set_kernel_text_rw(void)
 {
-	unsigned long start = PFN_ALIGN(_stext);
+	unsigned long start = PFN_ALIGN(_text);
 	unsigned long end = PFN_ALIGN(__start_rodata);
 
 	if (!kernel_set_to_readonly)
@@ -775,7 +775,7 @@ void set_kernel_text_rw(void)
 
 void set_kernel_text_ro(void)
 {
-	unsigned long start = PFN_ALIGN(_stext);
+	unsigned long start = PFN_ALIGN(_text);
 	unsigned long end = PFN_ALIGN(__start_rodata);
 
 	if (!kernel_set_to_readonly)
@@ -789,9 +789,13 @@ void set_kernel_text_ro(void)
 
 void mark_rodata_ro(void)
 {
-	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+	unsigned long start = PFN_ALIGN(_text);
 	unsigned long rodata_start =
 		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
+	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
+	unsigned long data_start = (unsigned long) &_sdata;
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -814,6 +818,14 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: again\n");
 	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
 #endif
+
+	free_init_pages("unused kernel memory",
+			(unsigned long) page_address(virt_to_page(text_end)),
+			(unsigned long)
+				 page_address(virt_to_page(rodata_start)));
+	free_init_pages("unused kernel memory",
+			(unsigned long) page_address(virt_to_page(rodata_end)),
+			(unsigned long) page_address(virt_to_page(data_start)));
 }
 
 #endif
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,20 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+	/*
+	 * Kernel text mappings for the large page aligned .rodata section
+	 * will be read-only. For the kernel identity mappings covering
+	 * the holes caused by this alignment can be anything.
+	 *
+	 * This will preserve the large page mappings for kernel text/data
+	 * at no extra cost.
+	 */
+	if (within(address, (unsigned long)_text,
+		   (unsigned long)__end_rodata_hpage_align))
+		pgprot_val(forbidden) |= _PAGE_RW;
+#endif
+
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 
 	return prot;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Michael S. Tsirkin @ 2010-07-28 10:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <4C4FE0CF.3070506@kernel.org>

On Wed, Jul 28, 2010 at 09:48:31AM +0200, Tejun Heo wrote:
> On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
> >> Thinking a bit more about it, it kind of sucks that queueing to
> >> another worker from worker->func() breaks flush.  Maybe the right
> >> thing to do there is using atomic_t for done_seq?
> > 
> > I don't believe it will help: we might have:
> > 
> > worker1 runs work
> > work requeues itself queued index = 1
> > worker1 reads queued index = 1
> > worker2 runs work
> > work requeues itself queued index = 2
> > worker2 runs work
> > worker2 reads queued index = 2
> > worker2 writes done index = 2
> > worker1 writes done index = 1
> > 
> > As you see, done index got moved back.
> 
> Yeah, I think the flushing logic should be moved to the worker.
> Are you interested in doing it w/ your change?
> 
> Thanks.

I'm unsure how flush_work operates under these conditions.  E.g. in
workqueue.c, this seems to work by keeping a pointer to current
workqueue in the work.  But what prevents us from destroying the
workqueue when work might not be running?

Is this currently broken if you use multiple workqueues
for the same work? If yes, I propose we do as I did,
making flush_work get worker pointer, and only flushing
on that worker.

> -- 
> tejun

^ permalink raw reply

* RE: [Uclinux-dist-devel] [PATCH net-next] drivers/net/bfin_mac.c: Use  pr_fmt, netdev_<level>
From: Hennerich, Michael @ 2010-07-28 10:41 UTC (permalink / raw)
  To: Joe Perches, Mike Frysinger; +Cc: uclinux-dist-devel, LKML, netdev
In-Reply-To: <1280311240.24054.103.camel@Joe-Laptop.home>

Joe Perches wrote on 2010-07-28:
> On Wed, 2010-07-28 at 03:50 -0400, Mike Frysinger wrote:
>> On Tue, Jul 27, 2010 at 15:22, Joe Perches wrote:
>>> $ ./scripts/checkpatch.pl -f drivers/net/bfin_mac.c | grep "^total:"
>>> total: 2 errors, 25 warnings, 1723 lines checked $
>>> ./scripts/checkpatch.pl -f drivers/net/bfin_mac.c | grep "^total:"
>>> total: 0 errors, 0 warnings, 1743 lines checked
>> i dislike the mixing of whitespace and useful changes if they were
>> split, and they worked, then i wouldnt have a problem with them
>
> Is that a nak or a dislike?

I think what Mike meant was - two patches would be better.
I'm currently cloning the tree this patch applies to.
I'll ACK once I verified the patch.

>
> Also, is Michael Hennerich actually the maintainer for bfin_mac?
> He had been  listed as MAINTAINER for over a year but he hasn't ever
> written or acked a patch for this file.
>
> commit making Michael Hennerich bfin_mac MAINTAINER $ git log -1
> 6c83429a commit 6c83429a1c32c914dfb81939cc2ddece97e48294
> Author: Mike Frysinger <vapier@gentoo.org>
> Date:   Sun May 24 02:13:15 2009 -0400
>
>     MAINTAINERS: update Blackfin items
>
>     With Bryan Wu having moved on to another job, push the slack onto
>     some other ADI lackeys.
>
>     Signed-off-by: Mike Frysinger <vapier@gentoo.org>
> $ ./scripts/get_maintainer.pl -f drivers/net/bfin_mac.c \
>       --rolestats --git-since=5-years-ago --nol Michael Hennerich
> <michael.hennerich@analog.com> (supporter:BLACKFIN EMAC DRIVER) "David
> S. Miller" <davem@davemloft.net> (commit_signer:41/65=63%) Jeff Garzik
> <jgarzik@redhat.com> (commit_signer:31/65=48%) Bryan Wu
> <cooloney@kernel.org> (commit_signer:26/65=40%) Mike Frysinger
> <vapier@gentoo.org> (commit_signer:20/65=31%) Sonic Zhang
> <sonic.zhang@analog.com> (commit_signer:8/65=12%)
>
>

Greetings,
Michael

Analog Devices GmbH      Wilhelm-Wagenfeld-Str. 6      80807 Muenchen
Sitz der Gesellschaft Muenchen, Registergericht Muenchen HRB 4036 Geschaeftsfuehrer Thomas Wessel, William A. Martin, Margaret Seif



^ permalink raw reply

* Re: [Uclinux-dist-devel] [PATCH net-next] drivers/net/bfin_mac.c: Use  pr_fmt, netdev_<level>
From: Joe Perches @ 2010-07-28 10:00 UTC (permalink / raw)
  To: Mike Frysinger; +Cc: Michael Hennerich, uclinux-dist-devel, LKML, netdev
In-Reply-To: <AANLkTi=p=-ovBaqyqBdEW=W4E1ArWsiyUBwy37GtXOyu@mail.gmail.com>

On Wed, 2010-07-28 at 03:50 -0400, Mike Frysinger wrote:
> On Tue, Jul 27, 2010 at 15:22, Joe Perches wrote:
> > $ ./scripts/checkpatch.pl -f drivers/net/bfin_mac.c | grep "^total:"
> > total: 2 errors, 25 warnings, 1723 lines checked
> > $ ./scripts/checkpatch.pl -f drivers/net/bfin_mac.c | grep "^total:"
> > total: 0 errors, 0 warnings, 1743 lines checked
> i dislike the mixing of whitespace and useful changes
> if they were split, and they worked, then i wouldnt have a problem with them

Is that a nak or a dislike?

Also, is Michael Hennerich actually the maintainer for bfin_mac?
He had been  listed as MAINTAINER for over a year but he hasn't
ever written or acked a patch for this file.

commit making Michael Hennerich bfin_mac MAINTAINER
$ git log -1 6c83429a
commit 6c83429a1c32c914dfb81939cc2ddece97e48294
Author: Mike Frysinger <vapier@gentoo.org>
Date:   Sun May 24 02:13:15 2009 -0400

    MAINTAINERS: update Blackfin items
    
    With Bryan Wu having moved on to another job, push the slack onto some
    other ADI lackeys.
    
    Signed-off-by: Mike Frysinger <vapier@gentoo.org>

$ ./scripts/get_maintainer.pl -f drivers/net/bfin_mac.c \
	--rolestats --git-since=5-years-ago --nol
Michael Hennerich <michael.hennerich@analog.com> (supporter:BLACKFIN EMAC DRIVER)
"David S. Miller" <davem@davemloft.net> (commit_signer:41/65=63%)
Jeff Garzik <jgarzik@redhat.com> (commit_signer:31/65=48%)
Bryan Wu <cooloney@kernel.org> (commit_signer:26/65=40%)
Mike Frysinger <vapier@gentoo.org> (commit_signer:20/65=31%)
Sonic Zhang <sonic.zhang@analog.com> (commit_signer:8/65=12%)




^ permalink raw reply

* [BUG] bridge leaks 3 references on lo per up&down
From: David Lamparter @ 2010-07-28  9:58 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, bridge
In-Reply-To: <20100728074428.GB13413@jupiter.n2.diac24.net>

On Wed, Jul 28, 2010 at 09:44:28AM +0200, David Lamparter wrote:
> On Tue, Jul 27, 2010 at 10:02:49AM -0700, Stephen Hemminger wrote:
> > David Lamparter <equinox@diac24.net> wrote:
> > > unregister_netdevice: waiting for lo to become free. Usage count = 4
> > > 
> > > unshare -n -- /bin/bash -c 'ip link add link lo veth0 type veth peer
> > > name veth3 netns 1; brctl addbr br0; ip l s br0 up; ip a l; sleep 8'

When I do

unshare -n -- /bin/bash -c 'ip link add link lo veth0 type veth peer
	name veth3 netns 1; brctl addbr br0; \
	ip l s br0 up; sleep 8; ip l s br0 down; sleep 1;
	ip l s br0 up; sleep 8;'

I get "usage count = 7" on lo; when I add another up & down, I get
"usage count = 10". 4 times makes it a merry 13 refs...


-David


^ permalink raw reply

* Re: [PATCH] vlan: fix u64_stats
From: Changli Gao @ 2010-07-28  8:41 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netdev, Changli Gao
In-Reply-To: <1280293833-8723-1-git-send-email-xiaosuo@gmail.com>

On Wed, Jul 28, 2010 at 1:10 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> rx_stats->rx_errors should be under the protection of rx_stats->syncp.
>

I checked the code again. rx_errors is unsigned long, so need no
protection. Sorry for the noise.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* [PATCH] vlan: fix u64_stats
From: Changli Gao @ 2010-07-28  5:10 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David S. Miller, netdev, Changli Gao

rx_stats->rx_errors should be under the protection of rx_stats->syncp.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
 net/8021q/vlan_dev.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 3d59c9b..d70d31a 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -211,7 +211,6 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		default:
 			break;
 		}
-		u64_stats_update_end(&rx_stats->syncp);
 	}
 
 	skb_pull_rcsum(skb, VLAN_HLEN);
@@ -221,8 +220,10 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		skb = vlan_check_reorder_header(skb);
 		if (!skb) {
 			rx_stats->rx_errors++;
+			u64_stats_update_end(&rx_stats->syncp);
 			goto err_unlock;
 		}
+		u64_stats_update_end(&rx_stats->syncp);
 	}
 
 	netif_rx(skb);

^ permalink raw reply related

* noqueue on bonding devices
From: Simon Horman @ 2010-07-28  8:32 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev

Hi Jay, Hi All,

I would just to wonder out loud if it is intentional that bonding
devices default to noqueue, whereas for instance ethernet devices
default to a pfifo_fast with qlen 1000.

The reason that I ask, is that when setting up some bandwidth
control using tc I encountered some strange behaviour which
I eventually tracked down to the queue-length of the qdiscs being 1p -
inherited from noqueue, as opposed to 1000p which would occur
on an ethernet device.

Its trivial to work around, by either altering the txqueuelen on
the bonding device before adding the qdisc or by manually setting
the qlen of the qdisc. But it did take us a while to determine the
cause of the problem we were seeing. And as it seems inconsistent
I'm interested to know why this is the case.

On an unrelated note, MAINTANERS lists bonding-devel@lists.sourceforge.net
but the (recent) archives seem to be entirely spam.  Is the MAINTAINERS
file correct?

^ permalink raw reply

* Re: can: expected receive behavior broken
From: Matthias Fuchs @ 2010-07-28  8:23 UTC (permalink / raw)
  To: Wolfgang Grandegger
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w, Linux Netdev List
In-Reply-To: <4C4FE7AC.4010806-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>

plx_pci/sja1000 + esd_usb2

Should be easy to reproduce.

Matthias

On Wednesday 28 July 2010 10:17, Wolfgang Grandegger wrote:
> On 07/28/2010 09:56 AM, Matthias Fuchs wrote:
> > Hi,
> > 
> > I just noticed that the receive behavior of CAN sockets is broken
> > in current net-next-2.6.
> > I wrote some simple code that receives messages and echos them back to
> > the bus. When I now trigger one single message on the bus, I get
> > this message received and echoed back in an endless loop.
> > 
> > I do not touch the sockopts CAN_RAW_LOOPBACK or CAN_RAW_RECV_OWN_MSGS in my code.
> > Only (!) setting CAN_RAW_LOOPBACK to 0 helps at the moment. But this behavior
> > actually has nothing to do with LOOPBACK but more with RECV_OWN_MSGS.
> 
> Sounds wired! What driver are you using?
> 
> Wolfgang.
> 
> 
> 

^ permalink raw reply

* Re: can: expected receive behavior broken
From: Wolfgang Grandegger @ 2010-07-28  8:17 UTC (permalink / raw)
  To: Matthias Fuchs; +Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w, Linux Netdev List
In-Reply-To: <201007280956.38957.matthias.fuchs-iOnpLzIbIdM@public.gmane.org>

On 07/28/2010 09:56 AM, Matthias Fuchs wrote:
> Hi,
> 
> I just noticed that the receive behavior of CAN sockets is broken
> in current net-next-2.6.
> I wrote some simple code that receives messages and echos them back to
> the bus. When I now trigger one single message on the bus, I get
> this message received and echoed back in an endless loop.
> 
> I do not touch the sockopts CAN_RAW_LOOPBACK or CAN_RAW_RECV_OWN_MSGS in my code.
> Only (!) setting CAN_RAW_LOOPBACK to 0 helps at the moment. But this behavior
> actually has nothing to do with LOOPBACK but more with RECV_OWN_MSGS.

Sounds wired! What driver are you using?

Wolfgang.

^ permalink raw reply

* local_bh_enable_ip warning with conntrack/forcedeth
From: Johannes Berg @ 2010-07-28  8:12 UTC (permalink / raw)
  To: netdev

This is a bit weird ... forcedeth uses dev_kfree_skb_any(), which gets
to dev_kree_skb_irq(), but then why consume_skb() which can't be called
in that context?

Or is the destructor check in dev_kfree_skb_irq() wrong?

[  446.375065] WARNING: at /home/johannes/sys/wireless-testing/kernel/softirq.c:143 local_bh_enable_ip+0xba/0x110()
[  446.375069] Hardware name: MacBook5,1
[  446.375172] Pid: 0, comm: swapper Not tainted 2.6.35-rc6-wl-48390-gb63b887-dirty #176
[  446.375175] Call Trace:
[  446.375178]  <IRQ>  [<ffffffff81046b5f>] warn_slowpath_common+0x7f/0xc0
[  446.375197]  [<ffffffff81046bba>] warn_slowpath_null+0x1a/0x20
[  446.375201]  [<ffffffff8104eb0a>] local_bh_enable_ip+0xba/0x110
[  446.375207]  [<ffffffff8149072e>] _raw_spin_unlock_bh+0x3e/0x50
[  446.375214]  [<ffffffffa08c249c>] destroy_conntrack+0xfc/0x150 [nf_conntrack]
[  446.375227]  [<ffffffff813db358>] nf_conntrack_destroy+0x68/0x100
[  446.375236]  [<ffffffff813a71b5>] skb_release_head_state+0xe5/0x120
[  446.375240]  [<ffffffff813a6de6>] __kfree_skb+0x16/0xa0
[  446.375244]  [<ffffffff813a6e8e>] consume_skb+0x1e/0x40
[  446.375249]  [<ffffffff813b4405>] dev_kfree_skb_irq+0xa5/0xb0
[  446.375258]  [<ffffffff813b443c>] dev_kfree_skb_any+0x2c/0x50
[  446.375262]  [<ffffffff81364be9>] nv_tx_done_optimized+0x59/0x220
[  446.375271]  [<ffffffff81367ffa>] nv_napi_poll+0x6a/0x330
[  446.375276]  [<ffffffff813b228b>] net_rx_action+0x12b/0x300
[  446.375280]  [<ffffffff8104ee34>] __do_softirq+0x114/0x3d0
[  446.375290]  [<ffffffff8100360c>] call_softirq+0x1c/0x50
[  446.375294]  [<ffffffff8100500d>] do_softirq+0x7d/0xb0
[  446.375298]  [<ffffffff8104e8d5>] irq_exit+0x95/0xa0
[  446.375302]  [<ffffffff8100450c>] do_IRQ+0x7c/0xf0
[  446.375306]  [<ffffffff81490d13>] ret_from_intr+0x0/0xf




^ permalink raw reply

* Re: [PATCH] Driver-core: Fix bluetooth network device rename  regression
From: Eric W. Biederman @ 2010-07-28  7:57 UTC (permalink / raw)
  To: Kay Sievers
  Cc: Greg KH, Greg KH, Johannes Berg, Andrew Morton, Rafael J. Wysocki,
	Maciej W. Rozycki, netdev
In-Reply-To: <AANLkTikoLQMH5LZCgKmQivTrtvtCf8oksk-HwBQQ+Tte@mail.gmail.com>

Kay Sievers <kay.sievers@vrfy.org> writes:

> Yeah, but most of these things we should have fixed over the last
> years. There is no single WAIT_FOR instruction left in udev rules. :)

Last time I looked there were quite a few attributes that were still
getting created late.  I would not be surprised if the common case
works fine, but I know of a least one and I think a couple of weird
cases that still have to do unpleasant things.

Still that is a project for another time.


>> At the subsystem level bus devices look better.
>> At the individual device level bus devices stacked on bus devices
>> appear to be a namespace disaster.
>
> They are usually created by the same code, in many cases by the same
> drivers, and have not been a real problem so far. As you said, network
> devices are special here, because of the ability to rename them from
> userspace.
>
> At some time in the future, when buses and classes are merged, I
> expect stuff can just set a flag to have a 'glue dir' created or not.
>
> For now 'glue dirs' are limited to be created between a bus and a
> class device. It could possibly be extended to be created between
> classes of different types to handle issues like this.

Sounds like a plan.  And now I'm off on vacation.

Have a good one.

Eric

^ permalink raw reply

* can: expected receive behavior broken
From: Matthias Fuchs @ 2010-07-28  7:56 UTC (permalink / raw)
  To: Linux Netdev List; +Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w

Hi,

I just noticed that the receive behavior of CAN sockets is broken
in current net-next-2.6.
I wrote some simple code that receives messages and echos them back to
the bus. When I now trigger one single message on the bus, I get
this message received and echoed back in an endless loop.

I do not touch the sockopts CAN_RAW_LOOPBACK or CAN_RAW_RECV_OWN_MSGS in my code.
Only (!) setting CAN_RAW_LOOPBACK to 0 helps at the moment. But this behavior
actually has nothing to do with LOOPBACK but more with RECV_OWN_MSGS.

Matthias

^ permalink raw reply

* Re: [Uclinux-dist-devel] [PATCH net-next] drivers/net/bfin_mac.c: Use pr_fmt, netdev_<level>
From: Mike Frysinger @ 2010-07-28  7:50 UTC (permalink / raw)
  To: Joe Perches; +Cc: Michael Hennerich, uclinux-dist-devel, LKML, netdev
In-Reply-To: <1280258531.24054.10.camel@Joe-Laptop.home>

On Tue, Jul 27, 2010 at 15:22, Joe Perches wrote:
> And some assorted neatening for checkpatch:
>
>        80 column reformatting (mostly comments)
>        argument alignment
>        couple of spelling/grammar typos corrected
>
> Added bfin_alloc_skb to centralize allocation/dcache invalidation
> Added get_mac_addr for symmetry
>
> $ ./scripts/checkpatch.pl -f drivers/net/bfin_mac.c | grep "^total:"
> total: 2 errors, 25 warnings, 1723 lines checked
> $ ./scripts/checkpatch.pl -f drivers/net/bfin_mac.c | grep "^total:"
> total: 0 errors, 0 warnings, 1743 lines checked
>
> Uncompiled, untested.

i dislike the mixing of whitespace and useful changes

if they were split, and they worked, then i wouldnt have a problem with them
-mike

^ permalink raw reply

* Re: [PATCH UPDATED 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Tejun Heo @ 2010-07-28  7:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <20100727191911.GA16350@redhat.com>

On 07/27/2010 09:19 PM, Michael S. Tsirkin wrote:
>> Thinking a bit more about it, it kind of sucks that queueing to
>> another worker from worker->func() breaks flush.  Maybe the right
>> thing to do there is using atomic_t for done_seq?
> 
> I don't believe it will help: we might have:
> 
> worker1 runs work
> work requeues itself queued index = 1
> worker1 reads queued index = 1
> worker2 runs work
> work requeues itself queued index = 2
> worker2 runs work
> worker2 reads queued index = 2
> worker2 writes done index = 2
> worker1 writes done index = 1
> 
> As you see, done index got moved back.

Yeah, I think the flushing logic should be moved to the worker.  Are
you interested in doing it w/ your change?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: BUG: net-next netns teardown bridge bug "waiting for lo to become free."
From: David Lamparter @ 2010-07-28  7:44 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20100727100249.51ffc893@nehalam>

On Tue, Jul 27, 2010 at 10:02:49AM -0700, Stephen Hemminger wrote:
> David Lamparter <equinox@diac24.net> wrote:
> > unregister_netdevice: waiting for lo to become free. Usage count = 4
> > 
> > unshare -n -- /bin/bash -c 'ip link add link lo veth0 type veth peer
> > name veth3 netns 1; brctl addbr br0; ip l s br0 up; ip a l; sleep 8'
> > 
> > (It happens when the netns is going down.)
> > 
> > Without the "ip l s br0 up" it does not happen;
[...]
> > I'm looking into this, but i'm not versed in Linux kernel code, so my
> > luck in fixing this might be limited. Some reference taken in bridge
> > "up"? Should the bridge be set "down" on netns teardown?
> 
> Loopback device is special and is referenced by routes etc.
> You need to manually remove bridge, and all routes, neighbors and cache
> before removing namespace.

That doesn't do. I have namespaces which are only network namespaces and
they have only one or two processes like openvpn. openvpn can have bugs,
so it might just terminate. If it is the last process in the namespace,
the namespace needs to go, and correctly. Even if I had some "master
daemon" to tear down the namespace at the "end", who tells me that that
will work perfectly and not die? It shouldn't be possible to break the
network stack by just killing the wrong process...

I'll try reading the code today, maybe I can come up with the right
dereferences in the right places.

-David

^ permalink raw reply

* Re: br_forward.c - rcu dereference warning
From: Johannes Berg @ 2010-07-28  7:33 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: paulmck, netdev
In-Reply-To: <20100727134223.5bb0be30@nehalam>

On Tue, 2010-07-27 at 13:42 -0700, Stephen Hemminger wrote:

> > 
> > Did you want me to test the patch?
> 
> Yes please, I can make sure it works, but not that it gets rid
> of your error

Yes, it fixed it, thanks.

johannes



^ permalink raw reply

* RE: nfs client hang
From: Andy Chittenden @ 2010-07-28  7:24 UTC (permalink / raw)
  To: Andy Chittenden, Chuck Lever, Eric Dumazet
  Cc: Linux Kernel Mailing List (linux-kernel@vger.kernel.org),
	Trond Myklebust, netdev, Linux NFS Mailing List
In-Reply-To: <4C4F174C.2000308@oracle.com>

resending as it seems to have been corrupted on LKML!

> The RPC client marks the socket closed. and the linger timeout is 
> cancelled.  At this point, sk_shutdown should be set to zero, correct? 
> I don't see an xs_error_report() call here, which would confirm that the 
> socket took a trip through tcp_disconnect().

From my reading of tcp_disconnect(), it calls sk->sk_error_report(sk) unconditionally so as there's no xs_error_report(), that surely means the exact opposite: tcp_disconnect() wasn't called. If it's not called, sk_shutdown is not cleared. And my revised tracing confirmed that it was set to 
SEND_SHUTDOWN.

-- 
Andy, BlueArc Engineering

^ permalink raw reply

* RE: nfs client hang
From: Andy Chittenden @ 2010-07-28  7:08 UTC (permalink / raw)
  To: Chuck Lever, Eric Dumazet
  Cc: Linux Kernel Mailing List (linux-kernel@vger.kernel.org),
	Trond Myklebust, netdev, Linux NFS Mailing List
In-Reply-To: <4C4F174C.2000308@oracle.com>

> I don't see an xs_error_report() call here, which would confirm that the socket took a trip through tcp_disconnect().

From my reading of tcp_disconnect(), it calls sk->sk_error_report(sk) unconditionally so as there's no xs_error_report(), that surely means the exact opposite: tcp_disconnect() wasn't called. If it's not called, sk_shutdown is not cleared. And my revised tracing confirmed that it was set to 
SEND_SHUTDOWN.

-- 
Andy, BlueArc Engineering


-----Original Message-----
From: Chuck Lever [mailto:chuck.lever@oracle.com] 
Sent: 27 July 2010 18:29
To: Eric Dumazet
Cc: Andy Chittenden; Linux Kernel Mailing List (linux-kernel@vger.kernel.org); Trond Myklebust; netdev; Linux NFS Mailing List
Subject: Re: nfs client hang

Add CC: linux-nfs@vger.kernel.org

On 07/27/10 08:21 AM, Eric Dumazet wrote:
> Le mardi 27 juillet 2010 à 11:53 +0100, Andy Chittenden a écrit :
>>>>> IE the client starts a connection and then closes it again without sending data.
>>>> Once this happens, here's some rpcdebug info for the rpc module using 2.6.34.1 kernel:
>>>>
>>>> ... lots of the following nfsv3 WRITE requests:
>>>> [ 7670.026741] 57793 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026759] 57794 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026778] 57795 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026797] 57796 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026815] 57797 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026834] 57798 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026853] 57799 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026871] 57800 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026890] 57801 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7670.026909] 57802 0001    -11 ffff88012e32b000   (null)        0 ffffffffa03beb10 nfsv3 WRITE a:call_reserveresult q:xprt_backlog
>>>> [ 7680.520042] RPC:       worker connecting xprt ffff88013e62d800 via tcp to 10.1.6.102 (port 2049)
>>>> [ 7680.520066] RPC:       ffff88013e62d800 connect status 99 connected 0 sock state 7
>>>> [ 7680.520074] RPC: 33550 __rpc_wake_up_task (now 4296812426)
>>>> [ 7680.520079] RPC: 33550 disabling timer
>>>> [ 7680.520084] RPC: 33550 removed from queue ffff88013e62db20 "xprt_pending"
>>>> [ 7680.520089] RPC:       __rpc_wake_up_task done
>>>> [ 7680.520094] RPC: 33550 __rpc_execute flags=0x1
>>>> [ 7680.520098] RPC: 33550 xprt_connect_status: retrying
>>>> [ 7680.520103] RPC: 33550 call_connect_status (status -11)
>>>> [ 7680.520108] RPC: 33550 call_transmit (status 0)
>>>> [ 7680.520112] RPC: 33550 xprt_prepare_transmit
>>>> [ 7680.520118] RPC: 33550 rpc_xdr_encode (status 0)
>>>> [ 7680.520123] RPC: 33550 marshaling UNIX cred ffff88012e002300
>>>> [ 7680.520130] RPC: 33550 using AUTH_UNIX cred ffff88012e002300 to wrap rpc data
>>>> [ 7680.520136] RPC: 33550 xprt_transmit(32920)
>>>> [ 7680.520145] RPC:       xs_tcp_send_request(32920) = -32
>>>> [ 7680.520151] RPC:       xs_tcp_state_change client ffff88013e62d800...
>>>> [ 7680.520156] RPC:       state 7 conn 0 dead 0 zapped 1
>>
>>> I changed that debug to output sk_shutdown too. That has a value of 2
>>> (IE SEND_SHUTDOWN). Looking at tcp_sendmsg(), I see this:
>>
>>>           err = -EPIPE;
>>>           if (sk->sk_err || (sk->sk_shutdown&  SEND_SHUTDOWN))
>>>                   goto out_err;
>>
>>> which correlates with the trace "xs_tcp_send_request(32920) = -32". So,
>>> this looks like a problem in the sockets/tcp layer. The rpc layer issues
>>> a shutdown and then reconnects using the same socket. So either
>>> sk_shutdown needs zeroing once the shutdown completes or should be
>>> zeroed on subsequent connect. The latter sounds safer.

>> This patch for 2.6.34.1 fixes the issue:
>>
>> --- /home/company/software/src/linux-2.6.34.1/net/ipv4/tcp_output.c     2010-07-27 08:46:46.917000000 +0100
>> +++ net/ipv4/tcp_output.c       2010-07-27 09:19:16.000000000 +0100
>> @@ -2522,6 +2522,13 @@
>>          struct tcp_sock *tp = tcp_sk(sk);
>>          __u8 rcv_wscale;
>>
>> +       /* clear down any previous shutdown attempts so that
>> +        * reconnects on a socket that's been shutdown leave the
>> +        * socket in a usable state (otherwise tcp_sendmsg() returns
>> +        * -EPIPE).
>> +        */
>> +       sk->sk_shutdown = 0;
>> +
>>          /* We'll fix this up when we get a response from the other end.
>>           * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
>>           */
>>
>> As I mentioned in my first message, we first saw this issue in 2.6.32 as supplied by debian (linux-image-2.6.32-5-amd64 Version: 2.6.32-17). It looks like the same patch would fix the problem there too.
>>
>
> CC netdev
>
> This reminds me a similar problem we had in the past, fixed with commit
> 1fdf475a (tcp: tcp_disconnect() should clear window_clamp)
>
> But tcp_disconnect() already clears sk->sk_shutdown
>
> If NFS calls tcp_disconnect(), then shutdown(), there is a problem.

If tcp_disconnect() was called at some point, I would expect to see a 
message from xs_error_report() in the debugging output.  Perhaps 
tcp_disconnect() is not being invoked at all?

> Maybe xs_tcp_shutdown() should make some sanity tests ?
>
> Following sequence is legal, and your patch might break it.
>
> fd = socket(...);
> shutdown(fd, SHUT_WR);
> ...
> connect(fd, ...);


I looked closely at some of Andy's debugging output from the 
linux-kernel mailing list archive.  I basically agree that the network 
layer is returning -EPIPE from tcp_sendmsg(), which the RPC client logic 
does not expect. But it's not clear to me how it gets into this state.

> [ 7728.520042] RPC:       worker connecting xprt ffff88013e62d800 via tcp to 10.1.6.102 (port 2049)
 > [ 7728.520093] RPC:       ffff88013e62d800 connect status 115 
connected 0 sock state 2

"sock state 2" => sk->sk_state == TCP_SYN_SENT

 > [ 7728.520884] RPC:       xs_tcp_state_change client ffff88013e62d800...
 > [ 7728.520889] RPC:       state 1 conn 0 dead 0 zapped 1

"state 1" => sk->sk_state == TCP_ESTABLISHED

RPC client wakes up this RPC task now that the connection is established.

> [ 7728.520896] RPC: 33550 __rpc_wake_up_task (now 4296824426)
> [ 7728.520900] RPC: 33550 disabling timer
> [ 7728.520906] RPC: 33550 removed from queue ffff88013e62db20 "xprt_pending"
> [ 7728.520912] RPC:       __rpc_wake_up_task done
> [ 7728.520932] RPC: 33550 __rpc_execute flags=0x1
> [ 7728.520937] RPC: 33550 xprt_connect_status: retrying
> [ 7728.520942] RPC: 33550 call_connect_status (status -11)

The awoken RPC task's status is -EAGAIN, which prevents a reconnection 
attempt.

> [ 7728.520947] RPC: 33550 call_transmit (status 0)
> [ 7728.520951] RPC: 33550 xprt_prepare_transmit
> [ 7728.520957] RPC: 33550 rpc_xdr_encode (status 0)
> [ 7728.520962] RPC: 33550 marshaling UNIX cred ffff88012e002300
> [ 7728.520969] RPC: 33550 using AUTH_UNIX cred ffff88012e002300 to wrap rpc data
> [ 7728.520976] RPC: 33550 xprt_transmit(32920)

RPC client encodes the request and attempts to send it.

> [ 7728.520984] RPC:       xs_tcp_send_request(32920) = -32

Network layer says -EPIPE, for some reason.  RPC client calls 
kernel_sock_shutdown(SHUT_WR).

> [ 7728.520997] RPC:       xs_tcp_state_change client ffff88013e62d800...
> [ 7728.521007] RPC:       state 4 conn 1 dead 0 zapped 1

"state 4" => sk->sk_state == TCP_FIN_WAIT1

The RPC client sets up a linger timeout.

> [ 7728.521013] RPC: 33550 call_status (status -32)
> [ 7728.521018] RPC: 33550 call_bind (status 0)
> [ 7728.521023] RPC: 33550 call_connect xprt ffff88013e62d800 is not connected
> [ 7728.521028] RPC: 33550 xprt_connect xprt ffff88013e62d800 is not connected
> [ 7728.521035] RPC: 33550 sleep_on(queue "xprt_pending" time 4296824426)
> [ 7728.521040] RPC: 33550 added to queue ffff88013e62db20 "xprt_pending"
> [ 7728.521045] RPC: 33550 setting alarm for 60000 ms

RPC client puts this RPC task to sleep.

> [ 7728.521439] RPC:       xs_tcp_state_change client ffff88013e62d800...
> [ 7728.521444] RPC:       state 5 conn 0 dead 0 zapped 1

"state 5" => sk->sk_state == TCP_FIN_WAIT2

> [ 7728.521602] RPC:       xs_tcp_state_change client ffff88013e62d800...
> [ 7728.521608] RPC:       state 7 conn 0 dead 0 zapped 1
> [ 7728.521612] RPC:       disconnected transport ffff88013e62d800

"state 7" => sk->sk_state == TCP_CLOSE

The RPC client marks the socket closed. and the linger timeout is 
cancelled.  At this point, sk_shutdown should be set to zero, correct? 
I don't see an xs_error_report() call here, which would confirm that the 
socket took a trip through tcp_disconnect().

> [ 7728.521617] RPC: 33550 __rpc_wake_up_task (now 4296824426)
> [ 7728.521621] RPC: 33550 disabling timer
> [ 7728.521626] RPC: 33550 removed from queue ffff88013e62db20 "xprt_pending"
> [ 7728.521631] RPC:       __rpc_wake_up_task done

RPC client wakes up the RPC task.  Meanwhile...

> [ 7728.521636] RPC:       xs_tcp_state_change client ffff88013e62d800...
> [ 7728.521641] RPC:       state 7 conn 0 dead 0 zapped 1
> [ 7728.521645] RPC:       disconnected transport ffff88013e62d800
> [ 7728.521649] RPC:       xs_tcp_data_ready...

... network layer calls closed socket's data_ready method... while the 
awoken RPC task gets underway.

> [ 7728.521662] RPC: 33550 __rpc_execute flags=0x1
> [ 7728.521666] RPC: 33550 xprt_connect_status: retrying
> [ 7728.521671] RPC: 33550 call_connect_status (status -11)

The awoken RPC task's status is -EAGAIN, which prevents a reconnection 
attempt, even though there is no established connection.

RPC client barrels on to send the request again.

> [ 7728.521675] RPC: 33550 call_transmit (status 0)
> [ 7728.521679] RPC: 33550 xprt_prepare_transmit
> [ 7728.521683] RPC: 33550 rpc_xdr_encode (status 0)
> [ 7728.521688] RPC: 33550 marshaling UNIX cred ffff88012e002300
> [ 7728.521694] RPC: 33550 using AUTH_UNIX cred ffff88012e002300 to wrap rpc data
> [ 7728.521699] RPC: 33550 xprt_transmit(32920)
> [ 7728.521704] RPC:       xs_tcp_send_request(32920) = -32

RPC client attempts to send again, gets -EPIPE, and calls 
kernel_sock_shutdown(SHUT_WR).  If there is no connection established, 
the RPC client expects -ENOTCONN, in which case it will attempt to 
reconnect here.

> [ 7728.521709] RPC:       xs_tcp_state_change client ffff88013e62d800...
> [ 7728.521714] RPC:       state 7 conn 0 dead 0 zapped 1
> [ 7728.521718] RPC:       disconnected transport ffff88013e62d800

"state 7" => sk->sk_state == TCP_CLOSE

Following this, the RPC client attempts to retransmit the request 
repeatedly, but the socket remains in the TCP_CLOSE state.

^ permalink raw reply

* RE: [REGRESSION] e1000e stopped working [MANUALLY BISECTED]
From: Maxim Levitsky @ 2010-07-28  7:04 UTC (permalink / raw)
  To: Tantilov, Emil S
  Cc: Kirsher, Jeffrey T, netdev@vger.kernel.org, Allan, Bruce W,
	Pieper, Jeffrey E
In-Reply-To: <1280103959.2589.2.camel@localhost.localdomain>

On Mon, 2010-07-26 at 03:25 +0300, Maxim Levitsky wrote: 
> On Sat, 2010-07-17 at 16:54 +0300, Maxim Levitsky wrote:
> > On Fri, 2010-07-16 at 17:23 -0600, Tantilov, Emil S wrote:
> > > Maxim Levitsky wrote:
> > > > On Thu, 2010-07-15 at 22:09 +0300, Maxim Levitsky wrote:
> > > >> On Thu, 2010-07-15 at 13:02 -0600, Tantilov, Emil S wrote:
> > > >>> Maxim Levitsky wrote:
> > > >>>> On Thu, 2010-07-15 at 02:33 +0300, Maxim Levitsky wrote:
> > > >>>>> On Wed, 2010-07-14 at 16:56 -0600, Tantilov, Emil S wrote:
> > > >>>>>> Maxim Levitsky wrote:
> > > >>>>>>> On Mon, 2010-07-12 at 15:23 -0600, Tantilov, Emil S wrote:
> > > >>>>>>>> Maxim Levitsky wrote:
> > > >>>>>>>>> On Mon, 2010-07-05 at 12:58 +0300, Maxim Levitsky wrote:
> > > >>>>>>>>>> On Mon, 2010-07-05 at 01:13 -0700, Jeff Kirsher wrote:
> > > >>>>>>>>>>> On Sun, Jul 4, 2010 at 15:48, Maxim Levitsky
> > > >>>>>>>>>>> <maximlevitsky@gmail.com> wrote:
> > > >>>>>>>>>>>> Did few guesses, and now I see that reverting the below
> > > >>>>>>>>>>>> commit fixes the problem. 
> > > >>>>>>>>>>>> 
> > > >>>>>>>>>>>> "e1000e: Fix/cleanup PHY reset code for ICHx/PCHx"
> > > >>>>>>>>>>>> e98cac447cc1cc418dff1d610a5c79c4f2bdec7f.
> > > >>>>>>>>>>>> 
> > > >>>>>>>>>>>> 
> > > >>>>>>>>>>>> Best regards,
> > > >>>>>>>>>>>>        Maxim Levitsky
> > > >>>>>>>>>>>> 
> > > >>>>>>>>>>>> --
> > > >>>>>>>>>>> 
> > > >>>>>>>>>>> Can you give us till Tuesday to respond?  I know that there
> > > >>>>>>>>>>> are some additional e1000e patches in my queue, which may
> > > >>>>>>>>>>> resolve the issue, but this weekend the power is down to do
> > > >>>>>>>>>>> some infrastructure upgrades which prevents us from doing
> > > >>>>>>>>>>> any investigation.debugging until Tuesday.
> > > >>>>>>>>>>> 
> > > >>>>>>>>>> 
> > > >>>>>>>>>> Sure.
> > > >>>>>>>>>> 
> > > >>>>>>>>>> Best regards,
> > > >>>>>>>>>> 	Maxim Levitsky
> > > >>>>>>>>>> 
> > > >>>>>>>>> 
> > > >>>>>>>>> Updates?
> > > >>>>>>>> 
> > > >>>>>>>> We are working on reproducing the issue. So far we have not
> > > >>>>>>>> seen the problem when testing with net-next.
> > > >>>>>>>> 
> > > >>>>>>>> I asked in previous email about some additional info from
> > > >>>>>>>> ethtool (-d, -e, -S) and kernel config. That would help us to
> > > >>>>>>>> narrow it down. 
> > > >>>>>>>> 
> > > >>>>>>>> Thanks,
> > > >>>>>>>> Emil
> > > >>>>>>> I did send -e and -d output.
> > > >>>>>> 
> > > >>>>>> Sorry, looks like I lost the email with the attachements.
> > > >>>>>> 
> > > >>>>>> Could you provide the output of dmesg after the failure occurs?
> > > >>>>>> 
> > > >>>>>>> Since you probably want -S output during failure, I need to
> > > >>>>>>> recompile kernel for that. I will do that soon.
> > > >>>>>>> 
> > > >>>>>>> 
> > > >>>>>>> One question, in two weeks I hope 2.6.35 won't be released?
> > > >>>>>>> If so, I will have enough free time then to narrow down this
> > > >>>>>>> issue. 
> > > >>>>>>> 
> > > >>>>>>> Other solution, is to revert this commit.
> > > >>>>>>> (I have never seen this problem with it reverted).
> > > >>>>>> 
> > > >>>>>> We have been running reboot tests on 2 separate systems with
> > > >>>>>> recent net-next kernels using your config and so far no luck in
> > > >>>>>> reproducing this issue. 
> > > >>>>>> 
> > > >>>>>> What is the make model of your system (or MB)?
> > > >>>>> 
> > > >>>>> the motherboard is Intel DG965RY.
> > > >>>>> 
> > > >>>>> However, I am using vanilla kernel.
> > > >>>>> net-next might contain further fixes.
> > > >>>>> 
> > > >>>>> I see if net-next works here.
> > > >>>> 
> > > >>>> Yep, net-next works here.
> > > >>>> 
> > > >>>> 
> > > >>>> I have the problem on vanilla kernel.
> > > >>>> Last revision of it, I tested is 2.6.35-rc4 exactly
> > > >>>> (815c4163b6c8ebf8152f42b0a5fd015cfdcedc78)
> > > >>>> 
> > > >>>> 
> > > >>>> Maybe vanilla git master works, I test it too soon.
> > > >>> 
> > > >>> Thanks for the information! Good to know that this issue does not
> > > >>> exist in the latest branch. 
> > > >>> 
> > > >>> Have you by any chance tested a stable branch (2.6.34.x)?
> > > >> 
> > > >> I only did test plain 2.6.34 (v2.6.34)
> > > > And forgot to add, that it did work.
> > > > 
> > > >> 
> > > >> Also I repeat that revert of e98cac447cc1cc418dff1d610a5c79c4f2bdec7f
> > > >> (e1000e: Fix/cleanup PHY reset code for ICHx/PCHx) fixes the bug on
> > > >> vanilla kernel. 
> > > >> 
> > > >> Also I just pulled latest vanilla git, and I according to diffstat I
> > > >> see no changes in e1000e, so its likely that bug remains there.
> > > >> I will test that soon.
> > > > Tested, broken as expected.
> > > 
> > > That makes sense. Unfortunately we are still not able to reproduce even on recent pull from Linus tree.
> > > 
> > > If you want - you can look at the patches for e1000e in net-next and start applying those to your tree until the issue is resolved.
> > > 
> > That exactly what I will do soon.
> > 
> > 
> > Also I can narrow down the problem by reverting the commit partially.
> > 
> > After one week, I will have enough free time to do all the thing like
> > above. Now I have none.
> > 
> > 
> > > I will keep trying it here, but none of the systems we have exhibit the issue you described, so the bug could be exposed by something in your system/config.
> > I also think so. Otherwise, we would see more bug-reports.
> > 
> > You probably don't need to try anymore and reproduce that issue, because
> > of that.
> > 
> 
> 
> This commit, present in net-next, solves the problem:
> 
> commit 1286950690f0f82ffa504e1e149ee3fdb4c51478
> Author: Bruce Allan <bruce.w.allan@intel.com>
> Date:   Mon Jul 26 03:19:38 2010 +0300
> 
>     e1000e: cleanup e1000_sw_lcd_config_ich8lan()
>     
>     Do not acquire and release the PHY unnecessarily for parts that return
>     from this workaround without actually accessing the PHY registers.
>     
>     Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
>     Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
>     Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> 
> 
> 
> Also, the above patch is part of whole series of patches with scary descriptions (that is these fix bugs).
> If I were you I would send them to Linus for 2.6.35 inclusion too.
> 
> Best regards,
> 	Maxim Levitsky
> 
> 
> 
ping



^ permalink raw reply

* RE: [net-next 1/3] stmmac: remove the STMMAC_DUAL_MAC option
From: Peppe CAVALLARO @ 2010-07-28  6:55 UTC (permalink / raw)
  To: netdev@vger.kernel.org, davem@davemloft.net
In-Reply-To: <1280225387-26240-1-git-send-email-peppe.cavallaro@st.com>

> -----Original Message-----
> From: Giuseppe CAVALLARO [mailto:peppe.cavallaro@st.com]
> Sent: Tuesday, July 27, 2010 12:10 PM
> To: netdev@vger.kernel.org
> Cc: Peppe CAVALLARO
> Subject: [net-next 1/3] stmmac: remove the STMMAC_DUAL_MAC option
> 
> The STMMAC_DUAL_MAC is now removed from the driver's Kconfig.
> It will be available from a specific STM boards Kconfig.
> 
> Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>


> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Wednesday, July 28, 2010 5:45 AM
> To: Peppe CAVALLARO
> Cc: netdev@vger.kernel.org
> Subject: Re: [net-next 2/3] stmmac: fix timer setup when use dual mac
> Kconfig

[snip]
 
> This is not how we do things.
> 
> All of the options that influence the driver should be right next
> to the main driver option.
> 
> What the platform SOC Kconfig's can do is 'select' those option.
> 
> But even better is to get rid of all of these feature Kconfig options,
> and communicate the capability in the platform_device probe
> information or similar.

Hi David,

Indeed, this option doesn't impact the driver itself. Some STM SoCs
(7105,7106, 7108 ...) have two GMAC cores integrated in the same SoC.
So the meaning of this option was to turn-on the second device, only.
This option is also used within our board's setup files.
In the future, it could also generate some misunderstanding on other
platforms (ARM based) where there is no second GMAC device.
For this reason, I've removed it from the driver's Kconfig.

Welcome advice.

Thanks for the feedback.

Regards,
Peppe

> ---
>  drivers/net/stmmac/Kconfig |    9 ---------
>  1 files changed, 0 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/stmmac/Kconfig b/drivers/net/stmmac/Kconfig
> index eb63d44..2513555 100644
> --- a/drivers/net/stmmac/Kconfig
> +++ b/drivers/net/stmmac/Kconfig
> @@ -20,15 +20,6 @@ config STMMAC_DA
>  	  By default, the DMA arbitration scheme is based on Round-robin
>  	  (rx:tx priority is 1:1).
> 
> -config STMMAC_DUAL_MAC
> -	bool "STMMAC: dual mac support (EXPERIMENTAL)"
> -	default n
> -        depends on EXPERIMENTAL && STMMAC_ETH && !STMMAC_TIMER
> -	help
> -	  Some ST SoCs (for example the stx7141 and stx7200c2) have two
> -	  Ethernet Controllers. This option turns on the second Ethernet
> -	  device on this kind of platforms.
> -
>  config STMMAC_TIMER
>  	bool "STMMAC Timer optimisation"
>  	default n
> --
> 1.5.5.6


^ permalink raw reply

* Re: br_forward.c - rcu dereference warning
From: Johannes Berg @ 2010-07-28  6:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: paulmck, netdev
In-Reply-To: <20100727134223.5bb0be30@nehalam>

On Tue, 2010-07-27 at 13:42 -0700, Stephen Hemminger wrote:

> > Did you want me to test the patch?
> 
> Yes please, I can make sure it works, but not that it gets rid
> of your error

Testing now, but it didn't quite apply cleanly on 2.6.35-rc.

johannes


^ permalink raw reply

* [RFC PATCH v4 5/5] perf:add a script shows a process of packet
From: Koki Sanagi @ 2010-07-28  6:35 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, davem, kaneshige.kenji, izumi.taku, kosaki.motohiro,
	nhorman, laijs, scott.a.mcmillan, rostedt, eric.dumazet, fweisbec,
	mathieu.desnoyers
In-Reply-To: <4C4FCDA1.3000803@jp.fujitsu.com>

Add a perf script which shows a process of packets and processed time.
It helps us to investigate networking or network device.

If you want to use it, install perf and record perf.data like following.

#perf trace record netdev-times [script]

If you set script, perf gathers records until it ends.
If not, you must Ctrl-C to stop recording.

And if you want a report from record,

#perf trace report netdev-times [options]

If you use some options, you can limit an output.
Option is below.

tx: show only process of tx packets
rx: show only process of rx packets
dev=: show a process specified with this option
debug: work with debug mode. It shows buffer status.

For example, if you want to show a process of received packets associated
with eth4,

#perf trace report netdev-times rx dev=eth4
106133.171439sec cpu=0
  irq_entry(+0.000msec irq=24:eth4)
         |
  softirq_entry(+0.006msec)
         |
         |---netif_receive_skb(+0.010msec skb=f2d15900 len=100)
         |            |
         |      skb_copy_datagram_iovec(+0.039msec 10291::10291)
         |
  napi_poll_exit(+0.022msec eth4)

This perf script helps us to analyze a process time of transmit/receive
sequence.

Signed-off-by: Koki Sanagi <sanagi.koki@jp.fujitsu.com>
---
 tools/perf/scripts/python/bin/netdev-times-record |    8 +
 tools/perf/scripts/python/bin/netdev-times-report |    5 +
 tools/perf/scripts/python/netdev-times.py         |  464 +++++++++++++++++++++
 3 files changed, 477 insertions(+), 0 deletions(-)

diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record
new file mode 100644
index 0000000..2b59511
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+perf record -c 1 -f -R -a -e net:net_dev_xmit -e net:net_dev_queue	\
+		-e net:netif_receive_skb -e net:netif_rx		\
+		-e skb:consume_skb -e skb:kfree_skb			\
+		-e skb:skb_copy_datagram_iovec -e napi:napi_poll	\
+		-e irq:irq_handler_entry -e irq:irq_handler_exit	\
+		-e irq:softirq_entry -e irq:softirq_exit		\
+		-e irq:softirq_raise $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report
new file mode 100644
index 0000000..c3d0a63
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: display a process of packet and processing time
+# args: [tx] [rx] [dev=] [debug]
+
+perf trace -s ~/libexec/perf-core/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/netdev-times.py b/tools/perf/scripts/python/netdev-times.py
new file mode 100644
index 0000000..9aa0a32
--- /dev/null
+++ b/tools/perf/scripts/python/netdev-times.py
@@ -0,0 +1,464 @@
+# Display a process of packets and processed time.
+# It helps us to investigate networking or network device.
+#
+# options
+# tx: show only tx chart
+# rx: show only rx chart
+# dev=: show only thing related to specified device
+# debug: work with debug mode. It shows buffer status.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+all_event_list = []; # insert all tracepoint event related with this script
+irq_dic = {}; # key is cpu and value is a list which stacks irqs
+              # which raise NET_RX softirq
+net_rx_dic = {}; # key is cpu and value include time of NET_RX softirq-entry
+		 # and a list which stacks receive
+receive_hunk_list = []; # a list which include a sequence of receive events
+rx_skb_list = []; # received packet list for matching
+		       # skb_copy_datagram_iovec
+
+buffer_budget = 65536; # the budget of rx_skb_list, tx_queue_list and
+		       # tx_xmit_list
+of_count_rx_skb_list = 0; # overflow count
+
+tx_queue_list = []; # list of packets which pass through dev_queue_xmit
+of_count_tx_queue_list = 0; # overflow count
+
+tx_xmit_list = [];  # list of packets which pass through dev_hard_start_xmit
+of_count_tx_xmit_list = 0; # overflow count
+
+tx_free_list = [];  # list of packets which is freed
+
+# options
+show_tx = 0;
+show_rx = 0;
+dev = 0; # store a name of device specified by option "dev="
+debug = 0;
+
+# indices of event_info tuple
+EINFO_IDX_NAME=   0
+EINFO_IDX_CONTEXT=1
+EINFO_IDX_CPU=    2
+EINFO_IDX_TIME=   3
+EINFO_IDX_PID=    4
+EINFO_IDX_COMM=   5
+
+# Calculate a time interval(msec) from src(nsec) to dst(nsec)
+def diff_msec(src, dst):
+	return (dst - src) / 1000000.0
+
+# Display a process of transmitting a packet
+def print_transmit(hunk):
+	if dev != 0 and hunk['dev'].find(dev) < 0:
+		return
+	print "%7s %5d %6d.%06dsec %12.3fmsec      %12.3fmsec" % \
+		(hunk['dev'], hunk['len'],
+		nsecs_secs(hunk['queue_t']),
+		nsecs_nsecs(hunk['queue_t'])/1000,
+		diff_msec(hunk['queue_t'], hunk['xmit_t']),
+		diff_msec(hunk['xmit_t'], hunk['free_t']))
+
+# Format for displaying rx packet processing
+PF_IRQ_ENTRY= "  irq_entry(+%.3fmsec irq=%d:%s)"
+PF_SOFT_ENTRY="  softirq_entry(+%.3fmsec)"
+PF_NAPI_POLL= "  napi_poll_exit(+%.3fmsec %s)"
+PF_JOINT=     "         |"
+PF_WJOINT=    "         |            |"
+PF_NET_RECV=  "         |---netif_receive_skb(+%.3fmsec skb=%x len=%d)"
+PF_NET_RX=    "         |---netif_rx(+%.3fmsec skb=%x)"
+PF_CPY_DGRAM= "         |      skb_copy_datagram_iovec(+%.3fmsec %d:%s)"
+PF_KFREE_SKB= "         |      kfree_skb(+%.3fmsec location=%x)"
+PF_CONS_SKB=  "         |      consume_skb(+%.3fmsec)"
+
+# Display a process of received packets and interrputs associated with
+# a NET_RX softirq
+def print_receive(hunk):
+	show_hunk = 0
+	irq_list = hunk['irq_list']
+	cpu = irq_list[0]['cpu']
+	base_t = irq_list[0]['irq_ent_t']
+	# check if this hunk should be showed
+	if dev != 0:
+		for i in range(len(irq_list)):
+			if irq_list[i]['name'].find(dev) >= 0:
+				show_hunk = 1
+				break
+	else:
+		show_hunk = 1
+	if show_hunk == 0:
+		return
+
+	print "%d.%06dsec cpu=%d" % \
+		(nsecs_secs(base_t), nsecs_nsecs(base_t)/1000, cpu)
+	for i in range(len(irq_list)):
+		print PF_IRQ_ENTRY % \
+			(diff_msec(base_t, irq_list[i]['irq_ent_t']),
+			irq_list[i]['irq'], irq_list[i]['name'])
+		print PF_JOINT
+		irq_event_list = irq_list[i]['event_list']
+		for j in range(len(irq_event_list)):
+			irq_event = irq_event_list[j]
+			if irq_event['event'] == 'netif_rx':
+				print PF_NET_RX % \
+					(diff_msec(base_t, irq_event['time']),
+					irq_event['skbaddr'])
+				print PF_JOINT
+	print PF_SOFT_ENTRY % \
+		diff_msec(base_t, hunk['sirq_ent_t'])
+	print PF_JOINT
+	event_list = hunk['event_list']
+	for i in range(len(event_list)):
+		event = event_list[i]
+		if event['event_name'] == 'napi_poll':
+			print PF_NAPI_POLL % \
+			    (diff_msec(base_t, event['event_t']), event['dev'])
+			if i == len(event_list) - 1:
+				print ""
+			else:
+				print PF_JOINT
+		else:
+			print PF_NET_RECV % \
+			    (diff_msec(base_t, event['event_t']), event['skbaddr'],
+				event['len'])
+			if 'comm' in event.keys():
+				print PF_WJOINT
+				print PF_CPY_DGRAM % \
+					(diff_msec(base_t, event['comm_t']),
+					event['pid'], event['comm'])
+			elif 'handle' in event.keys():
+				print PF_WJOINT
+				if event['handle'] == "kfree_skb":
+					print PF_KFREE_SKB % \
+						(diff_msec(base_t,
+						event['comm_t']),
+						event['location'])
+				elif event['handle'] == "consume_skb":
+					print PF_CONS_SKB % \
+						diff_msec(base_t,
+							event['comm_t'])
+			print PF_JOINT
+
+def trace_begin():
+	global show_tx
+	global show_rx
+	global dev
+	global debug
+
+	for i in range(len(sys.argv)):
+		if i == 0:
+			continue
+		arg = sys.argv[i]
+		if arg == 'tx':
+			show_tx = 1
+		elif arg =='rx':
+			show_rx = 1
+		elif arg.find('dev=',0, 4) >= 0:
+			dev = arg[4:]
+		elif arg == 'debug':
+			debug = 1
+	if show_tx == 0  and show_rx == 0:
+		show_tx = 1
+		show_rx = 1
+
+def trace_end():
+	# order all events in time
+	all_event_list.sort(lambda a,b :cmp(a[EINFO_IDX_TIME],
+					    b[EINFO_IDX_TIME]))
+	# process all events
+	for i in range(len(all_event_list)):
+		event_info = all_event_list[i]
+		name = event_info[EINFO_IDX_NAME]
+		if name == 'irq__softirq_exit':
+			handle_irq_softirq_exit(event_info)
+		elif name == 'irq__softirq_entry':
+			handle_irq_softirq_entry(event_info)
+		elif name == 'irq__softirq_raise':
+			handle_irq_softirq_raise(event_info)
+		elif name == 'irq__irq_handler_entry':
+			handle_irq_handler_entry(event_info)
+		elif name == 'irq__irq_handler_exit':
+			handle_irq_handler_exit(event_info)
+		elif name == 'napi__napi_poll':
+			handle_napi_poll(event_info)
+		elif name == 'net__netif_receive_skb':
+			handle_netif_receive_skb(event_info)
+		elif name == 'net__netif_rx':
+			handle_netif_rx(event_info)
+		elif name == 'skb__skb_copy_datagram_iovec':
+			handle_skb_copy_datagram_iovec(event_info)
+		elif name == 'net__net_dev_queue':
+			handle_net_dev_queue(event_info)
+		elif name == 'net__net_dev_xmit':
+			handle_net_dev_xmit(event_info)
+		elif name == 'skb__kfree_skb':
+			handle_kfree_skb(event_info)
+		elif name == 'skb__consume_skb':
+			handle_consume_skb(event_info)
+	# display receive hunks
+	if show_rx:
+		for i in range(len(receive_hunk_list)):
+			print_receive(receive_hunk_list[i])
+	# display transmit hunks
+	if show_tx:
+		print "   dev    len      Qdisc        " \
+			"       netdevice             free"
+		for i in range(len(tx_free_list)):
+			print_transmit(tx_free_list[i])
+	if debug:
+		print "debug buffer status"
+		print "----------------------------"
+		print "xmit Qdisc:remain:%d overflow:%d" % \
+			(len(tx_queue_list), of_count_tx_queue_list)
+		print "xmit netdevice:remain:%d overflow:%d" % \
+			(len(tx_xmit_list), of_count_tx_xmit_list)
+		print "receive:remain:%d overflow:%d" % \
+			(len(rx_skb_list), of_count_rx_skb_list)
+
+# called from perf, when it finds a correspoinding event
+def irq__softirq_entry(name, context, cpu, sec, nsec, pid, comm, vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+	all_event_list.append(event_info)
+
+def irq__softirq_exit(name, context, cpu, sec, nsec, pid, comm, vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+	all_event_list.append(event_info)
+
+def irq__softirq_raise(name, context, cpu, sec, nsec, pid, comm, vec):
+	if symbol_str("irq__softirq_entry", "vec", vec) != "NET_RX":
+		return
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, vec)
+	all_event_list.append(event_info)
+
+def irq__irq_handler_entry(name, context, cpu, sec, nsec, pid, comm,
+			irq, irq_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			irq, irq_name)
+	all_event_list.append(event_info)
+
+def irq__irq_handler_exit(name, context, cpu, sec, nsec, pid, comm, irq, ret):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm, irq, ret)
+	all_event_list.append(event_info)
+
+def napi__napi_poll(name, context, cpu, sec, nsec, pid, comm, napi, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			napi, dev_name)
+	all_event_list.append(event_info)
+
+def net__netif_receive_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+			skblen, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, dev_name)
+	all_event_list.append(event_info)
+
+def net__netif_rx(name, context, cpu, sec, nsec, pid, comm, skbaddr,
+			skblen, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, dev_name)
+	all_event_list.append(event_info)
+
+def net__net_dev_queue(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, skblen, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, dev_name)
+	all_event_list.append(event_info)
+
+def net__net_dev_xmit(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, skblen, rc, dev_name):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen, rc ,dev_name)
+	all_event_list.append(event_info)
+
+def skb__kfree_skb(name, context, cpu, sec, nsec, pid, comm,
+			skbaddr, protocol, location):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, protocol, location)
+	all_event_list.append(event_info)
+
+def skb__consume_skb(name, context, cpu, sec, nsec, pid, comm, skbaddr):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr)
+	all_event_list.append(event_info)
+
+def skb__skb_copy_datagram_iovec(name, context, cpu, sec, nsec, pid, comm,
+	skbaddr, skblen):
+	event_info = (name, context, cpu, nsecs(sec, nsec), pid, comm,
+			skbaddr, skblen)
+	all_event_list.append(event_info)
+
+def handle_irq_handler_entry(event_info):
+	(name, context, cpu, time, pid, comm, irq, irq_name) = event_info
+	if cpu not in irq_dic.keys():
+		irq_dic[cpu] = []
+	irq_record = {'irq':irq, 'name':irq_name, 'cpu':cpu, 'irq_ent_t':time}
+	irq_dic[cpu].append(irq_record)
+
+def handle_irq_handler_exit(event_info):
+	(name, context, cpu, time, pid, comm, irq, ret) = event_info
+	if cpu not in irq_dic.keys():
+		return
+	irq_record = irq_dic[cpu].pop()
+	if irq != irq_record['irq']:
+		return
+	irq_record.update({'irq_ext_t':time})
+	# if an irq doesn't include NET_RX softirq, drop.
+	if 'event_list' in irq_record.keys():
+		irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_raise(event_info):
+	(name, context, cpu, time, pid, comm, vec) = event_info
+	if cpu not in irq_dic.keys() \
+	or len(irq_dic[cpu]) == 0:
+		return
+	irq_record = irq_dic[cpu].pop()
+	if 'event_list' in irq_record.keys():
+		irq_event_list = irq_record['event_list']
+	else:
+		irq_event_list = []
+	irq_event_list.append({'time':time, 'event':'sirq_raise'})
+	irq_record.update({'event_list':irq_event_list})
+	irq_dic[cpu].append(irq_record)
+
+def handle_irq_softirq_entry(event_info):
+	(name, context, cpu, time, pid, comm, vec) = event_info
+	net_rx_dic[cpu] = {'sirq_ent_t':time, 'event_list':[]}
+
+def handle_irq_softirq_exit(event_info):
+	(name, context, cpu, time, pid, comm, vec) = event_info
+	irq_list = []
+	event_list = 0
+	if cpu in irq_dic.keys():
+		irq_list = irq_dic[cpu]
+		del irq_dic[cpu]
+	if cpu in net_rx_dic.keys():
+		sirq_ent_t = net_rx_dic[cpu]['sirq_ent_t']
+		event_list = net_rx_dic[cpu]['event_list']
+		del net_rx_dic[cpu]
+	if irq_list == [] or event_list == 0:
+		return
+	rec_data = {'sirq_ent_t':sirq_ent_t, 'sirq_ext_t':time,
+		    'irq_list':irq_list, 'event_list':event_list}
+	# merge information realted to a NET_RX softirq
+	receive_hunk_list.append(rec_data)
+
+def handle_napi_poll(event_info):
+	(name, context, cpu, time, pid, comm, napi, dev_name) = event_info
+	if cpu in net_rx_dic.keys():
+		event_list = net_rx_dic[cpu]['event_list']
+		rec_data = {'event_name':'napi_poll',
+				'dev':dev_name, 'event_t':time}
+		event_list.append(rec_data)
+
+def handle_netif_rx(event_info):
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, dev_name) = event_info
+	if cpu not in irq_dic.keys() \
+	or len(irq_dic[cpu]) == 0:
+		return
+	irq_record = irq_dic[cpu].pop()
+	if 'event_list' in irq_record.keys():
+		irq_event_list = irq_record['event_list']
+	else:
+		irq_event_list = []
+	irq_event_list.append({'time':time, 'event':'netif_rx',
+		'skbaddr':skbaddr, 'skblen':skblen, 'dev_name':dev_name})
+	irq_record.update({'event_list':irq_event_list})
+	irq_dic[cpu].append(irq_record)
+
+def handle_netif_receive_skb(event_info):
+	global of_count_rx_skb_list
+
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, dev_name) = event_info
+	if cpu in net_rx_dic.keys():
+		rec_data = {'event_name':'netif_receive_skb',
+			    'event_t':time, 'skbaddr':skbaddr, 'len':skblen}
+		event_list = net_rx_dic[cpu]['event_list']
+		event_list.append(rec_data)
+		rx_skb_list.insert(0, rec_data)
+		if len(rx_skb_list) > buffer_budget:
+			rx_skb_list.pop()
+			of_count_rx_skb_list += 1
+
+def handle_net_dev_queue(event_info):
+	global of_count_tx_queue_list
+
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, dev_name) = event_info
+	skb = {'dev':dev_name, 'skbaddr':skbaddr, 'len':skblen, 'queue_t':time}
+	tx_queue_list.insert(0, skb)
+	if len(tx_queue_list) > buffer_budget:
+		tx_queue_list.pop()
+		of_count_tx_queue_list += 1
+
+def handle_net_dev_xmit(event_info):
+	global of_count_tx_xmit_list
+
+	(name, context, cpu, time, pid, comm,
+		skbaddr, skblen, rc, dev_name) = event_info
+	if rc == 0: # NETDEV_TX_OK
+		for i in range(len(tx_queue_list)):
+			skb = tx_queue_list[i]
+			if skb['skbaddr'] == skbaddr:
+				skb['xmit_t'] = time
+				tx_xmit_list.insert(0, skb)
+				del tx_queue_list[i]
+				if len(tx_xmit_list) > buffer_budget:
+					tx_xmit_list.pop()
+					of_count_tx_xmit_list += 1
+				return
+
+def handle_kfree_skb(event_info):
+	(name, context, cpu, time, pid, comm,
+		skbaddr, protocol, location) = event_info
+	for i in range(len(tx_queue_list)):
+		skb = tx_queue_list[i]
+		if skb['skbaddr'] == skbaddr:
+			del tx_queue_list[i]
+			return
+	for i in range(len(tx_xmit_list)):
+		skb = tx_xmit_list[i]
+		if skb['skbaddr'] == skbaddr:
+			skb['free_t'] = time
+			tx_free_list.append(skb)
+			del tx_xmit_list[i]
+			return
+	for i in range(len(rx_skb_list)):
+		rec_data = rx_skb_list[i]
+		if rec_data['skbaddr'] == skbaddr:
+			rec_data.update({'handle':"kfree_skb",
+					'comm':comm, 'pid':pid, 'comm_t':time})
+			del rx_skb_list[i]
+			return
+
+def handle_consume_skb(event_info):
+	(name, context, cpu, time, pid, comm, skbaddr) = event_info
+	for i in range(len(tx_xmit_list)):
+		skb = tx_xmit_list[i]
+		if skb['skbaddr'] == skbaddr:
+			skb['free_t'] = time
+			tx_free_list.append(skb)
+			del tx_xmit_list[i]
+			return
+
+def handle_skb_copy_datagram_iovec(event_info):
+	(name, context, cpu, time, pid, comm, skbaddr, skblen) = event_info
+	for i in range(len(rx_skb_list)):
+		rec_data = rx_skb_list[i]
+		if skbaddr == rec_data['skbaddr']:
+			rec_data.update({'handle':"skb_copy_datagram_iovec",
+					'comm':comm, 'pid':pid, 'comm_t':time})
+			del rx_skb_list[i]
+			return

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox