Netdev List
 help / color / mirror / Atom feed
* Re: Top 10 kernel oopses for the week ending January 5th, 2008
From: Al Viro @ 2008-01-10  5:53 UTC (permalink / raw)
  To: Neil Brown
  Cc: Linus Torvalds, Kevin Winchester, J. Bruce Fields,
	Arjan van de Ven, Linux Kernel Mailing List, Andrew Morton,
	NetDev, gregkh
In-Reply-To: <18309.39804.31974.851666@notabene.brown>

On Thu, Jan 10, 2008 at 03:13:48PM +1100, Neil Brown wrote:
> > What guarantees that it doesn't happen before we get to callback?  AFAICS,
> > nothing whatsoever...
> 
> Yes, that's bad isn't it :-)
> 
> I think I should be using sysfs_schedule_callback here.  That makes the 
> required 'get' and 'put' calls.... but it can fail with -ENOMEM.  I
> wonder what I do if -ENOMEM???  Maybe I'll just continue to roll my
> one :-( 

How about this instead (completely untested)

	* split failure exits
	* switch to kick_rdev_from_array()
	* fold unbind_rdev_from_array() into it (no other callers anymore)
	* take export_rdev() into failure case in bind_rdev_to_array()
	* in kick_rdev_from_array() do what export_rdev() does sans
kobject_put() and do that before schedule_work().  Take kobject_put() into
delayed_delete().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cef9ebd..116cc5a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1344,6 +1344,39 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
 
 static LIST_HEAD(pending_raid_disks);
 
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+	struct block_device *bdev = rdev->bdev;
+	rdev->bdev = NULL;
+	if (!bdev)
+		MD_BUG();
+	bd_release(bdev);
+	blkdev_put(bdev);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void __export_rdev(mdk_rdev_t * rdev)
+{
+	char b[BDEVNAME_SIZE];
+	printk(KERN_INFO "md: export_rdev(%s)\n",
+		bdevname(rdev->bdev,b));
+	if (rdev->mddev)
+		MD_BUG();
+	free_disk_sb(rdev);
+	list_del_init(&rdev->same_set);
+#ifndef MODULE
+	md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+	unlock_rdev(rdev);
+}
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+	__export_rdev(rdev);
+	kobject_put(&rdev->kobj);
+}
+
 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 {
 	char b[BDEVNAME_SIZE];
@@ -1353,7 +1386,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
 	if (rdev->mddev) {
 		MD_BUG();
-		return -EINVAL;
+		err = -EINVAL;
+		goto out;
 	}
 	/* make sure rdev->size exceeds mddev->size */
 	if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
@@ -1362,8 +1396,10 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 			 * If mddev->level <= 0, then we don't care
 			 * about aligning sizes (e.g. linear)
 			 */
-			if (mddev->level > 0)
-				return -ENOSPC;
+			if (mddev->level > 0) {
+				err = -ENOSPC;
+				goto out;
+			}
 		} else
 			mddev->size = rdev->size;
 	}
@@ -1379,12 +1415,16 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 			choice++;
 		rdev->desc_nr = choice;
 	} else {
-		if (find_rdev_nr(mddev, rdev->desc_nr))
-			return -EBUSY;
+		if (find_rdev_nr(mddev, rdev->desc_nr)) {
+			err = -EBUSY;
+			goto out;
+		}
 	}
 	bdevname(rdev->bdev,b);
-	if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
-		return -ENOMEM;
+	if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) {
+		err = -ENOMEM;
+		goto out;
+	}
 	while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
 		*s = '!';
 			
@@ -1407,9 +1447,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 	bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
 	return 0;
 
- fail:
+fail:
 	printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
 	       b, mdname(mddev));
+out:
+	export_rdev(rdev);
 	return err;
 }
 
@@ -1417,19 +1459,22 @@ static void delayed_delete(struct work_struct *ws)
 {
 	mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
 	kobject_del(&rdev->kobj);
+	kobject_put(&rdev->kobj);
 }
 
-static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
 {
 	char b[BDEVNAME_SIZE];
 	if (!rdev->mddev) {
 		MD_BUG();
+		export_rdev(rdev);
 		return;
 	}
 	bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
 	list_del_init(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
+	__export_rdev(rdev);
 	sysfs_remove_link(&rdev->kobj, "block");
 
 	/* We need to delay this, otherwise we can deadlock when
@@ -1467,40 +1512,6 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
 	return err;
 }
 
-static void unlock_rdev(mdk_rdev_t *rdev)
-{
-	struct block_device *bdev = rdev->bdev;
-	rdev->bdev = NULL;
-	if (!bdev)
-		MD_BUG();
-	bd_release(bdev);
-	blkdev_put(bdev);
-}
-
-void md_autodetect_dev(dev_t dev);
-
-static void export_rdev(mdk_rdev_t * rdev)
-{
-	char b[BDEVNAME_SIZE];
-	printk(KERN_INFO "md: export_rdev(%s)\n",
-		bdevname(rdev->bdev,b));
-	if (rdev->mddev)
-		MD_BUG();
-	free_disk_sb(rdev);
-	list_del_init(&rdev->same_set);
-#ifndef MODULE
-	md_autodetect_dev(rdev->bdev->bd_dev);
-#endif
-	unlock_rdev(rdev);
-	kobject_put(&rdev->kobj);
-}
-
-static void kick_rdev_from_array(mdk_rdev_t * rdev)
-{
-	unbind_rdev_from_array(rdev);
-	export_rdev(rdev);
-}
-
 static void export_array(mddev_t *mddev)
 {
 	struct list_head *tmp;
@@ -2576,8 +2587,10 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
 						       mdk_rdev_t, same_set);
 			err = super_types[mddev->major_version]
 				.load_super(rdev, rdev0, mddev->minor_version);
-			if (err < 0)
-				goto out;
+			if (err < 0) {
+				export_rdev(rdev);
+				return err;
+			}
 		}
 	} else
 		rdev = md_import_device(dev, -1, -1);
@@ -2585,9 +2598,6 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len)
 	if (IS_ERR(rdev))
 		return PTR_ERR(rdev);
 	err = bind_rdev_to_array(rdev, mddev);
- out:
-	if (err)
-		export_rdev(rdev);
 	return err ? err : len;
 }
 
@@ -3637,8 +3647,7 @@ static void autorun_devices(int part)
 			printk(KERN_INFO "md: created %s\n", mdname(mddev));
 			ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
 				list_del_init(&rdev->same_set);
-				if (bind_rdev_to_array(rdev, mddev))
-					export_rdev(rdev);
+				bind_rdev_to_array(rdev, mddev);
 			}
 			autorun_array(mddev);
 			mddev_unlock(mddev);
@@ -3807,7 +3816,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		return -EOVERFLOW;
 
 	if (!mddev->raid_disks) {
-		int err;
 		/* expecting a device which has a superblock */
 		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
 		if (IS_ERR(rdev)) {
@@ -3830,10 +3838,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 				return -EINVAL;
 			}
 		}
-		err = bind_rdev_to_array(rdev, mddev);
-		if (err)
-			export_rdev(rdev);
-		return err;
+		return bind_rdev_to_array(rdev, mddev);
 	}
 
 	/*
@@ -3887,10 +3892,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 				validate_super(mddev, rdev);
 			err = mddev->pers->hot_add_disk(mddev, rdev);
 			if (err)
-				unbind_rdev_from_array(rdev);
+				kick_rdev_from_array(rdev);
 		}
-		if (err)
-			export_rdev(rdev);
 
 		md_update_sb(mddev, 1);
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3908,7 +3911,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 	}
 
 	if (!(info->state & (1<<MD_DISK_FAULTY))) {
-		int err;
 		rdev = md_import_device (dev, -1, 0);
 		if (IS_ERR(rdev)) {
 			printk(KERN_WARNING 
@@ -3938,11 +3940,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
 		rdev->size = calc_dev_size(rdev, mddev->chunk_size);
 
-		err = bind_rdev_to_array(rdev, mddev);
-		if (err) {
-			export_rdev(rdev);
-			return err;
-		}
+		return bind_rdev_to_array(rdev, mddev);
 	}
 
 	return 0;
@@ -4018,15 +4016,15 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 		printk(KERN_WARNING 
 			"md: can not hot-add faulty %s disk to %s!\n",
 			bdevname(rdev->bdev,b), mdname(mddev));
-		err = -EINVAL;
-		goto abort_export;
+		export_rdev(rdev);
+		return -EINVAL;
 	}
 	clear_bit(In_sync, &rdev->flags);
 	rdev->desc_nr = -1;
 	rdev->saved_raid_disk = -1;
 	err = bind_rdev_to_array(rdev, mddev);
 	if (err)
-		goto abort_export;
+		return err;
 
 	/*
 	 * The rest should better be atomic, we can have disk failures
@@ -4036,8 +4034,8 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	if (rdev->desc_nr == mddev->max_disks) {
 		printk(KERN_WARNING "%s: can not hot-add to full array!\n",
 			mdname(mddev));
-		err = -EBUSY;
-		goto abort_unbind_export;
+		kick_rdev_from_array(rdev);
+		return -EBUSY;
 	}
 
 	rdev->raid_disk = -1;
@@ -4052,13 +4050,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	md_wakeup_thread(mddev->thread);
 	md_new_event(mddev);
 	return 0;
-
-abort_unbind_export:
-	unbind_rdev_from_array(rdev);

^ permalink raw reply related

* Re: [Bugme-new] [Bug 9721] New: wake on lan fails with sky2 module
From: Stephen Hemminger @ 2008-01-10  4:52 UTC (permalink / raw)
  To: Andrew Morton, supersud501; +Cc: netdev, linux-acpi
In-Reply-To: <20080109160300.304ec687.akpm@linux-foundation.org>

On Wed, 9 Jan 2008 16:03:00 -0800
Andrew Morton <akpm@linux-foundation.org> wrote:

> 
> (switched to email.  Please respond via emailed reply-to-all, not via the
> bugzilla web interface).
> 
> On Wed,  9 Jan 2008 13:05:34 -0800 (PST)
> bugme-daemon@bugzilla.kernel.org wrote:
> 
> > http://bugzilla.kernel.org/show_bug.cgi?id=9721
> > 
> >            Summary: wake on lan fails with sky2 module
> >            Product: ACPI
> >            Version: 2.5
> >      KernelVersion: 2.6.24-rc7
> >           Platform: All
> >         OS/Version: Linux
> >               Tree: Mainline
> >             Status: NEW
> >           Severity: normal
> >           Priority: P1
> >          Component: Power-Sleep-Wake
> >         AssignedTo: acpi_power-sleep-wake@kernel-bugs.osdl.org
> >         ReportedBy: supersud501@yahoo.de
> 
> This post-2.6.23 regression was assigned to ACPI but is quite possibly a
> net driver problem?
> 
> > 
> > Latest working kernel version: 2.6.23.12
> > Earliest failing kernel version: 2.6.24-rc6 (not tested earlier kernel,
> > 2.6.24-rc7 still failing)
> > Distribution: Ubuntu 8.04 (but Kernel build from Kernel.org and system modifiet
> > to make wake on lan work, i.e. network cards are not shutted down on poweroff)
> > Hardware Environment: Marvell Technology Group Ltd. 88E8053 PCI-E Gigabit
> > Ethernet Controller (rev 20) onboard Asus P5W DH motherboard, uses module SKY2
> > Software Environment:
> > Problem Description:
> > 
> > When enabling wake on lan with: 'ethtool -s eth0 wol' i get the following
> > status:
> > 
> > 21:56:29 ~ # sudo ethtool eth0
> > Settings for eth0:
> >         Supported ports: [ TP ]
> >         Supported link modes:   10baseT/Half 10baseT/Full 
> >                                 100baseT/Half 100baseT/Full 
> >                                 1000baseT/Half 1000baseT/Full 
> >         Supports auto-negotiation: Yes
> >         Advertised link modes:  10baseT/Half 10baseT/Full 
> >                                 100baseT/Half 100baseT/Full 
> >                                 1000baseT/Half 1000baseT/Full 
> >         Advertised auto-negotiation: Yes
> >         Speed: 100Mb/s
> >         Duplex: Full
> >         Port: Twisted Pair
> >         PHYAD: 0
> >         Transceiver: internal
> >         Auto-negotiation: on
> >         Supports Wake-on: pg
> >         Wake-on: g    <---- wol enabled
> >         Current message level: 0x000000ff (255)
> >         Link detected: yes
> > 
> > but after shutting down the pc doesn't wake up when magic packet is sent.
> > 
> > the status lights of the network card are still on (so the card seems to be
> > online).
> > 
> > same system with only changed kernel to 2.6.23.12 and same procedure like
> > above: wake on lan works.
> > 
> > Steps to reproduce: enable wol on your network card using SKY2 module and it
> > doesn't work too?
> > 
> > if you need more information, just tell me, it's my first bug report.
> > regards
> > 


Wake from power off works on 2.6.24-rc7 for me.
Wake from suspend doesn't because Network Manager, HAL, or some other
user space tool gets confused.

I just rechecked it with Fujitsu Lifebook, which has sky2 (88E8055).
There many variations of this chip, and it maybe chip specific problem
or ACPI/BIOS issues.  If you don't enable Wake on Lan in BIOS, the
driver can't do it for you. Also, check how you are shutting down.

Also since the device has to restart the PHY, it could be a switch
issue if you have some fancy pants switch doing intrusion detection
or something, but I doubt that.

Is it a clean or fast shutdown, most distributions mark network
devices as down on shutdown, but if the distribution does something 
stupid like remove the driver module, then the driver is unable to setup Wake On Lan.
The wake on lan setup is done in one place in the driver, add
a printk to see if it is ever called.


-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>

^ permalink raw reply

* Re: [Bugme-new]  [Bug 9719] New: when a system is configured as a bridge, and at the same time configured to have multipath weighted route, with one leg goes thru NAT and another without NAT, the nat path will intermittently get packets leaking out using internal IP without being SNAT-ted
From: Ming-Ching Tiew @ 2008-01-10 17:25 UTC (permalink / raw)
  To: Ming-Ching Tiew; +Cc: Andrew Morton, bugme-daemon, netdev
In-Reply-To: <47864892.9010605@redtone.com>

Ming-Ching Tiew wrote:
> Ming-Ching Tiew wrote:
>>
>> What I meant was that it failed on both the kernel versions I tested. 
>> I am afraid it  is a problem which exists all a long. Perhaps it has 
>> been broken quite sometime already. I need to go back to try some 
>> older kernel version and see if I could repeat the problem.
>
> OK based on the I repeat the problem, so far I could not find such 
> misbehaviour on kernel 2.6.18. I will do more tests to make it more 
> conclusive.
>

Sorry for jumping the gun. Kernel 2.6.18 has the same problem too.
I think from now on, I will refrain from early posting until conclusive 
results.




^ permalink raw reply

* Re: Top 10 kernel oopses for the week ending January 5th, 2008
From: Neil Brown @ 2008-01-10  4:13 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Kevin Winchester, J. Bruce Fields,
	Arjan van de Ven, Linux Kernel Mailing List, Andrew Morton,
	NetDev, gregkh
In-Reply-To: <20080108055917.GZ27894@ZenIV.linux.org.uk>

On Tuesday January 8, viro@ZenIV.linux.org.uk wrote:
> 
> FWIW, I'm going to go through Arjan's collection and post blow-by-blow
> analysis of some of those suckers.  Tonight, probably...
> 
> Let's take e.g. http://www.kerneloops.org/raw.php?rawid=2618

Thanks for that analysis.
...
> 
> Humm...  So we have kobj->parent containing crap.  What about the caller?
> It's from drivers/md/md.c:
> static void delayed_delete(struct work_struct *ws)

This is a good argument for sticking "md_" at the from of all my
function names, even if they are static.  I'm fairly sure I looked at
that trace:

> Call Trace:
>  [<ffffffff803b37e9>] kobject_put+0x19/0x20
>  [<ffffffff803b389b>] kobject_del+0x2b/0x40
>  [<ffffffff804d7d50>] delayed_delete+0x0/0xb0
>  [<ffffffff804d7db9>] delayed_delete+0x69/0xb0
>  [<ffffffff80249775>] run_workqueue+0x175/0x210
>  [<ffffffff8024a411>] worker_thread+0x71/0xb0
>  [<ffffffff8024d9e0>] autoremove_wake_function+0x0/0x40
>  [<ffffffff8024a3a0>] worker_thread+0x0/0xb0
>  [<ffffffff8024d5fd>] kthread+0x4d/0x80
>  [<ffffffff8020c4b8>] child_rip+0xa/0x12
>  [<ffffffff8020bbcf>] restore_args+0x0/0x30
>  [<ffffffff8024d5b0>] kthread+0x0/0x80
>  [<ffffffff8020c4ae>] child_rip+0x0/0x12

but as it doesn't mention 'md' or 'nfs' I moved on.  My bad.

> 
> What guarantees that it doesn't happen before we get to callback?  AFAICS,
> nothing whatsoever...

Yes, that's bad isn't it :-)

I think I should be using sysfs_schedule_callback here.  That makes the 
required 'get' and 'put' calls.... but it can fail with -ENOMEM.  I
wonder what I do if -ENOMEM???  Maybe I'll just continue to roll my
one :-( 

Thanks,
NeilBrown


^ permalink raw reply

* Re: [Bugme-new]  [Bug 9719] New: when a system is configured as a bridge, and at the same time configured to have multipath weighted route, with one leg goes thru NAT and another without NAT, the nat path will intermittently get packets leaking out using internal IP without being SNAT-ted
From: Ming-Ching Tiew @ 2008-01-10 16:32 UTC (permalink / raw)
  To: Ming-Ching Tiew; +Cc: Andrew Morton, bugme-daemon, netdev
In-Reply-To: <4786187E.3090000@redtone.com>

Ming-Ching Tiew wrote:
>
> What I meant was that it failed on both the kernel versions I tested. 
> I am afraid it  is a problem which exists all a long. Perhaps it has 
> been broken quite sometime already. I need to go back to try some 
> older kernel version and see if I could repeat the problem.

OK based on the I repeat the problem, so far I could not find such 
misbehaviour on kernel 2.6.18. I will do more tests to make it more 
conclusive.





^ permalink raw reply

* Re: AF_UNIX MSG_PEEK bug?
From: Stephen Hemminger @ 2008-01-10  3:05 UTC (permalink / raw)
  To: Brent Casavant; +Cc: netdev
In-Reply-To: <alpine.BSF.1.00.0801092031290.35527@pkunk.americas.sgi.com>


> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 060bba4..2ffdf5b 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -50,6 +50,9 @@
>   *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
>   *	     				the core infrastructure is doing that
>   *	     				for all net proto families now (2.5.69+)
> + *		Brent Casavant	:	SOCK_STREAM MSG_PEEK should peek
> + *					far enough ahead to satisfy the request
> + *					rather than stop after the first skb.
>   *
>   *

The kernel doesn't use comments as a changlog anymore, that was so last century.


-- 
Stephen Hemminger <stephen.hemminger@vyatta.com>

^ permalink raw reply

* Re: AF_UNIX MSG_PEEK bug?
From: Brent Casavant @ 2008-01-10  2:50 UTC (permalink / raw)
  To: Herbert Xu; +Cc: penguin-kernel, netdev, davem, linux-kernel
In-Reply-To: <E1JCkrp-0000vX-00@gondolin.me.apana.org.au>

On Thu, 10 Jan 2008, Herbert Xu wrote:

> Having said that, I do agree that having TCP and AF_UNIX behave
> in the same way is a plus.  However, if you really want this to
> happen it would help if you had attached a patch :)

The following patch appears to fix the problem.  However, I would
really appreciate if someone more familiar with UNIX credential
passing could review what I did here and provide some feedback.
It wasn't at all clear to me why the MSG_PEEK and !MSG_PEEK code
paths were taking different actions regarding credentials, so I
unified them to behave the same way as the !MSG_PEEK code path.

I haven't tested credential passing under this new code yet, but
hope to do so tomorrow.  Again, a review with an eye towards
that area would be most appreciated.

The various test cases I had which tripped this bug now pass without
incident.  Also, if netstat is to be believed, quite a few programs
which utilize AF_UNIX sockets (e.g. hald, dbus, acpid) are running
very contentedly on the test system, so at least I didn't severely
break anything, possible UNIX credential issues notwithstanding.

Note: I'm not proposing this patch be integrated yet -- I'm throwing
it out here as a starting point.

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 060bba4..2ffdf5b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -50,6 +50,9 @@
  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
  *	     				the core infrastructure is doing that
  *	     				for all net proto families now (2.5.69+)
+ *		Brent Casavant	:	SOCK_STREAM MSG_PEEK should peek
+ *					far enough ahead to satisfy the request
+ *					rather than stop after the first skb.
  *
  *
  * Known differences from reference BSD that was tested:
@@ -1750,6 +1753,8 @@ static int unix_stream_recvmsg(struct ki
 	int target;
 	int err = 0;
 	long timeo;
+	struct sk_buff *skb;
+	struct sk_buff_head peek_stack;
 
 	err = -EINVAL;
 	if (sk->sk_state != TCP_ESTABLISHED)
@@ -1759,6 +1764,9 @@ static int unix_stream_recvmsg(struct ki
 	if (flags&MSG_OOB)
 		goto out;
 
+	if (flags & MSG_PEEK)
+		skb_queue_head_init(&peek_stack);
+
 	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
 	timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
 
@@ -1778,7 +1786,6 @@ static int unix_stream_recvmsg(struct ki
 	do
 	{
 		int chunk;
-		struct sk_buff *skb;
 
 		unix_state_lock(sk);
 		skb = skb_dequeue(&sk->sk_receive_queue);
@@ -1845,39 +1852,31 @@ static int unix_stream_recvmsg(struct ki
 		copied += chunk;
 		size -= chunk;
 
-		/* Mark read part of skb as used */
-		if (!(flags & MSG_PEEK))
-		{
-			skb_pull(skb, chunk);
-
-			if (UNIXCB(skb).fp)
-				unix_detach_fds(siocb->scm, skb);
+		/* Credential passing */
+		if (UNIXCB(skb).fp)
+			unix_detach_fds(siocb->scm, skb);
 
-			/* put the skb back if we didn't use it up.. */
+		if (!(flags & MSG_PEEK)) {
+			/* Mark read part of skb as used */
+			skb_pull(skb, chunk);
+			/* Return unused portion or free skb */
 			if (skb->len)
-			{
 				skb_queue_head(&sk->sk_receive_queue, skb);
-				break;
-			}
-
-			kfree_skb(skb);
-
-			if (siocb->scm->fp)
-				break;
-		}
-		else
-		{
-			/* It is questionable, see note in unix_dgram_recvmsg.
-			 */
-			if (UNIXCB(skb).fp)
-				siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+			else
+				kfree_skb(skb);
+		} else
+			__skb_queue_head(&peek_stack, skb);
 
-			/* put message back and return */
-			skb_queue_head(&sk->sk_receive_queue, skb);
+		/* Stop early when passed credentials are encountered */
+		if (siocb->scm->fp)
 			break;
-		}
 	} while (size);
 
+	/* Push all peeked skbs back onto receive queue */
+	if (flags & MSG_PEEK)
+		while ((skb = __skb_dequeue(&peek_stack)))
+			skb_queue_head(&sk->sk_receive_queue, skb);
+
 	mutex_unlock(&u->readlock);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:

^ permalink raw reply related

* Re: AF_UNIX MSG_PEEK bug?
From: Brent Casavant @ 2008-01-10  1:19 UTC (permalink / raw)
  To: Herbert Xu; +Cc: penguin-kernel, netdev, davem, linux-kernel
In-Reply-To: <E1JCkrp-0000vX-00@gondolin.me.apana.org.au>

On Thu, 10 Jan 2008, Herbert Xu wrote:

> The POSIX text for MSG_WAITALL actually says that when used in
> conjunction with MSG_PEEK it may not return the full data.

That's fine.  The problem is that the peek operation returns less
data than requested even when sufficient data is available on the
receive queue.

> However, if you really want this to
> happen it would help if you had attached a patch :)

A patch is definitely in progress.  I'm a little confused as
to the difference between unix_detach_fds() and scm_fp_dup()
in the MSG_PEEK versus !MSG_PEEK paths in unix_stream_recvmsg(),
however once I figure that out a patch should be forthcoming.

Brent

-- 
Brent Casavant                          All music is folk music.  I ain't
bcasavan@sgi.com                        never heard a horse sing a song.
Silicon Graphics, Inc.                    -- Louis Armstrong

^ permalink raw reply

* Re: AF_UNIX MSG_PEEK bug?
From: Herbert Xu @ 2008-01-10  0:01 UTC (permalink / raw)
  To: Brent Casavant; +Cc: penguin-kernel, netdev, davem, linux-kernel
In-Reply-To: <alpine.BSF.1.00.0801091201180.35527@pkunk.americas.sgi.com>

Brent Casavant <bcasavan@sgi.com> wrote:
> 
>> Did you try MSG_WAITALL flag? See "man 2 recv".
>> A TCP socket handles data in bytes.
>> You cannot complain if the amount received by recv() is smaller than expected
>> unless you use MSG_WAITALL flag.
> 
> Yes. It made no difference, as noted in the comments in the
> provided test program.

The POSIX text for MSG_WAITALL actually says that when used in
conjunction with MSG_PEEK it may not return the full data.

Having said that, I do agree that having TCP and AF_UNIX behave
in the same way is a plus.  However, if you really want this to
happen it would help if you had attached a patch :)

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH 0/3] bonding: 3 fixes for 2.6.24
From: Herbert Xu @ 2008-01-10  0:58 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Andy Gospodarek, Krzysztof Oledzki, netdev, Jeff Garzik,
	David Miller
In-Reply-To: <7603.1199920750@death>

On Wed, Jan 09, 2008 at 03:19:10PM -0800, Jay Vosburgh wrote:
>
> >No that's not the point.  The point is to move the majority of the code
> >into process context so that you can take the RTNL.  Once you have taken
> >the RTNL you can disable BH all you want and I don't care one bit.
> 
> 	I'm not sure how we could move more code into a process context;
> much of the bonding driver is at the mercy of its callers, as in this
> case.  The monitoring stuff and enslave / deslave is all in a process
> context now (workqueue).  The transmit processing functions, for
> example, can't be assumed to be in any particular context as they're
> called by dev_queue_xmit.

No I'm not calling for you to move any more code into process context.
I was replying to the comment that changing the read_lock calls in
process context to read_lock_bh somehow undoes the benefit of moving
softirq code into process context.  It does not since the point of the
move is to be able to take the RTNL, which you can still do as long as
you do it before you disable BH.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: 2.6.24-rc6-mm1
From: FUJITA Tomonori @ 2008-01-10  0:54 UTC (permalink / raw)
  To: jarkao2
  Cc: fujita.tomonori, mingo, akpm, just.for.lkml, tomof, herbert,
	linux-kernel, neilb, bfields, netdev, tom
In-Reply-To: <20080109090442.GA1746@ff.dom.local>

On Wed, 9 Jan 2008 10:04:42 +0100
Jarek Poplawski <jarkao2@gmail.com> wrote:

> On Wed, Jan 09, 2008 at 08:57:53AM +0900, FUJITA Tomonori wrote:
> ...
> > diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
> > new file mode 100644
> > index 0000000..495575a
> > --- /dev/null
> > +++ b/lib/iommu-helper.c
> > @@ -0,0 +1,80 @@
> > +/*
> > + * IOMMU helper functions for the free area management
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/bitops.h>
> > +
> > +static unsigned long find_next_zero_area(unsigned long *map,
> > +					 unsigned long size,
> > +					 unsigned long start,
> > +					 unsigned int nr,
> > +					 unsigned long align_mask)
> > +{
> > +	unsigned long index, end, i;
> > +again:
> > +	index = find_next_zero_bit(map, size, start);
> > +
> > +	/* Align allocation */
> > +	index = (index + align_mask) & ~align_mask;
> > +
> > +	end = index + nr;
> > +	if (end >= size)
> > +		return -1;
> 
> This '>=' looks doubtful to me, e.g.:
> map points to 0s only,  size = 64, nr = 64,
> we get: index = 0; end = 64;
> and: return -1 ?!

You are right. I did it only because I didn't want to change the
original code (iommu_range_alloc in arch/powerpc/kernel/iommu.c). I
thought that there might be a mysterious reason for it so I let it
alone since it's tiny loss.

Thanks,

^ permalink raw reply

* Re: [Bugme-new]  [Bug 9719] New: when a system is configured as a bridge, and at the same time configured to have multipath weighted route, with one leg goes thru NAT and another without NAT, the nat path will intermittently get packets leaking out using internal IP without being SNAT-ted
From: Ming-Ching Tiew @ 2008-01-10 13:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: bugme-daemon, netdev
In-Reply-To: <20080109152813.83fb8168.akpm@linux-foundation.org>

Andrew Morton wrote:
> (switched to email.  Please respond via emailed reply-to-all, not via the
> bugzilla web interface).
>
> On Wed,  9 Jan 2008 11:55:50 -0800 (PST)
> bugme-daemon@bugzilla.kernel.org wrote:
>
>   
>> http://bugzilla.kernel.org/show_bug.cgi?id=9719
>>
>>            Summary: when a system is configured as a bridge, and at the same
>>                     time configured to have multipath weighted route, with
>>                     one leg goes thru NAT and another without NAT, the nat
>>                     path will intermittently get packets leaking out using
>>                     internal IP without being SNAT-ted
>>            Product: Networking
>>            Version: 2.5
>>      KernelVersion: 2.6.22.15 and 2.6.23
>>           Platform: All
>>         OS/Version: Linux
>>               Tree: Mainline
>>             Status: NEW
>>           Severity: normal
>>           Priority: P1
>>          Component: Netfilter/Iptables
>>         AssignedTo: networking_netfilter-iptables@kernel-bugs.osdl.org
>>         ReportedBy: mingching.tiew@redtone.com
>>
>>
>> Latest working kernel version: 2.6.23
>> Earliest failing kernel version: 2.6.22.15
>>     
>
> This doesn't make sense.  What we're trying to ask here (and we've been
> unable to find a pair of questions which 100% of reporters can successfully
> answer) is whether this is a regression, and in which kernel release did we
> regress?
>
> In other words: did we break it, and if so, when did we break it?
>   

Sorry for the confusion and for such a lousy first time bug reporter.

I realized that mistake immediately after I posted it on the web 
interface. However, the web interface does not seem to allow me to 
correct that.

What I meant was that it failed on both the kernel versions I tested. I 
am afraid it  is a problem which exists all a long. Perhaps it has been 
broken quite sometime already. I need to go back to try some older 
kernel version and see if I could repeat the problem.

Regards.


^ permalink raw reply

* Re: Linux IPv6 DAD not full conform to RFC 4862 ?
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2008-01-10  0:09 UTC (permalink / raw)
  To: davem; +Cc: kkeil, netdev, yoshfuji
In-Reply-To: <20080109.155544.136768603.davem@davemloft.net>

In article <20080109.155544.136768603.davem@davemloft.net> (at Wed, 09 Jan 2008 15:55:44 -0800 (PST)), David Miller <davem@davemloft.net> says:

> Because of the above, the existing behavior must still stay the
> default.  I hope this is your plan.
> 
> By default Linux will not implement this SHOULD, it's a security
> issue.

Yes so far, though we may have more things to consider.

--yoshfuji

^ permalink raw reply

* Re: [Bugme-new] [Bug 9721] New: wake on lan fails with sky2 module
From: Andrew Morton @ 2008-01-10  0:03 UTC (permalink / raw)
  To: netdev, linux-acpi; +Cc: bugme-daemon, supersud501
In-Reply-To: <bug-9721-10286@http.bugzilla.kernel.org/>


(switched to email.  Please respond via emailed reply-to-all, not via the
bugzilla web interface).

On Wed,  9 Jan 2008 13:05:34 -0800 (PST)
bugme-daemon@bugzilla.kernel.org wrote:

> http://bugzilla.kernel.org/show_bug.cgi?id=9721
> 
>            Summary: wake on lan fails with sky2 module
>            Product: ACPI
>            Version: 2.5
>      KernelVersion: 2.6.24-rc7
>           Platform: All
>         OS/Version: Linux
>               Tree: Mainline
>             Status: NEW
>           Severity: normal
>           Priority: P1
>          Component: Power-Sleep-Wake
>         AssignedTo: acpi_power-sleep-wake@kernel-bugs.osdl.org
>         ReportedBy: supersud501@yahoo.de

This post-2.6.23 regression was assigned to ACPI but is quite possibly a
net driver problem?

> 
> Latest working kernel version: 2.6.23.12
> Earliest failing kernel version: 2.6.24-rc6 (not tested earlier kernel,
> 2.6.24-rc7 still failing)
> Distribution: Ubuntu 8.04 (but Kernel build from Kernel.org and system modifiet
> to make wake on lan work, i.e. network cards are not shutted down on poweroff)
> Hardware Environment: Marvell Technology Group Ltd. 88E8053 PCI-E Gigabit
> Ethernet Controller (rev 20) onboard Asus P5W DH motherboard, uses module SKY2
> Software Environment:
> Problem Description:
> 
> When enabling wake on lan with: 'ethtool -s eth0 wol' i get the following
> status:
> 
> 21:56:29 ~ # sudo ethtool eth0
> Settings for eth0:
>         Supported ports: [ TP ]
>         Supported link modes:   10baseT/Half 10baseT/Full 
>                                 100baseT/Half 100baseT/Full 
>                                 1000baseT/Half 1000baseT/Full 
>         Supports auto-negotiation: Yes
>         Advertised link modes:  10baseT/Half 10baseT/Full 
>                                 100baseT/Half 100baseT/Full 
>                                 1000baseT/Half 1000baseT/Full 
>         Advertised auto-negotiation: Yes
>         Speed: 100Mb/s
>         Duplex: Full
>         Port: Twisted Pair
>         PHYAD: 0
>         Transceiver: internal
>         Auto-negotiation: on
>         Supports Wake-on: pg
>         Wake-on: g    <---- wol enabled
>         Current message level: 0x000000ff (255)
>         Link detected: yes
> 
> but after shutting down the pc doesn't wake up when magic packet is sent.
> 
> the status lights of the network card are still on (so the card seems to be
> online).
> 
> same system with only changed kernel to 2.6.23.12 and same procedure like
> above: wake on lan works.
> 
> Steps to reproduce: enable wol on your network card using SKY2 module and it
> doesn't work too?
> 
> if you need more information, just tell me, it's my first bug report.
> regards
> 
> 
> -- 
> Configure bugmail: http://bugzilla.kernel.org/userprefs.cgi?tab=email
> ------- You are receiving this mail because: -------
> You are on the CC list for the bug, or are watching someone who is.

^ permalink raw reply

* Re: [PATCH 1/3] drivers/net/ipg.c: Fix skbuff leak
From: David Miller @ 2008-01-09 23:56 UTC (permalink / raw)
  To: romieu; +Cc: linux, akpm, netdev
In-Reply-To: <20080109233458.GB5673@electric-eye.fr.zoreil.com>

From: Francois Romieu <romieu@fr.zoreil.com>
Date: Thu, 10 Jan 2008 00:34:58 +0100

> David Miller <davem@davemloft.net> :
> [...]
> > all resolved.  All of that code cleanup stuff needs to wait
> > until later, let's fix bugs before adding new ones. :-)
> 
> Yes.
> 
> I should be able to test your r8169 NAPI changes tomorrow.

Thank you.

^ permalink raw reply

* Re: Linux IPv6 DAD not full conform to RFC 4862 ?
From: David Miller @ 2008-01-09 23:55 UTC (permalink / raw)
  To: yoshfuji; +Cc: kkeil, netdev
In-Reply-To: <20080110.084655.66182071.yoshfuji@linux-ipv6.org>

From: YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org>
Date: Thu, 10 Jan 2008 08:46:55 +0900 (JST)

> In article <20080109.153212.144388472.davem@davemloft.net> (at Wed, 09 Jan 2008 15:32:12 -0800 (PST)), David Miller <davem@davemloft.net> says:
> 
> > I question any RFC mandate that shuts down IP communication on a node
> > because of packets received from remote systems.
> 
> RFC4862 tell us that we SHOULD disable IP communication.
> (IP means IPv6 here; IPv4 is out of scope.)
> In IETF term, a SHOULD is almost a MUST.  We are required to follow
> unless we have very good reason to ignore it.

A DoS by definition is a very good reason.

> > If the TAHI test can trigger this, so can a compromised system on your
> > network and won't that be fun? :-)
> 
> So, I know the specification, but I have ignored it.
> I think it is fine to implent in some way, but I do think we must have
> a switch not to do this.

Because of the above, the existing behavior must still stay the
default.  I hope this is your plan.

By default Linux will not implement this SHOULD, it's a security
issue.

I more and more do not like these conformance tests, they leave no
room whatsoever for handling bugs or ill-specified features in the
specification.

^ permalink raw reply

* Re: [PATCH 1/3] drivers/net/ipg.c: Fix skbuff leak
From: Francois Romieu @ 2008-01-09 23:34 UTC (permalink / raw)
  To: David Miller; +Cc: linux, akpm, netdev
In-Reply-To: <20080109.003952.25384040.davem@davemloft.net>

David Miller <davem@davemloft.net> :
[...]
> all resolved.  All of that code cleanup stuff needs to wait
> until later, let's fix bugs before adding new ones. :-)

Yes.

I should be able to test your r8169 NAPI changes tomorrow.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH 1/3] drivers/net/ipg.c: Fix skbuff leak
From: Francois Romieu @ 2008-01-09 23:30 UTC (permalink / raw)
  To: linux; +Cc: akpm, davem, netdev
In-Reply-To: <20080109003840.22917.qmail@science.horizon.com>

linux@horizon.com <linux@horizon.com> :
[...]
> That doesn't seem to do it.  Not entirely, at least.  After downloading
> and partially re-uploading an 800M file, slabtop reports:

Ok, enjoy this one. It is definitely better wrt the current problem.

More work tomorrow.

diff --git a/drivers/net/ipg.c b/drivers/net/ipg.c
index dbd23bb..42f300d 100644
--- a/drivers/net/ipg.c
+++ b/drivers/net/ipg.c
@@ -860,18 +860,18 @@ static void ipg_nic_txfree(struct net_device *dev)
 	void __iomem *ioaddr = sp->ioaddr;
 	unsigned int curr;
 	u64 txd_map;
-	unsigned int released, pending;
+	unsigned int released, pending, dirty;
 
 	txd_map = (u64)sp->txd_map;
 	curr = ipg_r32(TFD_LIST_PTR_0) -
 		do_div(txd_map, sizeof(struct ipg_tx)) - 1;
 
 	IPG_DEBUG_MSG("_nic_txfree\n");
 
 	pending = sp->tx_current - sp->tx_dirty;
+	dirty = sp->tx_dirty % IPG_TFDLIST_LENGTH;
 
 	for (released = 0; released < pending; released++) {
-		unsigned int dirty = sp->tx_dirty % IPG_TFDLIST_LENGTH;
 		struct sk_buff *skb = sp->TxBuff[dirty];
 		struct ipg_tx *txfd = sp->txd + dirty;
 
@@ -882,8 +884,11 @@ static void ipg_nic_txfree(struct net_device *dev)
 		 * If the TFDDone bit is set, free the associated
 		 * buffer.
 		 */
-		if (dirty == curr)
+		if (!(txfd->tfc & cpu_to_le64(IPG_TFC_TFDDONE))) {
+			printk(KERN_INFO "%s: released = %d pending = %d\n",
+				dev->name, released, pending);
 			break;
+		}
 
 		/* Setup TFDDONE for compatible issue. */
 		txfd->tfc |= cpu_to_le64(IPG_TFC_TFDDONE);
@@ -898,6 +903,7 @@ static void ipg_nic_txfree(struct net_device *dev)
 
 			sp->TxBuff[dirty] = NULL;
 		}
+		dirty = (dirty + 1) % IPG_TFDLIST_LENGTH;
 	}
 
 	sp->tx_dirty += released;
@@ -1943,10 +1948,7 @@ static int ipg_nic_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	if (sp->tenmbpsmode)
 		txfd->tfc |= cpu_to_le64(IPG_TFC_TXINDICATE);
-	else if (!((sp->tx_current - sp->tx_dirty + 1) >
-	    IPG_FRAMESBETWEENTXDMACOMPLETES)) {
-		txfd->tfc |= cpu_to_le64(IPG_TFC_TXDMAINDICATE);
-	}
+	txfd->tfc |= cpu_to_le64(IPG_TFC_TXDMAINDICATE);
 	/* Based on compilation option, determine if FCS is to be
 	 * appended to transmit frame by IPG.
 	 */
@@ -2003,7 +2005,7 @@ static int ipg_nic_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	ipg_w32(IPG_DC_TX_DMA_POLL_NOW, DMA_CTRL);
 
 	if (sp->tx_current == (sp->tx_dirty + IPG_TFDLIST_LENGTH))
-		netif_wake_queue(dev);
+		netif_stop_queue(dev);
 
 	spin_unlock_irqrestore(&sp->lock, flags);
 
-- 
Ueimor

Anybody got a battery for my Ultra 10 ?

^ permalink raw reply related

* Re: Linux IPv6 DAD not full conform to RFC 4862 ?
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2008-01-09 23:46 UTC (permalink / raw)
  To: davem; +Cc: kkeil, netdev, yoshfuji
In-Reply-To: <20080109.153212.144388472.davem@davemloft.net>

In article <20080109.153212.144388472.davem@davemloft.net> (at Wed, 09 Jan 2008 15:32:12 -0800 (PST)), David Miller <davem@davemloft.net> says:

> I question any RFC mandate that shuts down IP communication on a node
> because of packets received from remote systems.

RFC4862 tell us that we SHOULD disable IP communication.
(IP means IPv6 here; IPv4 is out of scope.)
In IETF term, a SHOULD is almost a MUST.  We are required to follow
unless we have very good reason to ignore it.

> If the TAHI test can trigger this, so can a compromised system on your
> network and won't that be fun? :-)

So, I know the specification, but I have ignored it.
I think it is fine to implent in some way, but I do think we must have
a switch not to do this.

--yoshfuji

^ permalink raw reply

* Re: [PATCH 0/0]: Cassini bug fixes.
From: David Miller @ 2008-01-09 23:33 UTC (permalink / raw)
  To: panther; +Cc: netdev, bazsi, hidden
In-Reply-To: <4784F293.9050707@balabit.hu>

From: Laszlo Attila Toth <panther@balabit.hu>
Date: Wed, 09 Jan 2008 17:13:07 +0100

> We tested the card, it works well, all previous bugs are gone (truesize 
> bug messages and memory comsumption).

Thank you for testing.

^ permalink raw reply

* Re: Linux IPv6 DAD not full conform to RFC 4862 ?
From: David Miller @ 2008-01-09 23:32 UTC (permalink / raw)
  To: kkeil; +Cc: netdev
In-Reply-To: <20080109153656.GA16962@pingi.kke.suse.de>

From: Karsten Keil <kkeil@suse.de>
Date: Wed, 9 Jan 2008 16:36:56 +0100

>    If the address is a link-local address formed from an interface
>    identifier based on the hardware address, which is supposed to be
>    uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
>    operation on the interface SHOULD be disabled.  By disabling IP
>    operation, the node will then:
> 
>    -  not send any IP packets from the interface,
> 
>    -  silently drop any IP packets received on the interface, and
> 
>    -  not forward any IP packets to the interface (when acting as a
>       router or processing a packet with a Routing header).

I question any RFC mandate that shuts down IP communication on a node
because of packets received from remote systems.

If the TAHI test can trigger this, so can a compromised system on your
network and won't that be fun? :-)

^ permalink raw reply

* Re: [Bugme-new]  [Bug 9719] New: when a system is configured as a bridge, and at the same time configured to have multipath weighted route, with one leg goes thru NAT and another without NAT, the nat path will intermittently get packets leaking out using internal IP without being SNAT-ted
From: Andrew Morton @ 2008-01-09 23:28 UTC (permalink / raw)
  To: mingching.tiew; +Cc: bugme-daemon, netdev
In-Reply-To: <bug-9719-10286@http.bugzilla.kernel.org/>


(switched to email.  Please respond via emailed reply-to-all, not via the
bugzilla web interface).

On Wed,  9 Jan 2008 11:55:50 -0800 (PST)
bugme-daemon@bugzilla.kernel.org wrote:

> http://bugzilla.kernel.org/show_bug.cgi?id=9719
> 
>            Summary: when a system is configured as a bridge, and at the same
>                     time configured to have multipath weighted route, with
>                     one leg goes thru NAT and another without NAT, the nat
>                     path will intermittently get packets leaking out using
>                     internal IP without being SNAT-ted
>            Product: Networking
>            Version: 2.5
>      KernelVersion: 2.6.22.15 and 2.6.23
>           Platform: All
>         OS/Version: Linux
>               Tree: Mainline
>             Status: NEW
>           Severity: normal
>           Priority: P1
>          Component: Netfilter/Iptables
>         AssignedTo: networking_netfilter-iptables@kernel-bugs.osdl.org
>         ReportedBy: mingching.tiew@redtone.com
> 
> 
> Latest working kernel version: 2.6.23
> Earliest failing kernel version: 2.6.22.15

This doesn't make sense.  What we're trying to ask here (and we've been
unable to find a pair of questions which 100% of reporters can successfully
answer) is whether this is a regression, and in which kernel release did we
regress?

In other words: did we break it, and if so, when did we break it?

> Distribution: iptables 1.4.0 was used with kernel 2.6.23 and iptables 1.3.8
> with 2.6.22.15
> Hardware Environment: 3 interfaces, 2 interfaces bridged to form br0, and
> another connects to internet using pppoe.
> Software Environment: bridge, multipath routing
> Problem Description: when a system is configured as a bridge with IP assigned
> to br0 interface, and at the same time it is configured to have multipath
> weighted default route, and one of the default route is NAT-ed and another of
> the default route is not NAT-ed, then it is NAT-ed interface will occasionally
> get packets leaking out to it with packets with private IPs.
> 
> Steps to reproduce: 
> 1) setup the bridge interface and assign an IP to it 
> 2) setup an default gateway on side B of the bridge ( without NAT ) and default
> route the bridge to this gateway. 
> 3) Setup a client on side A of the bridge and default route to the bridge br0
> interface.
> 4) Start ping'ing an internet site, for example www.google.com from the client.
>    Run the ping continuously, for example :-
>          while true
>          do
>             ping -c 1 www.google.com
>             sleep 1
>          done
> 5) after successfully and consistently getting a ping response from the
> www.google.com, on the bridge system start up another uplink to the internet,
> but this uplink is SNAT-ed 
> 
>        ( eg iptables -t nat -A POSTROUTING -o eth2 -j MASQUERADE ) 
> 
> 6) verify and make sure that the second uplink is working.
> 7) change the default route on the bridge to multipath weighted route with
> equal weight on both the uplinks.
> 8) sniff the NAT-ed inteface for packets coming in from the LAN client.
> Occasionallly packets with private IP leaks to the NAT-ed interface.
> 


^ permalink raw reply

* Re: Re : Re : Re : Bonding : Monitoring of 4965 wireless card
From: Jay Vosburgh @ 2008-01-09 23:26 UTC (permalink / raw)
  To: patnel972-linux; +Cc: netdev
In-Reply-To: <416779.67064.qm@web25704.mail.ukl.yahoo.com>

patnel972-linux@yahoo.fr wrote:

>I mean that instead of arp test an ip in lan or else, i want it to test 127.0.0.1 but in order to do this it must go out and re-enter and then use wlan0 to go out.

	In other words, what I think you're saying (and I'm not entirely
sure here) is that you want probes to go to a remote node on the
network, and back, without having to actually know the identity of the
remote node (because, presumably, on a roaming type of wireless
configuration, your gateway and whatnot can change from time to time).

	Is that what you're looking for?

	That isn't available now, but might be straightforward to plug
into the address update system to keep the arp_ip_target up to date as
the current gateway as the gateway changes.  I haven't looked into the
details of doing that, but in theory it sounds straightforward.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com


>
>----- Message d'origine ----
>De : Jay Vosburgh <fubar@us.ibm.com>
>À : patnel972-linux@yahoo.fr
>Cc : John W. Linville <linville@tuxdriver.com>; netdev@vger.kernel.org
>Envoyé le : Mercredi, 9 Janvier 2008, 22h36mn 00s
>Objet : Re: Re : Re : Bonding : Monitoring of 4965 wireless card 
>
>patnel972-linux@yahoo.fr wrote:
>
>>I ignore it, but it seems like it prevent bonding detect link of
> wlan0. I enslave wlan0 and i already use use_carrier=1;
>
>    The default for bonding is use_carrier=1, which makes bonding
>use the device driver's netif_carrier_on/off state for link detection.
>Bonding only checks via ethtool/mii if use_carrier=0.
>
>>I'll try arp monitoring but this is annoying i c'ant test localhost.
> Is there a way to test localhost with arp, without pass through lo ? 
>
>    What do you mean by "test localhost with arp, without pass
>through lo"?  ARP monitoring issues probes (ARPs) to a remote
>destination to confirm that there is connectivity; I'm not sure what
>localhost has to do with it.
>
>    In general, though, I have not tested bonding with wireless
>adapters, so I'm unfamiliar with how well it does or does not work.
>
>    -J
>
>---
>    -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
>
>
>
>
>
>      _____________________________________________________________________________ 
>Ne gardez plus qu'une seule adresse mail ! Copiez vos mails vers Yahoo! Mail http://mail.yahoo.fr
>--
>To unsubscribe from this list: send the line "unsubscribe netdev" in
>the body of a message to majordomo@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 0/3] bonding: 3 fixes for 2.6.24
From: Jay Vosburgh @ 2008-01-09 23:19 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Andy Gospodarek, Krzysztof Oledzki, netdev, Jeff Garzik,
	David Miller
In-Reply-To: <20080109220534.GA2692@gondor.apana.org.au>

Herbert Xu <herbert@gondor.apana.org.au> wrote:

>On Wed, Jan 09, 2008 at 03:17:09PM -0500, Andy Gospodarek wrote:
>>
>> Agreed.  And despite Herbert's opinion that this isn't the correct fix,
>> I think this will work fine.  This is one of the cases where we can take
>> a write_lock(bond->lock) in softirq context, so we need to drop that (or
>> make sure all the read_lock's are read_lock_bh's).  The latter isn't
>> really an option since having a majority of the bonding code run in
>> softirq context was what we are trying to avoid with the workqueue
>> conversion.
>
>No that's not the point.  The point is to move the majority of the code
>into process context so that you can take the RTNL.  Once you have taken
>the RTNL you can disable BH all you want and I don't care one bit.

	I'm not sure how we could move more code into a process context;
much of the bonding driver is at the mercy of its callers, as in this
case.  The monitoring stuff and enslave / deslave is all in a process
context now (workqueue).  The transmit processing functions, for
example, can't be assumed to be in any particular context as they're
called by dev_queue_xmit.

	The function in question here is the dev->set_multicast_list
method function, which is sometimes called with RTNL, and sometimes with
other random locks, but seems to always be called with netif_tx_lock_bh.
Then, bonding's bond_set_multicast_list calls dev_set_promiscuity (on
slaves), which I believe is an "RTNL only" function (which is probably
another discussion).

>In any case, fixing a known dead-lock is important.

	Agreed.

	I'm now thinking for just the topic at hand (the previously
posted lockdep report), something like this will resolve it:

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 77d004d..b7ac10b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3937,8 +3937,6 @@ static void bond_set_multicast_list(struct net_device *bond_dev)
 	struct bonding *bond = bond_dev->priv;
 	struct dev_mc_list *dmi;
 
-	write_lock_bh(&bond->lock);
-
 	/*
 	 * Do promisc before checking multicast_mode
 	 */
@@ -3959,6 +3957,8 @@ static void bond_set_multicast_list(struct net_device *bond_dev)
 		bond_set_allmulti(bond, -1);
 	}
 
+	read_lock_bh(&bond->lock);
+
 	bond->flags = bond_dev->flags;
 
 	/* looking for addresses to add to slaves' mc list */
@@ -3979,7 +3979,7 @@ static void bond_set_multicast_list(struct net_device *bond_dev)
 	bond_mc_list_destroy(bond);
 	bond_mc_list_copy(bond_dev->mc_list, bond, GFP_ATOMIC);
 
-	write_unlock_bh(&bond->lock);
+	read_unlock_bh(&bond->lock);
 }
 
 /*


	This (a) converts the write_lock_bh to a read_lock_bh, and moves
it down to not cover the promisc/allmulti code (which is protected by
RTNL).  I'm not sure that this is really a signficant alteration, but
it's a bit closer to "correct."

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply related

* Re: [PATCH 0/3] bonding: 3 fixes for 2.6.24
From: Herbert Xu @ 2008-01-09 22:05 UTC (permalink / raw)
  To: Andy Gospodarek
  Cc: Jay Vosburgh, Krzysztof Oledzki, netdev, Jeff Garzik,
	David Miller
In-Reply-To: <20080109201709.GF8728@gospo.usersys.redhat.com>

On Wed, Jan 09, 2008 at 03:17:09PM -0500, Andy Gospodarek wrote:
>
> Agreed.  And despite Herbert's opinion that this isn't the correct fix,
> I think this will work fine.  This is one of the cases where we can take
> a write_lock(bond->lock) in softirq context, so we need to drop that (or
> make sure all the read_lock's are read_lock_bh's).  The latter isn't
> really an option since having a majority of the bonding code run in
> softirq context was what we are trying to avoid with the workqueue
> conversion.

No that's not the point.  The point is to move the majority of the code
into process context so that you can take the RTNL.  Once you have taken
the RTNL you can disable BH all you want and I don't care one bit.

In any case, fixing a known dead-lock is important.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox