* [PATCH] fib_hash: fix rcu sparse and logical errors
From: Eric Dumazet @ 2010-10-26 13:24 UTC (permalink / raw)
To: David Miller; +Cc: netdev
While fixing CONFIG_SPARSE_RCU_POINTER errors, I had to fix accesses to
fz->fz_hash for real.
- &fz->fz_hash[fn_hash(f->fn_key, fz)]
+ rcu_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz)
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/ipv4/fib_hash.c | 36 ++++++++++++++++++++----------------
1 file changed, 20 insertions(+), 16 deletions(-)
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 43e1c59..b232375 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -120,11 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
struct fib_node *f;
hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head __rcu *new_head;
+ struct hlist_head *new_head;
hlist_del_rcu(&f->fn_hash);
- new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+ new_head = rcu_dereference_protected(fz->fz_hash, 1) +
+ fn_hash(f->fn_key, fz);
hlist_add_head_rcu(&f->fn_hash, new_head);
}
}
@@ -179,8 +180,8 @@ static void fn_rehash_zone(struct fn_zone *fz)
memcpy(&nfz, fz, sizeof(nfz));
write_seqlock_bh(&fz->fz_lock);
- old_ht = fz->fz_hash;
- nfz.fz_hash = ht;
+ old_ht = rcu_dereference_protected(fz->fz_hash, 1);
+ RCU_INIT_POINTER(nfz.fz_hash, ht);
nfz.fz_hashmask = new_hashmask;
nfz.fz_divisor = new_divisor;
fn_rebuild_zone(&nfz, old_ht, old_divisor);
@@ -236,7 +237,7 @@ fn_new_zone(struct fn_hash *table, int z)
seqlock_init(&fz->fz_lock);
fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
fz->fz_hashmask = fz->fz_divisor - 1;
- fz->fz_hash = fz->fz_embedded_hash;
+ RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
fz->fz_order = z;
fz->fz_revorder = 32 - z;
fz->fz_mask = inet_make_mask(z);
@@ -272,7 +273,7 @@ int fib_table_lookup(struct fib_table *tb,
for (fz = rcu_dereference(t->fn_zone_list);
fz != NULL;
fz = rcu_dereference(fz->fz_next)) {
- struct hlist_head __rcu *head;
+ struct hlist_head *head;
struct hlist_node *node;
struct fib_node *f;
__be32 k;
@@ -282,7 +283,7 @@ int fib_table_lookup(struct fib_table *tb,
seq = read_seqbegin(&fz->fz_lock);
k = fz_key(flp->fl4_dst, fz);
- head = &fz->fz_hash[fn_hash(k, fz)];
+ head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
hlist_for_each_entry_rcu(f, node, head, fn_hash) {
if (f->fn_key != k)
continue;
@@ -311,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb,
struct fib_info *last_resort;
struct fn_hash *t = (struct fn_hash *)tb->tb_data;
struct fn_zone *fz = t->fn_zones[0];
+ struct hlist_head *head;
if (fz == NULL)
return;
@@ -320,7 +322,8 @@ void fib_table_select_default(struct fib_table *tb,
order = -1;
rcu_read_lock();
- hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
+ head = rcu_dereference(fz->fz_hash);
+ hlist_for_each_entry_rcu(f, node, head, fn_hash) {
struct fib_alias *fa;
list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
@@ -374,7 +377,7 @@ out:
/* Insert node F to FZ. */
static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
{
- struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+ struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
hlist_add_head_rcu(&f->fn_hash, head);
}
@@ -382,7 +385,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
/* Return the node in FZ matching KEY. */
static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
{
- struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+ struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
struct hlist_node *node;
struct fib_node *f;
@@ -662,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
static int fn_flush_list(struct fn_zone *fz, int idx)
{
- struct hlist_head *head = &fz->fz_hash[idx];
+ struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
struct hlist_node *node, *n;
struct fib_node *f;
int found = 0;
@@ -761,14 +764,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
struct fn_zone *fz)
{
int h, s_h;
+ struct hlist_head *head = rcu_dereference(fz->fz_hash);
- if (fz->fz_hash == NULL)
+ if (head == NULL)
return skb->len;
s_h = cb->args[3];
for (h = s_h; h < fz->fz_divisor; h++) {
- if (hlist_empty(&fz->fz_hash[h]))
+ if (hlist_empty(head + h))
continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) {
+ if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
cb->args[3] = h;
return -1;
}
@@ -872,7 +876,7 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
if (!iter->zone->fz_nent)
continue;
- iter->hash_head = iter->zone->fz_hash;
+ iter->hash_head = rcu_dereference(iter->zone->fz_hash);
maxslot = iter->zone->fz_divisor;
for (iter->bucket = 0; iter->bucket < maxslot;
@@ -957,7 +961,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
goto out;
iter->bucket = 0;
- iter->hash_head = iter->zone->fz_hash;
+ iter->hash_head = rcu_dereference(iter->zone->fz_hash);
hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
list_for_each_entry(fa, &fn->fn_alias, fa_list) {
^ permalink raw reply related
* Re: [PATCH net-next-2.6] net: Fix some corner cases in dev_can_checksum()
From: Ben Hutchings @ 2010-10-26 13:29 UTC (permalink / raw)
To: Jesse Gross; +Cc: David Miller, netdev
In-Reply-To: <AANLkTi=PiB_oCvaFzQpUNhgV8qsn9d-jy_ejGdbOTzQe@mail.gmail.com>
Jesse Gross wrote:
> On Fri, Oct 22, 2010 at 7:12 AM, Ben Hutchings
> <bhutchings@solarflare.com> wrote:
> > dev_can_checksum() incorrectly returns true in these cases:
> >
> > 1. The skb has both out-of-band and in-band VLAN tags and the device
> > supports checksum offload for the encapsulated protocol but only with
> > one layer of encapsulation.
> > 2. The skb has a VLAN tag and the device supports generic checksumming
> > but not in conjunction with VLAN encapsulation.
> >
> > Rearrange the VLAN tag checks to avoid these.
> >
> > Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
>
> If we assume that cards cannot handle offloading for double tagged
> packets, which is obviously the most conservative approach, we
> probably also need to change the checks for TSO/SG. There's no issue
> with extracting the protocol from the right header but we might assume
> that the card can handle double tag offloading when it can't. For
> both TSO/SG we check if there is either an in-band tag or out-of-band
> tag and use dev->vlan_features if that is the case. Maybe we need to
> handle it in software if it is double tagged.
That's something to check.
> On the other hand, I don't know whether it's true that cards can't
> handle offloading for packets tagged in both manners. I suppose that
> it depends on where the offloading and tagging are in the pipeline.
> For example, when it comes to SG I doubt that the cards care about
> vlan tags much at all.
I do know that current Solarflare controllers can parse two VLAN tags
and generate/validate TCP/IP-style checksums after them. We could add
vlan2_features which would be copied to a VLAN sub-device's
vlan_features, but then what happens when people want to handle triple
VLAN encapsulation?
Ben.
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Joe Buehler @ 2010-10-26 13:29 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1287851745.2658.364.camel@edumazet-laptop>
Eric Dumazet wrote:
> With a normal workload, on a dual cpu machine, a missing memory barrier
> can stay un-noticed for quite a long time. The race window is so small
> that probability for the bug might be 0.0000001 % or something like
> that :(
I'm looking at the LINUX source at the moment and not liking what I see
in include/asm-mips/barrier.h:
#define smp_mb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
#ifdef CONFIG_CPU_CAVIUM_OCTEON
#define smp_rmb() barrier()
#define smp_wmb() barrier()
#else
#define smp_rmb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
#define smp_wmb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
#endif
The Octeon documentation explicitly says that neither loads nor stores
need execute in program order, so the definitions for smp_rmb and
smp_wmb appear to be wrong wrong wrong.
It appears that smp_wmb should be making use of SYNCW and smp_rmb should
be making use of SYNC.
Should I pursue this question on the main LINUX kernel list?
Joe Buehler
^ permalink raw reply
* Re: [PATCH net-next-2.6 1/2] be2net: Adding an option to use INTx instead of MSI-X
From: Ben Hutchings @ 2010-10-26 13:32 UTC (permalink / raw)
To: Michael Ellerman; +Cc: David Miller, somnath.kotur, netdev, linux-pci
In-Reply-To: <1288075928.6578.185.camel@concordia>
Michael Ellerman wrote:
> On Mon, 2010-10-25 at 16:25 -0700, David Miller wrote:
> > From: Ben Hutchings <bhutchings@solarflare.com>
> > Date: Mon, 25 Oct 2010 23:38:53 +0100
> >
> > > David Miller wrote:
> > >> From: Somnath Kotur <somnath.kotur@emulex.com>
> > >> Date: Mon, 25 Oct 2010 16:42:35 +0530
> > >>
> > >> > By default, be2net uses MSIx wherever possible.
> > >> > Adding a module parameter to use INTx for users who do not want to use MSIx.
> > >> >
> > >> > Signed-off-by: Somnath Kotur <somnath.kotur@emulex.com>
> > >>
> > >> Either add a new ethtool flag, or use the PCI subsystem facilities
> > >> for tweaking things to implement this.
> > >>
> > >> Do not use a module option, otherwise every other networking driver
> > >> author will get the same "cool" idea, give the module option
> > >> different names, and the resulting user experience is terrible.
> > >
> > > This has already happened, sadly. So far as I can see it's mostly done
> > > to allow users to work around systems with broken MSIs; I'm not aware of
> > > any other reason to prefer legacy interrupts. However, the PCI subsystem
> > > already implements a blacklist and a kernel parameter for disabling MSIs
> > > on these systems.
> >
> > The PCI subsystem bits I'm totally fine with.
> >
> > But in the drivers themselves, that's what I don't want.
>
> That horse has really really bolted, it's gawn.
>
> I count 26 drivers with "disable MSI/X" parameters. Some even have more
> than one.
>
> 11 of them are network drivers, 9 scsi, 3 ata.
>
> I agree it's a mess for users, but it's probably preferable to a
> non-working driver.
>
> Ethtool would be nice, but only for network drivers. Is there a generic
> solution, quirks are obviously not keeping people happy.
Since this is (normally) a property of the system, pci=nomsi is the
generic solution.
[...]
> Misc:
> sound/pci/hda/hda_intel.c:MODULE_PARM_DESC(enable_msi, "Enable Message Signaled Interrupt (MSI)");
> drivers/message/fusion/mptbase.c:MODULE_PARM_DESC(mpt_msi_enable_spi, " Enable MSI Support for SPI \
> drivers/message/fusion/mptbase.c:MODULE_PARM_DESC(mpt_msi_enable_fc, " Enable MSI Support for FC \
> drivers/message/fusion/mptbase.c:MODULE_PARM_DESC(mpt_msi_enable_sas, " Enable MSI Support for SAS \
> drivers/net/myri10ge/myri10ge.c:MODULE_PARM_DESC(myri10ge_msi, "Enable Message Signalled Interrupts");
[...]
drivers/net/sfc/efx.c:MODULE_PARM_DESC(interrupt_mode,
drivers/net/sfc/efx.c- "Interrupt mode (0=>MSIX 1=>MSI 2=>legacy)");
Ben.
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Eric Dumazet @ 2010-10-26 13:36 UTC (permalink / raw)
To: Joe Buehler; +Cc: netdev
In-Reply-To: <4CC6D7CC.5040608@cox.net>
Le mardi 26 octobre 2010 à 09:29 -0400, Joe Buehler a écrit :
> Eric Dumazet wrote:
>
> > With a normal workload, on a dual cpu machine, a missing memory barrier
> > can stay un-noticed for quite a long time. The race window is so small
> > that probability for the bug might be 0.0000001 % or something like
> > that :(
>
> I'm looking at the LINUX source at the moment and not liking what I see
> in include/asm-mips/barrier.h:
>
> #define smp_mb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
> #ifdef CONFIG_CPU_CAVIUM_OCTEON
> #define smp_rmb() barrier()
> #define smp_wmb() barrier()
> #else
> #define smp_rmb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
> #define smp_wmb() __asm__ __volatile__(__WEAK_ORDERING_MB : : :"memory")
> #endif
>
> The Octeon documentation explicitly says that neither loads nor stores
> need execute in program order, so the definitions for smp_rmb and
> smp_wmb appear to be wrong wrong wrong.
>
> It appears that smp_wmb should be making use of SYNCW and smp_rmb should
> be making use of SYNC.
>
> Should I pursue this question on the main LINUX kernel list?
Well, it would be surprising this being wrong and crash only once in a
while in fib_rules_lookup
Did you tried my last patch ?
^ permalink raw reply
* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Joe Buehler @ 2010-10-26 13:53 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1288100208.3169.112.camel@edumazet-laptop>
Eric Dumazet wrote:
> Well, it would be surprising this being wrong and crash only once in a
> while in fib_rules_lookup
> Did you tried my last patch ?
There was a patch to the kernel by David Daney back in January to
improve performance of Octeon memory barriers. The patch changes the
generic MIPS barrier code to introduce optimizations for Octeon. The
LINUX version I am using is from the Octeon SDK and appears to have an
early version of this patch. It's broken however -- the Jan patch has
proper SYNCW instructions in smp_wmb while the SDK version does not.
I have made your changes but will also fold in this change, then start
some load testing.
The real-time scheduler is broken in the LINUX I am using -- I get
kernel crashes -- and I would be most happy if the SYNCW fix fixed that
also.
Joe Buehler
^ permalink raw reply
* Re: [PATCH 1/2] r6040: fix multicast operations
From: Ben Hutchings @ 2010-10-26 14:02 UTC (permalink / raw)
To: Shawn Lin
Cc: Florian Fainelli, netdev, Marc Leclerc, Albert Chen, David Miller
In-Reply-To: <1287649654.1792.98.camel@shawn-desktop>
[-- Attachment #1: Type: text/plain, Size: 2236 bytes --]
On Thu, Oct 21, 2010 at 04:27:34PM +0800, Shawn Lin wrote:
[...]
> > > + /* Use internal multicast address registers
> > > + * if the number of multicast addresses is not greater than MCAST_MAX.
> > > + */
> > > + else if (netdev_mc_empty(dev)) {
> > > + for (i = 0; i < MCAST_MAX ; i++) {
> > > + iowrite16(0, ioaddr + MID_1L + 8 * i);
> > > + iowrite16(0, ioaddr + MID_1M + 8 * i);
> > > + iowrite16(0, ioaddr + MID_1H + 8 * i);
> > > + }
> > > + } else if (netdev_mc_count(dev) <= MCAST_MAX) {
> > > + i = 0;
> > > + netdev_for_each_mc_addr(ha, dev) {
> > > + adrp = (u16 *) ha->addr;
> > > + iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
> > > + iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
> > > + iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
> > > + i++;
> > > + }
> >
> > What about the unused exact match entries? And why is the empty case
> > special?
>
> Unused exact match entries? I am not sure which entries are you
> mentioned.
If there are 1 or 2 addresses in the multicast list then some of the
exact match entries will be used and some will not. But the loop
above does not clear the unused entries.
[...]
> 2) if (netdev_mc_count(dev) <= 3)
> There are two hardware features could be used to filter multicast
> frames:
[...]
> 3) if (netdev_mc_empty(dev))
> because we masked the multicast hash table flag before examine all
> conditions, we only need to clear the addresses in the three
> MAC/Multicast registers.
[...]
But why is this so different from the case of 1-3 addresses? I would
write these two cases as:
else if (netdev_mc_count(dev) <= MCAST_MAX) {
i = 0;
netdev_for_each_mc_addr(ha, dev) {
adrp = (u16 *) ha->addr;
iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
i++;
}
while (i < MCAST_MAX) {
iowrite16(0, ioaddr + MID_1L + 8 * i);
iowrite16(0, ioaddr + MID_1M + 8 * i);
iowrite16(0, ioaddr + MID_1H + 8 * i);
i++;
}
}
Ben.
--
Ben Hutchings
We get into the habit of living before acquiring the habit of thinking.
- Albert Camus
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 827 bytes --]
^ permalink raw reply
* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Eric Dumazet @ 2010-10-26 14:33 UTC (permalink / raw)
To: Joe Buehler; +Cc: netdev
In-Reply-To: <4CC6DD69.4020502@cox.net>
Le mardi 26 octobre 2010 à 09:53 -0400, Joe Buehler a écrit :
> Eric Dumazet wrote:
>
> > Well, it would be surprising this being wrong and crash only once in a
> > while in fib_rules_lookup
>
> > Did you tried my last patch ?
>
> There was a patch to the kernel by David Daney back in January to
> improve performance of Octeon memory barriers. The patch changes the
> generic MIPS barrier code to introduce optimizations for Octeon. The
> LINUX version I am using is from the Octeon SDK and appears to have an
> early version of this patch. It's broken however -- the Jan patch has
> proper SYNCW instructions in smp_wmb while the SDK version does not.
>
> I have made your changes but will also fold in this change, then start
> some load testing.
>
> The real-time scheduler is broken in the LINUX I am using -- I get
> kernel crashes -- and I would be most happy if the SYNCW fix fixed that
> also.
>
>
Just to make sure, are you using a single syncw, or a double one ?
/*
* We actually use two syncw instructions in a row when we need a write
* memory barrier. This is because the CN3XXX series of Octeons have
* errata Core-401. This can cause a single syncw to not enforce
* ordering under very rare conditions. Even if it is rare, better safe
* than sorry.
*/
#define CVMX_SYNCW asm volatile ("syncw\n\tsyncw" : : : "memory")
^ permalink raw reply
* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Joe Buehler @ 2010-10-26 14:41 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1288103614.2622.0.camel@edumazet-laptop>
Eric Dumazet wrote:
> Just to make sure, are you using a single syncw, or a double one ?
I'm using double. I know about the errata. It doesn't apply to all
Octeon versions but in their SDK they always use two without worrying
about the chip variant.
What I actually to patch this was look at the current LINUX prerelease
and copy what it does.
We'll see later today if all these changes fix my crash, it takes a
while to encounter it.
Joe Buehler
^ permalink raw reply
* Re: kernel panic in fib_rules_lookup [2.6.27.7 vendor-patched]
From: Eric Dumazet @ 2010-10-26 13:58 UTC (permalink / raw)
To: Joe Buehler; +Cc: netdev
In-Reply-To: <4CC6DD69.4020502@cox.net>
Le mardi 26 octobre 2010 à 09:53 -0400, Joe Buehler a écrit :
> Eric Dumazet wrote:
>
> > Well, it would be surprising this being wrong and crash only once in a
> > while in fib_rules_lookup
>
> > Did you tried my last patch ?
>
> There was a patch to the kernel by David Daney back in January to
> improve performance of Octeon memory barriers. The patch changes the
> generic MIPS barrier code to introduce optimizations for Octeon. The
> LINUX version I am using is from the Octeon SDK and appears to have an
> early version of this patch. It's broken however -- the Jan patch has
> proper SYNCW instructions in smp_wmb while the SDK version does not.
>
> I have made your changes but will also fold in this change, then start
> some load testing.
>
> The real-time scheduler is broken in the LINUX I am using -- I get
> kernel crashes -- and I would be most happy if the SYNCW fix fixed that
> also.
>
> Joe Buehler
>
Just to make sure, you do use two "syncw", not only one ?
/*
* We actually use two syncw instructions in a row when we need a write
* memory barrier. This is because the CN3XXX series of Octeons have
* errata Core-401. This can cause a single syncw to not enforce
* ordering under very rare conditions. Even if it is rare, better safe
* than sorry.
*/
#define CVMX_SYNCW asm volatile ("syncw\n\tsyncw" : : : "memory")
^ permalink raw reply
* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Stephen Hemminger @ 2010-10-26 15:28 UTC (permalink / raw)
To: Lorenzo Colitti; +Cc: netdev
In-Reply-To: <AANLkTinGpt=EQX3aXRaj5mEUMDNpyEapbs4VyOwgtN55@mail.gmail.com>
On Mon, 25 Oct 2010 22:44:03 -0700
Lorenzo Colitti <lorenzo@google.com> wrote:
> On Mon, Oct 25, 2010 at 9:38 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
> > This is incorrect. When link is lost, routes and address should not be
> > flushed. They should be marked as tentative and then go through DAD again
> > on the new network.
>
> That won't help the case I am trying to fix, which is the case where
> the new link has a global prefix different than the old link. Marking
> the addresses as tentative will simply make them pass DAD and come
> back as soon as link comes back. But since they don't match the prefix
> that is assigned to the new link, they are unusable, because packets
> can't be routed back to them.
For IPv4 this is already handled by network manager.
Why couldn't the same apply to IPv6?
> > If you do it this way, you break routing protocols when link is brought
> > down and back up.
>
> The only addresses and routes flushed in this way should be ones that
> aren't manually configured, i.e., the ones created by autoconf
> (addrconf.c:2720 onwards). These won't be used by routing protocols,
> except for link-local addresses. So I assume you're talking about
> link-local here.
Not sure, let me do test it.
> Link-local addresses are immediately recreated in a tentative state as
> soon as link comes back, because on NETDEV_UP addrconf_notify calls
> addrconf_dev_config. So, this patch only makes it so that they become
> tentative when link goes away and comes back. In that time, the router
> that temporarily loses link is unable to send packets for the brief
> period of time that the link is performing DAD, but if the router has
> lost link, it will also fail to send the packet while link is lost.
> What's the additional failure scenario? Will it help if I make it so
> that link-local addresses aren't touched at all?
Link-local works fine.
--
^ permalink raw reply
* Re: [PATCH 1/2 v3] xps: Improvements in TX queue selection
From: Tom Herbert @ 2010-10-26 15:32 UTC (permalink / raw)
To: Helmut Schaa; +Cc: davem, netdev, eric.dumazet
In-Reply-To: <201010260818.47523.helmut.schaa@googlemail.com>
> Wouldn't that break mac80211 QoS again for bridged AP mode interfaces (see
> commit deabc772f39405054a438d711f408d2d94d26d96, "net: fix tx queue selection
> for bridged devices implementing select_queue")?
>
Yes, looks like that would break. I'll fix that.
If a device only has one real TX queue should we still call
ndo_select_queue, or can we bypass it? (to save one conditional)
Thanks,
Tom
^ permalink raw reply
* Re: [PATCH 1/2 v3] xps: Improvements in TX queue selection
From: David Miller @ 2010-10-26 15:35 UTC (permalink / raw)
To: therbert; +Cc: helmut.schaa, netdev, eric.dumazet
In-Reply-To: <AANLkTimMrqhWbnLXk1xbYPY7WT7Gazn5G=S1QvVkZ86Y@mail.gmail.com>
From: Tom Herbert <therbert@google.com>
Date: Tue, 26 Oct 2010 08:32:59 -0700
>> Wouldn't that break mac80211 QoS again for bridged AP mode interfaces (see
>> commit deabc772f39405054a438d711f408d2d94d26d96, "net: fix tx queue selection
>> for bridged devices implementing select_queue")?
>>
> Yes, looks like that would break. I'll fix that.
>
> If a device only has one real TX queue should we still call
> ndo_select_queue, or can we bypass it? (to save one conditional)
Probably bypass, for now.
^ permalink raw reply
* Re: [PATCH 0/4]qlcnic: bug fixes
From: David Miller @ 2010-10-26 15:37 UTC (permalink / raw)
To: amit.salecha; +Cc: netdev, ameen.rahman, anirban.chakraborty
In-Reply-To: <1288085882-11988-1-git-send-email-amit.salecha@qlogic.com>
From: Amit Kumar Salecha <amit.salecha@qlogic.com>
Date: Tue, 26 Oct 2010 02:37:58 -0700
> Hi
> Series of 4 bug fixes. Apply them on net-2.6 branch.
Your patches weren't numbered, making the order I should
apply them ambiguous.
Please fix the subject lines and resubmit.
Thank you.
^ permalink raw reply
* Re: linux-next: Tree for October 25 (netfilter/nf_conntrack_reasm)
From: David Miller @ 2010-10-26 16:09 UTC (permalink / raw)
To: randy.dunlap; +Cc: sfr, netfilter-devel, netdev, linux-next, linux-kernel
In-Reply-To: <20101025.220316.39174493.davem@davemloft.net>
From: David Miller <davem@davemloft.net>
Date: Mon, 25 Oct 2010 22:03:16 -0700 (PDT)
> From: Randy Dunlap <randy.dunlap@oracle.com>
> Date: Mon, 25 Oct 2010 16:55:29 -0700
>
>> On Mon, 25 Oct 2010 14:58:34 +1100 Stephen Rothwell wrote:
>>
>>> Hi all,
>>>
>>> Reminder: do not add 2.6.38 destined stuff to linux-next until after
>>> 2.6.37-rc1 is released.
>>
>> net/ipv6/netfilter/nf_conntrack_reasm.c:628: error: 'nf_ct_frag6_sysctl_header' undeclared (first use in this function)
>> net/ipv6/netfilter/nf_conntrack_reasm.c:628: error: 'nf_net_netfilter_sysctl_path' undeclared (first use in this function)
>> net/ipv6/netfilter/nf_conntrack_reasm.c:629: error: 'nf_ct_frag6_sysctl_table' undeclared (first use in this function)
>> net/ipv6/netfilter/nf_conntrack_reasm.c:640: error: 'nf_ct_frag6_sysctl_header' undeclared (first use in this function)
>>
>>
>> config file is attached.
>
> Should also be fixed by the commit I just pointed you to.
Actually it isn't :-) I'll commit the following to fix this,
thanks!
--------------------
netfilter: Add missing CONFIG_SYSCTL checks in ipv6's nf_conntrack_reasm.c
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
net/ipv6/netfilter/nf_conntrack_reasm.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 489d71b..3a3f129 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -625,21 +625,24 @@ int nf_ct_frag6_init(void)
inet_frags_init_net(&nf_init_frags);
inet_frags_init(&nf_frags);
+#ifdef CONFIG_SYSCTL
nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
nf_ct_frag6_sysctl_table);
if (!nf_ct_frag6_sysctl_header) {
inet_frags_fini(&nf_frags);
return -ENOMEM;
}
+#endif
return 0;
}
void nf_ct_frag6_cleanup(void)
{
+#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nf_ct_frag6_sysctl_header);
nf_ct_frag6_sysctl_header = NULL;
-
+#endif
inet_frags_fini(&nf_frags);
nf_init_frags.low_thresh = 0;
--
1.7.3.2
^ permalink raw reply related
* [PATCH] netfilter: Add missing CONFIG_SYSCTL checks in ipv6's nf_conntrack_reasm.c
From: David Miller @ 2010-10-26 16:10 UTC (permalink / raw)
To: netfilter-devel; +Cc: netdev, kaber
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
I just committed this to net-2.6, just FYI.
net/ipv6/netfilter/nf_conntrack_reasm.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 489d71b..3a3f129 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -625,21 +625,24 @@ int nf_ct_frag6_init(void)
inet_frags_init_net(&nf_init_frags);
inet_frags_init(&nf_frags);
+#ifdef CONFIG_SYSCTL
nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path,
nf_ct_frag6_sysctl_table);
if (!nf_ct_frag6_sysctl_header) {
inet_frags_fini(&nf_frags);
return -ENOMEM;
}
+#endif
return 0;
}
void nf_ct_frag6_cleanup(void)
{
+#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nf_ct_frag6_sysctl_header);
nf_ct_frag6_sysctl_header = NULL;
-
+#endif
inet_frags_fini(&nf_frags);
nf_init_frags.low_thresh = 0;
--
1.7.3.2
^ permalink raw reply related
* [PATCH] drivers/net: sgiseeq: fix return on error
From: Nicolas Kaiser @ 2010-10-26 16:45 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, linux-kernel
If we continue on error, we'd likely free the IRQ that we
didn't get, right?
Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
---
drivers/net/sgiseeq.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/sgiseeq.c b/drivers/net/sgiseeq.c
index 9265315..3a0cc63 100644
--- a/drivers/net/sgiseeq.c
+++ b/drivers/net/sgiseeq.c
@@ -531,7 +531,7 @@ static int sgiseeq_open(struct net_device *dev)
if (request_irq(irq, sgiseeq_interrupt, 0, sgiseeqstr, dev)) {
printk(KERN_ERR "Seeq8003: Can't get irq %d\n", dev->irq);
- err = -EAGAIN;
+ return -EAGAIN;
}
err = init_seeq(dev, sp, sregs);
--
1.7.2.2
^ permalink raw reply related
* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Brian Haley @ 2010-10-26 16:58 UTC (permalink / raw)
To: Lorenzo Colitti; +Cc: Stephen Hemminger, netdev
In-Reply-To: <AANLkTinGpt=EQX3aXRaj5mEUMDNpyEapbs4VyOwgtN55@mail.gmail.com>
On 10/26/2010 01:44 AM, Lorenzo Colitti wrote:
> On Mon, Oct 25, 2010 at 9:38 PM, Stephen Hemminger
> <shemminger@vyatta.com> wrote:
>> This is incorrect. When link is lost, routes and address should not be
>> flushed. They should be marked as tentative and then go through DAD again
>> on the new network.
>
> That won't help the case I am trying to fix, which is the case where
> the new link has a global prefix different than the old link. Marking
> the addresses as tentative will simply make them pass DAD and come
> back as soon as link comes back. But since they don't match the prefix
> that is assigned to the new link, they are unusable, because packets
> can't be routed back to them.
The old addresses will become deprecated, and eventually get removed, but
it will take 2 hours.
>> If you do it this way, you break routing protocols when link is brought
>> down and back up.
>
> The only addresses and routes flushed in this way should be ones that
> aren't manually configured, i.e., the ones created by autoconf
> (addrconf.c:2720 onwards). These won't be used by routing protocols,
> except for link-local addresses. So I assume you're talking about
> link-local here.
I posted a very similar patch recently:
http://marc.info/?l=linux-netdev&m=128415231909522&w=2
But the first response pointed out that I didn't test this with just a
simple link flap, in which case all the IPv6 addresses are deleted,
and all sessions using them die. Not good. This changes the current
behavior, and isn't what happens with IPv4 either.
Having these addresses restart DAD is probably about as much as we
can do I think, unless we add a per-device sysctl to remove the addresses
(which I think has been shot-down before). Is this a mobile device that
is actually changing it's point of attachment?
-Brian
^ permalink raw reply
* Re: [PATCH] drivers/net: sgiseeq: fix return on error
From: David Miller @ 2010-10-26 17:02 UTC (permalink / raw)
To: nikai; +Cc: netdev, linux-kernel
In-Reply-To: <20101026184505.077246a2@absol.kitzblitz>
From: Nicolas Kaiser <nikai@nikai.net>
Date: Tue, 26 Oct 2010 18:45:05 +0200
> If we continue on error, we'd likely free the IRQ that we
> didn't get, right?
>
> Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
Yep, applied, thank you.
^ permalink raw reply
* Re: [PATCH net-next-2.6] tg3: use dma_alloc_coherent() instead of pci_alloc_consistent()
From: Matt Carlson @ 2010-10-26 17:04 UTC (permalink / raw)
To: Eric Dumazet
Cc: Amit Salecha, Matthew Carlson, davem@davemloft.net,
netdev@vger.kernel.org, Ameen Rahman, Anirban Chakraborty,
Michael Chan
In-Reply-To: <1288094593.3169.95.camel@edumazet-laptop>
On Tue, Oct 26, 2010 at 05:03:13AM -0700, Eric Dumazet wrote:
> Le mardi 26 octobre 2010 ?? 12:30 +0200, Eric Dumazet a ??crit :
>
> > By the way, you should use dma_alloc_coherent() instead of
> > pci_alloc_consistent() so that you can use GFP_KERNEL allocations
> > instead of GFP_ATOMIC, it might help in low memory conditions (if you
> > dont hold a spinlock at this point)
> >
>
> tg3 being often used as a reference network driver, I believe we should
> change its allocations before too many people copy paste suboptimal
> pci_alloc_consistent() stuff...
>
> Matt, could you please queue this patch and submit it to David when
> net-next-2.6 reopens ?
Sure. Thanks for the patch.
> Thanks
>
> [PATCH] tg3: use dma_alloc_coherent() instead of pci_alloc_consistent()
>
> Using dma_alloc_coherent() permits to use GFP_KERNEL allocations instead
> of GFP_ATOMIC ones. Its better when a machine is out of memory, because
> this allows driver to sleep to get its memory and succeed its init,
> especially when allocating high order pages.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> CC: Matt Carlson <mcarlson@broadcom.com>
> CC: Michael Chan <mchan@broadcom.com>
> ---
> drivers/net/tg3.c | 73 ++++++++++++++++++++++++--------------------
> 1 file changed, 41 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
> index 852e917..7dfa579 100644
> --- a/drivers/net/tg3.c
> +++ b/drivers/net/tg3.c
> @@ -6339,13 +6339,13 @@ static void tg3_rx_prodring_fini(struct tg3 *tp,
> kfree(tpr->rx_jmb_buffers);
> tpr->rx_jmb_buffers = NULL;
> if (tpr->rx_std) {
> - pci_free_consistent(tp->pdev, TG3_RX_STD_RING_BYTES(tp),
> - tpr->rx_std, tpr->rx_std_mapping);
> + dma_free_coherent(&tp->pdev->dev, TG3_RX_STD_RING_BYTES(tp),
> + tpr->rx_std, tpr->rx_std_mapping);
> tpr->rx_std = NULL;
> }
> if (tpr->rx_jmb) {
> - pci_free_consistent(tp->pdev, TG3_RX_JMB_RING_BYTES(tp),
> - tpr->rx_jmb, tpr->rx_jmb_mapping);
> + dma_free_coherent(&tp->pdev->dev, TG3_RX_JMB_RING_BYTES(tp),
> + tpr->rx_jmb, tpr->rx_jmb_mapping);
> tpr->rx_jmb = NULL;
> }
> }
> @@ -6358,8 +6358,10 @@ static int tg3_rx_prodring_init(struct tg3 *tp,
> if (!tpr->rx_std_buffers)
> return -ENOMEM;
>
> - tpr->rx_std = pci_alloc_consistent(tp->pdev, TG3_RX_STD_RING_BYTES(tp),
> - &tpr->rx_std_mapping);
> + tpr->rx_std = dma_alloc_coherent(&tp->pdev->dev,
> + TG3_RX_STD_RING_BYTES(tp),
> + &tpr->rx_std_mapping,
> + GFP_KERNEL);
> if (!tpr->rx_std)
> goto err_out;
>
> @@ -6370,9 +6372,10 @@ static int tg3_rx_prodring_init(struct tg3 *tp,
> if (!tpr->rx_jmb_buffers)
> goto err_out;
>
> - tpr->rx_jmb = pci_alloc_consistent(tp->pdev,
> - TG3_RX_JMB_RING_BYTES(tp),
> - &tpr->rx_jmb_mapping);
> + tpr->rx_jmb = dma_alloc_coherent(&tp->pdev->dev,
> + TG3_RX_JMB_RING_BYTES(tp),
> + &tpr->rx_jmb_mapping,
> + GFP_KERNEL);
> if (!tpr->rx_jmb)
> goto err_out;
> }
> @@ -6491,7 +6494,7 @@ static void tg3_free_consistent(struct tg3 *tp)
> struct tg3_napi *tnapi = &tp->napi[i];
>
> if (tnapi->tx_ring) {
> - pci_free_consistent(tp->pdev, TG3_TX_RING_BYTES,
> + dma_free_coherent(&tp->pdev->dev, TG3_TX_RING_BYTES,
> tnapi->tx_ring, tnapi->tx_desc_mapping);
> tnapi->tx_ring = NULL;
> }
> @@ -6500,25 +6503,26 @@ static void tg3_free_consistent(struct tg3 *tp)
> tnapi->tx_buffers = NULL;
>
> if (tnapi->rx_rcb) {
> - pci_free_consistent(tp->pdev, TG3_RX_RCB_RING_BYTES(tp),
> - tnapi->rx_rcb,
> - tnapi->rx_rcb_mapping);
> + dma_free_coherent(&tp->pdev->dev,
> + TG3_RX_RCB_RING_BYTES(tp),
> + tnapi->rx_rcb,
> + tnapi->rx_rcb_mapping);
> tnapi->rx_rcb = NULL;
> }
>
> tg3_rx_prodring_fini(tp, &tnapi->prodring);
>
> if (tnapi->hw_status) {
> - pci_free_consistent(tp->pdev, TG3_HW_STATUS_SIZE,
> - tnapi->hw_status,
> - tnapi->status_mapping);
> + dma_free_coherent(&tp->pdev->dev, TG3_HW_STATUS_SIZE,
> + tnapi->hw_status,
> + tnapi->status_mapping);
> tnapi->hw_status = NULL;
> }
> }
>
> if (tp->hw_stats) {
> - pci_free_consistent(tp->pdev, sizeof(struct tg3_hw_stats),
> - tp->hw_stats, tp->stats_mapping);
> + dma_free_coherent(&tp->pdev->dev, sizeof(struct tg3_hw_stats),
> + tp->hw_stats, tp->stats_mapping);
> tp->hw_stats = NULL;
> }
> }
> @@ -6531,9 +6535,10 @@ static int tg3_alloc_consistent(struct tg3 *tp)
> {
> int i;
>
> - tp->hw_stats = pci_alloc_consistent(tp->pdev,
> - sizeof(struct tg3_hw_stats),
> - &tp->stats_mapping);
> + tp->hw_stats = dma_alloc_coherent(&tp->pdev->dev,
> + sizeof(struct tg3_hw_stats),
> + &tp->stats_mapping,
> + GFP_KERNEL);
> if (!tp->hw_stats)
> goto err_out;
>
> @@ -6543,9 +6548,10 @@ static int tg3_alloc_consistent(struct tg3 *tp)
> struct tg3_napi *tnapi = &tp->napi[i];
> struct tg3_hw_status *sblk;
>
> - tnapi->hw_status = pci_alloc_consistent(tp->pdev,
> - TG3_HW_STATUS_SIZE,
> - &tnapi->status_mapping);
> + tnapi->hw_status = dma_alloc_coherent(&tp->pdev->dev,
> + TG3_HW_STATUS_SIZE,
> + &tnapi->status_mapping,
> + GFP_KERNEL);
> if (!tnapi->hw_status)
> goto err_out;
>
> @@ -6566,9 +6572,10 @@ static int tg3_alloc_consistent(struct tg3 *tp)
> if (!tnapi->tx_buffers)
> goto err_out;
>
> - tnapi->tx_ring = pci_alloc_consistent(tp->pdev,
> - TG3_TX_RING_BYTES,
> - &tnapi->tx_desc_mapping);
> + tnapi->tx_ring = dma_alloc_coherent(&tp->pdev->dev,
> + TG3_TX_RING_BYTES,
> + &tnapi->tx_desc_mapping,
> + GFP_KERNEL);
> if (!tnapi->tx_ring)
> goto err_out;
> }
> @@ -6601,9 +6608,10 @@ static int tg3_alloc_consistent(struct tg3 *tp)
> if (!i && (tp->tg3_flags3 & TG3_FLG3_ENABLE_RSS))
> continue;
>
> - tnapi->rx_rcb = pci_alloc_consistent(tp->pdev,
> - TG3_RX_RCB_RING_BYTES(tp),
> - &tnapi->rx_rcb_mapping);
> + tnapi->rx_rcb = dma_alloc_coherent(&tp->pdev->dev,
> + TG3_RX_RCB_RING_BYTES(tp),
> + &tnapi->rx_rcb_mapping,
> + GFP_KERNEL);
> if (!tnapi->rx_rcb)
> goto err_out;
>
> @@ -14159,7 +14167,8 @@ static int __devinit tg3_test_dma(struct tg3 *tp)
> u32 *buf, saved_dma_rwctrl;
> int ret = 0;
>
> - buf = pci_alloc_consistent(tp->pdev, TEST_BUFFER_SIZE, &buf_dma);
> + buf = dma_alloc_coherent(&tp->pdev->dev, TEST_BUFFER_SIZE,
> + &buf_dma, GFP_KERNEL);
> if (!buf) {
> ret = -ENOMEM;
> goto out_nofree;
> @@ -14343,7 +14352,7 @@ static int __devinit tg3_test_dma(struct tg3 *tp)
> }
>
> out:
> - pci_free_consistent(tp->pdev, TEST_BUFFER_SIZE, buf, buf_dma);
> + dma_free_coherent(&tp->pdev->dev, TEST_BUFFER_SIZE, buf, buf_dma);
> out_nofree:
> return ret;
> }
>
>
>
^ permalink raw reply
* [PATCH 2.6.36/stable v2] vlan: Fix crash when hwaccel rx pkt for non-existant vlan.
From: Ben Greear @ 2010-10-26 17:06 UTC (permalink / raw)
To: netdev; +Cc: Ben Greear
The vlan_hwaccel_do_receive code expected skb->dev to always
be a vlan device, but if the NIC was promisc, and the VLAN
for a particular VID was not configured, then this method
could receive a packet where skb->dev was NOT a vlan
device. This caused access of bad memory and a crash.
Signed-off-by: Ben Greear <greearb@candelatech.com>
---
v1 -> v2: Simplify patch..no need for setting pkt-type, etc.
:100644 100644 0eb96f7... 0687b6c... M net/8021q/vlan_core.c
:100644 100644 660dd41... 5dc45b9... M net/core/dev.c
net/8021q/vlan_core.c | 3 +++
net/core/dev.c | 5 +++--
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 0eb96f7..0687b6c 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -43,6 +43,9 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb)
struct net_device *dev = skb->dev;
struct vlan_rx_stats *rx_stats;
+ if (!is_vlan_dev(dev))
+ return 0;
+
skb->dev = vlan_dev_info(dev)->real_dev;
netif_nit_deliver(skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 660dd41..5dc45b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2828,8 +2828,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
if (!netdev_tstamp_prequeue)
net_timestamp_check(skb);
- if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
- return NET_RX_SUCCESS;
+ if (vlan_tx_tag_present(skb))
+ /* This method cannot fail at this time. */
+ vlan_hwaccel_do_receive(skb);
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
--
1.6.2.5
^ permalink raw reply related
* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Lorenzo Colitti @ 2010-10-26 17:09 UTC (permalink / raw)
To: Brian Haley; +Cc: Stephen Hemminger, netdev
In-Reply-To: <4CC708CE.4040404@hp.com>
On Tue, Oct 26, 2010 at 9:58 AM, Brian Haley <brian.haley@hp.com> wrote:
> > That won't help the case I am trying to fix, which is the case where
> > the new link has a global prefix different than the old link. Marking
> > the addresses as tentative will simply make them pass DAD and come
> > back as soon as link comes back. But since they don't match the prefix
> > that is assigned to the new link, they are unusable, because packets
> > can't be routed back to them.
>
> The old addresses will become deprecated, and eventually get removed, but
> it will take 2 hours.
Yes, but they become deprecated only after the preferred lifetime is
expires. Until that happens, the kernel considers them fair game and
will use them for outgoing connections, without knowing that they
won't work. So the user just sees connection timeouts and thinks that
IPv6 is slow.
> http://marc.info/?l=linux-netdev&m=128415231909522&w=2
>
> But the first response pointed out that I didn't test this with just a
> simple link flap, in which case all the IPv6 addresses are deleted,
> and all sessions using them die. Not good. This changes the current
> behavior, and isn't what happens with IPv4 either.
Actually, I just tested this and it works fine. I opened a telnet
session to ipv6.google.com port 80, and while the TCP connection was
open I reassociated with the same wifi link. During the flap, the
patch removed and then readded the same global IPv6 address. While it
was doing it, I typed GET / HTTP/1.0\n\n in the telnet window. When
the address came back, the response came back fine. The connection was
not reset.
> Having these addresses restart DAD is probably about as much as we
> can do I think, unless we add a per-device sysctl to remove the addresses
> (which I think has been shot-down before).
As before, just setting them tentative won't help that case I am
trying to fix. They have to be removed.
> Is this a mobile device that is actually changing it's point of attachment?
This is a laptop that is changing SSIDs, plugging into wired/wireless,
etc. At work we have multiple wireless networks with their own IPv6
ranges. The typical case is that I am connected to the corp network,
then I associate with the guest network... and IPv6 is broken, because
the kernel wants to use my old address and gateway, which don't work.
I'm also doing lab testing of IPv6-capable home routers, each of which
has its own wireless network. As you can imagine, after a couple of
switches, I end up with three IPv6 addresses and between three sets of
default gateways, only one of which works.
^ permalink raw reply
* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: David Miller @ 2010-10-26 17:10 UTC (permalink / raw)
To: brian.haley; +Cc: lorenzo, shemminger, netdev
In-Reply-To: <4CC708CE.4040404@hp.com>
From: Brian Haley <brian.haley@hp.com>
Date: Tue, 26 Oct 2010 12:58:54 -0400
> Having these addresses restart DAD is probably about as much as we
> can do I think, unless we add a per-device sysctl to remove the addresses
> (which I think has been shot-down before).
It would be fundamentally flawed, in my opinion, if the normal
mechanisms of ipv6 cannot handle this situation adequately.
That's really why I don't even want to consider the "blow away
the addresses" sysctl option.
^ permalink raw reply
* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Lorenzo Colitti @ 2010-10-26 17:11 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20101026082832.4c5af2ef@nehalam>
On Tue, Oct 26, 2010 at 8:28 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
>> That won't help the case I am trying to fix, which is the case where
>> the new link has a global prefix different than the old link. Marking
>> the addresses as tentative will simply make them pass DAD and come
>> back as soon as link comes back. But since they don't match the prefix
>> that is assigned to the new link, they are unusable, because packets
>> can't be routed back to them.
>
> For IPv4 this is already handled by network manager.
> Why couldn't the same apply to IPv6?
I think addresses should be deleted by the entity that configured
them. In IPv4, the addresses are created by userspace daemons like
networkmanager or DHCP, and so they should be deleted by
networkmanager as well. In IPv6, autoconf addresses are created by the
kernel, and so should be deleted by the kernel.
Otherwise, userspace daemons that are not IPv6 aware must listen to
see what addresses are created, deleting them if they think they need
to go away, and possibly even racing with the kernel (e.g., imagine
networkmanager purging IPv6 addresses just before the kernel processes
an RA that adds new IPv6 addresses).
>> What's the additional failure scenario? Will it help if I make it so
>> that link-local addresses aren't touched at all?
>
> Link-local works fine.
Ok, so if global addresses work, then the patch doesn't need to do
anything special with link-local?
^ permalink raw reply
* Re: [PATCH] ipv6: addrconf: clear IPv6 addresses and routes when losing link
From: Lorenzo Colitti @ 2010-10-26 17:13 UTC (permalink / raw)
To: David Miller; +Cc: brian.haley, shemminger, netdev
In-Reply-To: <20101026.101038.260090183.davem@davemloft.net>
On Tue, Oct 26, 2010 at 10:10 AM, David Miller <davem@davemloft.net> wrote:
> It would be fundamentally flawed, in my opinion, if the normal
> mechanisms of ipv6 cannot handle this situation adequately.
I think they can. When you associate with a new link, you naturally
set up link-local addresses, perform DAD for them, and send router
solicitations. You just need to make sure you don't have old addresses
from a previous link you were on before.
Removing autoconfigured addresses is what the kernel currently does
when the link goes down. The patch just makes sure it happens when
carrier is lost as well, since losing link might mean that when
carrier comes back, the host might be on a different link than it was
before.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox