* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Stephen Hemminger @ 2008-01-15 17:47 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Robert Olsson, David Miller, robert.olsson, netdev
In-Reply-To: <20080115182544.98c18d08.dada1@cosmosbay.com>
This is how I did it:
--- a/net/ipv4/fib_trie.c 2008-01-15 09:14:53.000000000 -0800
+++ b/net/ipv4/fib_trie.c 2008-01-15 09:21:48.000000000 -0800
@@ -101,13 +101,6 @@ struct node {
t_key key;
};
-struct leaf {
- unsigned long parent;
- t_key key;
- struct hlist_head list;
- struct rcu_head rcu;
-};
-
struct leaf_info {
struct hlist_node hlist;
struct rcu_head rcu;
@@ -115,6 +108,13 @@ struct leaf_info {
struct list_head falh;
};
+struct leaf {
+ unsigned long parent;
+ t_key key;
+ struct hlist_head list;
+ struct rcu_head rcu;
+};
+
struct tnode {
unsigned long parent;
t_key key;
@@ -321,16 +321,6 @@ static void __leaf_free_rcu(struct rcu_h
kmem_cache_free(trie_leaf_kmem, leaf);
}
-static void __leaf_info_free_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct leaf_info, rcu));
-}
-
-static inline void free_leaf_info(struct leaf_info *leaf)
-{
- call_rcu(&leaf->rcu, __leaf_info_free_rcu);
-}
-
static struct tnode *tnode_alloc(size_t size)
{
struct page *pages;
@@ -357,7 +347,7 @@ static void __tnode_free_rcu(struct rcu_
free_pages((unsigned long)tn, get_order(size));
}
-static inline void tnode_free(struct tnode *tn)
+static void tnode_free(struct tnode *tn)
{
if (IS_LEAF(tn)) {
struct leaf *l = (struct leaf *) tn;
@@ -376,16 +366,41 @@ static struct leaf *leaf_new(void)
return l;
}
+static void leaf_info_init(struct leaf_info *li, int plen)
+{
+ li->plen = plen;
+ INIT_LIST_HEAD(&li->falh);
+}
+
+static struct leaf_info *leaf_info_first(struct leaf *l, int plen)
+{
+ struct leaf_info *li = (struct leaf_info *) (l + 1);
+ leaf_info_init(li, plen);
+ return li;
+}
+
static struct leaf_info *leaf_info_new(int plen)
{
struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
- if (li) {
- li->plen = plen;
- INIT_LIST_HEAD(&li->falh);
- }
+ if (li)
+ leaf_info_init(li, plen);
+
return li;
}
+static void __leaf_info_free_rcu(struct rcu_head *head)
+{
+ kfree(container_of(head, struct leaf_info, rcu));
+}
+
+static inline void free_leaf_info(struct leaf *l, struct leaf_info *leaf)
+{
+ if (leaf == (struct leaf_info *)(l + 1))
+ return;
+
+ call_rcu(&leaf->rcu, __leaf_info_free_rcu);
+}
+
static struct tnode* tnode_new(t_key key, int pos, int bits)
{
size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
@@ -1047,18 +1062,13 @@ static struct list_head *fib_insert_node
insert_leaf_info(&l->list, li);
goto done;
}
- l = leaf_new();
+ l = leaf_new();
if (!l)
return NULL;
l->key = key;
- li = leaf_info_new(plen);
-
- if (!li) {
- tnode_free((struct tnode *) l);
- return NULL;
- }
+ li = leaf_info_first(l, plen);
fa_head = &li->falh;
insert_leaf_info(&l->list, li);
@@ -1091,7 +1101,7 @@ static struct list_head *fib_insert_node
}
if (!tn) {
- free_leaf_info(li);
+ free_leaf_info(l, li);
tnode_free((struct tnode *) l);
return NULL;
}
@@ -1624,7 +1634,7 @@ static int fn_trie_delete(struct fib_tab
if (list_empty(fa_head)) {
hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
+ free_leaf_info(l, li);
}
if (hlist_empty(&l->list))
@@ -1668,7 +1678,7 @@ static int trie_flush_leaf(struct trie *
if (list_empty(&li->falh)) {
hlist_del_rcu(&li->hlist);
- free_leaf_info(li);
+ free_leaf_info(l, li);
}
}
return found;
@@ -1935,7 +1945,8 @@ void __init fib_hash_init(void)
fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
0, SLAB_PANIC, NULL);
- trie_leaf_kmem = kmem_cache_create("ip_fib_trie", sizeof(struct leaf),
+ trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
+ sizeof(struct leaf) + sizeof(struct leaf_info),
0, SLAB_PANIC, NULL);
}
^ permalink raw reply
* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Robert Olsson @ 2008-01-15 17:59 UTC (permalink / raw)
To: Eric Dumazet
Cc: Robert Olsson, Stephen Hemminger, David Miller, robert.olsson,
netdev
In-Reply-To: <20080115182544.98c18d08.dada1@cosmosbay.com>
Eric Dumazet writes:
>
> So you think that a leaf cannot have 2 infos, one 'embeded' and one in the list ?
Hello,
The model I thought of is to have either:
1) One leaf_info embedded in leaf. A fast-path leaf. FP-leaf
Or
2) The intct old leaf_info list with arbitrary number leaf_infos
If plen_iinfo is >=0 It's a FP-leaf
Cheers.
--ro
^ permalink raw reply
* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Eric Dumazet @ 2008-01-15 18:10 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, robert.olsson, netdev
In-Reply-To: <20080115094753.32e35823@deepthought>
On Tue, 15 Jan 2008 09:47:53 -0800
Stephen Hemminger <stephen.hemminger@vyatta.com> wrote:
> This is how I did it:
>
> --- a/net/ipv4/fib_trie.c 2008-01-15 09:14:53.000000000 -0800
> +++ b/net/ipv4/fib_trie.c 2008-01-15 09:21:48.000000000 -0800
> @@ -101,13 +101,6 @@ struct node {
> t_key key;
> };
>
> -struct leaf {
> - unsigned long parent;
> - t_key key;
> - struct hlist_head list;
> - struct rcu_head rcu;
> -};
> -
> struct leaf_info {
> struct hlist_node hlist;
> struct rcu_head rcu;
> @@ -115,6 +108,13 @@ struct leaf_info {
> struct list_head falh;
> };
>
> +struct leaf {
> + unsigned long parent;
> + t_key key;
> + struct hlist_head list;
> + struct rcu_head rcu;
> +};
I like this :)
Your design is clean, but we waste some space (rcu in leaf_info "included"), we probably can do a litle bit better
(moving rcu at the end of leaf_info, and kmem_cache_create("ip_fib_trie", sizeof(struct leaf) + sizeof(struct_leaf_info) - sizeof(struct rcu_head))
> - trie_leaf_kmem = kmem_cache_create("ip_fib_trie", sizeof(struct leaf),
> + trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
> + sizeof(struct leaf) + sizeof(struct leaf_info),
> 0, SLAB_PANIC, NULL);
> }
>
>
Thank you
^ permalink raw reply
* Re: sky2 patch in 2.6.24-rc7-git6 breaks POST - commit 84cd2dfb04d23a961c5f537baa243fa54d0987ac
From: Stephen Hemminger @ 2008-01-15 18:09 UTC (permalink / raw)
To: Ioan Ionita; +Cc: netdev, jeff, Linux Kernel Mailing List
In-Reply-To: <df47b87a0801150301w6b970decwdd4bd256aa2ca9ac@mail.gmail.com>
On Tue, 15 Jan 2008 13:01:47 +0200
"Ioan Ionita" <opslynx@gmail.com> wrote:
> Hi,
>
> I have an Asus Commando motherboard, p965 chipset with Marvell
> 88E8056 and 88E8001 gigabit lan onboard. skge and sky2 are compiled
> in.
> When booting linux-2.6.24-rc7-git6, everything seemed fine, but when
> attempting to reboot, the machine would freeze on POST, right before
> detecting AHCI drives.
> In order to get it to POST, a reset or shutdown from the power button
> is not enough, I actually have to cut power to the motherboard using
> the PSU's power switch!
>
> I first thought that the BIOS or CMOS may have gotten corrupted, but
> eventually I fund that linux was the culprit. After a git bisect, I
> found the bad commit below
>
Is Wake On Lan enabled in the BIOS?
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply
* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Stephen Hemminger @ 2008-01-15 18:15 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Robert Olsson, David Miller, robert.olsson, netdev
In-Reply-To: <20080115191031.7ce7219b.dada1@cosmosbay.com>
On Tue, 15 Jan 2008 19:10:31 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:
> On Tue, 15 Jan 2008 09:47:53 -0800
> Stephen Hemminger <stephen.hemminger@vyatta.com> wrote:
>
> > This is how I did it:
> >
> > --- a/net/ipv4/fib_trie.c 2008-01-15 09:14:53.000000000 -0800
> > +++ b/net/ipv4/fib_trie.c 2008-01-15 09:21:48.000000000 -0800
> > @@ -101,13 +101,6 @@ struct node {
> > t_key key;
> > };
> >
> > -struct leaf {
> > - unsigned long parent;
> > - t_key key;
> > - struct hlist_head list;
> > - struct rcu_head rcu;
> > -};
> > -
> > struct leaf_info {
> > struct hlist_node hlist;
> > struct rcu_head rcu;
> > @@ -115,6 +108,13 @@ struct leaf_info {
> > struct list_head falh;
> > };
> >
> > +struct leaf {
> > + unsigned long parent;
> > + t_key key;
> > + struct hlist_head list;
> > + struct rcu_head rcu;
> > +};
>
> I like this :)
>
> Your design is clean, but we waste some space (rcu in leaf_info "included"), we probably can do a litle bit better
> (moving rcu at the end of leaf_info, and kmem_cache_create("ip_fib_trie", sizeof(struct leaf) + sizeof(struct_leaf_info) - sizeof(struct rcu_head))
>
>
> > - trie_leaf_kmem = kmem_cache_create("ip_fib_trie", sizeof(struct leaf),
> > + trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
> > + sizeof(struct leaf) + sizeof(struct leaf_info),
> > 0, SLAB_PANIC, NULL);
> > }
> >
> >
>
> Thank you
Having multiple RCU links is a waste. I started on code that just splice's the
leaf_info's off to a free_list and then do a mass free after and RCU barrier.
For the normal case of just freeing a leaf, it could just walk the chain
in the RCU free of the leaf.
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply
* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Eric Dumazet @ 2008-01-15 18:32 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Robert Olsson, David Miller, robert.olsson, netdev
In-Reply-To: <20080115101543.44c29fa7@deepthought>
On Tue, 15 Jan 2008 10:15:43 -0800
Stephen Hemminger <stephen.hemminger@vyatta.com> wrote:
> On Tue, 15 Jan 2008 19:10:31 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
>
> > On Tue, 15 Jan 2008 09:47:53 -0800
> > Stephen Hemminger <stephen.hemminger@vyatta.com> wrote:
> >
> > > This is how I did it:
> > >
> > > --- a/net/ipv4/fib_trie.c 2008-01-15 09:14:53.000000000 -0800
> > > +++ b/net/ipv4/fib_trie.c 2008-01-15 09:21:48.000000000 -0800
> > > @@ -101,13 +101,6 @@ struct node {
> > > t_key key;
> > > };
> > >
> > > -struct leaf {
> > > - unsigned long parent;
> > > - t_key key;
> > > - struct hlist_head list;
> > > - struct rcu_head rcu;
> > > -};
> > > -
> > > struct leaf_info {
> > > struct hlist_node hlist;
> > > struct rcu_head rcu;
> > > @@ -115,6 +108,13 @@ struct leaf_info {
> > > struct list_head falh;
> > > };
> > >
> > > +struct leaf {
> > > + unsigned long parent;
> > > + t_key key;
> > > + struct hlist_head list;
> > > + struct rcu_head rcu;
> > > +};
> >
> > I like this :)
> >
> > Your design is clean, but we waste some space (rcu in leaf_info "included"), we probably can do a litle bit better
> > (moving rcu at the end of leaf_info, and kmem_cache_create("ip_fib_trie", sizeof(struct leaf) + sizeof(struct_leaf_info) - sizeof(struct rcu_head))
> >
> >
> > > - trie_leaf_kmem = kmem_cache_create("ip_fib_trie", sizeof(struct leaf),
> > > + trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
> > > + sizeof(struct leaf) + sizeof(struct leaf_info),
> > > 0, SLAB_PANIC, NULL);
> > > }
> > >
> > >
> >
> > Thank you
>
> Having multiple RCU links is a waste. I started on code that just splice's the
> leaf_info's off to a free_list and then do a mass free after and RCU barrier.
>
> For the normal case of just freeing a leaf, it could just walk the chain
> in the RCU free of the leaf.
Well, If you take this path, we can also copy the leaf itself, and
just use a variable size array of infos[], no more list, and maximal locality
kmalloc(sizeof(leaf) + nb_infos * sizeof(info))
(Still we can use a kmem cache for nb_infos=1 leaves)
^ permalink raw reply
* Re: sky2 patch in 2.6.24-rc7-git6 breaks POST - commit 84cd2dfb04d23a961c5f537baa243fa54d0987ac
From: Ioan Ionita @ 2008-01-15 19:00 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, jeff, Linux Kernel Mailing List
In-Reply-To: <20080115100947.4ec6cda0@deepthought>
On Jan 15, 2008 8:09 PM, Stephen Hemminger <stephen.hemminger@vyatta.com> wrote:
> On Tue, 15 Jan 2008 13:01:47 +0200
> "Ioan Ionita" <opslynx@gmail.com> wrote:
>
>
> Is Wake On Lan enabled in the BIOS?
I don't have a Wake on Lan option in BIOS, only wake on pci, i assume
it's the same. It was disabled. I enabled it and the regression
behaved the same way. Wouldn't POST after shutdown or soft reset.
Regards
^ permalink raw reply
* Re: sky2 patch in 2.6.24-rc7-git6 breaks POST - commit 84cd2dfb04d23a961c5f537baa243fa54d0987ac
From: Stephen Hemminger @ 2008-01-15 19:16 UTC (permalink / raw)
To: Ioan Ionita; +Cc: netdev, jeff, Linux Kernel Mailing List
In-Reply-To: <df47b87a0801151100qe7c39f1y3296176f343667eb@mail.gmail.com>
On Tue, 15 Jan 2008 21:00:13 +0200
"Ioan Ionita" <opslynx@gmail.com> wrote:
> On Jan 15, 2008 8:09 PM, Stephen Hemminger <stephen.hemminger@vyatta.com> wrote:
> > On Tue, 15 Jan 2008 13:01:47 +0200
> > "Ioan Ionita" <opslynx@gmail.com> wrote:
> >
>
> >
> > Is Wake On Lan enabled in the BIOS?
>
> I don't have a Wake on Lan option in BIOS, only wake on pci, i assume
> it's the same. It was disabled. I enabled it and the regression
> behaved the same way. Wouldn't POST after shutdown or soft reset.
>
I assume if you do:
ethtool -s eth0 wol d
it will POST okay. Okay, for now I'll have Jeff revert the patch.
Looks like the BIOS in your system is broken for WOL.
--
Stephen Hemminger <stephen.hemminger@vyatta.com>
^ permalink raw reply
* Re: [PATCH] net: EMAC: Fix problem with mtu > 4080 on non TAH equipped 4xx PPC's
From: Stefan Roese @ 2008-01-15 19:46 UTC (permalink / raw)
To: Eugene Surovegin; +Cc: linuxppc-dev, netdev, benh
In-Reply-To: <20080115173202.GA1268@gate.ebshome.net>
On Tuesday 15 January 2008, Eugene Surovegin wrote:
> On Tue, Jan 15, 2008 at 01:40:09PM +0100, Stefan Roese wrote:
> > Currently, all non TAH equipped 4xx PPC's call emac_start_xmit() upon
> > xmit. This routine doesn't check if the frame length exceeds the max.
> > MAL buffer size.
> >
> > This patch now changes the driver to call emac_start_xmit_sg() on all
> > platforms and not only the TAH equipped ones (440GX). This enables an
> > MTU of 9000 instead 4080.
> >
> > Tested on Kilauea (405EX) with gbit link -> jumbo frames enabled.
> >
> > Signed-off-by: Stefan Roese <sr@denx.de>
> > ---
> > Eugene & Ben, do you see any problems with this patch? If not, then I'll
> > send another version for the newemac driver too.
>
> Hmm, so why not make GigE support a condition to hook SG version of
> xmit then? I don't like when you change behaviour for chips where it
> perefectly legal not to do this check because you cannot change MTU
> anyways.
OK. But how do we detect GigE support? Seems like GigE enabled devices have
CONFIG_IBM_EMAC4 defined. If nobody objects I'll fix up another version
tomorrow.
Thanks.
Best regards,
Stefan
^ permalink raw reply
* [PATCH 01/03] ISATAP V2 (header file changes)
From: Templin, Fred L @ 2008-01-15 19:57 UTC (permalink / raw)
To: netdev; +Cc: YOSHIFUJI Hideaki / 吉藤英明
In-Reply-To: <20071129.195459.55971471.yoshfuji@linux-ipv6.org>
This patch updates the Linux the Intra-Site Automatic Tunnel Addressing
Protocol (ISATAP) implementation. It places the ISATAP potential router
list (PRL) in the kernel and adds three new private ioctls for PRL
management. The diffs are specific to the netdev net-2.6.25 development
tree taken by "git pull" on 1/14/08.
Signed-off-by: Fred L. Templin <fred.l.templin@boeing.com>
--- net-2.6.25/include/linux/skbuff.h.orig 2008-01-14 15:33:36.000000000 -0800
+++ net-2.6.25/include/linux/skbuff.h 2008-01-14 15:43:06.000000000 -0800
@@ -311,7 +311,8 @@ struct sk_buff {
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
- /* 2 byte hole */
+ __u8 rtr_type;
+ /* 1 byte hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
--- net-2.6.25/include/linux/if_tunnel.h.orig 2008-01-14 15:33:36.000000000 -0800
+++ net-2.6.25/include/linux/if_tunnel.h 2008-01-14 15:42:14.000000000 -0800
@@ -7,6 +7,9 @@
#define SIOCADDTUNNEL (SIOCDEVPRIVATE + 1)
#define SIOCDELTUNNEL (SIOCDEVPRIVATE + 2)
#define SIOCCHGTUNNEL (SIOCDEVPRIVATE + 3)
+#define SIOCADDPRL (SIOCDEVPRIVATE + 4)
+#define SIOCDELPRL (SIOCDEVPRIVATE + 5)
+#define SIOCCHGPRL (SIOCDEVPRIVATE + 6)
#define GRE_CSUM __constant_htons(0x8000)
#define GRE_ROUTING __constant_htons(0x4000)
@@ -17,9 +20,6 @@
#define GRE_FLAGS __constant_htons(0x00F8)
#define GRE_VERSION __constant_htons(0x0007)
-/* i_flags values for SIT mode */
-#define SIT_ISATAP 0x0001
-
struct ip_tunnel_parm
{
char name[IFNAMSIZ];
@@ -30,5 +30,15 @@ struct ip_tunnel_parm
__be32 o_key;
struct iphdr iph;
};
+/* SIT-mode i_flags */
+#define SIT_ISATAP 0x0001
+
+struct ip_tunnel_prladdr {
+ __be32 addr;
+ __be16 flags;
+ __be16 rsvd;
+};
+/* PRL flags */
+#define PRL_BORDER 0x0001
#endif /* _IF_TUNNEL_H_ */
--- net-2.6.25/include/net/ipip.h.orig 2008-01-14 15:33:36.000000000 -0800
+++ net-2.6.25/include/net/ipip.h 2008-01-14 15:41:21.000000000 -0800
@@ -24,6 +24,13 @@ struct ip_tunnel
int mlink;
struct ip_tunnel_parm parms;
+ struct ip_tunnel_prlent *prl; /* potential router list */
+};
+
+struct ip_tunnel_prlent
+{
+ struct ip_tunnel_prlent *next;
+ struct ip_tunnel_prladdr ent;
};
#define IPTUNNEL_XMIT() do { \
--- net-2.6.25/include/net/ndisc.h.orig 2008-01-14 15:40:28.000000000 -0800
+++ net-2.6.25/include/net/ndisc.h 2008-01-15 08:43:21.000000000 -0800
@@ -12,6 +12,16 @@
#define NDISC_REDIRECT 137
/*
+ * Router type: cross-layer information from link-layer to
+ * IPv6 layer reported by certain link types (e.g., RFC4214).
+ */
+
+#define RTRTYPE_UNSPEC 0 /* unspecified (default) */
+#define RTRTYPE_HOST 1 /* host or unauthorized router */
+#define RTRTYPE_INTERIOR 2 /* site-interior router */
+#define RTRTYPE_BORDER 3 /* site border router */
+
+/*
* ndisc options
*/
^ permalink raw reply
* Re: [PATCH] net: EMAC: Fix problem with mtu > 4080 on non TAH equipped 4xx PPC's
From: Eugene Surovegin @ 2008-01-15 20:00 UTC (permalink / raw)
To: Stefan Roese; +Cc: linuxppc-dev, netdev, benh
In-Reply-To: <200801152046.01881.sr@denx.de>
On Tue, Jan 15, 2008 at 08:46:01PM +0100, Stefan Roese wrote:
> On Tuesday 15 January 2008, Eugene Surovegin wrote:
> > On Tue, Jan 15, 2008 at 01:40:09PM +0100, Stefan Roese wrote:
> > > Currently, all non TAH equipped 4xx PPC's call emac_start_xmit() upon
> > > xmit. This routine doesn't check if the frame length exceeds the max.
> > > MAL buffer size.
> > >
> > > This patch now changes the driver to call emac_start_xmit_sg() on all
> > > platforms and not only the TAH equipped ones (440GX). This enables an
> > > MTU of 9000 instead 4080.
> > >
> > > Tested on Kilauea (405EX) with gbit link -> jumbo frames enabled.
> > >
> > > Signed-off-by: Stefan Roese <sr@denx.de>
> > > ---
> > > Eugene & Ben, do you see any problems with this patch? If not, then I'll
> > > send another version for the newemac driver too.
> >
> > Hmm, so why not make GigE support a condition to hook SG version of
> > xmit then? I don't like when you change behaviour for chips where it
> > perefectly legal not to do this check because you cannot change MTU
> > anyways.
>
> OK. But how do we detect GigE support? Seems like GigE enabled devices have
> CONFIG_IBM_EMAC4 defined. If nobody objects I'll fix up another version
> tomorrow.
Look couple of lines down where I set MTU changing hook. If you cannot
change MTU you cannot get big frames.
--
Eugene
^ permalink raw reply
* [PATCH 03/03] ISATAP V2 (sit.c changes)
From: Templin, Fred L @ 2008-01-15 20:00 UTC (permalink / raw)
To: netdev; +Cc: YOSHIFUJI Hideaki / 吉藤英明
In-Reply-To: <39C363776A4E8C4A94691D2BD9D1C9A1029EDDAC@XCH-NW-7V2.nw.nos.boeing.com>
This patch updates the Linux the Intra-Site Automatic Tunnel Addressing
Protocol (ISATAP) implementation. It places the ISATAP potential router
list (PRL) in the kernel and adds three new private ioctls for PRL
management. The diffs are specific to the netdev net-2.6.25 development
tree taken by "git pull" on 1/14/08.
Signed-off-by: Fred L. Templin <fred.l.templin@boeing.com>
--- net-2.6.25/net/ipv6/sit.c.orig 2008-01-14 15:33:36.000000000 -0800
+++ net-2.6.25/net/ipv6/sit.c 2008-01-15 10:21:31.000000000 -0800
@@ -16,7 +16,7 @@
* Changes:
* Roger Venning <r.venning@telstra.com>: 6to4 support
* Nate Thompson <nate@thebog.net>: 6to4 support
- * Fred L. Templin <fltemplin@acm.org>: isatap support
+ * Fred Templin <fred.l.templin@boeing.com>: isatap support
*/
#include <linux/module.h>
@@ -200,6 +200,118 @@ failed:
return NULL;
}
+static struct ip_tunnel_prlent *
+ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
+{
+ struct ip_tunnel_prlent *p = (struct ip_tunnel_prlent *)NULL;
+
+ for (p = t->prl; p; p = p->next)
+ if (p->ent.addr == addr)
+ break;
+ return p;
+
+}
+
+static int
+ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prladdr *a, int chg)
+{
+ struct ip_tunnel_prlent *p;
+
+ for (p = t->prl; p; p = p->next) {
+ if (p->ent.addr == a->addr) {
+ if (chg) {
+ p->ent = *a;
+ return 0;
+ }
+ return -EEXIST;
+ }
+ }
+
+ if (chg)
+ return -ENXIO;
+
+ if (!(p = kzalloc(sizeof(struct ip_tunnel_prlent), GFP_KERNEL)))
+ return -ENOBUFS;
+
+ p->ent = *a;
+ p->next = t->prl;
+ t->prl = p;
+ return 0;
+}
+
+static int
+ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prladdr *a)
+{
+ struct ip_tunnel_prlent *x, **p;
+
+ if (a) {
+ for (p = &t->prl; *p; p = &(*p)->next) {
+ if ((*p)->ent.addr == a->addr) {
+ x = *p;
+ *p = x->next;
+ kfree(x);
+ return 0;
+ }
+ }
+ return -ENXIO;
+ } else {
+ while (t->prl) {
+ x = t->prl;
+ t->prl = t->prl->next;
+ kfree(x);
+ }
+ }
+ return 0;
+}
+
+/* copied directly from anycast.c */
+static int
+ipip6_onlink(struct in6_addr *addr, struct net_device *dev)
+{
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa;
+ int onlink;
+
+ onlink = 0;
+ rcu_read_lock();
+ idev = __in6_dev_get(dev);
+ if (idev) {
+ read_lock_bh(&idev->lock);
+ for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) {
+ onlink = ipv6_prefix_equal(addr, &ifa->addr,
+ ifa->prefix_len);
+ if (onlink)
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ rcu_read_unlock();
+ return onlink;
+}
+
+static int
+isatap_chksrc(struct sk_buff *skb, struct iphdr *iph, struct ip_tunnel *t)
+{
+ struct ip_tunnel_prlent *p = ipip6_tunnel_locate_prl(t, iph->saddr);
+ int ok = 1;
+
+ if (p) {
+ if (p->ent.flags & PRL_BORDER)
+ skb->rtr_type = RTRTYPE_BORDER;
+ else
+ skb->rtr_type = RTRTYPE_INTERIOR;
+ } else {
+ struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr;
+ if (ipv6_addr_is_isatap(addr6) &&
+ (addr6->s6_addr32[3] == iph->saddr) &&
+ ipip6_onlink(addr6, t->dev))
+ skb->rtr_type = RTRTYPE_HOST;
+ else
+ ok = 0;
+ }
+ return ok;
+}
+
static void ipip6_tunnel_uninit(struct net_device *dev)
{
if (dev == ipip6_fb_tunnel_dev) {
@@ -209,6 +321,7 @@ static void ipip6_tunnel_uninit(struct n
dev_put(dev);
} else {
ipip6_tunnel_unlink(netdev_priv(dev));
+ ipip6_tunnel_del_prl(netdev_priv(dev), 0);
dev_put(dev);
}
}
@@ -368,48 +481,6 @@ static inline void ipip6_ecn_decapsulate
IP6_ECN_set_ce(ipv6_hdr(skb));
}
-/* ISATAP (RFC4214) - check source address */
-static int
-isatap_srcok(struct sk_buff *skb, struct iphdr *iph, struct net_device *dev)
-{
- struct neighbour *neigh;
- struct dst_entry *dst;
- struct rt6_info *rt;
- struct flowi fl;
- struct in6_addr *addr6;
- struct in6_addr rtr;
- struct ipv6hdr *iph6;
- int ok = 0;
-
- /* from onlink default router */
- ipv6_addr_set(&rtr, htonl(0xFE800000), 0, 0, 0);
- ipv6_isatap_eui64(rtr.s6_addr + 8, iph->saddr);
- if ((rt = rt6_get_dflt_router(&rtr, dev))) {
- dst_release(&rt->u.dst);
- return 1;
- }
-
- iph6 = ipv6_hdr(skb);
- memset(&fl, 0, sizeof(fl));
- fl.proto = iph6->nexthdr;
- ipv6_addr_copy(&fl.fl6_dst, &iph6->saddr);
- fl.oif = dev->ifindex;
- security_skb_classify_flow(skb, &fl);
-
- dst = ip6_route_output(NULL, &fl);
- if (!dst->error && (dst->dev == dev) && (neigh = dst->neighbour)) {
-
- addr6 = (struct in6_addr*)&neigh->primary_key;
-
- /* from correct previous hop */
- if (ipv6_addr_is_isatap(addr6) &&
- (addr6->s6_addr32[3] == iph->saddr))
- ok = 1;
- }
- dst_release(dst);
- return ok;
-}
-
static int ipip6_rcv(struct sk_buff *skb)
{
struct iphdr *iph;
@@ -430,7 +501,7 @@ static int ipip6_rcv(struct sk_buff *skb
skb->pkt_type = PACKET_HOST;
if ((tunnel->dev->priv_flags & IFF_ISATAP) &&
- !isatap_srcok(skb, iph, tunnel->dev)) {
+ !isatap_chksrc(skb, iph, tunnel)) {
tunnel->stat.rx_errors++;
read_unlock(&ipip6_lock);
kfree_skb(skb);
@@ -710,6 +781,7 @@ ipip6_tunnel_ioctl (struct net_device *d
{
int err = 0;
struct ip_tunnel_parm p;
+ struct ip_tunnel_prladdr prl;
struct ip_tunnel *t;
switch (cmd) {
@@ -809,6 +881,31 @@ ipip6_tunnel_ioctl (struct net_device *d
err = 0;
break;
+ case SIOCADDPRL:
+ case SIOCDELPRL:
+ case SIOCCHGPRL:
+ err = -EPERM;
+ if (!capable(CAP_NET_ADMIN))
+ goto done;
+ err = -EINVAL;
+ if (dev == ipip6_fb_tunnel_dev)
+ goto done;
+ err = -EFAULT;
+ if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
+ goto done;
+ err = -ENOENT;
+ if (!(t = netdev_priv(dev)))
+ goto done;
+
+ ipip6_tunnel_unlink(t);
+ if (cmd == SIOCDELPRL)
+ err = ipip6_tunnel_del_prl(t, &prl);
+ else
+ err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+ ipip6_tunnel_link(t);
+ netdev_state_change(dev);
+ break;
+
default:
err = -EINVAL;
}
^ permalink raw reply
* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Robert Olsson @ 2008-01-15 20:18 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, robert.olsson, netdev
In-Reply-To: <20080115094753.32e35823@deepthought>
Stephen Hemminger writes:
> This is how I did it:
Yes looks like an elegant solution. Did you even test it?
Maybe we see some effects in just dumping a full table?
Anyway lookup should be tested in some way. We can a lot
of analyzing before getting to right entry, local_table
backtracking, main lookup w. ev. backtracking etc. So
hopefully we get paid for this work.
Also it might be idea to do some analysis of the fib_aliases
list. Maybe the trick can be done again? ;)
Cheers
--ro
> --- a/net/ipv4/fib_trie.c 2008-01-15 09:14:53.000000000 -0800
> +++ b/net/ipv4/fib_trie.c 2008-01-15 09:21:48.000000000 -0800
> @@ -101,13 +101,6 @@ struct node {
> t_key key;
> };
>
> -struct leaf {
> - unsigned long parent;
> - t_key key;
> - struct hlist_head list;
> - struct rcu_head rcu;
> -};
> -
> struct leaf_info {
> struct hlist_node hlist;
> struct rcu_head rcu;
> @@ -115,6 +108,13 @@ struct leaf_info {
> struct list_head falh;
> };
>
> +struct leaf {
> + unsigned long parent;
> + t_key key;
> + struct hlist_head list;
> + struct rcu_head rcu;
> +};
> +
> struct tnode {
> unsigned long parent;
> t_key key;
> @@ -321,16 +321,6 @@ static void __leaf_free_rcu(struct rcu_h
> kmem_cache_free(trie_leaf_kmem, leaf);
> }
>
> -static void __leaf_info_free_rcu(struct rcu_head *head)
> -{
> - kfree(container_of(head, struct leaf_info, rcu));
> -}
> -
> -static inline void free_leaf_info(struct leaf_info *leaf)
> -{
> - call_rcu(&leaf->rcu, __leaf_info_free_rcu);
> -}
> -
> static struct tnode *tnode_alloc(size_t size)
> {
> struct page *pages;
> @@ -357,7 +347,7 @@ static void __tnode_free_rcu(struct rcu_
> free_pages((unsigned long)tn, get_order(size));
> }
>
> -static inline void tnode_free(struct tnode *tn)
> +static void tnode_free(struct tnode *tn)
> {
> if (IS_LEAF(tn)) {
> struct leaf *l = (struct leaf *) tn;
> @@ -376,16 +366,41 @@ static struct leaf *leaf_new(void)
> return l;
> }
>
> +static void leaf_info_init(struct leaf_info *li, int plen)
> +{
> + li->plen = plen;
> + INIT_LIST_HEAD(&li->falh);
> +}
> +
> +static struct leaf_info *leaf_info_first(struct leaf *l, int plen)
> +{
> + struct leaf_info *li = (struct leaf_info *) (l + 1);
> + leaf_info_init(li, plen);
> + return li;
> +}
> +
> static struct leaf_info *leaf_info_new(int plen)
> {
> struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
> - if (li) {
> - li->plen = plen;
> - INIT_LIST_HEAD(&li->falh);
> - }
> + if (li)
> + leaf_info_init(li, plen);
> +
> return li;
> }
>
> +static void __leaf_info_free_rcu(struct rcu_head *head)
> +{
> + kfree(container_of(head, struct leaf_info, rcu));
> +}
> +
> +static inline void free_leaf_info(struct leaf *l, struct leaf_info *leaf)
> +{
> + if (leaf == (struct leaf_info *)(l + 1))
> + return;
> +
> + call_rcu(&leaf->rcu, __leaf_info_free_rcu);
> +}
> +
> static struct tnode* tnode_new(t_key key, int pos, int bits)
> {
> size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
> @@ -1047,18 +1062,13 @@ static struct list_head *fib_insert_node
> insert_leaf_info(&l->list, li);
> goto done;
> }
> - l = leaf_new();
>
> + l = leaf_new();
> if (!l)
> return NULL;
>
> l->key = key;
> - li = leaf_info_new(plen);
> -
> - if (!li) {
> - tnode_free((struct tnode *) l);
> - return NULL;
> - }
> + li = leaf_info_first(l, plen);
>
> fa_head = &li->falh;
> insert_leaf_info(&l->list, li);
> @@ -1091,7 +1101,7 @@ static struct list_head *fib_insert_node
> }
>
> if (!tn) {
> - free_leaf_info(li);
> + free_leaf_info(l, li);
> tnode_free((struct tnode *) l);
> return NULL;
> }
> @@ -1624,7 +1634,7 @@ static int fn_trie_delete(struct fib_tab
>
> if (list_empty(fa_head)) {
> hlist_del_rcu(&li->hlist);
> - free_leaf_info(li);
> + free_leaf_info(l, li);
> }
>
> if (hlist_empty(&l->list))
> @@ -1668,7 +1678,7 @@ static int trie_flush_leaf(struct trie *
>
> if (list_empty(&li->falh)) {
> hlist_del_rcu(&li->hlist);
> - free_leaf_info(li);
> + free_leaf_info(l, li);
> }
> }
> return found;
> @@ -1935,7 +1945,8 @@ void __init fib_hash_init(void)
> fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
> 0, SLAB_PANIC, NULL);
>
> - trie_leaf_kmem = kmem_cache_create("ip_fib_trie", sizeof(struct leaf),
> + trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
> + sizeof(struct leaf) + sizeof(struct leaf_info),
> 0, SLAB_PANIC, NULL);
> }
>
^ permalink raw reply
* Re: questions on NAPI processing latency and dropped network packets
From: Jarek Poplawski @ 2008-01-15 20:29 UTC (permalink / raw)
To: Chris Friesen; +Cc: David Miller, netdev, linux-kernel
In-Reply-To: <478CC76B.1020804@nortel.com>
On Tue, Jan 15, 2008 at 08:47:07AM -0600, Chris Friesen wrote:
> Jarek Poplawski wrote:
>
>> IMHO, checking this with a current stable, which probably you are going
>> to do some day, anyway, should be 100% acceptable: giving some input to
>> netdev, while still working for yourself.
>
> While I would love to do this, it's not that simple.
...Hmm... As a matter of fact, I expected you'd treat my point less
literally... Of course, I know it could be sometimes very hard to get
something working even after upgrading one version, let alone several
at once.
So, it was more a rhetorical trick (sorry!) to suggest, that such a
business model of being always late with kernels might be quite
practical and reasonable for many companies, but looks like the
worst possible development model for Linux.
On the other hand, it seems there are not so much, nor expensive
changes needed (a bit more perspective thinking?) to make everybody
happy...
Jarek P.
^ permalink raw reply
* [PATCH 02/03] ISATAP V2 (ndisc.c; route.c changes)
From: Templin, Fred L @ 2008-01-15 19:59 UTC (permalink / raw)
To: netdev; +Cc: YOSHIFUJI Hideaki / 吉藤英明
In-Reply-To: <39C363776A4E8C4A94691D2BD9D1C9A1029EDDAC@XCH-NW-7V2.nw.nos.boeing.com>
This patch updates the Linux the Intra-Site Automatic Tunnel Addressing
Protocol (ISATAP) implementation. It places the ISATAP potential router
list (PRL) in the kernel and adds three new private ioctls for PRL
management. The diffs are specific to the netdev net-2.6.25 development
tree taken by "git pull" on 1/14/08.
Signed-off-by: Fred L. Templin <fred.l.templin@boeing.com>
--- net-2.6.25/net/ipv6/ndisc.c.orig 2008-01-14 15:35:55.000000000 -0800
+++ net-2.6.25/net/ipv6/ndisc.c 2008-01-15 09:02:23.000000000 -0800
@@ -1090,6 +1090,12 @@ static void ndisc_router_discovery(struc
return;
}
+ if (skb->rtr_type == RTRTYPE_HOST) {
+ ND_PRINTK2(KERN_WARNING
+ "ICMPv6 RA: from host or unauthorized router\n");
+ return;
+ }
+
/*
* set the RA_RECV flag in the interface
*/
@@ -1113,6 +1119,10 @@ static void ndisc_router_discovery(struc
return;
}
+ /* skip link-specific parameters from interior routers */
+ if (skb->rtr_type == RTRTYPE_INTERIOR)
+ goto skip_linkparms;
+
if (in6_dev->if_flags & IF_RS_SENT) {
/*
* flag that an RA was received after an RS was sent
@@ -1227,6 +1237,8 @@ skip_defrtr:
}
}
+skip_linkparms:
+
/*
* Process options.
*/
@@ -1266,6 +1278,10 @@ skip_defrtr:
}
#endif
+ /* skip link-specific ndopts from interior routers */
+ if (skb->rtr_type == RTRTYPE_INTERIOR)
+ goto out;
+
if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_pi;
@@ -1329,6 +1345,14 @@ static void ndisc_redirect_rcv(struct sk
int optlen;
u8 *lladdr = NULL;
+ switch (skb->rtr_type) {
+ case RTRTYPE_HOST:
+ case RTRTYPE_INTERIOR:
+ ND_PRINTK2(KERN_WARNING
+ "ICMPv6 Redirect: from host or unauthorized router\n");
+ return;
+ }
+
if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
ND_PRINTK2(KERN_WARNING
"ICMPv6 Redirect: source address is not link-local.\n");
--- net-2.6.25/net/ipv6/route.c.orig 2008-01-14 15:39:40.000000000 -0800
+++ net-2.6.25/net/ipv6/route.c 2008-01-14 15:39:55.000000000 -0800
@@ -1655,8 +1655,6 @@ struct rt6_info *rt6_get_dflt_router(str
return rt;
}
-EXPORT_SYMBOL(rt6_get_dflt_router);
-
struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
struct net_device *dev,
unsigned int pref)
^ permalink raw reply
* SO_RCVBUF doesn't change receiver advertised window
From: Ritesh Kumar @ 2008-01-15 20:36 UTC (permalink / raw)
To: netdev
Hi,
I am using linux 2.6.20 and am trying to limit the receiver window
size for a TCP connection. However, it seems that auto tuning is not
turning itself off even after I use the syscall
rwin=65536
setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rwin, sizeof(rwin));
and verify using
getsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rwin, &rwin_size);
that RCVBUF indeed is getting set (the value returned from getsockopt
is double that, 131072).
The above calls are made before connect() on the client side and
before bind(), accept() on the server side. Bulk data is being sent
from the client to the server. The client and the server machines also
have tcp_moderate_rcvbuf set to 0 (though I don't think that's really
needed; setting a value to SO_RCVBUF should automatically turnoff auto
tuning.).
However the tcp trace shows the SYN, SYN/ACK and the first few packets as:
14:34:18.831703 IP 192.168.1.153.45038 > 192.168.2.204.9999: S
3947298186:3947298186(0) win 5840 <mss 1460,sackOK,timestamp 2842625
0,nop,wscale 5>
14:34:18.836000 IP 192.168.2.204.9999 > 192.168.1.153.45038: S
3955381015:3955381015(0) ack 3947298187 win 5792 <mss
1460,sackOK,timestamp 2843649 2842625,nop,wscale 2>
14:34:18.837654 IP 192.168.1.153.45038 > 192.168.2.204.9999: . ack 1
win 183 <nop,nop,timestamp 2842634 2843649>
14:34:18.837849 IP 192.168.1.153.45038 > 192.168.2.204.9999: .
1:1449(1448) ack 1 win 183 <nop,nop,timestamp 2842634 2843649>
14:34:18.837851 IP 192.168.1.153.45038 > 192.168.2.204.9999: P
1449:1461(12) ack 1 win 183 <nop,nop,timestamp 2842634 2843649>
14:34:18.839001 IP 192.168.2.204.9999 > 192.168.1.153.45038: . ack
1449 win 2172 <nop,nop,timestamp 2843652 2842634>
14:34:18.839011 IP 192.168.2.204.9999 > 192.168.1.153.45038: . ack
1461 win 2172 <nop,nop,timestamp 2843652 2842634>
14:34:18.840875 IP 192.168.1.153.45038 > 192.168.2.204.9999: .
1461:2909(1448) ack 1 win 183 <nop,nop,timestamp 2842637 2843652>
14:34:18.840997 IP 192.168.1.153.45038 > 192.168.2.204.9999: .
2909:4357(1448) ack 1 win 183 <nop,nop,timestamp 2842637 2843652>
14:34:18.841120 IP 192.168.1.153.45038 > 192.168.2.204.9999: .
4357:5805(1448) ack 1 win 183 <nop,nop,timestamp 2842637 2843652>
14:34:18.841244 IP 192.168.1.153.45038 > 192.168.2.204.9999: .
5805:7253(1448) ack 1 win 183 <nop,nop,timestamp 2842637 2843652>
14:34:18.841388 IP 192.168.2.204.9999 > 192.168.1.153.45038: . ack
2909 win 2896 <nop,nop,timestamp 2843655 2842637>
14:34:18.841399 IP 192.168.2.204.9999 > 192.168.1.153.45038: . ack
4357 win 3620 <nop,nop,timestamp 2843655 2842637>
14:34:18.841413 IP 192.168.2.204.9999 > 192.168.1.153.45038: . ack
5805 win 4344 <nop,nop,timestamp 2843655 2842637>
As you can see, the syn and syn ack show rcv windows to be 5840 and
5792 and it automatically increases for the receiver to values 2172
till 4344 and more in the later part of the trace till 24214.
The values for the tcp sysctl variables are given below:
/proc/sys/net/ipv4/tcp_moderate_rcvbuf 0
/proc/sys/net/ipv4/tcp_mem 32768 43690 65536
/proc/sys/net/ipv4/tcp_rmem 4096 87380 1398080
/proc/sys/net/ipv4/tcp_wmem 4096 16384 1398080
/proc/sys/net/core/rmem_max 131071
/proc/sys/net/core/wmem_max 131071
/proc/sys/net/core/wmem_default 109568
/proc/sys/net/core/rmem_default 109568
I will really appreciate your help,
Ritesh
^ permalink raw reply
* Re: [PATCH 1/3] skb_partial_csum_set
From: Rusty Russell @ 2008-01-15 21:03 UTC (permalink / raw)
To: David Miller; +Cc: netdev, virtualization
In-Reply-To: <20080115.031422.06112893.davem@davemloft.net>
On Tuesday 15 January 2008 22:14:22 David Miller wrote:
> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Tue, 15 Jan 2008 21:41:55 +1100
>
> > Implement skb_partial_csum_set, for setting partial csums on untrusted
> > packets.
> >
> > Use it in virtio_net (replacing buggy version there), it's also going
> > to be used by TAP for partial csum support.
> >
> > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>
> Looks fine to me.
>
> Acked-by: David S. Miller <davem@davemloft.net>
>
> If you like I can merge this into my net-2.6.25 tree, or alternatively
> if it makes your life easier you then you can handle it yourself.
Thanks, that will reduce coordination pain.
Cheers,
Rusty.
^ permalink raw reply
* Is there an easy way for non-privileged users to determine an interface's speed?
From: Mark Seger @ 2008-01-15 21:07 UTC (permalink / raw)
To: netdev
I think the current answer is they can't but I also wanted to confirm it
with this list. I had hoped I might be able to find it in /proc, /sys
or with a utility like ethtool or even ifconfig, but alas it's nowhere
to be seen. So if I'm correct, that leads to the second question of why
not? Would it be that tough to put it in under /sys/class/net/*? One
of the reasons I ask is that I'd like to be able to tell if a network
pipe is nearing its capacity and so be able to alert someone about it
when it occurs.
-mark
^ permalink raw reply
* Re: Packetlost when "tc qdisc del dev eth0 root"
From: Jarek Poplawski @ 2008-01-15 21:15 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Badalian Vyacheslav, netdev
In-Reply-To: <478CD9D6.3000504@trash.net>
Patrick McHardy wrote, On 01/15/2008 05:05 PM:
> Badalian Vyacheslav wrote:
...
> Yes, packets in the old qdisc are lost.
>
>> Maybe if tc do changes - need create second queue (hash of rules or how
>> you named it?) and do changes at it. Then replace old queue rules by
>> created new.
>> Logic -
>> 1. Do snapshot
>> 2. Do changes in shapshot
>> 3. All new packets go to snapshot
>> 4. If old queue not have packets - delete it.
>> 5. Snapshot its default.
>
>
> That doesn't really work since qdiscs keep internal state that
> in large parts depends on the packets queued. Take the qlen as
> a simple example, the new qdisc doesn't know about the packets
> in the old one and will exceed the limit.
But, some similar alternative to killing packets 'to death' could
be imagined, I suppose (in the future, of course!). So, e.g. doing
the switch automatically after last packet has been dequeued (maybe
even with some 'special' function/mode for this). After all even
with accuracy lost, it could be less visible for clients than
current way?
Regards,
Jarek P.
^ permalink raw reply
* Re: [RFC 6/6] fib_trie: combine leaf and info
From: Eric Dumazet @ 2008-01-15 21:16 UTC (permalink / raw)
To: Robert Olsson; +Cc: Stephen Hemminger, David Miller, robert.olsson, netdev
In-Reply-To: <18317.5368.952926.46132@robur.slu.se>
Robert Olsson a écrit :
>
> Stephen Hemminger writes:
> > This is how I did it:
>
> Yes looks like an elegant solution. Did you even test it?
> Maybe we see some effects in just dumping a full table?
>
> Anyway lookup should be tested in some way. We can a lot
> of analyzing before getting to right entry, local_table
> backtracking, main lookup w. ev. backtracking etc. So
> hopefully we get paid for this work.
>
> Also it might be idea to do some analysis of the fib_aliases
> list. Maybe the trick can be done again? ;)
>
Back in 2.6.9 times, sizeof(bi_alias) was 16 bytes on i386
Nowadays, 64/128 bytes are the norm :(
SLAB_HWCACHE_ALIGN is not our friend.
^ permalink raw reply
* Re: Packetlost when "tc qdisc del dev eth0 root"
From: slavon @ 2008-01-15 21:46 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: Patrick McHardy, Badalian Vyacheslav, netdev
In-Reply-To: <478D226E.1050209@gmail.com>
Quoting Jarek Poplawski <jarkao2@gmail.com>:
> Patrick McHardy wrote, On 01/15/2008 05:05 PM:
>
>> Badalian Vyacheslav wrote:
>
> ...
>
>> Yes, packets in the old qdisc are lost.
>>
>>> Maybe if tc do changes - need create second queue (hash of rules or how
>>> you named it?) and do changes at it. Then replace old queue rules by
>>> created new.
>>> Logic -
>>> 1. Do snapshot
>>> 2. Do changes in shapshot
>>> 3. All new packets go to snapshot
>>> 4. If old queue not have packets - delete it.
>>> 5. Snapshot its default.
>>
>>
>> That doesn't really work since qdiscs keep internal state that
>> in large parts depends on the packets queued. Take the qlen as
>> a simple example, the new qdisc doesn't know about the packets
>> in the old one and will exceed the limit.
>
> But, some similar alternative to killing packets 'to death' could
> be imagined, I suppose (in the future, of course!). So, e.g. doing
> the switch automatically after last packet has been dequeued (maybe
> even with some 'special' function/mode for this). After all even
> with accuracy lost, it could be less visible for clients than
> current way?
>
> Regards,
> Jarek P.
Hmmm... i found way to fix this for me... but its not look good
Scheme look like:
Root - prio bands 3 priomap 0 0 0 0 ....
--- Class 1
--- Class 2
-------- Copy of all table (Last this qdisc be root)
--- Class 3
-------- Copy of all table (Last this qdisc be root)
2. Add filter to root - flowid all packets to class 2
3. Delete qdisc at class 3
4. Create all table on class 3 (~20k qdiscs and 20k classes)
5. Replace filter on root - flowid all packets to class 3
6. If need update go to step 3, but use class 2
All work good... and packets not dropeed =)
But i have above 45 k classes and qdiscs.... After some time i will
need patch to up max qdisc and classes more then 65k (> 0xfffe) =)))
Also i have very bad TC commands performance then i have more then 10k rules.
Thanks =)
----------------------------------------------------------------
This message was sent using IMP, the Internet Messaging Program.
^ permalink raw reply
* RE: [REGRESSION] 2.6.24-rc7: e1000: Detected Tx Unit Hang
From: Brandeburg, Jesse @ 2008-01-15 21:53 UTC (permalink / raw)
To: slavon, Frans Pop; +Cc: David Miller, netdev, linux-kernel
In-Reply-To: <20080115190458.rxt3yhb2o8o404kc@mail.bigtelecom.ru>
slavon@bigtelecom.ru wrote:
> Quoting Frans Pop <elendil@planet.nl>:
>>> (Note this isn't the final correct patch we should apply. There is
>>> no reason why this revert back to the older ->poll() logic here
>>> should have any effect on the TX hang triggering...)
>>
>> s/no reason/no obvious reason/ ? ;-)
The tx code has an "early exit" that tries to limit the amount of tx
packets handled in a single poll loop and requires napi or interrupt
rescheduling based on the return value from e1000_clean_tx_irq.
see this code in e1000_clean_tx_irq
4005 #ifdef CONFIG_E1000_NAPI
4006 #define E1000_TX_WEIGHT 64
4007 > > /* weight of a sort for tx, to avoid endless
transmit cleanup */
4008 > > if (count++ == E1000_TX_WEIGHT) break;
4009 #endif
I think that is probably related. For a test you could apply the
original patch, and remove this "break" just by commenting out line
4008. This would guarantee all tx work is cleaned at every e1000_clean
Jesse
^ permalink raw reply
* Re: Not understand some in htb_do_events function
From: Martin Devera @ 2008-01-15 21:58 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Badalian Vyacheslav, netdev
In-Reply-To: <478CD741.7040004@trash.net>
>
> So this was meant to protect against endless loops?
>
>> We want way to smooth big burst of events over more dequeue invocations
>> in order to not slow dequeue too much. Constant 500 is max. allowed
>> "slowdown" of dequeue.
>> Any bright idea how to do it more elegant, Patrick ?
>
>
> Unfortunately not, but I believe simply removing the limit
> completely would be better than picking an arbitary value.
Grrr my comp crashed while I was writing this mail. Well the second
attempt.
When we allow unlimited events per dequeue, then there is case where
all N classes in qdisc can be in the event queue with the same target
time. Then they will all be acted on in the loop within single dequeue,
costing us say some milliseconds. Additionaly, it tends to repeat itself
then in cycles.
Maybe it is acceptable but it seemed to me as rather big latency.
Thus I wanted to do only limited work per dequeue call. One possibility
is to remove the limit and "see what happend in wild".
What do u think about to do limited no of transitions and then schedule
tasklet to do the rest (again in limited buckets) ?
^ permalink raw reply
* Re: Packetlost when "tc qdisc del dev eth0 root"
From: slavon @ 2008-01-15 22:04 UTC (permalink / raw)
To: slavon; +Cc: Jarek Poplawski, Patrick McHardy, netdev
In-Reply-To: <20080116004602.zn4y94e8sg0w4o8k@mail.bigtelecom.ru>
Good night! =)
Sorry... i was wrong...
I see that problem more serious....
Lets see to scheme
Class 1
---qdisc
------- 10k classes
Class 2
---qdisc
------- 10k classes
All traffic go to class 2... class 1 qdisc not have packets and if we
delete it - packets not lost... in theory... lets try delete class 1
qdisc (all childrens delete too)...
PC freeze on 2-5 seconds... its not forward any traffic at this
moment... its great tree lock?
Its normal or code need to more accurate lock?
Thanks!
> Quoting Jarek Poplawski <jarkao2@gmail.com>:
>
>> Patrick McHardy wrote, On 01/15/2008 05:05 PM:
>>
>>> Badalian Vyacheslav wrote:
>>
>> ...
>>
>>> Yes, packets in the old qdisc are lost.
>>>
>>>> Maybe if tc do changes - need create second queue (hash of rules or how
>>>> you named it?) and do changes at it. Then replace old queue rules by
>>>> created new.
>>>> Logic -
>>>> 1. Do snapshot
>>>> 2. Do changes in shapshot
>>>> 3. All new packets go to snapshot
>>>> 4. If old queue not have packets - delete it.
>>>> 5. Snapshot its default.
>>>
>>>
>>> That doesn't really work since qdiscs keep internal state that
>>> in large parts depends on the packets queued. Take the qlen as
>>> a simple example, the new qdisc doesn't know about the packets
>>> in the old one and will exceed the limit.
>>
>> But, some similar alternative to killing packets 'to death' could
>> be imagined, I suppose (in the future, of course!). So, e.g. doing
>> the switch automatically after last packet has been dequeued (maybe
>> even with some 'special' function/mode for this). After all even
>> with accuracy lost, it could be less visible for clients than
>> current way?
>>
>> Regards,
>> Jarek P.
>
> Hmmm... i found way to fix this for me... but its not look good
>
> Scheme look like:
> Root - prio bands 3 priomap 0 0 0 0 ....
> --- Class 1
> --- Class 2
> -------- Copy of all table (Last this qdisc be root)
> --- Class 3
> -------- Copy of all table (Last this qdisc be root)
>
> 2. Add filter to root - flowid all packets to class 2
> 3. Delete qdisc at class 3
> 4. Create all table on class 3 (~20k qdiscs and 20k classes)
> 5. Replace filter on root - flowid all packets to class 3
> 6. If need update go to step 3, but use class 2
>
> All work good... and packets not dropeed =)
> But i have above 45 k classes and qdiscs.... After some time i will
> need patch to up max qdisc and classes more then 65k (> 0xfffe) =)))
> Also i have very bad TC commands performance then i have more then 10k rules.
>
> Thanks =)
>
> ----------------------------------------------------------------
> This message was sent using IMP, the Internet Messaging Program.
----------------------------------------------------------------
This message was sent using IMP, the Internet Messaging Program.
^ permalink raw reply
* [PATCH] 8390: fix CONFIG_LOCKDEP error, 2.6.24-rc7
From: Frank Rowand @ 2008-01-15 22:23 UTC (permalink / raw)
To: p_gortmaker; +Cc: netdev
From: Frank Rowand <frank.rowand@am.sony.com>
Turning on CONFIG_LOCKDEP for CONFIG_PREEMPT invokes a path which may
sleep with IRQs disabled. Change disable_irq_nosync_lockdep() to
disable_irq_nosync(), etc. Note the comment near the top of
drivers/net/lib8390.c, which is an lkml email from Alan Cox, pre-saging
the need of this patch.
Signed-off-by: Frank Rowand <frank.rowand@am.sony.com>
---
drivers/net/lib8390.c | 14 7 + 7 - 0 !
1 files changed, 7 insertions(+), 7 deletions(-)
Index: linux-2.6.24-rc5/drivers/net/lib8390.c
===================================================================
--- linux-2.6.24-rc5.orig/drivers/net/lib8390.c
+++ linux-2.6.24-rc5/drivers/net/lib8390.c
@@ -284,7 +284,7 @@ static void ei_tx_timeout(struct net_dev
/* Ugly but a reset can be slow, yet must be protected */
- disable_irq_nosync_lockdep(dev->irq);
+ disable_irq_nosync(dev->irq);
spin_lock(&ei_local->page_lock);
/* Try to restart the card. Perhaps the user has fixed something. */
@@ -292,7 +292,7 @@ static void ei_tx_timeout(struct net_dev
__NS8390_init(dev, 1);
spin_unlock(&ei_local->page_lock);
- enable_irq_lockdep(dev->irq);
+ enable_irq(dev->irq);
netif_wake_queue(dev);
}
@@ -334,7 +334,7 @@ static int ei_start_xmit(struct sk_buff
* Slow phase with lock held.
*/
- disable_irq_nosync_lockdep_irqsave(dev->irq, &flags);
+ disable_irq_nosync(dev->irq);
spin_lock(&ei_local->page_lock);
@@ -373,7 +373,7 @@ static int ei_start_xmit(struct sk_buff
netif_stop_queue(dev);
ei_outb_p(ENISR_ALL, e8390_base + EN0_IMR);
spin_unlock(&ei_local->page_lock);
- enable_irq_lockdep_irqrestore(dev->irq, &flags);
+ enable_irq(dev->irq);
ei_local->stat.tx_errors++;
return 1;
}
@@ -414,7 +414,7 @@ static int ei_start_xmit(struct sk_buff
ei_outb_p(ENISR_ALL, e8390_base + EN0_IMR);
spin_unlock(&ei_local->page_lock);
- enable_irq_lockdep_irqrestore(dev->irq, &flags);
+ enable_irq(dev->irq);
dev_kfree_skb (skb);
ei_local->stat.tx_bytes += send_length;
@@ -530,9 +530,9 @@ static irqreturn_t __ei_interrupt(int ir
#ifdef CONFIG_NET_POLL_CONTROLLER
static void __ei_poll(struct net_device *dev)
{
- disable_irq_lockdep(dev->irq);
+ disable_irq(dev->irq);
__ei_interrupt(dev->irq, dev);
- enable_irq_lockdep(dev->irq);
+ enable_irq(dev->irq);
}
#endif
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox