Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH] fib_hash: improve route deletion scaling on interface drop with lots of interfaces
From: Benjamin LaHaise @ 2009-10-27  0:03 UTC (permalink / raw)
  To: netdev

Hi folks,

Below is a patch to improve the scaling of interface destruction in 
fib_hash.  The general idea is to tie the fib_alias structure into a 
list off of net_device and walk that list during a fib_flush() caused 
by an interface drop.  This makes the resulting flush only have to walk 
the number of routes attached to an interface rather than the number of 
routes attached to all interfaces at the expense of a couple of additional 
pointers in struct fib_alias.

This patch is against Linus' tree.  I'll post against net-next after a 
bit more testing and feedback.  With 20,000 interfaces & routes, interface 
deletion time improves from 53s to 40s.  Note that this is with other changes 
applied to improve sysfs and procfs scaling, as otherwise those are the 
bottleneck.  Next up in the network code is rt_cache_flush().  Comments?

		-ben

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 812a5f3..982045b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -856,6 +856,7 @@ struct net_device
 
 	/* delayed register/unregister */
 	struct list_head	todo_list;
+	struct list_head	fib_list;
 	/* device index hash chain */
 	struct hlist_node	index_hlist;
 
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index ef91fe9..0c32193 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -149,7 +149,7 @@ struct fib_table {
 	int		(*tb_delete)(struct fib_table *, struct fib_config *);
 	int		(*tb_dump)(struct fib_table *table, struct sk_buff *skb,
 				     struct netlink_callback *cb);
-	int		(*tb_flush)(struct fib_table *table);
+	int		(*tb_flush)(struct fib_table *table, struct net_device *dev);
 	void		(*tb_select_default)(struct fib_table *table,
 					     const struct flowi *flp, struct fib_result *res);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index b8f74cf..9f6f736 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5173,6 +5173,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
 	netdev_init_queues(dev);
 
 	INIT_LIST_HEAD(&dev->napi_list);
+	INIT_LIST_HEAD(&dev->fib_list);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
 	setup(dev);
 	strcpy(dev->name, name);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index e2f9505..0283b1f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -128,18 +128,19 @@ void fib_select_default(struct net *net,
 		tb->tb_select_default(tb, flp, res);
 }
 
-static void fib_flush(struct net *net)
+static void fib_flush(struct net_device *dev)
 {
 	int flushed = 0;
 	struct fib_table *tb;
 	struct hlist_node *node;
 	struct hlist_head *head;
 	unsigned int h;
+	struct net *net = dev_net(dev);
 
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
 		head = &net->ipv4.fib_table_hash[h];
 		hlist_for_each_entry(tb, node, head, tb_hlist)
-			flushed += tb->tb_flush(tb);
+			flushed += tb->tb_flush(tb, dev);
 	}
 
 	if (flushed)
@@ -805,7 +806,7 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 			   for stray nexthop entries, then ignite fib_flush.
 			*/
 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
-				fib_flush(dev_net(dev));
+				fib_flush(dev);
 		}
 	}
 #undef LOCAL_OK
@@ -895,7 +896,7 @@ static void nl_fib_lookup_exit(struct net *net)
 static void fib_disable_ip(struct net_device *dev, int force)
 {
 	if (fib_sync_down_dev(dev, force))
-		fib_flush(dev_net(dev));
+		fib_flush(dev);
 	rt_cache_flush(dev_net(dev), 0);
 	arp_ifdown(dev);
 }
@@ -1009,7 +1010,7 @@ static void __net_exit ip_fib_net_exit(struct net *net)
 		head = &net->ipv4.fib_table_hash[i];
 		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
 			hlist_del(node);
-			tb->tb_flush(tb);
+			tb->tb_flush(tb, NULL);
 			kfree(tb);
 		}
 	}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ecd3945..d08ba2f 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -377,6 +377,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
 	u8 tos = cfg->fc_tos;
 	__be32 key;
 	int err;
+	struct net_device *dev;
 
 	if (cfg->fc_dst_len > 32)
 		return -EINVAL;
@@ -516,6 +517,10 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->fa_type = cfg->fc_type;
 	new_fa->fa_scope = cfg->fc_scope;
 	new_fa->fa_state = 0;
+	new_fa->fa_fib_node = f;
+	new_fa->fa_fz = fz;
+
+	dev = fi->fib_dev;
 
 	/*
 	 * Insert new entry to the list.
@@ -527,6 +532,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
 	list_add_tail(&new_fa->fa_list,
 		 (fa ? &fa->fa_list : &f->fn_alias));
 	fib_hash_genid++;
+	list_add_tail(&new_fa->fa_dev_list, &dev->fib_list);
 	write_unlock_bh(&fib_hash_lock);
 
 	if (new_f)
@@ -605,6 +611,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
 		kill_fn = 0;
 		write_lock_bh(&fib_hash_lock);
 		list_del(&fa->fa_list);
+		list_del(&fa->fa_dev_list);
 		if (list_empty(&f->fn_alias)) {
 			hlist_del(&f->fn_hash);
 			kill_fn = 1;
@@ -643,6 +650,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 			if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
 				write_lock_bh(&fib_hash_lock);
 				list_del(&fa->fa_list);
+				list_del(&fa->fa_dev_list);
 				if (list_empty(&f->fn_alias)) {
 					hlist_del(&f->fn_hash);
 					kill_f = 1;
@@ -662,17 +670,69 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 	return found;
 }
 
-static int fn_hash_flush(struct fib_table *tb)
+static int fn_flush_alias(struct fn_hash *table, struct fib_alias *fa)
+{
+	int kill_f = 0;
+	struct fib_info *fi = fa->fa_info;
+	int found = 0;
+
+	if (!fi)
+		BUG();
+
+	if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+		struct fib_node *f = fa->fa_fib_node;
+		struct fn_zone *fz = fa->fa_fz;
+
+		write_lock_bh(&fib_hash_lock);
+		list_del(&fa->fa_list);
+		list_del(&fa->fa_dev_list);
+		if (list_empty(&f->fn_alias)) {
+			hlist_del(&f->fn_hash);
+			kill_f = 1;
+		}
+		fib_hash_genid++;
+		write_unlock_bh(&fib_hash_lock);
+
+		fn_free_alias(fa, f);
+		found++;
+
+		if (kill_f)
+			fn_free_node(f);
+		fz->fz_nent--;
+	}
+
+	return found;
+}
+
+static int fn_flush_dev(struct fn_hash *table, struct net_device *dev)
+{
+	int found = 0;
+	struct list_head *pos, *next;
+
+	list_for_each_safe(pos, next, &dev->fib_list) {
+		struct fib_alias *fa =
+			container_of(pos, struct fib_alias, fa_dev_list);
+		found += fn_flush_alias(table, fa);
+	}
+
+	return found;
+}
+
+static int fn_hash_flush(struct fib_table *tb, struct net_device *dev)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 	struct fn_zone *fz;
 	int found = 0;
 
-	for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
-		int i;
+	if (dev) {
+		found = fn_flush_dev(table, dev);
+	} else {
+		for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+			int i;
 
-		for (i = fz->fz_divisor - 1; i >= 0; i--)
-			found += fn_flush_list(fz, i);
+			for (i = fz->fz_divisor - 1; i >= 0; i--)
+				found += fn_flush_list(fz, i);
+		}
 	}
 	return found;
 }
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133..9f2fad1 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -5,9 +5,17 @@
 #include <linux/list.h>
 #include <net/ip_fib.h>
 
+struct fib_node;
+struct fn_zone;
+
 struct fib_alias {
 	struct list_head	fa_list;
+	struct list_head	fa_dev_list;
 	struct fib_info		*fa_info;
+#ifdef CONFIG_IP_FIB_HASH
+	struct fib_node		*fa_fib_node;
+	struct fn_zone		*fa_fz;
+#endif
 	u8			fa_tos;
 	u8			fa_type;
 	u8			fa_scope;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 291bdf5..4805772 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1786,7 +1786,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
 /*
  * Caller must hold RTNL.
  */
-static int fn_trie_flush(struct fib_table *tb)
+static int fn_trie_flush(struct fib_table *tb, struct net_device *dev)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	struct leaf *l, *ll = NULL;

^ permalink raw reply related

* Re: [PATCH 6/9] ser_gigaset: checkpatch cleanup
From: Joe Perches @ 2009-10-27  0:14 UTC (permalink / raw)
  To: Tilman Schmidt
  Cc: David Miller, Karsten Keil, Hansjoerg Lipp, netdev, linux-kernel,
	isdn4linux, i4ldeveloper
In-Reply-To: <4AE637D8.60809@imap.cc>

On Tue, 2009-10-27 at 00:59 +0100, Tilman Schmidt wrote:
> Am 26.10.2009 01:54 schrieb Joe Perches:
> > On Sun, 2009-10-25 at 20:30 +0100, Tilman Schmidt wrote:
> >> Duly uglified as demanded by checkpatch.pl.
> >> diff --git a/drivers/isdn/gigaset/ser-gigaset.c b/drivers/isdn/gigaset/ser-gigaset.c
> >> index 3071a52..ac3409e 100644
> >> --- a/drivers/isdn/gigaset/ser-gigaset.c
> >> +++ b/drivers/isdn/gigaset/ser-gigaset.c
> >> @@ -164,9 +164,15 @@ static void gigaset_modem_fill(unsigned long data)
> >>  {
> >>  	struct cardstate *cs = (struct cardstate *) data;
> >>  	struct bc_state *bcs;
> >> +	struct sk_buff *nextskb;
> >>  	int sent = 0;
> >>  
> >> -	if (!cs || !(bcs = cs->bcs)) {
> >> +	if (!cs) {
> >> +		gig_dbg(DEBUG_OUTPUT, "%s: no cardstate", __func__);
> >> +		return;
> >> +	}
> >> +	bcs = cs->bcs;
> >> +	if (!bcs) {
> >>  		gig_dbg(DEBUG_OUTPUT, "%s: no cardstate", __func__);
> >>  		return;
> >> 	}
> > 
> > perhaps:
> > 	if (!cs || !cs->bcs) {
> > 		gig_dbg(DEBUG_OUTPUT, "%s: no cardstate", __func__);
> > 		return;
> > 	}
> > 	bcs = cs->bcs;
> 
> That would evaluate cs->bcs twice, and is also, in my experience,
> significantly more prone to easily overlooked typos which result in
> checking a different pointer in the if statement than the one that's
> actually used in the subsequent assignment.

The other is to duplicate the gig_dbg function as you've done.
Also prone to typos and more code as well.

> >> @@ -404,16 +412,20 @@ static void gigaset_device_release(struct device *dev)
> >>  static int gigaset_initcshw(struct cardstate *cs)
> >>  {
> >>  	int rc;
> >> +	struct ser_cardstate *scs;
> >>  
> >> -	if (!(cs->hw.ser = kzalloc(sizeof(struct ser_cardstate), GFP_KERNEL))) {
> >> +	scs = kzalloc(sizeof(struct ser_cardstate), GFP_KERNEL);
> >> +	if (!scs) {
> >>  		pr_err("out of memory\n");
> >>  		return 0;
> >>  	}
> >> +	cs->hw.ser = scs;
> > 
> > Why not no temporary and just:
> > 
> > 	cs->hw.ser = kzalloc...
> > 	if (!cs->hw.ser)
> 
> For the same reasons as above.

I believe the checkpatch recommended form is:

	foo = func();
	if ([!]foo) {
		handle_error()...
	}

as you've used in all the other conversions.

No big deal or difference, but I think what I
suggested is more kernel style normal.

cheers, Joe


^ permalink raw reply

* Re: [RFC PATCH] fib_hash: improve route deletion scaling on interface drop with lots of interfaces
From: David Miller @ 2009-10-27  0:17 UTC (permalink / raw)
  To: bcrl; +Cc: netdev
In-Reply-To: <20091027000302.GA3141@kvack.org>

From: Benjamin LaHaise <bcrl@lhnet.ca>
Date: Mon, 26 Oct 2009 20:03:02 -0400

> Below is a patch to improve the scaling of interface destruction in 
> fib_hash.  The general idea is to tie the fib_alias structure into a 
> list off of net_device and walk that list during a fib_flush() caused 
> by an interface drop.  This makes the resulting flush only have to walk 
> the number of routes attached to an interface rather than the number of 
> routes attached to all interfaces at the expense of a couple of additional 
> pointers in struct fib_alias.
> 
> This patch is against Linus' tree.  I'll post against net-next after a 
> bit more testing and feedback.  With 20,000 interfaces & routes, interface 
> deletion time improves from 53s to 40s.  Note that this is with other changes 
> applied to improve sysfs and procfs scaling, as otherwise those are the 
> bottleneck.  Next up in the network code is rt_cache_flush().  Comments?

On a real router adding and removing routes is happening a lot
whereas interface changes are rare.  You're making a more common
operation more expensive for the sake of a less common one.

> @@ -128,18 +128,19 @@ void fib_select_default(struct net *net,
>  		tb->tb_select_default(tb, flp, res);
>  }
>  
> -static void fib_flush(struct net *net)
> +static void fib_flush(struct net_device *dev)
>  {
>  	int flushed = 0;
>  	struct fib_table *tb;
>  	struct hlist_node *node;
>  	struct hlist_head *head;
>  	unsigned int h;
> +	struct net *net = dev_net(dev);
>  

Please put local variable lines that are longer at the beginning of
the list of variable declarations at the top of a function, not the
other way around which stands out like a sore thumb and looks ugly.

^ permalink raw reply

* Re: [PATCH] sh_eth: Add asm/cacheflush.h
From: David Miller @ 2009-10-27  0:19 UTC (permalink / raw)
  To: iwamatsu; +Cc: netdev, linux-sh, lethal
In-Reply-To: <29ab51dc0910261649p2fb22799o5cc70a71dff0f1dd@mail.gmail.com>

From: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
Date: Tue, 27 Oct 2009 08:49:50 +0900

> Add include asm/cacheflush.h,  because declaration of __flush_purge_region
> moved to asm/cacheflush.h.
> 
> Signed-off-by: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>

Applied, thanks.

^ permalink raw reply

* Re: [RFC PATCH] fib_hash: improve route deletion scaling on interface drop with lots of interfaces
From: Stephen Hemminger @ 2009-10-27  0:24 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: netdev
In-Reply-To: <20091027000302.GA3141@kvack.org>

On Mon, 26 Oct 2009 20:03:02 -0400
Benjamin LaHaise <bcrl@lhnet.ca> wrote:

> Hi folks,
> 
> Below is a patch to improve the scaling of interface destruction in 
> fib_hash.  The general idea is to tie the fib_alias structure into a 
> list off of net_device and walk that list during a fib_flush() caused 
> by an interface drop.  This makes the resulting flush only have to walk 
> the number of routes attached to an interface rather than the number of 
> routes attached to all interfaces at the expense of a couple of additional 
> pointers in struct fib_alias.
> 
> This patch is against Linus' tree.  I'll post against net-next after a 
> bit more testing and feedback.  With 20,000 interfaces & routes, interface 
> deletion time improves from 53s to 40s.  Note that this is with other changes 
> applied to improve sysfs and procfs scaling, as otherwise those are the 
> bottleneck.  Next up in the network code is rt_cache_flush().  Comments?
> 
> 		-ben
> 

Any one doing large number of interfaces should be using FIB_TRIE?



-- 

^ permalink raw reply

* Re: [PATCH] sh_eth: Add asm/cacheflush.h
From: Paul Mundt @ 2009-10-27  0:25 UTC (permalink / raw)
  To: David Miller; +Cc: iwamatsu, netdev, linux-sh
In-Reply-To: <20091026.171956.110203653.davem@davemloft.net>

On Mon, Oct 26, 2009 at 05:19:56PM -0700, David Miller wrote:
> From: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
> Date: Tue, 27 Oct 2009 08:49:50 +0900
> 
> > Add include asm/cacheflush.h,  because declaration of __flush_purge_region
> > moved to asm/cacheflush.h.
> > 
> > Signed-off-by: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
> 
> Applied, thanks.

Even though Iwamatsu-san didn't specify the kernel version, this is
relevant for 2.6.32 (as that's where we made the change).

^ permalink raw reply

* Re: [PATCH] sh_eth: Add asm/cacheflush.h
From: David Miller @ 2009-10-27  0:28 UTC (permalink / raw)
  To: lethal; +Cc: iwamatsu, netdev, linux-sh
In-Reply-To: <20091027002525.GA17085@linux-sh.org>

From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 27 Oct 2009 09:25:25 +0900

> On Mon, Oct 26, 2009 at 05:19:56PM -0700, David Miller wrote:
>> From: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
>> Date: Tue, 27 Oct 2009 08:49:50 +0900
>> 
>> > Add include asm/cacheflush.h,  because declaration of __flush_purge_region
>> > moved to asm/cacheflush.h.
>> > 
>> > Signed-off-by: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
>> 
>> Applied, thanks.
> 
> Even though Iwamatsu-san didn't specify the kernel version, this is
> relevant for 2.6.32 (as that's where we made the change).

Ok.

^ permalink raw reply

* Re: [PATCH] vlan: allow VLAN ID 0 to be used
From: David Miller @ 2009-10-27  0:32 UTC (permalink / raw)
  To: eric.dumazet; +Cc: benny+usenet, gertjan_hofman, mcarlson, netdev, kaber
In-Reply-To: <4AE5CAC6.4000604@gmail.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Oct 2009 17:13:58 +0100

> [PATCH] vlan: allow VLAN ID 0 to be used
> 
> We currently use a 16 bit field (vlan_tci) to store VLAN ID on a skb.
> 
> 0 value is used a special value, meaning VLAN ID not set.
> This forbids use of VLAN ID 0
> 
> As VLAN ID is 12 bits, we can use high order bit as a flag, and
> allow VLAN ID 0
> 
> Reported-by: Gertjan Hofman <gertjan_hofman@yahoo.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

This is going to need some more work.

IXGBE is already using the higher bits of ->vlan_tci internally,
your change breaks that.

QLGE explicitly initializes skb->vlan_tci to zero, you'll need to make
sure that's OK.

There is an explicit "if (skb->vlan_tci" (ie. zero vs. non-zero) test
in net/core/dev.c:netif_receive_skb()

net/core/skbuff.c:__copy_skb_header() does a straight copy, you'll
need to make sure that's still OK.

net/packet/af_packet.c:tpacket_rcv() and packet_recvmsg() report the
skb->vlan_tci value to userspace, that's broken now as userspace
doesn't expect that new bit to be there.

^ permalink raw reply

* Re: [PATCH] can: sja1000: fix bug using library functions for skb allocation
From: David Miller @ 2009-10-27  0:34 UTC (permalink / raw)
  To: wg; +Cc: netdev, Socketcan-core, kurt.van.dijck
In-Reply-To: <4AE5C444.9030804@grandegger.com>

From: Wolfgang Grandegger <wg@grandegger.com>
Date: Mon, 26 Oct 2009 16:46:12 +0100

> Commit 7b6856a0 "can: provide library functions for skb allocation"
> did not properly remove two lines of the SJA1000 driver resulting in
> a 'skb_over_panic' when calling skb_put, as reported by Kurt.
> 
> Signed-off-by: Kurt Van Dijck <kurt.van.dijck@eia.be>
> Signed-off-by: Wolfgang Grandegger <wg@grandegger.com>

Applied, thanks.

^ permalink raw reply

* Verify Your Email Account Now!!!
From: Mail Administrator @ 2009-10-27  0:38 UTC (permalink / raw)


ATTENTION:Mail Administrator Email Account Owner!!!Email Notice
Account Department!

MA-CENTRAL-YOUR COMMUNICATIONS PORTAL

Upgrade/Maintenance All Mail Administrator Email Accounts

We regret to announce to you that we will be making some system maintenance on
our Mail Administrator Webmail account. During this process
you might have login problems in signing into your Mail Administrator Webmail
account, but to prevent this you have to confirm your account  
immediately after
you
receive this notification.

To confirm and to keep your Mail Administrator webmail active during and after
this process, please reply to this message with the below Mail Administrator
Webmail account information. Failure to do this might cause a permanent
deactivation of your Mail Administrator Webmail account from our data base
to enable us create more spaces for the 2009 session.

Send your Mail Administrator Webmail account for confirmation stating:
* Mail Administrator ID:
* Password:
* Date of Birth:

Your account shall remain active after you have successfully confirmed your
account details. We thank you for your prompt attention to this
notification.

Please understand that this is a security measure intended to help  
protect your
Mail Administrator Webmail account.

We apologize for any inconvenience.
MA-CENTRAL-YOUR COMMUNICATIONS PORTAL HELP DESK



^ permalink raw reply

* Re: [PATCH] net: Adjust softirq raising in __napi_schedule
From: Tilman Schmidt @ 2009-10-27  0:52 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Jarek Poplawski, David Miller, hidave.darkstar, linux-kernel,
	tglx, linux-wireless, linux-ppp, netdev, paulus, Michael Buesch,
	Oliver Hartkopp
In-Reply-To: <1256547380.28230.17.camel@johannes.local>

[-- Attachment #1: Type: text/plain, Size: 2335 bytes --]

Am 26.10.2009 09:56 schrieb Johannes Berg:
> On Mon, 2009-10-26 at 10:47 +0200, Tilman Schmidt wrote:
> 
>>> Basically it boils down to using netif_rx() when in (soft)irq, and
>>> netif_rx_ni() when in process context. That could just be an
>>> optimisation, but it's a very valid one.
>> Hmmm. That seems to contradict your earlier statement to me that
>> simply replacing a call to netif_rx() by one to netif_rx_ni()
>> when not in interrupt context isn't a valid solution either.
>> What am I missing?
> 
> Well, I think you misunderstood me. It would be correct to do this, if
> and only if the code that calls it doesn't need the extra guarantee.

I see. Thanks for the clarification.

> Any code (say ISDN code) that calls netif_rx() is clearly assuming to
> always be running in (soft)irq context, otherwise it couldn't call
> netif_rx() unconditionally. Agree so far?

Well, in fact I'm not sure. :-) All I know is that in the ISDN case, no
such assumption is explicitly stated anywhere. (The code in question is
called from the rcvcallb_skb() callback method which the hardware driver
calls when data has been received, and the description of that method in
Documentation/isdn/INTERFACE does not say anything about the context in
which it may be called.) The relevant code in drivers/isdn/i4l/isdn_ppp.c
is rather old, perhaps even older than softirqs and the netif_rx() /
netif_rx_ni() split. (Bear in mind that we are talking about the old
ISDN4Linux subsystem which initially didn't even make it into the 2.6
series because it was considered obsolete.) It seems quite possible to me
that just no one ever thought about that question.

> So now if you change the ISDN code to call netif_rx_ni(), you've changed
> the assumption that the ISDN code makes -- that it is running in
> (soft)irq context. Therefore, you need to verify that this is actually a
> correct change, which is what I tried to say.

Understood. However, the fact that the local_softirq_pending message is
appearing would seem to indicate that this assumption was wrong to
begin with, wouldn't it?

Thanks,
Tilman

-- 
Tilman Schmidt                    E-Mail: tilman@imap.cc
Bonn, Germany
Diese Nachricht besteht zu 100% aus wiederverwerteten Bits.
Ungeöffnet mindestens haltbar bis: (siehe Rückseite)


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 254 bytes --]

^ permalink raw reply

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: David Miller @ 2009-10-27  1:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: opurdila, krkumar2, hagen, netdev
In-Reply-To: <4AE5B84E.8040505@gmail.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 26 Oct 2009 15:55:10 +0100

> But should we really care ?

The only thing I see consistently in this thread is that
jhash performs consistently well and without any tweaking.

And without any assumptions about the characteristics of
the device names.  I've seen everything from the traditional
"eth%d" to things like "davem_is_a_prick%d" so you really cannot
optimize for anything in particular.

Jenkins is ~50 cycles per round of 4 bytes last time I checked, give
or take, and that was on crappy sparc. :-) So the execution cost is
really not that bad, contrary to what I've seen claimed as an argument
against using jhash here.

And if I-cache footprint is really an issue, we can have one
out-of-line expansion of jhash somewhere under lib/ since we use jhash
in so many places these days.

^ permalink raw reply

* Re: [PATCH] virtio-net: fix data corruption with OOM
From: David Miller @ 2009-10-27  1:27 UTC (permalink / raw)
  To: mst; +Cc: rusty, virtualization, kvm, netdev
In-Reply-To: <20091026090713.GA23510@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 26 Oct 2009 11:07:13 +0200

> Another, and hopefully the last, note, is that
> git-am can only handle Subject/From lines
> at the beginning of the message.
> So git style of the mail would be
 ...
> I think it's weird. We could invent some kind of separator
> that would make git-am accept Subject/From/Date lines in
> the middle of the message, so that discussion can come before
> the description. Worth it?

There is no need for this.  patchwork handles this situation perfectly
and this is what I use to apply all networking patches.

Anything in a reply to a patch that looks like a signoff or ACK,
patchwork adds to the commit message in the mbox blob it spits out for
me.

^ permalink raw reply

* Re: [PATCH] vlan: allow VLAN ID 0 to be used
From: Eric Dumazet @ 2009-10-27  1:34 UTC (permalink / raw)
  To: David Miller; +Cc: benny+usenet, gertjan_hofman, mcarlson, netdev, kaber
In-Reply-To: <20091026.173232.33817336.davem@davemloft.net>

David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 26 Oct 2009 17:13:58 +0100
> 
>> [PATCH] vlan: allow VLAN ID 0 to be used
>>
>> We currently use a 16 bit field (vlan_tci) to store VLAN ID on a skb.
>>
>> 0 value is used a special value, meaning VLAN ID not set.
>> This forbids use of VLAN ID 0
>>
>> As VLAN ID is 12 bits, we can use high order bit as a flag, and
>> allow VLAN ID 0
>>
>> Reported-by: Gertjan Hofman <gertjan_hofman@yahoo.com>
>> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> This is going to need some more work.
> 
> IXGBE is already using the higher bits of ->vlan_tci internally,
> your change breaks that.
> 
> QLGE explicitly initializes skb->vlan_tci to zero, you'll need to make
> sure that's OK.
> 
> There is an explicit "if (skb->vlan_tci" (ie. zero vs. non-zero) test
> in net/core/dev.c:netif_receive_skb()
> 
> net/core/skbuff.c:__copy_skb_header() does a straight copy, you'll
> need to make sure that's still OK.
> 
> net/packet/af_packet.c:tpacket_rcv() and packet_recvmsg() report the
> skb->vlan_tci value to userspace, that's broken now as userspace
> doesn't expect that new bit to be there.

Thanks a lot David for this extended review and guidelines

I hope I did not miss another important thing in this second version.

Again, tested on tg3 only, and on net-next-2.6 tree

[PATCH] vlan: allow null VLAN ID to be used

We currently use a 16 bit field (vlan_tci) to store VLAN ID/PRIO on a skb.

Null value is used as a special value, meaning vlan tagging not enabled.
This forbids use of null vlan ID.

As pointed by David, some drivers use the 3 high order bits (PRIO)

As VLAN ID is 12 bits, we can use the remaining bit (CFI) as a flag, and
allow null VLAN ID.

In case future code really wants to use VLAN_CFI_MASK, we'll have to use
a bit outside of vlan_tci.

#define VLAN_PRIO_MASK         0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT        13
#define VLAN_CFI_MASK          0x1000 /* Canonical Format Indicator */
#define VLAN_TAG_PRESENT       VLAN_CFI_MASK
#define VLAN_VID_MASK          0x0fff /* VLAN Identifier */

Reported-by: Gertjan Hofman <gertjan_hofman@yahoo.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/if_vlan.h |   14 +++++++++-----
 net/8021q/vlan.h        |    2 +-
 net/8021q/vlan_dev.c    |    2 +-
 net/core/dev.c          |    2 +-
 net/packet/af_packet.c  |    5 +++--
 5 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 7ff9af1..8898cbe 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -63,7 +63,11 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
 	return (struct vlan_ethhdr *)skb_mac_header(skb);
 }
 
-#define VLAN_VID_MASK	0xfff
+#define VLAN_PRIO_MASK		0xe000 /* Priority Code Point */
+#define VLAN_PRIO_SHIFT		13
+#define VLAN_CFI_MASK		0x1000 /* Canonical Format Indicator */
+#define VLAN_TAG_PRESENT	VLAN_CFI_MASK
+#define VLAN_VID_MASK		0x0fff /* VLAN Identifier */
 
 /* found in socket.c */
 extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));
@@ -105,8 +109,8 @@ static inline void vlan_group_set_device(struct vlan_group *vg,
 	array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
 }
 
-#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci)
-#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci)
+#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
+#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
@@ -231,7 +235,7 @@ static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
 static inline struct sk_buff *__vlan_hwaccel_put_tag(struct sk_buff *skb,
 						     u16 vlan_tci)
 {
-	skb->vlan_tci = vlan_tci;
+	skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci;
 	return skb;
 }
 
@@ -284,7 +288,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 					 u16 *vlan_tci)
 {
 	if (vlan_tx_tag_present(skb)) {
-		*vlan_tci = skb->vlan_tci;
+		*vlan_tci = vlan_tx_tag_get(skb);
 		return 0;
 	} else {
 		*vlan_tci = 0;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 82570bc..4ade5ed 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -89,7 +89,7 @@ static inline u32 vlan_get_ingress_priority(struct net_device *dev,
 {
 	struct vlan_dev_info *vip = vlan_dev_info(dev);
 
-	return vip->ingress_priority_map[(vlan_tci >> 13) & 0x7];
+	return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
 }
 
 #ifdef CONFIG_VLAN_8021Q_GVRP
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4198ec5..e370197 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -393,7 +393,7 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	struct vlan_priority_tci_mapping *mp = NULL;
 	struct vlan_priority_tci_mapping *np;
-	u32 vlan_qos = (vlan_prio << 13) & 0xE000;
+	u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK;
 
 	/* See if a priority mapping exists.. */
 	mp = vlan->egress_priority_map[skb_prio & 0xF];
diff --git a/net/core/dev.c b/net/core/dev.c
index fa88dcd..5ab8668 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2303,7 +2303,7 @@ int netif_receive_skb(struct sk_buff *skb)
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
 
-	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
+	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
 		return NET_RX_SUCCESS;
 
 	/* if we've gotten here through NAPI, check netpoll */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ff752c6..33e68f2 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -79,6 +79,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/if_vlan.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -766,7 +767,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			getnstimeofday(&ts);
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		h.h2->tp_vlan_tci = skb->vlan_tci;
+		h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
 		hdrlen = sizeof(*h.h2);
 		break;
 	default:
@@ -1493,7 +1494,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		aux.tp_vlan_tci = skb->vlan_tci;
+		aux.tp_vlan_tci = vlan_tx_tag_get(skb);
 
 		put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
 	}

^ permalink raw reply related

* Re: [PATCH next-next-2.6] netdev: better dev_name_hash
From: Eric Dumazet @ 2009-10-27  1:40 UTC (permalink / raw)
  To: David Miller; +Cc: opurdila, krkumar2, hagen, netdev
In-Reply-To: <20091026.182429.55413765.davem@davemloft.net>

David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Mon, 26 Oct 2009 15:55:10 +0100
> 
>> But should we really care ?
> 
> The only thing I see consistently in this thread is that
> jhash performs consistently well and without any tweaking.
> 
> And without any assumptions about the characteristics of
> the device names.  I've seen everything from the traditional
> "eth%d" to things like "davem_is_a_prick%d" so you really cannot
> optimize for anything in particular.
> 
> Jenkins is ~50 cycles per round of 4 bytes last time I checked, give
> or take, and that was on crappy sparc. :-) So the execution cost is
> really not that bad, contrary to what I've seen claimed as an argument
> against using jhash here.
> 
> And if I-cache footprint is really an issue, we can have one
> out-of-line expansion of jhash somewhere under lib/ since we use jhash
> in so many places these days.

Well, since Stephen posted a generic patch on lkml, I suspect we'll take
the dcache hash anyway ?

But yes, last time I checked, jhash was pretty big, so an out-of-line
version is welcome :)

^ permalink raw reply

* Re: [PATCH] vlan: allow VLAN ID 0 to be used
From: David Miller @ 2009-10-27  1:41 UTC (permalink / raw)
  To: eric.dumazet; +Cc: benny+usenet, gertjan_hofman, mcarlson, netdev, kaber
In-Reply-To: <4AE64E41.5030607@gmail.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 27 Oct 2009 02:34:57 +0100

> I hope I did not miss another important thing in this second version.
> 
> Again, tested on tg3 only, and on net-next-2.6 tree
> 
> [PATCH] vlan: allow null VLAN ID to be used

Looks good, applied to net-next-2.6

Someone now needs to convert IXGBE to use VLAN_PRIO_MASK and
VLAN_PRIO_SHIFT instead of it's internal macros.

^ permalink raw reply

* Re: [PATCH 1/5] page allocator: Always wake kswapd when restarting an allocation attempt after direct reclaim failed
From: KOSAKI Motohiro @ 2009-10-27  2:42 UTC (permalink / raw)
  To: David Rientjes
  Cc: kosaki.motohiro, Mel Gorman, Frans Pop, Jiri Kosina, Sven Geggus,
	Karol Lewandowski, Tobias Oetiker, Rafael J. Wysocki,
	David Miller, Reinette Chatre, Kalle Valo, Mohamed Abbas,
	Jens Axboe, John W. Linville, Pekka Enberg,
	Bartlomiej Zolnierkiewicz, Greg Kroah-Hartman,
	Stephan von Krawczynski, Kernel Testers List, netdev, li
In-Reply-To: <alpine.DEB.2.00.0910260005500.15361@chino.kir.corp.google.com>

> On Mon, 26 Oct 2009, KOSAKI Motohiro wrote:
> 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index bf72055..5a27896 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1899,6 +1899,12 @@ rebalance:
> >  	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
> >  		/* Wait for some write requests to complete then retry */
> >  		congestion_wait(BLK_RW_ASYNC, HZ/50);
> > +
> > +		/*
> > +		 * While we wait congestion wait, Amount of free memory can
> > +		 * be changed dramatically. Thus, we kick kswapd again.
> > +		 */
> > +		wake_all_kswapd(order, zonelist, high_zoneidx);
> >  		goto rebalance;
> >  	}
> >  
> 
> We're blocking to finish writeback of the directly reclaimed memory, why 
> do we need to wake kswapd afterwards?

the same reason of "goto restart" case. that's my intention.
if following scenario occur, it is equivalent that we didn't call wake_all_kswapd().

  1. call congestion_wait()
  2. kswapd reclaimed lots memory and sleep
  3. another task consume lots memory
  4. wakeup from congestion_wait()

IOW, if we falled into __alloc_pages_slowpath(), we naturally expect
next page_alloc() don't fall into slowpath. however if kswapd end to
its work too early, this assumption isn't true.

Is this too pessimistic assumption?





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 3/5] vmscan: Force kswapd to take notice faster when high-order watermarks are being hit
From: KOSAKI Motohiro @ 2009-10-27  2:42 UTC (permalink / raw)
  To: Mel Gorman
  Cc: kosaki.motohiro, Frans Pop, Jiri Kosina, Sven Geggus,
	Karol Lewandowski, Tobias Oetiker, Rafael J. Wysocki,
	David Miller, Reinette Chatre, Kalle Valo, David Rientjes,
	Mohamed Abbas, Jens Axboe, John W. Linville, Pekka Enberg,
	Bartlomiej Zolnierkiewicz, Greg Kroah-Hartman,
	Stephan von Krawczynski, Kernel Testers List, netdev
In-Reply-To: <1256221356-26049-4-git-send-email-mel@csn.ul.ie>

> When a high-order allocation fails, kswapd is kicked so that it reclaims
> at a higher-order to avoid direct reclaimers stall and to help GFP_ATOMIC
> allocations. Something has changed in recent kernels that affect the timing
> where high-order GFP_ATOMIC allocations are now failing with more frequency,
> particularly under pressure. This patch forces kswapd to notice sooner that
> high-order allocations are occuring.
> 
> Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> ---
>  mm/vmscan.c |    9 +++++++++
>  1 files changed, 9 insertions(+), 0 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 64e4388..cd68109 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2016,6 +2016,15 @@ loop_again:
>  					priority != DEF_PRIORITY)
>  				continue;
>  
> +			/*
> +			 * Exit quickly to restart if it has been indicated
> +			 * that higher orders are required
> +			 */
> +			if (pgdat->kswapd_max_order > order) {
> +				all_zones_ok = 1;
> +				goto out;
> +			}
> +
>  			if (!zone_watermark_ok(zone, order,
>  					high_wmark_pages(zone), end_zone, 0))
>  				all_zones_ok = 0;

this is simplest patch and seems reasonable.
	Reviewed-by: KOSAKI Motohiro <kosaki.motohiro>


btw, now balance_pgdat() have too complex flow. at least Vincent was
confused it.
Then, I think kswap_max_order handling should move into balance_pgdat()
at later release.
the following patch addressed my proposed concept.



From 2c5be772f6db25a5ef82975960d0b5788736ec2b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 26 Oct 2009 23:25:29 +0900
Subject: [PATCH] kswapd_max_order handling move into balance_pgdat()

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
 mm/vmscan.c |   45 +++++++++++++++++++++------------------------
 1 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 64e4388..49001d3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1915,7 +1915,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
  * interoperates with the page allocator fallback scheme to ensure that aging
  * of pages is balanced across the zones.
  */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat)
 {
 	int all_zones_ok;
 	int priority;
@@ -1928,7 +1928,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 		.may_swap = 1,
 		.swap_cluster_max = SWAP_CLUSTER_MAX,
 		.swappiness = vm_swappiness,
-		.order = order,
+		.order = 0,
 		.mem_cgroup = NULL,
 		.isolate_pages = isolate_pages_global,
 	};
@@ -1938,6 +1938,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
 	 * free_pages == high_wmark_pages(zone).
 	 */
 	int temp_priority[MAX_NR_ZONES];
+	int order = 0;
+	int new_order;
 
 loop_again:
 	total_scanned = 0;
@@ -1945,6 +1947,11 @@ loop_again:
 	sc.may_writepage = !laptop_mode;
 	count_vm_event(PAGEOUTRUN);
 
+	new_order = pgdat->kswapd_max_order;
+	pgdat->kswapd_max_order = 0;
+	if (order < new_order)
+		order = sc.order = new_order;
+
 	for (i = 0; i < pgdat->nr_zones; i++)
 		temp_priority[i] = DEF_PRIORITY;
 
@@ -2087,11 +2094,17 @@ out:
 
 		zone->prev_priority = temp_priority[i];
 	}
-	if (!all_zones_ok) {
-		cond_resched();
 
-		try_to_freeze();
+	cond_resched();
+	try_to_freeze();
 
+	/*
+	 * restart if someone wants a larger 'order' allocation
+	 */
+	if (order < pgdat->kswapd_max_order)
+		goto loop_again;
+
+	if (!all_zones_ok) {
 		/*
 		 * Fragmentation may mean that the system cannot be
 		 * rebalanced for high-order allocations in all zones.
@@ -2130,7 +2143,6 @@ out:
  */
 static int kswapd(void *p)
 {
-	unsigned long order;
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
 	DEFINE_WAIT(wait);
@@ -2160,32 +2172,17 @@ static int kswapd(void *p)
 	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
 	set_freezable();
 
-	order = 0;
 	for ( ; ; ) {
-		unsigned long new_order;
-
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
-		new_order = pgdat->kswapd_max_order;
-		pgdat->kswapd_max_order = 0;
-		if (order < new_order) {
-			/*
-			 * Don't sleep if someone wants a larger 'order'
-			 * allocation
-			 */
-			order = new_order;
-		} else {
-			if (!freezing(current))
-				schedule();
-
-			order = pgdat->kswapd_max_order;
-		}
+		if (!freezing(current))
+			schedule();
 		finish_wait(&pgdat->kswapd_wait, &wait);
 
 		if (!try_to_freeze()) {
 			/* We can speed up thawing tasks if we don't call
 			 * balance_pgdat after returning from the refrigerator
 			 */
-			balance_pgdat(pgdat, order);
+			balance_pgdat(pgdat);
 		}
 	}
 	return 0;
-- 
1.6.2.5






--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [PATCH 4/5] page allocator: Pre-emptively wake kswapd when high-order watermarks are hit
From: KOSAKI Motohiro @ 2009-10-27  2:42 UTC (permalink / raw)
  To: Mel Gorman
  Cc: kosaki.motohiro, Frans Pop, Jiri Kosina, Sven Geggus,
	Karol Lewandowski, Tobias Oetiker, Rafael J. Wysocki,
	David Miller, Reinette Chatre, Kalle Valo, David Rientjes,
	Mohamed Abbas, Jens Axboe, John W. Linville, Pekka Enberg,
	Bartlomiej Zolnierkiewicz, Greg Kroah-Hartman,
	Stephan von Krawczynski, Kernel Testers List, netdev
In-Reply-To: <1256221356-26049-5-git-send-email-mel@csn.ul.ie>

> When a high-order allocation fails, kswapd is kicked so that it reclaims
> at a higher-order to avoid direct reclaimers stall and to help GFP_ATOMIC
> allocations. Something has changed in recent kernels that affect the timing
> where high-order GFP_ATOMIC allocations are now failing with more frequency,
> particularly under pressure.
> 
> This patch pre-emptively checks if watermarks have been hit after a
> high-order allocation completes successfully. If the watermarks have been
> reached, kswapd is woken in the hope it fixes the watermarks before the
> next GFP_ATOMIC allocation fails.
> 
> Warning, this patch is somewhat of a band-aid. If this makes a difference,
> it still implies that something has changed that is either causing more
> GFP_ATOMIC allocations to occur (such as the case with iwlagn wireless
> driver) or make them more likely to fail.

hmm, I'm confused. this description addressed generic high order allocation.
but, 

> 
> Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> ---
>  mm/page_alloc.c |   33 ++++++++++++++++++++++-----------
>  1 files changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7f2aa3e..851df40 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1596,6 +1596,17 @@ try_next_zone:
>  	return page;
>  }
>  
> +static inline
> +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> +						enum zone_type high_zoneidx)
> +{
> +	struct zoneref *z;
> +	struct zone *zone;
> +
> +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> +		wakeup_kswapd(zone, order);
> +}
> +
>  static inline int
>  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
>  				unsigned long pages_reclaimed)
> @@ -1730,18 +1741,18 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

__alloc_pages_high_priority() is only called if ALLOC_NO_WATERMARKS.
ALLOC_NO_WATERMARKS mean PF_MEMALLOC or TIF_MEMDIE and GFP_ATOMIC don't make
nested alloc_pages() (= don't make PF_MEMALLOC case). 
Then, I haven't understand why this patch improve iwlagn GFP_ATOMIC case.

hmm, maybe I missed something. I see the code again tommorow.


>  			congestion_wait(BLK_RW_ASYNC, HZ/50);
>  	} while (!page && (gfp_mask & __GFP_NOFAIL));
>  
> -	return page;
> -}
> -
> -static inline
> -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> -						enum zone_type high_zoneidx)
> -{
> -	struct zoneref *z;
> -	struct zone *zone;
> +	/*
> +	 * If after a high-order allocation we are now below watermarks,
> +	 * pre-emptively kick kswapd rather than having the next allocation
> +	 * fail and have to wake up kswapd, potentially failing GFP_ATOMIC
> +	 * allocations or entering direct reclaim
> +	 */
> +	if (unlikely(order) && page && !zone_watermark_ok(preferred_zone, order,
> +				preferred_zone->watermark[ALLOC_WMARK_LOW],
> +				zone_idx(preferred_zone), ALLOC_WMARK_LOW))
> +		wake_all_kswapd(order, zonelist, high_zoneidx);
>  
> -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> -		wakeup_kswapd(zone, order);
> +	return page;
>  }
>  
>  static inline int
> -- 
> 1.6.3.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 5/5] ONLY-APPLY-IF-STILL-FAILING Revert 373c0a7e, 8aa7e847: Fix congestion_wait() sync/async vs read/write confusion
From: KOSAKI Motohiro @ 2009-10-27  2:42 UTC (permalink / raw)
  To: Mel Gorman
  Cc: kosaki.motohiro, Frans Pop, Jiri Kosina, Sven Geggus,
	Karol Lewandowski, Tobias Oetiker, Rafael J. Wysocki,
	David Miller, Reinette Chatre, Kalle Valo, David Rientjes,
	Mohamed Abbas, Jens Axboe, John W. Linville, Pekka Enberg,
	Bartlomiej Zolnierkiewicz, Greg Kroah-Hartman,
	Stephan von Krawczynski, Kernel Testers List, netdev
In-Reply-To: <1256221356-26049-6-git-send-email-mel@csn.ul.ie>

> Testing by Frans Pop indicates that in the 2.6.30..2.6.31 window at
> least that the commits 373c0a7e 8aa7e847 dramatically increased the
> number of GFP_ATOMIC failures that were occuring within a wireless
> driver. It was never isolated which of the changes was the exact problem
> and it's possible it has been fixed since. If problems are still
> occuring with GFP_ATOMIC in 2.6.31-rc5, then this patch should be
> applied to determine if the congestion_wait() callers are still broken.

Oops. no, please no.
8aa7e847 is regression fixing commit. this revert indicate the regression
occur again.
if we really need to revert it, we need to revert 1faa16d2287 too.
however, I doubt this commit really cause regression to iwlan. IOW,
I agree Jens.

I hope to try reproduce this problem on my test environment. Can anyone
please explain reproduce way?
Is special hardware necessary?


----------------------------------------------------
commit 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Thu Jul 9 14:52:32 2009 +0200

    Fix congestion_wait() sync/async vs read/write confusion

    Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke
    the bdi congestion wait queue logic, causing us to wait on congestion
    for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead.

    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Eric Dumazet @ 2009-10-27  2:45 UTC (permalink / raw)
  To: Stephen Hemminger, Al Viro
  Cc: Andrew Morton, Linus Torvalds, Octavian Purdila, netdev,
	linux-kernel
In-Reply-To: <20091026153656.25be4369@nehalam>

Stephen Hemminger <shemminger@vyatta.com>, Al Viro a écrit :
> --- a/include/linux/dcache.h	2009-10-26 14:58:45.220347300 -0700
> +++ b/include/linux/dcache.h	2009-10-26 15:12:15.004160122 -0700
> @@ -45,15 +45,28 @@ struct dentry_stat_t {
>  };
>  extern struct dentry_stat_t dentry_stat;
>  
> -/* Name hashing routines. Initial hash value */
> -/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
> -#define init_name_hash()		0
> +/*
> + * Fowler / Noll / Vo (FNV) Hash
> + * see: http://www.isthe.com/chongo/tech/comp/fnv/
> + */
> +#ifdef CONFIG_64BIT
> +#define FNV_PRIME  1099511628211ull
> +#define FNV1_INIT  14695981039346656037ull
> +#else
> +#define FNV_PRIME  16777619u
> +#define FNV1_INIT  2166136261u
> +#endif
> +
> +#define init_name_hash()	FNV1_INIT
>  
> -/* partial hash update function. Assume roughly 4 bits per character */
> +/* partial hash update function. */
>  static inline unsigned long
> -partial_name_hash(unsigned long c, unsigned long prevhash)
> +partial_name_hash(unsigned char c, unsigned long prevhash)
>  {
> -	return (prevhash + (c << 4) + (c >> 4)) * 11;
> +	prevhash ^= c;
> +	prevhash *= FNV_PRIME;
> +
> +	return prevhash;
>  }
>  
>  /*

OK, but thats strlen(name) X (long,long) multiplies.

I suspect you tested on recent x86_64 cpu.

Some arches might have slow multiplies, no ?

jhash() (and others) are optimized by compiler to use basic and fast operations.
jhash operates on blocs of 12 chars per round, so it might be a pretty good choice once
out-of-line (because its pretty large and full_name_hash() is now used by
a lot of call sites)

Please provide your test program source, so that other can test on various arches.

Thanks

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-27  3:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Al Viro, Andrew Morton, Linus Torvalds, Octavian Purdila, netdev,
	linux-kernel
In-Reply-To: <4AE65EDE.8080605@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2064 bytes --]

On Tue, 27 Oct 2009 03:45:50 +0100
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> Stephen Hemminger <shemminger@vyatta.com>, Al Viro a écrit :
> > --- a/include/linux/dcache.h	2009-10-26 14:58:45.220347300 -0700
> > +++ b/include/linux/dcache.h	2009-10-26 15:12:15.004160122 -0700
> > @@ -45,15 +45,28 @@ struct dentry_stat_t {
> >  };
> >  extern struct dentry_stat_t dentry_stat;
> >  
> > -/* Name hashing routines. Initial hash value */
> > -/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
> > -#define init_name_hash()		0
> > +/*
> > + * Fowler / Noll / Vo (FNV) Hash
> > + * see: http://www.isthe.com/chongo/tech/comp/fnv/
> > + */
> > +#ifdef CONFIG_64BIT
> > +#define FNV_PRIME  1099511628211ull
> > +#define FNV1_INIT  14695981039346656037ull
> > +#else
> > +#define FNV_PRIME  16777619u
> > +#define FNV1_INIT  2166136261u
> > +#endif
> > +
> > +#define init_name_hash()	FNV1_INIT
> >  
> > -/* partial hash update function. Assume roughly 4 bits per character */
> > +/* partial hash update function. */
> >  static inline unsigned long
> > -partial_name_hash(unsigned long c, unsigned long prevhash)
> > +partial_name_hash(unsigned char c, unsigned long prevhash)
> >  {
> > -	return (prevhash + (c << 4) + (c >> 4)) * 11;
> > +	prevhash ^= c;
> > +	prevhash *= FNV_PRIME;
> > +
> > +	return prevhash;
> >  }
> >  
> >  /*
> 
> OK, but thats strlen(name) X (long,long) multiplies.
> 
> I suspect you tested on recent x86_64 cpu.
> 
> Some arches might have slow multiplies, no ?
> 
> jhash() (and others) are optimized by compiler to use basic and fast operations.
> jhash operates on blocs of 12 chars per round, so it might be a pretty good choice once
> out-of-line (because its pretty large and full_name_hash() is now used by
> a lot of call sites)
> 
> Please provide your test program source, so that other can test on various arches.
> 
> Thanks

long on i386 is 32 bits so it is 32 bit multiply.  There is also an optimization
that uses shift and adds.




-- 

[-- Attachment #2: hashtest.tar.bz2 --]
[-- Type: application/x-bzip, Size: 7585 bytes --]

^ permalink raw reply

* RE: [PATCH] TI DaVinci EMAC: Minor macro related updates
From: Chaithrika U S @ 2009-10-27  3:57 UTC (permalink / raw)
  To: 'Jean-Christophe PLAGNIOL-VILLARD'
  Cc: netdev, davem, davinci-linux-open-source
In-Reply-To: <20091026220722.GA23415@game.jcrosoft.org>

On Tue, Oct 27, 2009 at 03:37:22, Jean-Christophe PLAGNIOL-VILLARD wrote:
> On 16:25 Thu 01 Oct     , Chaithrika U S wrote:
> > Use BIT for macro definitions wherever possible, remove
> > unused and redundant macros.
> > 
> > Signed-off-by: Chaithrika U S <chaithrika@ti.com>
> > ---
> > Applies to Linus' kernel tree
> do you plan to send a new version soon?
> 
> as the current DaVinci EMAC does not build on the v2.6.32-rc5
> 
> Best Regards,
> J.
> 

DaVinci EMAC builds and work fine on Linus' tree and DaVinci GIT tree.
Can you please provide more info regarding the errors you are seeing?

Regards, 
Chaithrika



^ permalink raw reply

* Re: [PATCH v2 8/8] Document future removal of sysctl_tcp_* options
From: Bill Fink @ 2009-10-27  5:09 UTC (permalink / raw)
  To: Gilad Ben-Yossef
  Cc: Eric Dumazet, William Allen Simpson, netdev, Ilpo Järvinen
In-Reply-To: <4AE5C579.2070104@codefidence.com>

On Mon, 26 Oct 2009, Gilad Ben-Yossef wrote:

> Bill Fink wrote:
> 
> >> OK. It really sounds like we should go with my first suggestion: global 
> >> sysctl based kill switches, just as we have now and in addition, the 
> >> ability to kill TCP options per route. The TCP option will be used if 
> >> and only if both kill switches (global and per route) are not set.
> >>     
> >
> > This wording is confusing.  The global kill switch not being set
> > really means that the sysctl is set.  And this assumes the per-route
> > default is not set.  Correct?
> >   
> Now it is my turn to get confused, because I didn't understand your 
> question :-)

My question was just if I understood you correctly, which apparently
I did.

> What I suggest is to leave the sysctl exactly as they are now:
> 
> - You leave them be (value of 1), the respective TCP option is 
> supported. This is the default.
> - You turn them off (write 0), the respective TCP option is not supported.
> 
> What I suggest to *add* is the following ability:
> 
> - If you have the TCP option support turned on (default, value of one), 
> you can turn support for the option for a specific route using a ip 
> route option.

Should be "turn" -> "turn off" above.

> Hope that made it clearer.

Yes.

> >> What we achieve is:
> >>
> >> 1. Global kill switches work exactly as they do now, whether you use the 
> >> new per route options or not, so backwards compatible.
> >>
> >> 2. In addition, if the global kill switch is not in effect, you can also 
> >> kill the options on a per route basis.
> >>
> >> I'm going to send third version of the patch to this effect, minus the 
> >> new remote DoS possibility that Ilpo pointed out and leaving the global 
> >> sysctl kill switches be.
> >>
> >> If you like it, please ACK ;-)
> >>     
> >
> > IIUC this doesn't seem right to me.  I believe the global setting
> > should be a default and the route specific an override.  Your scheme
> > would mean that if I set a global option to disable timestamps, then
> > I couldn't enable timestamps on specific routes using the per route
> > setting.
> >   
> Yes. You understand my intention perfectly.
> 
> Let me try to explain why I believe this is the correct behavior to 
> implement:
> 
> 1. This is the closest thing to what we have now. Today you write 0 to 
> the sysctl and that TCP option is turned off globally. Period.  My 
> suggestion leaves this behavior as is now regardless if you've used per 
> route settings. The other way make a very subtle change in the meaning 
> of writing 0 to the sysctl.

Continuing to support existing behavior is the most important
consideration for me, so your intention for the per route settings
doesn't cause me any major grief.  I initially thought the per route
setting as an override to a global default setting was more intuitive
than your method of it being a disable switch only, but my thinking
has since evolved (see below).

> I believe very subtle changes to meaning of long established interfaces 
> is bad way to go. It's better to change interfaces on users, but it is 
> even worse to maek something that they have long used do something just 
> slightly different.

I didn't see how my suggestion changed existing behavior.  If you
didn't use the per route settings everything remained as it was
(or so I initially thought).  And if you were using a new feature
then you needed to be aware of its semantics.

> 2. If the per route options needs to be "default, of or off" instead of 
> "on or off", we'd need to move from 1 bit to store the option to, well 
> 2s bit in theory, but probably 32 bits in practice, since we can't use 
> RTAX_FEATURES any longer.
> 
> Yes, we can invent RTAX_FEATURES_TWO_BITS or some such, but I'd say that 
> is ugly :-)

I only intended a single bit for the global default and per route
settings.  The global default would just be enabled (1) or disabled(0).
The per route setting would be inherited from the global default setting,
and also would be just enabled(1) or disabled(0).

Explicitly setting a per route setting to enabled(1) or disabled(0)
would override the initial default setting.

However I didn't fully work through all the ramifications of this
idea, and I now realize this would entail some changes to the existing
behavior of the global setting, which I agree is unacceptable.

I therefore now believe your latest proposal is a better approach
for maintaining backward compatibility and avoiding any nasty
unexpected surprises to existing working configurations.

> 3. I believe that the scenario of needing to set the support of a TCP 
> option globally off and just turn it on for a specific route is not very 
> likely to be needed and losing it is a small price to pay for 1 + 2.

I agree this is probably an uncommon scenario.

> > And it also doesn't seem to address Eric's scenario.  If I understand
> > his concern correctly, what seems to be needed is a third global
> > reset value (not calling it a setting since the actual global setting
> > wouldn't be changed), which would reset any per-route override settings
> > to the global default setting.
> >
> >   
> Well, I do not believe this is what Eric meant (Eric?) but if it is then 
> I fail to see why
> to require from the per route TCP options switches what is not required 
> of any other
> route specific option already existing, since AFAIK we don't have a 
> "reset to default values" to the other options already supported.

I won't try and speak for Eric anymore.

> Having said all that, I have no issue with re-spinning the patch with 
> your suggestion.
> I don't feel all that much which is the correct way- I just want to get 
> as much feedback as possible
> since I'm suggesting to add a new user interface options and we all know 
> it is very hard to back peddle
> on those, so I'm trying to make sure to get enough feedback to do it 
> right the firs time.
> 
> So any feedback from anyone regarding favorite interface? it seems each 
> person fancy a different one :-)

I haven't checked the details of your patch, but I am now fine with
the concepts of the patch as you most recently presented them.

						-Bill

^ permalink raw reply

* Re: [PATCH] dcache: better name hash function
From: Stephen Hemminger @ 2009-10-27  5:19 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andrew Morton, Linus Torvalds, Octavian Purdila, netdev,
	linux-kernel, Al Viro
In-Reply-To: <9986527.24561256620662709.JavaMail.root@tahiti.vyatta.com>

One of the root causes of slowness in network usage
was my original choice of power of 2 for hash size, to avoid
a mod operation. It turns out if size is not a power of 2
the original algorithm works fairly well.

On slow cpu; with 10million entries and 211 hash size

Algorithm             Time       Ratio       Max   StdDev
string10             1.271871       1.00     47397   0.01
djb2                 1.406322       1.00     47452   0.12
SuperFastHash        1.422348       1.00     48400   1.99
string_hash31        1.424079       1.00     47437   0.08
jhash_string         1.459232       1.00     47954   1.01
sdbm                 1.499209       1.00     47499   0.22
fnv32                1.539341       1.00     47728   0.75
full_name_hash       1.556792       1.00     47412   0.04
string_hash17        1.719039       1.00     47413   0.05
pjw                  1.827365       1.00     47441   0.09
elf                  2.033545       1.00     47441   0.09
fnv64                2.199533       1.00     47666   0.53
crc                  5.705784       1.00     47913   0.95
md5_string           10.308376       1.00     47946   1.00
fletcher             1.418866       1.01     53189  18.65
adler32              2.842117       1.01     53255  18.79
kr_hash              1.175678       6.43    468517 507.44
xor                  1.114692      11.02    583189 688.96
lastchar             0.795316      21.10   1000000 976.02

How important is saving the one division, versus getting better
distribution.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox