Netdev List
 help / color / mirror / Atom feed
* [PATCH 16/31] netvm: INET reserves.
From: Suresh Jayaraman @ 2009-10-01 14:07 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Add reserves for INET.

The two big users seem to be the route cache and ip-fragment cache.

Reserve the route cache under generic RX reserve, its usage is bounded by
the high reclaim watermark, and thus does not need further accounting.

Reserve the ip-fragement caches under SKB data reserve, these add to the
SKB RX limit. By ensuring we can at least receive as much data as fits in
the reassmbly line we avoid fragment attack deadlocks.

Adds to the reserve tree:

  total network reserve
    network TX reserve
      protocol TX pages
    network RX reserve
+     IPv6 route cache
+     IPv4 route cache
      SKB data reserve
+       IPv6 fragment cache
+       IPv4 fragment cache

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 include/net/inet_frag.h  |    7 +++
 include/net/netns/ipv6.h |    4 ++
 net/ipv4/inet_fragment.c |    3 +
 net/ipv4/ip_fragment.c   |   86 +++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/route.c         |   70 +++++++++++++++++++++++++++++++++++++-
 net/ipv6/reassembly.c    |   85 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/route.c         |   77 ++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 325 insertions(+), 7 deletions(-)

Index: mmotm/net/ipv4/ip_fragment.c
===================================================================
--- mmotm.orig/net/ipv4/ip_fragment.c
+++ mmotm/net/ipv4/ip_fragment.c
@@ -42,6 +42,8 @@
 #include <linux/udp.h>
 #include <linux/inet.h>
 #include <linux/netfilter_ipv4.h>
+#include <linux/reserve.h>
+#include <linux/nsproxy.h>
 
 /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
  * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -599,6 +601,63 @@ int ip_defrag(struct sk_buff *skb, u32 u
 }
 
 #ifdef CONFIG_SYSCTL
+static int
+proc_dointvec_fragment(struct ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.frags.high_thresh);
+	ctl_table tmp = *table;
+	int new_bytes, ret;
+
+	mutex_lock(&net->ipv4.frags.lock);
+	if (write) {
+		tmp.data = &new_bytes;
+		table = &tmp;
+	}
+
+	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
+				new_bytes);
+		if (!ret)
+			net->ipv4.frags.high_thresh = new_bytes;
+	}
+	mutex_unlock(&net->ipv4.frags.lock);
+
+	return ret;
+}
+
+static int sysctl_intvec_fragment(struct ctl_table *table,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.frags.high_thresh);
+	int write = (newval && newlen);
+	ctl_table tmp = *table;
+	int new_bytes, ret;
+
+	mutex_lock(&net->ipv4.frags.lock);
+	if (write) {
+		tmp.data = &new_bytes;
+		table = &tmp;
+	}
+
+	ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
+				new_bytes);
+		if (!ret)
+			net->ipv4.frags.high_thresh = new_bytes;
+	}
+	mutex_unlock(&net->ipv4.frags.lock);
+
+	return ret;
+}
+
 static int zero;
 
 static struct ctl_table ip4_frags_ns_ctl_table[] = {
@@ -608,7 +667,8 @@ static struct ctl_table ip4_frags_ns_ctl
 		.data		= &init_net.ipv4.frags.high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= &proc_dointvec_fragment,
+		.strategy	= &sysctl_intvec_fragment,
 	},
 	{
 		.ctl_name	= NET_IPV4_IPFRAG_LOW_THRESH,
@@ -711,6 +771,8 @@ static inline void ip4_frags_ctl_registe
 
 static int ipv4_frags_init_net(struct net *net)
 {
+	int ret;
+
 	/*
 	 * Fragment cache limits. We will commit 256K at one time. Should we
 	 * cross that limit we will prune down to 192K. This should cope with
@@ -728,11 +790,31 @@ static int ipv4_frags_init_net(struct ne
 
 	inet_frags_init_net(&net->ipv4.frags);
 
-	return ip4_frags_ns_ctl_register(net);
+	ret = ip4_frags_ns_ctl_register(net);
+	if (ret)
+		goto out_reg;
+
+	mem_reserve_init(&net->ipv4.frags.reserve, "IPv4 fragment cache",
+			&net_skb_reserve);
+	ret = mem_reserve_kmalloc_set(&net->ipv4.frags.reserve,
+			net->ipv4.frags.high_thresh);
+	if (ret)
+		goto out_reserve;
+
+	return 0;
+
+out_reserve:
+	mem_reserve_disconnect(&net->ipv4.frags.reserve);
+	ip4_frags_ns_ctl_unregister(net);
+out_reg:
+	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+
+	return ret;
 }
 
 static void ipv4_frags_exit_net(struct net *net)
 {
+	mem_reserve_disconnect(&net->ipv4.frags.reserve);
 	ip4_frags_ns_ctl_unregister(net);
 	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
 }
Index: mmotm/net/ipv6/reassembly.c
===================================================================
--- mmotm.orig/net/ipv6/reassembly.c
+++ mmotm/net/ipv6/reassembly.c
@@ -41,6 +41,7 @@
 #include <linux/random.h>
 #include <linux/jhash.h>
 #include <linux/skbuff.h>
+#include <linux/reserve.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -634,6 +635,63 @@ static struct inet6_protocol frag_protoc
 };
 
 #ifdef CONFIG_SYSCTL
+static int
+proc_dointvec_fragment(struct ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv6.frags.high_thresh);
+	ctl_table tmp = *table;
+	int new_bytes, ret;
+
+	mutex_lock(&net->ipv6.frags.lock);
+	if (write) {
+		tmp.data = &new_bytes;
+		table = &tmp;
+	}
+
+	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
+					      new_bytes);
+		if (!ret)
+			net->ipv6.frags.high_thresh = new_bytes;
+	}
+	mutex_unlock(&net->ipv6.frags.lock);
+
+	return ret;
+}
+
+static int sysctl_intvec_fragment(struct ctl_table *table,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv6.frags.high_thresh);
+	int write = (newval && newlen);
+	ctl_table tmp = *table;
+	int new_bytes, ret;
+
+	mutex_lock(&net->ipv6.frags.lock);
+	if (write) {
+		tmp.data = &new_bytes;
+		table = &tmp;
+	}
+
+	ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
+					      new_bytes);
+		if (!ret)
+			net->ipv6.frags.high_thresh = new_bytes;
+	}
+	mutex_unlock(&net->ipv6.frags.lock);
+
+	return ret;
+}
+
 static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
 		.ctl_name	= NET_IPV6_IP6FRAG_HIGH_THRESH,
@@ -641,7 +699,8 @@ static struct ctl_table ip6_frags_ns_ctl
 		.data		= &init_net.ipv6.frags.high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= &proc_dointvec_fragment,
+		.strategy	= &sysctl_intvec_fragment,
 	},
 	{
 		.ctl_name	= NET_IPV6_IP6FRAG_LOW_THRESH,
@@ -750,17 +809,39 @@ static inline void ip6_frags_sysctl_unre
 
 static int ipv6_frags_init_net(struct net *net)
 {
+	int ret;
+
 	net->ipv6.frags.high_thresh = 256 * 1024;
 	net->ipv6.frags.low_thresh = 192 * 1024;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
 	inet_frags_init_net(&net->ipv6.frags);
 
-	return ip6_frags_ns_sysctl_register(net);
+	ret = ip6_frags_ns_sysctl_register(net);
+	if (ret)
+		goto out_reg;
+
+	mem_reserve_init(&net->ipv6.frags.reserve, "IPv6 fragment cache",
+			 &net_skb_reserve);
+	ret = mem_reserve_kmalloc_set(&net->ipv6.frags.reserve,
+				      net->ipv6.frags.high_thresh);
+	if (ret)
+		goto out_reserve;
+
+	return 0;
+
+out_reserve:
+	mem_reserve_disconnect(&net->ipv6.frags.reserve);
+	ip6_frags_ns_sysctl_unregister(net);
+out_reg:
+	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+
+	return ret;
 }
 
 static void ipv6_frags_exit_net(struct net *net)
 {
+	mem_reserve_disconnect(&net->ipv6.frags.reserve);
 	ip6_frags_ns_sysctl_unregister(net);
 	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
 }
Index: mmotm/net/ipv4/route.c
===================================================================
--- mmotm.orig/net/ipv4/route.c
+++ mmotm/net/ipv4/route.c
@@ -107,6 +107,7 @@
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
+#include <linux/reserve.h>
 
 #define RT_FL_TOS(oldflp) \
     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
@@ -271,6 +272,8 @@ static inline int rt_genid(struct net *n
 	return atomic_read(&net->ipv4.rt_genid);
 }
 
+static struct mem_reserve ipv4_route_reserve;
+
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
 	struct seq_net_private p;
@@ -400,6 +403,61 @@ static int rt_cache_seq_show(struct seq_
 	return 0;
 }
 
+static struct mutex ipv4_route_lock;
+
+static int
+proc_dointvec_route(struct ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	ctl_table tmp = *table;
+	int new_size, ret;
+
+	mutex_lock(&ipv4_route_lock);
+	if (write) {
+		tmp.data = &new_size;
+		table = &tmp;
+	}
+
+	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
+				ipv4_dst_ops.kmem_cachep, new_size);
+		if (!ret)
+			ip_rt_max_size = new_size;
+	}
+	mutex_unlock(&ipv4_route_lock);
+
+	return ret;
+}
+
+static int sysctl_intvec_route(struct ctl_table *table,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen)
+{
+	int write = (newval && newlen);
+	ctl_table tmp = *table;
+	int new_size, ret;
+
+	mutex_lock(&ipv4_route_lock);
+	if (write) {
+		tmp.data = &new_size;
+		table = &tmp;
+	}
+
+	ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
+				ipv4_dst_ops.kmem_cachep, new_size);
+		if (!ret)
+			ip_rt_max_size = new_size;
+	}
+	mutex_unlock(&ipv4_route_lock);
+
+	return ret;
+}
+
 static const struct seq_operations rt_cache_seq_ops = {
 	.start  = rt_cache_seq_start,
 	.next   = rt_cache_seq_next,
@@ -3145,7 +3203,8 @@ static ctl_table ipv4_route_table[] = {
 		.data		= &ip_rt_max_size,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= &proc_dointvec_route,
+		.strategy	= &sysctl_intvec_route,
 	},
 	{
 		/*  Deprecated. Use gc_min_interval_ms */
@@ -3424,6 +3483,15 @@ int __init ip_rt_init(void)
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
+#ifdef CONFIG_PROCFS
+	mutex_init(&ipv4_route_lock);
+#endif
+
+	mem_reserve_init(&ipv4_route_reserve, "IPv4 route cache",
+			&net_rx_reserve);
+	mem_reserve_kmem_cache_set(&ipv4_route_reserve,
+			ipv4_dst_ops.kmem_cachep, ip_rt_max_size);
+
 	devinet_init();
 	ip_fib_init();
 
Index: mmotm/net/ipv6/route.c
===================================================================
--- mmotm.orig/net/ipv6/route.c
+++ mmotm/net/ipv6/route.c
@@ -37,6 +37,7 @@
 #include <linux/mroute6.h>
 #include <linux/init.h>
 #include <linux/if_arp.h>
+#include <linux/reserve.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/nsproxy.h>
@@ -2537,6 +2538,63 @@ int ipv6_sysctl_rtcache_flush(ctl_table
 		return -EINVAL;
 }
 
+static int
+proc_dointvec_route(struct ctl_table *table, int write, struct file *filp,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv6.sysctl.ip6_rt_max_size);
+	ctl_table tmp = *table;
+	int new_size, ret;
+
+	mutex_lock(&net->ipv6.sysctl.ip6_rt_lock);
+	if (write) {
+		tmp.data = &new_size;
+		table = &tmp;
+	}
+
+	ret = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
+				net->ipv6.ip6_dst_ops->kmem_cachep, new_size);
+		if (!ret)
+			net->ipv6.sysctl.ip6_rt_max_size = new_size;
+	}
+	mutex_unlock(&net->ipv6.sysctl.ip6_rt_lock);
+
+	return ret;
+}
+
+static int sysctl_intvec_route(struct ctl_table *table,
+		void __user *oldval, size_t __user *oldlenp,
+		void __user *newval, size_t newlen)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv6.sysctl.ip6_rt_max_size);
+	int write = (newval && newlen);
+	ctl_table tmp = *table;
+	int new_size, ret;
+
+	mutex_lock(&net->ipv6.sysctl.ip6_rt_lock);
+	if (write) {
+		tmp.data = &new_size;
+		table = &tmp;
+	}
+
+	ret = sysctl_intvec(table, oldval, oldlenp, newval, newlen);
+
+	if (!ret && write) {
+		ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
+				net->ipv6.ip6_dst_ops->kmem_cachep, new_size);
+		if (!ret)
+			net->ipv6.sysctl.ip6_rt_max_size = new_size;
+	}
+	mutex_unlock(&net->ipv6.sysctl.ip6_rt_lock);
+
+	return ret;
+}
+
 ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -2559,7 +2617,8 @@ ctl_table ipv6_route_table_template[] =
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec,
+		.proc_handler	=	&proc_dointvec_route,
+		.strategy	= 	&sysctl_intvec_route,
 	},
 	{
 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
@@ -2647,6 +2706,8 @@ struct ctl_table *ipv6_route_sysctl_init
 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
 	}
 
+	mutex_init(&net->ipv6.sysctl.ip6_rt_lock);
+
 	return table;
 }
 #endif
@@ -2700,6 +2761,14 @@ static int ip6_route_net_init(struct net
 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
 
+	mem_reserve_init(&net->ipv6.ip6_rt_reserve, "IPv6 route cache",
+			 &net_rx_reserve);
+	ret = mem_reserve_kmem_cache_set(&net->ipv6.ip6_rt_reserve,
+			net->ipv6.ip6_dst_ops->kmem_cachep,
+			net->ipv6.sysctl.ip6_rt_max_size);
+	if (ret)
+		goto out_reserve_fail;
+
 #ifdef CONFIG_PROC_FS
 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
@@ -2710,12 +2779,15 @@ static int ip6_route_net_init(struct net
 out:
 	return ret;
 
+out_reserve_fail:
+	mem_reserve_disconnect(&net->ipv6.ip6_rt_reserve);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	kfree(net->ipv6.ip6_blk_hole_entry);
 out_ip6_prohibit_entry:
 	kfree(net->ipv6.ip6_prohibit_entry);
 out_ip6_null_entry:
-	kfree(net->ipv6.ip6_null_entry);
 #endif
+	kfree(net->ipv6.ip6_null_entry);
 out_ip6_dst_ops:
 	release_net(net->ipv6.ip6_dst_ops->dst_net);
 	kfree(net->ipv6.ip6_dst_ops);
@@ -2728,6 +2800,7 @@ static void ip6_route_net_exit(struct ne
 	proc_net_remove(net, "ipv6_route");
 	proc_net_remove(net, "rt6_stats");
 #endif
+	mem_reserve_disconnect(&net->ipv6.ip6_rt_reserve);
 	kfree(net->ipv6.ip6_null_entry);
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	kfree(net->ipv6.ip6_prohibit_entry);
Index: mmotm/include/net/inet_frag.h
===================================================================
--- mmotm.orig/include/net/inet_frag.h
+++ mmotm/include/net/inet_frag.h
@@ -1,6 +1,9 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include <linux/reserve.h>
+#include <linux/mutex.h>
+
 struct netns_frags {
 	int			nqueues;
 	atomic_t		mem;
@@ -10,6 +13,10 @@ struct netns_frags {
 	int			timeout;
 	int			high_thresh;
 	int			low_thresh;
+
+	/* reserves */
+	struct mutex		lock;
+	struct mem_reserve	reserve;
 };
 
 struct inet_frag_queue {
Index: mmotm/net/ipv4/inet_fragment.c
===================================================================
--- mmotm.orig/net/ipv4/inet_fragment.c
+++ mmotm/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
 #include <linux/random.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+#include <linux/reserve.h>
 
 #include <net/inet_frag.h>
 
@@ -74,6 +75,8 @@ void inet_frags_init_net(struct netns_fr
 	nf->nqueues = 0;
 	atomic_set(&nf->mem, 0);
 	INIT_LIST_HEAD(&nf->lru_list);
+	mutex_init(&nf->lock);
+	mem_reserve_init(&nf->reserve, "IP fragement cache", NULL);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
 
Index: mmotm/include/net/netns/ipv6.h
===================================================================
--- mmotm.orig/include/net/netns/ipv6.h
+++ mmotm/include/net/netns/ipv6.h
@@ -24,6 +24,8 @@ struct netns_sysctl_ipv6 {
 	int ip6_rt_mtu_expires;
 	int ip6_rt_min_advmss;
 	int icmpv6_time;
+
+	struct mutex ip6_rt_lock;
 };
 
 struct netns_ipv6 {
@@ -55,6 +57,8 @@ struct netns_ipv6 {
 	struct sock             *ndisc_sk;
 	struct sock             *tcp_sk;
 	struct sock             *igmp_sk;
+
+	struct mem_reserve	ip6_rt_reserve;
 #ifdef CONFIG_IPV6_MROUTE
 	struct sock		*mroute6_sk;
 	struct mfc6_cache	**mfc6_cache_array;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 17/31] Fix initialization of ipv4_route_lock
From: Suresh Jayaraman @ 2009-10-01 14:08 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Jeff Mahoney, Suresh Jayaraman

From: Jeff Mahoney <jeffm@suse.com>

 It's CONFIG_PROC_FS, not CONFIG_PROCFS.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 net/ipv4/route.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: mmotm/net/ipv4/route.c
===================================================================
--- mmotm.orig/net/ipv4/route.c
+++ mmotm/net/ipv4/route.c
@@ -3483,7 +3483,7 @@ int __init ip_rt_init(void)
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
-#ifdef CONFIG_PROCFS
+#ifdef CONFIG_PROC_FS
 	mutex_init(&ipv4_route_lock);
 #endif
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 18/31] netvm: hook skb allocation to reserves
From: Suresh Jayaraman @ 2009-10-01 14:08 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Change the skb allocation api to indicate RX usage and use this to fall back to
the reserve when needed. SKBs allocated from the reserve are tagged in
skb->emergency.

Teach all other skb ops about emergency skbs and the reserve accounting.

Use the (new) packet split API to allocate and track fragment pages from the
emergency reserve. Do this using an atomic counter in page->index. This is
needed because the fragments have a different sharing semantic than that
indicated by skb_shinfo()->dataref.

Note that the decision to distinguish between regular and emergency SKBs allows
the accounting overhead to be limited to the later kind.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 include/linux/mm_types.h |    1 
 include/linux/skbuff.h   |   28 +++++++--
 net/core/skbuff.c        |  137 +++++++++++++++++++++++++++++++++++++----------
 3 files changed, 133 insertions(+), 33 deletions(-)

Index: mmotm/include/linux/mm_types.h
===================================================================
--- mmotm.orig/include/linux/mm_types.h
+++ mmotm/include/linux/mm_types.h
@@ -78,6 +78,7 @@ struct page {
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* SLUB: freelist req. slab lock */
 		int reserve;		/* page_alloc: page is a reserve page */
+		atomic_t frag_count;	/* skb fragment use count */
 	};
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
Index: mmotm/include/linux/skbuff.h
===================================================================
--- mmotm.orig/include/linux/skbuff.h
+++ mmotm/include/linux/skbuff.h
@@ -384,8 +384,10 @@ struct sk_buff {
 	__u8			do_not_encrypt:1;
 #endif
 	kmemcheck_bitfield_end(flags2);
-
-	/* 0/13/14 bit hole */
+#ifdef CONFIG_NETVM
+	__u8			emergency:1;
+#endif
+	/* 0/14 bit hole */
 
 #ifdef CONFIG_NET_DMA
 	dma_cookie_t		dma_cookie;
@@ -426,6 +428,18 @@ extern void skb_dma_unmap(struct device
 			  enum dma_data_direction dir);
 #endif
 
+#define SKB_ALLOC_FCLONE	0x01
+#define SKB_ALLOC_RX		0x02
+
+static inline bool skb_emergency(const struct sk_buff *skb)
+{
+#ifdef CONFIG_NETVM
+	return unlikely(skb->emergency);
+#else
+	return false;
+#endif
+}
+
 static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
 {
 	return (struct dst_entry *)skb->_skb_dst;
@@ -445,7 +459,7 @@ extern void kfree_skb(struct sk_buff *sk
 extern void consume_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   gfp_t priority, int fclone, int node);
+				   gfp_t priority, int flags, int node);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
@@ -455,7 +469,7 @@ static inline struct sk_buff *alloc_skb(
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-	return __alloc_skb(size, priority, 1, -1);
+	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, -1);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1466,7 +1480,8 @@ static inline void __skb_queue_purge(str
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
 					      gfp_t gfp_mask)
 {
-	struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+	struct sk_buff *skb =
+		__alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1);
 	if (likely(skb))
 		skb_reserve(skb, NET_SKB_PAD);
 	return skb;
@@ -1497,6 +1512,7 @@ static inline struct sk_buff *netdev_all
 }
 
 extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask);
+extern void __netdev_free_page(struct net_device *dev, struct page *page);
 
 /**
  *	netdev_alloc_page - allocate a page for ps-rx on a specific device
@@ -1513,7 +1529,7 @@ static inline struct page *netdev_alloc_
 
 static inline void netdev_free_page(struct net_device *dev, struct page *page)
 {
-	__free_page(page);
+	__netdev_free_page(dev, page);
 }
 
 /**
Index: mmotm/net/core/skbuff.c
===================================================================
--- mmotm.orig/net/core/skbuff.c
+++ mmotm/net/core/skbuff.c
@@ -170,23 +170,29 @@ EXPORT_SYMBOL(skb_under_panic);
  *	%GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-			    int fclone, int node)
+			    int flags, int node)
 {
 	struct kmem_cache *cache;
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
+	int emergency = 0;
+	int memalloc = sk_memalloc_socks();
 
-	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+	size = SKB_DATA_ALIGN(size);
+	cache = (flags & SKB_ALLOC_FCLONE)
+		? skbuff_fclone_cache : skbuff_head_cache;
+
+	if (memalloc && (flags & SKB_ALLOC_RX))
+		gfp_mask |= __GFP_MEMALLOC;
 
 	/* Get the HEAD */
 	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
 	if (!skb)
 		goto out;
 
-	size = SKB_DATA_ALIGN(size);
-	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-			gfp_mask, node);
+	data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
+			gfp_mask, node, &net_skb_reserve, &emergency);
 	if (!data)
 		goto nodata;
 
@@ -196,6 +202,9 @@ struct sk_buff *__alloc_skb(unsigned int
 	 * the tail pointer in struct sk_buff!
 	 */
 	memset(skb, 0, offsetof(struct sk_buff, tail));
+#ifdef CONFIG_NETVM
+	skb->emergency = emergency;
+#endif
 	skb->truesize = size + sizeof(struct sk_buff);
 	atomic_set(&skb->users, 1);
 	skb->head = data;
@@ -220,7 +229,7 @@ struct sk_buff *__alloc_skb(unsigned int
 	skb_frag_list_init(skb);
 	memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
 
-	if (fclone) {
+	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
 
@@ -230,6 +239,9 @@ struct sk_buff *__alloc_skb(unsigned int
 		atomic_set(fclone_ref, 1);
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
+#ifdef CONFIG_NETVM
+		child->emergency = skb->emergency;
+#endif
 	}
 out:
 	return skb;
@@ -259,7 +271,7 @@ struct sk_buff *__netdev_alloc_skb(struc
 	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 	struct sk_buff *skb;
 
-	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, node);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
@@ -273,11 +285,19 @@ struct page *__netdev_alloc_page(struct
 	int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1;
 	struct page *page;
 
-	page = alloc_pages_node(node, gfp_mask, 0);
+	page = alloc_pages_reserve(node, gfp_mask | __GFP_MEMALLOC, 0,
+			&net_skb_reserve, NULL);
+
 	return page;
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+void __netdev_free_page(struct net_device *dev, struct page *page)
+{
+	free_pages_reserve(page, 0, &net_skb_reserve, page->reserve);
+}
+EXPORT_SYMBOL(__netdev_free_page);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 		int size)
 {
@@ -285,6 +305,27 @@ void skb_add_rx_frag(struct sk_buff *skb
 	skb->len += size;
 	skb->data_len += size;
 	skb->truesize += size;
+
+#ifdef CONFIG_NETVM
+	/*
+	 * In the rare case that skb_emergency() != page->reserved we'll
+	 * skew the accounting slightly, but since its only a 'small' constant
+	 * shift its ok.
+	 */
+	if (skb_emergency(skb)) {
+		/*
+		 * We need to track fragment pages so that we properly
+		 * release their reserve in skb_put_page().
+		 */
+		atomic_set(&page->frag_count, 1);
+	} else if (unlikely(page->reserve)) {
+		/*
+		 * Release the reserve now, because normal skbs don't
+		 * do the emergency accounting.
+		 */
+		mem_reserve_pages_charge(&net_skb_reserve, -1);
+	}
+#endif
 }
 EXPORT_SYMBOL(skb_add_rx_frag);
 
@@ -336,21 +377,38 @@ static void skb_clone_fraglist(struct sk
 		skb_get(list);
 }
 
+static void skb_get_page(struct sk_buff *skb, struct page *page)
+{
+	get_page(page);
+	if (skb_emergency(skb))
+		atomic_inc(&page->frag_count);
+}
+
+static void skb_put_page(struct sk_buff *skb, struct page *page)
+{
+	if (skb_emergency(skb) && atomic_dec_and_test(&page->frag_count))
+		mem_reserve_pages_charge(&net_skb_reserve, -1);
+	put_page(page);
+}
+
 static void skb_release_data(struct sk_buff *skb)
 {
 	if (!skb->cloned ||
 	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 			       &skb_shinfo(skb)->dataref)) {
+
 		if (skb_shinfo(skb)->nr_frags) {
 			int i;
-			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-				put_page(skb_shinfo(skb)->frags[i].page);
+			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+				skb_put_page(skb,
+					     skb_shinfo(skb)->frags[i].page);
+			}
 		}
 
 		if (skb_has_frags(skb))
 			skb_drop_fraglist(skb);
 
-		kfree(skb->head);
+		kfree_reserve(skb->head, &net_skb_reserve, skb_emergency(skb));
 	}
 }
 
@@ -544,6 +602,9 @@ static void __copy_skb_header(struct sk_
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	new->ipvs_property	= old->ipvs_property;
 #endif
+#ifdef CONFIG_NETVM
+	new->emergency		= old->emergency;
+#endif
 	new->protocol		= old->protocol;
 	new->mark		= old->mark;
 	new->iif		= old->iif;
@@ -641,6 +702,9 @@ struct sk_buff *skb_clone(struct sk_buff
 		n->fclone = SKB_FCLONE_CLONE;
 		atomic_inc(fclone_ref);
 	} else {
+		if (skb_emergency(skb))
+			gfp_mask |= __GFP_MEMALLOC;
+
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)
 			return NULL;
@@ -677,6 +741,14 @@ static void copy_skb_header(struct sk_bu
 	skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
 }
 
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+	if (skb_emergency(skb))
+		return SKB_ALLOC_RX;
+
+	return 0;
+}
+
 /**
  *	skb_copy	-	create private copy of an sk_buff
  *	@skb: buffer to copy
@@ -697,15 +769,17 @@ static void copy_skb_header(struct sk_bu
 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 {
 	int headerlen = skb->data - skb->head;
+	int size;
 	/*
 	 *	Allocate the copy buffer
 	 */
 	struct sk_buff *n;
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
-	n = alloc_skb(skb->end + skb->data_len, gfp_mask);
+	size = skb->end + skb->data_len;
 #else
-	n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask);
+	size = skb->end - skb->head + skb->data_len;
 #endif
+	n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1);
 	if (!n)
 		return NULL;
 
@@ -740,12 +814,14 @@ struct sk_buff *pskb_copy(struct sk_buff
 	/*
 	 *	Allocate the copy buffer
 	 */
+	int size;
 	struct sk_buff *n;
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
-	n = alloc_skb(skb->end, gfp_mask);
+	size = skb->end;
 #else
-	n = alloc_skb(skb->end - skb->head, gfp_mask);
+	size = skb->end - skb->head;
 #endif
+	n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), -1);
 	if (!n)
 		goto out;
 
@@ -764,8 +840,9 @@ struct sk_buff *pskb_copy(struct sk_buff
 		int i;
 
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
-			get_page(skb_shinfo(n)->frags[i].page);
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			skb_shinfo(n)->frags[i] = *frag;
+			skb_get_page(n, frag->page);
 		}
 		skb_shinfo(n)->nr_frags = i;
 	}
@@ -816,7 +893,11 @@ int pskb_expand_head(struct sk_buff *skb
 
 	size = SKB_DATA_ALIGN(size);
 
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	if (skb_emergency(skb))
+		gfp_mask |= __GFP_MEMALLOC;
+
+	data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
+			gfp_mask, -1, &net_skb_reserve, NULL);
 	if (!data)
 		goto nodata;
 
@@ -831,7 +912,7 @@ int pskb_expand_head(struct sk_buff *skb
 	       sizeof(struct skb_shared_info));
 
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-		get_page(skb_shinfo(skb)->frags[i].page);
+		skb_get_page(skb, skb_shinfo(skb)->frags[i].page);
 
 	if (skb_has_frags(skb))
 		skb_clone_fraglist(skb);
@@ -912,8 +993,8 @@ struct sk_buff *skb_copy_expand(const st
 	/*
 	 *	Allocate the copy buffer
 	 */
-	struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
-				      gfp_mask);
+	struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+					gfp_mask, skb_alloc_rx_flag(skb), -1);
 	int oldheadroom = skb_headroom(skb);
 	int head_copy_len, head_copy_off;
 	int off;
@@ -1105,7 +1186,7 @@ drop_pages:
 		skb_shinfo(skb)->nr_frags = i;
 
 		for (; i < nfrags; i++)
-			put_page(skb_shinfo(skb)->frags[i].page);
+			skb_put_page(skb, skb_shinfo(skb)->frags[i].page);
 
 		if (skb_has_frags(skb))
 			skb_drop_fraglist(skb);
@@ -1274,7 +1355,7 @@ pull_pages:
 	k = 0;
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		if (skb_shinfo(skb)->frags[i].size <= eat) {
-			put_page(skb_shinfo(skb)->frags[i].page);
+			skb_put_page(skb, skb_shinfo(skb)->frags[i].page);
 			eat -= skb_shinfo(skb)->frags[i].size;
 		} else {
 			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
@@ -2052,6 +2133,7 @@ static inline void skb_split_no_header(s
 			skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
 
 			if (pos < len) {
+				struct page *page = skb_shinfo(skb)->frags[i].page;
 				/* Split frag.
 				 * We have two variants in this case:
 				 * 1. Move all the frag to the second
@@ -2060,7 +2142,7 @@ static inline void skb_split_no_header(s
 				 *    where splitting is expensive.
 				 * 2. Split is accurately. We make this.
 				 */
-				get_page(skb_shinfo(skb)->frags[i].page);
+				skb_get_page(skb1, page);
 				skb_shinfo(skb1)->frags[0].page_offset += len - pos;
 				skb_shinfo(skb1)->frags[0].size -= len - pos;
 				skb_shinfo(skb)->frags[i].size	= len - pos;
@@ -2559,8 +2641,9 @@ struct sk_buff *skb_segment(struct sk_bu
 			skb_release_head_state(nskb);
 			__skb_push(nskb, doffset);
 		} else {
-			nskb = alloc_skb(hsize + doffset + headroom,
-					 GFP_ATOMIC);
+			nskb = __alloc_skb(hsize + doffset + headroom,
+					 GFP_ATOMIC, skb_alloc_rx_flag(skb),
+					 -1);
 
 			if (unlikely(!nskb))
 				goto err;
@@ -2602,7 +2685,7 @@ struct sk_buff *skb_segment(struct sk_bu
 
 		while (pos < offset + len && i < nfrags) {
 			*frag = skb_shinfo(skb)->frags[i];
-			get_page(frag->page);
+			skb_get_page(nskb, frag->page);
 			size = frag->size;
 
 			if (pos < offset) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 19/31] netvm: filter emergency skbs.
From: Suresh Jayaraman @ 2009-10-01 14:08 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Toss all emergency packets not for a SOCK_MEMALLOC socket. This ensures our
precious memory reserve doesn't get stuck waiting for user-space.

The correctness of this approach relies on the fact that networks must be
assumed lossy.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 net/core/filter.c |    3 +++
 1 file changed, 3 insertions(+)

Index: mmotm/net/core/filter.c
===================================================================
--- mmotm.orig/net/core/filter.c
+++ mmotm/net/core/filter.c
@@ -81,6 +81,9 @@ int sk_filter(struct sock *sk, struct sk
 	int err;
 	struct sk_filter *filter;
 
+	if (skb_emergency(skb) && !sk_has_memalloc(sk))
+		return -ENOMEM;
+
 	err = security_sock_rcv_skb(sk, skb);
 	if (err)
 		return err;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 20/31] netvm: prevent a stream specific deadlock
From: Suresh Jayaraman @ 2009-10-01 14:08 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC buffers
from receiving data, which will prevent userspace from running, which is needed
to reduce the buffered data.

Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 include/net/sock.h   |    7 ++++---
 net/core/sock.c      |    2 +-
 net/ipv4/tcp_input.c |   12 ++++++------
 net/sctp/ulpevent.c  |    2 +-
 4 files changed, 12 insertions(+), 11 deletions(-)

Index: mmotm/include/net/sock.h
===================================================================
--- mmotm.orig/include/net/sock.h
+++ mmotm/include/net/sock.h
@@ -882,12 +882,13 @@ static inline int sk_wmem_schedule(struc
 		__sk_mem_schedule(sk, size, SK_MEM_SEND);
 }
 
-static inline int sk_rmem_schedule(struct sock *sk, int size)
+static inline int sk_rmem_schedule(struct sock *sk, struct sk_buff *skb)
 {
 	if (!sk_has_account(sk))
 		return 1;
-	return size <= sk->sk_forward_alloc ||
-		__sk_mem_schedule(sk, size, SK_MEM_RECV);
+	return skb->truesize <= sk->sk_forward_alloc ||
+		__sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV) ||
+		skb_emergency(skb);
 }
 
 static inline void sk_mem_reclaim(struct sock *sk)
Index: mmotm/net/core/sock.c
===================================================================
--- mmotm.orig/net/core/sock.c
+++ mmotm/net/core/sock.c
@@ -390,7 +390,7 @@ int sock_queue_rcv_skb(struct sock *sk,
 	if (err)
 		goto out;
 
-	if (!sk_rmem_schedule(sk, skb->truesize)) {
+	if (!sk_rmem_schedule(sk, skb)) {
 		err = -ENOBUFS;
 		goto out;
 	}
Index: mmotm/net/ipv4/tcp_input.c
===================================================================
--- mmotm.orig/net/ipv4/tcp_input.c
+++ mmotm/net/ipv4/tcp_input.c
@@ -4269,19 +4269,19 @@ static void tcp_ofo_queue(struct sock *s
 static int tcp_prune_ofo_queue(struct sock *sk);
 static int tcp_prune_queue(struct sock *sk);
 
-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb)
 {
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	    !sk_rmem_schedule(sk, size)) {
+	    !sk_rmem_schedule(sk, skb)) {
 
 		if (tcp_prune_queue(sk) < 0)
 			return -1;
 
-		if (!sk_rmem_schedule(sk, size)) {
+		if (!sk_rmem_schedule(sk, skb)) {
 			if (!tcp_prune_ofo_queue(sk))
 				return -1;
 
-			if (!sk_rmem_schedule(sk, size))
+			if (!sk_rmem_schedule(sk, skb))
 				return -1;
 		}
 	}
@@ -4333,7 +4333,7 @@ static void tcp_data_queue(struct sock *
 		if (eaten <= 0) {
 queue_and_out:
 			if (eaten < 0 &&
-			    tcp_try_rmem_schedule(sk, skb->truesize))
+			    tcp_try_rmem_schedule(sk, skb))
 				goto drop;
 
 			skb_set_owner_r(skb, sk);
@@ -4404,7 +4404,7 @@ drop:
 
 	TCP_ECN_check_ce(tp, skb);
 
-	if (tcp_try_rmem_schedule(sk, skb->truesize))
+	if (tcp_try_rmem_schedule(sk, skb))
 		goto drop;
 
 	/* Disable header prediction. */
Index: mmotm/net/sctp/ulpevent.c
===================================================================
--- mmotm.orig/net/sctp/ulpevent.c
+++ mmotm/net/sctp/ulpevent.c
@@ -701,7 +701,7 @@ struct sctp_ulpevent *sctp_ulpevent_make
 	if (rx_count >= asoc->base.sk->sk_rcvbuf) {
 
 		if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
-		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
+		    (!sk_rmem_schedule(asoc->base.sk, chunk->skb)))
 			goto fail;
 	}
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 21/31] netfilter: NF_QUEUE vs emergency skbs
From: Suresh Jayaraman @ 2009-10-01 14:08 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Avoid memory getting stuck waiting for userspace, drop all emergency packets.
This of course requires the regular storage route to not include an NF_QUEUE
target ;-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 net/netfilter/core.c |    3 +++
 1 file changed, 3 insertions(+)

Index: mmotm/net/netfilter/core.c
===================================================================
--- mmotm.orig/net/netfilter/core.c
+++ mmotm/net/netfilter/core.c
@@ -175,9 +175,12 @@ next_hook:
 	if (verdict == NF_ACCEPT || verdict == NF_STOP) {
 		ret = 1;
 	} else if (verdict == NF_DROP) {
+drop:
 		kfree_skb(skb);
 		ret = -EPERM;
 	} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
+		if (skb_emergency(skb))
+			goto drop;
 		if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
 			      verdict >> NF_VERDICT_BITS))
 			goto next_hook;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 22/31] netvm: skb processing
From: Suresh Jayaraman @ 2009-10-01 14:09 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

In order to make sure emergency packets receive all memory needed to proceed
ensure processing of emergency SKBs happens under PF_MEMALLOC.

Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing.

Skip taps, since those are user-space again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 include/net/sock.h |    5 ++++
 net/core/dev.c     |   57 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 net/core/sock.c    |   16 ++++++++++++++
 3 files changed, 73 insertions(+), 5 deletions(-)

Index: mmotm/include/net/sock.h
===================================================================
--- mmotm.orig/include/net/sock.h
+++ mmotm/include/net/sock.h
@@ -619,8 +619,13 @@ static inline void sk_add_backlog(struct
 	skb->next = NULL;
 }
 
+extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	if (skb_emergency(skb))
+		return __sk_backlog_rcv(sk, skb);
+
 	return sk->sk_backlog_rcv(sk, skb);
 }
 
Index: mmotm/net/core/dev.c
===================================================================
--- mmotm.orig/net/core/dev.c
+++ mmotm/net/core/dev.c
@@ -2225,6 +2225,30 @@ void netif_nit_deliver(struct sk_buff *s
 	rcu_read_unlock();
 }
 
+/*
+ * Filter the protocols for which the reserves are adequate.
+ *
+ * Before adding a protocol make sure that it is either covered by the existing
+ * reserves, or add reserves covering the memory need of the new protocol's
+ * packet processing.
+ */
+static int skb_emergency_protocol(struct sk_buff *skb)
+{
+	if (skb_emergency(skb))
+		switch (skb->protocol) {
+		case __constant_htons(ETH_P_ARP):
+		case __constant_htons(ETH_P_IP):
+		case __constant_htons(ETH_P_IPV6):
+		case __constant_htons(ETH_P_8021Q):
+			break;
+
+		default:
+			return 0;
+		}
+
+	return 1;
+}
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
@@ -2247,13 +2271,26 @@ int netif_receive_skb(struct sk_buff *sk
 	struct net_device *null_or_orig;
 	int ret = NET_RX_DROP;
 	__be16 type;
+	unsigned long pflags = current->flags;
 
 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
 		return NET_RX_SUCCESS;
 
+	/* Emergency skb are special, they should
+	 *  - be delivered to SOCK_MEMALLOC sockets only
+	 *  - stay away from userspace
+	 *  - have bounded memory usage
+	 *
+	 * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
+	 * This saves us from propagating the allocation context down to all
+	 * allocation sites.
+	 */
+	if (skb_emergency(skb))
+		current->flags |= PF_MEMALLOC;
+
 	/* if we've gotten here through NAPI, check netpoll */
 	if (netpoll_receive_skb(skb))
-		return NET_RX_DROP;
+		goto out;
 
 	if (!skb->tstamp.tv64)
 		net_timestamp(skb);
@@ -2287,6 +2324,9 @@ int netif_receive_skb(struct sk_buff *sk
 	}
 #endif
 
+	if (skb_emergency(skb))
+		goto skip_taps;
+
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
 		    ptype->dev == orig_dev) {
@@ -2296,19 +2336,23 @@ int netif_receive_skb(struct sk_buff *sk
 		}
 	}
 
+skip_taps:
 #ifdef CONFIG_NET_CLS_ACT
 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
-		goto out;
+		goto unlock;
 ncls:
 #endif
 
+	if (!skb_emergency_protocol(skb))
+		goto drop;
+
 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
-		goto out;
+		goto unlock;
 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
 	if (!skb)
-		goto out;
+		goto unlock;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype,
@@ -2325,6 +2369,7 @@ ncls:
 	if (pt_prev) {
 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
+drop:
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -2332,8 +2377,10 @@ ncls:
 		ret = NET_RX_DROP;
 	}
 
-out:
+unlock:
 	rcu_read_unlock();
+out:
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 	return ret;
 }
 
Index: mmotm/net/core/sock.c
===================================================================
--- mmotm.orig/net/core/sock.c
+++ mmotm/net/core/sock.c
@@ -315,6 +315,22 @@ int sk_clear_memalloc(struct sock *sk)
 	return set;
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	unsigned long pflags = current->flags;
+
+	/* these should have been dropped before queueing */
+	BUG_ON(!sk_has_memalloc(sk));
+
+	current->flags |= PF_MEMALLOC;
+	ret = sk->sk_backlog_rcv(sk, skb);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+	return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
 #endif
 
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 23/31] mm: add support for non block device backed swap files
From: Suresh Jayaraman @ 2009-10-01 14:09 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

New addres_space_operations methods are added:
  int swapon(struct file *);
  int swapoff(struct file *);
  int swap_out(struct file *, struct page *, struct writeback_control *);
  int swap_in(struct file *, struct page *);

When during sys_swapon() the ->swapon() method is found and returns no error
the swapper_space.a_ops will proxy to sis->swap_file->f_mapping->a_ops, and
make use of ->swap_{out,in}() to write/read swapcache pages.

The ->swapon() method will be used to communicate to the file that the VM
relies on it, and the address_space should take adequate measures (like
reserving memory for mempools or the like). The ->swapoff() method will be
called on sys_swapoff() when ->swapon() was found and returned no error.

This new interface can be used to obviate the need for ->bmap in the swapfile
code. A filesystem would need to load (and maybe even allocate) the full block
map for a file into memory and pin it there on ->swapon() so that
->swap_{out,in}() have instant access to it. It can be released on ->swapoff().

The reason to provide ->swap_{out,in}() over using {write,read}page() is to
 1) make a distinction between swapcache and pagecache pages, and
 2) to provide a struct file * for credential context (normally not needed
    in the context of writepage, as the page content is normally dirtied
    using either of the following interfaces:
      write_{begin,end}()
      {prepare,commit}_write()
      page_mkwrite()
    which do have the file context.

[miklos@szeredi.hu: cleanups]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 Documentation/filesystems/Locking |   22 ++++++++++++++++
 Documentation/filesystems/vfs.txt |   18 +++++++++++++
 include/linux/buffer_head.h       |    1 
 include/linux/fs.h                |    9 ++++++
 include/linux/swap.h              |    4 ++
 mm/page_io.c                      |   51 ++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c                   |    4 +-
 mm/swapfile.c                     |   30 ++++++++++++++++++++--
 8 files changed, 135 insertions(+), 4 deletions(-)

Index: mmotm/Documentation/filesystems/Locking
===================================================================
--- mmotm.orig/Documentation/filesystems/Locking
+++ mmotm/Documentation/filesystems/Locking
@@ -174,6 +174,10 @@ prototypes:
 	int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
 	int (*launder_page) (struct page *);
+	int (*swapon) (struct file *);
+	int (*swapoff) (struct file *);
+	int (*swap_out) (struct file *, struct page *, struct writeback_control *);
+	int (*swap_in)  (struct file *, struct page *);
 
 locking rules:
 	All except set_page_dirty may block
@@ -193,6 +197,10 @@ invalidatepage:		no	yes
 releasepage:		no	yes
 direct_IO:		no
 launder_page:		no	yes
+swapon			no
+swapoff			no
+swap_out		no	yes, unlocks
+swap_in			no	yes, unlocks
 
 	->write_begin(), ->write_end(), ->sync_page() and ->readpage()
 may be called from the request handler (/dev/loop).
@@ -292,6 +300,20 @@ cleaned, or an error value if not. Note
 getting mapped back in and redirtied, it needs to be kept locked
 across the entire operation.
 
+	->swapon() will be called with a non-zero argument on files backing
+(non block device backed) swapfiles. A return value of zero indicates success,
+in which case this file can be used for backing swapspace. The swapspace
+operations will be proxied to the address space operations.
+
+	->swapoff() will be called in the sys_swapoff() path when ->swapon()
+returned success.
+
+	->swap_out() when swapon() returned success, this method is used to
+write the swap page.
+
+	->swap_in() when swapon() returned success, this method is used to
+read the swap page.
+
 	Note: currently almost all instances of address_space methods are
 using BKL for internal serialization and that's one of the worst sources
 of contention. Normally they are calling library functions (in fs/buffer.c)
Index: mmotm/Documentation/filesystems/vfs.txt
===================================================================
--- mmotm.orig/Documentation/filesystems/vfs.txt
+++ mmotm/Documentation/filesystems/vfs.txt
@@ -536,6 +536,11 @@ struct address_space_operations {
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
+	int (*swapon)(struct file *);
+	int (*swapoff)(struct file *);
+	int (*swap_out)(struct file *file, struct page *page,
+			struct writeback_control *wbc);
+	int (*swap_in)(struct file *file, struct page *page);
 };
 
   writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +699,19 @@ struct address_space_operations {
   	prevent redirtying the page, it is kept locked during the whole
 	operation.
 
+  swapon: Called when swapon is used on a file. A
+	return value of zero indicates success, in which case this
+	file can be used to back swapspace. The swapspace operations
+	will be proxied to this address space's ->swap_{out,in} methods.
+
+  swapoff: Called during swapoff on files where swapon was successfull.
+
+  swap_out: Called to write a swapcache page to a backing store, similar to
+	writepage.
+
+  swap_in: Called to read a swapcache page from a backing store, similar to
+	readpage.
+
 The File Object
 ===============
 
Index: mmotm/include/linux/buffer_head.h
===================================================================
--- mmotm.orig/include/linux/buffer_head.h
+++ mmotm/include/linux/buffer_head.h
@@ -339,6 +339,7 @@ static inline int inode_has_buffers(stru
 static inline void invalidate_inode_buffers(struct inode *inode) {}
 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void block_sync_page(struct page *) { }
 
 #endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */
Index: mmotm/include/linux/fs.h
===================================================================
--- mmotm.orig/include/linux/fs.h
+++ mmotm/include/linux/fs.h
@@ -595,6 +595,15 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+
+	/*
+	 * swapfile support
+	 */
+	int (*swapon)(struct file *file);
+	int (*swapoff)(struct file *file);
+	int (*swap_out)(struct file *file, struct page *page,
+			struct writeback_control *wbc);
+	int (*swap_in)(struct file *file, struct page *page);
 };
 
 /*
Index: mmotm/include/linux/swap.h
===================================================================
--- mmotm.orig/include/linux/swap.h
+++ mmotm/include/linux/swap.h
@@ -120,6 +120,7 @@ struct swap_extent {
 enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
+	SWP_FILE	= (1 << 2),	/* file swap area */
 	SWP_DISCARDABLE = (1 << 2),	/* blkdev supports discard */
 	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
 	SWP_SOLIDSTATE	= (1 << 4),	/* blkdev seeks are cheap */
@@ -258,6 +259,8 @@ extern void swap_unplug_io_fn(struct bac
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void swap_sync_page(struct page *page);
+extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
 /* linux/mm/swap_state.c */
@@ -293,6 +296,7 @@ extern unsigned int count_swap_pages(int
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
+extern struct swap_info_struct *page_swap_info(struct page *);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
Index: mmotm/mm/page_io.c
===================================================================
--- mmotm.orig/mm/page_io.c
+++ mmotm/mm/page_io.c
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/swapops.h>
 #include <linux/writeback.h>
+#include <linux/buffer_head.h>
 #include <asm/pgtable.h>
 
 static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
@@ -97,11 +98,23 @@ int swap_writepage(struct page *page, st
 {
 	struct bio *bio;
 	int ret = 0, rw = WRITE;
+	struct swap_info_struct *sis = page_swap_info(page);
 
 	if (try_to_free_swap(page)) {
 		unlock_page(page);
 		goto out;
 	}
+
+	if (sis->flags & SWP_FILE) {
+		struct file *swap_file = sis->swap_file;
+		struct address_space *mapping = swap_file->f_mapping;
+
+		ret = mapping->a_ops->swap_out(swap_file, page, wbc);
+		if (!ret)
+			count_vm_event(PSWPOUT);
+		return ret;
+	}
+
 	bio = get_swap_bio(GFP_NOIO, page_private(page), page,
 				end_swap_bio_write);
 	if (bio == NULL) {
@@ -120,13 +133,51 @@ out:
 	return ret;
 }
 
+void swap_sync_page(struct page *page)
+{
+	struct swap_info_struct *sis = page_swap_info(page);
+
+	if (sis->flags & SWP_FILE) {
+		struct address_space *mapping = sis->swap_file->f_mapping;
+
+		if (mapping->a_ops->sync_page)
+			mapping->a_ops->sync_page(page);
+	} else {
+		block_sync_page(page);
+	}
+}
+
+int swap_set_page_dirty(struct page *page)
+{
+	struct swap_info_struct *sis = page_swap_info(page);
+
+	if (sis->flags & SWP_FILE) {
+		struct address_space *mapping = sis->swap_file->f_mapping;
+
+		return mapping->a_ops->set_page_dirty(page);
+	} else {
+		return __set_page_dirty_nobuffers(page);
+	}
+}
+
 int swap_readpage(struct page *page)
 {
 	struct bio *bio;
 	int ret = 0;
+	struct swap_info_struct *sis = page_swap_info(page);
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageUptodate(page));
+
+	if (sis->flags & SWP_FILE) {
+		struct file *swap_file = sis->swap_file;
+		struct address_space *mapping = swap_file->f_mapping;
+
+		ret = mapping->a_ops->swap_in(swap_file, page);
+		if (!ret)
+			count_vm_event(PSWPIN);
+		return ret;
+	}
 	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
 				end_swap_bio_read);
 	if (bio == NULL) {
Index: mmotm/mm/swap_state.c
===================================================================
--- mmotm.orig/mm/swap_state.c
+++ mmotm/mm/swap_state.c
@@ -28,8 +28,8 @@
  */
 static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
-	.sync_page	= block_sync_page,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.sync_page	= swap_sync_page,
+	.set_page_dirty	= swap_set_page_dirty,
 	.migratepage	= migrate_page,
 };
 
Index: mmotm/mm/swapfile.c
===================================================================
--- mmotm.orig/mm/swapfile.c
+++ mmotm/mm/swapfile.c
@@ -1335,6 +1335,14 @@ static void destroy_swap_extents(struct
 		list_del(&se->list);
 		kfree(se);
 	}
+
+	if (sis->flags & SWP_FILE) {
+		struct file *swap_file = sis->swap_file;
+		struct address_space *mapping = swap_file->f_mapping;
+
+		sis->flags &= ~SWP_FILE;
+		mapping->a_ops->swapoff(swap_file);
+	}
 }
 
 /*
@@ -1409,7 +1417,9 @@ add_swap_extent(struct swap_info_struct
  */
 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
-	struct inode *inode;
+	struct file *swap_file = sis->swap_file;
+	struct address_space *mapping = swap_file->f_mapping;
+	struct inode *inode = mapping->host;
 	unsigned blocks_per_page;
 	unsigned long page_no;
 	unsigned blkbits;
@@ -1420,13 +1430,22 @@ static int setup_swap_extents(struct swa
 	int nr_extents = 0;
 	int ret;
 
-	inode = sis->swap_file->f_mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
 		goto done;
 	}
 
+	if (mapping->a_ops->swapon) {
+		ret = mapping->a_ops->swapon(swap_file);
+		if (!ret) {
+			sis->flags |= SWP_FILE;
+			ret = add_swap_extent(sis, 0, sis->max, 0);
+			*span = sis->pages;
+		}
+		goto done;
+	}
+
 	blkbits = inode->i_blkbits;
 	blocks_per_page = PAGE_SIZE >> blkbits;
 
@@ -2163,6 +2182,13 @@ get_swap_info_struct(unsigned type)
 	return &swap_info[type];
 }
 
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+	swp_entry_t swap = { .val = page_private(page) };
+	BUG_ON(!PageSwapCache(page));
+	return &swap_info[swp_type(swap)];
+}
+
 /*
  * swap_lock prevents swap_map being freed. Don't grab an extra
  * reference on the swaphandle, it doesn't matter if it becomes unused.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 24/31] mm: methods for teaching filesystems about PG_swapcache pages
From: Suresh Jayaraman @ 2009-10-01 14:09 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

In order to teach filesystems to handle swap cache pages, three new page
functions are introduced:

  pgoff_t page_file_index(struct page *);
  loff_t page_file_offset(struct page *);
  struct address_space *page_file_mapping(struct page *);

page_file_index() - gives the offset of this page in the file in
PAGE_CACHE_SIZE blocks. Like page->index is for mapped pages, this function
also gives the correct index for PG_swapcache pages.

page_file_offset() - uses page_file_index(), so that it will give the expected
result, even for PG_swapcache pages.

page_file_mapping() - gives the mapping backing the actual page; that is for
swap cache pages it will give swap_file->f_mapping.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 include/linux/mm.h      |   25 +++++++++++++++++++++++++
 include/linux/pagemap.h |    5 +++++
 mm/swapfile.c           |   19 +++++++++++++++++++
 3 files changed, 49 insertions(+)

Index: mmotm/include/linux/mm.h
===================================================================
--- mmotm.orig/include/linux/mm.h
+++ mmotm/include/linux/mm.h
@@ -634,6 +634,17 @@ static inline struct address_space *page
 	return mapping;
 }
 
+extern struct address_space *__page_file_mapping(struct page *);
+
+static inline
+struct address_space *page_file_mapping(struct page *page)
+{
+	if (unlikely(PageSwapCache(page)))
+		return __page_file_mapping(page);
+
+	return page->mapping;
+}
+
 static inline int PageAnon(struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
@@ -650,6 +661,20 @@ static inline pgoff_t page_index(struct
 	return page->index;
 }
 
+extern pgoff_t __page_file_index(struct page *page);
+
+/*
+ * Return the file index of the page. Regular pagecache pages use ->index
+ * whereas swapcache pages use swp_offset(->private)
+ */
+static inline pgoff_t page_file_index(struct page *page)
+{
+	if (unlikely(PageSwapCache(page)))
+		return __page_file_index(page);
+
+	return page->index;
+}
+
 /*
  * The atomic page->_mapcount, like _count, starts from -1:
  * so that transitions both from it and to it can be tracked,
Index: mmotm/include/linux/pagemap.h
===================================================================
--- mmotm.orig/include/linux/pagemap.h
+++ mmotm/include/linux/pagemap.h
@@ -279,6 +279,11 @@ static inline loff_t page_offset(struct
 	return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
 }
 
+static inline loff_t page_file_offset(struct page *page)
+{
+	return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
+}
+
 static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 					unsigned long address)
 {
Index: mmotm/mm/swapfile.c
===================================================================
--- mmotm.orig/mm/swapfile.c
+++ mmotm/mm/swapfile.c
@@ -2190,6 +2190,25 @@ struct swap_info_struct *page_swap_info(
 }
 
 /*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+
+struct address_space *__page_file_mapping(struct page *page)
+{
+	VM_BUG_ON(!PageSwapCache(page));
+	return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+
+pgoff_t __page_file_index(struct page *page)
+{
+	swp_entry_t swap = { .val = page_private(page) };
+	VM_BUG_ON(!PageSwapCache(page));
+	return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
+
+/*
  * swap_lock prevents swap_map being freed. Don't grab an extra
  * reference on the swaphandle, it doesn't matter if it becomes unused.
  */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 25/31] nfs: teach the NFS client how to treat PG_swapcache pages
From: Suresh Jayaraman @ 2009-10-01 14:09 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Replace all relevant occurences of page->index and page->mapping in the NFS
client with the new page_file_index() and page_file_mapping() functions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 fs/nfs/file.c     |    6 ++---
 fs/nfs/internal.h |    7 +++---
 fs/nfs/pagelist.c |    6 ++---
 fs/nfs/read.c     |    6 ++---
 fs/nfs/write.c    |   56 ++++++++++++++++++++++++++++--------------------------
 5 files changed, 43 insertions(+), 38 deletions(-)

Index: mmotm/fs/nfs/file.c
===================================================================
--- mmotm.orig/fs/nfs/file.c
+++ mmotm/fs/nfs/file.c
@@ -427,7 +427,7 @@ static void nfs_invalidate_page(struct p
 	if (offset != 0)
 		return;
 	/* Cancel any unstarted writes on this page */
-	nfs_wb_page_cancel(page->mapping->host, page);
+	nfs_wb_page_cancel(page_file_mapping(page)->host, page);
 
 	nfs_fscache_invalidate_page(page, page->mapping->host);
 }
@@ -458,7 +458,7 @@ static int nfs_release_page(struct page
  */
 static int nfs_launder_page(struct page *page)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
@@ -505,7 +505,7 @@ static int nfs_vm_page_mkwrite(struct vm
 	nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page);
 
 	lock_page(page);
-	mapping = page->mapping;
+	mapping = page_file_mapping(page);
 	if (mapping != dentry->d_inode->i_mapping)
 		goto out_unlock;
 
Index: mmotm/fs/nfs/internal.h
===================================================================
--- mmotm.orig/fs/nfs/internal.h
+++ mmotm/fs/nfs/internal.h
@@ -346,13 +346,14 @@ void nfs_super_set_maxbytes(struct super
 static inline
 unsigned int nfs_page_length(struct page *page)
 {
-	loff_t i_size = i_size_read(page->mapping->host);
+	loff_t i_size = i_size_read(page_file_mapping(page)->host);
 
 	if (i_size > 0) {
+		pgoff_t page_index = page_file_index(page);
 		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
-		if (page->index < end_index)
+		if (page_index < end_index)
 			return PAGE_CACHE_SIZE;
-		if (page->index == end_index)
+		if (page_index == end_index)
 			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
 	}
 	return 0;
Index: mmotm/fs/nfs/pagelist.c
===================================================================
--- mmotm.orig/fs/nfs/pagelist.c
+++ mmotm/fs/nfs/pagelist.c
@@ -76,11 +76,11 @@ nfs_create_request(struct nfs_open_conte
 	 * update_nfs_request below if the region is not locked. */
 	req->wb_page    = page;
 	atomic_set(&req->wb_complete, 0);
-	req->wb_index	= page->index;
+	req->wb_index	= page_file_index(page);
 	page_cache_get(page);
 	BUG_ON(PagePrivate(page));
 	BUG_ON(!PageLocked(page));
-	BUG_ON(page->mapping->host != inode);
+	BUG_ON(page_file_mapping(page)->host != inode);
 	req->wb_offset  = offset;
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
@@ -365,7 +365,7 @@ void nfs_pageio_cond_complete(struct nfs
  * nfs_scan_list - Scan a list for matching requests
  * @nfsi: NFS inode
  * @dst: Destination list
- * @idx_start: lower bound of page->index to scan
+ * @idx_start: lower bound of page_file_index(page) to scan
  * @npages: idx_start + npages sets the upper bound to scan.
  * @tag: tag to scan for
  *
Index: mmotm/fs/nfs/read.c
===================================================================
--- mmotm.orig/fs/nfs/read.c
+++ mmotm/fs/nfs/read.c
@@ -507,11 +507,11 @@ static const struct rpc_call_ops nfs_rea
 int nfs_readpage(struct file *file, struct page *page)
 {
 	struct nfs_open_context *ctx;
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	int		error;
 
 	dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
-		page, PAGE_CACHE_SIZE, page->index);
+		page, PAGE_CACHE_SIZE, page_file_index(page));
 	nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
 	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 
@@ -565,7 +565,7 @@ static int
 readpage_async_filler(void *data, struct page *page)
 {
 	struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *new;
 	unsigned int len;
 	int error;
Index: mmotm/fs/nfs/write.c
===================================================================
--- mmotm.orig/fs/nfs/write.c
+++ mmotm/fs/nfs/write.c
@@ -121,7 +121,7 @@ static struct nfs_page *nfs_page_find_re
 
 static struct nfs_page *nfs_page_find_request(struct page *page)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *req = NULL;
 
 	spin_lock(&inode->i_lock);
@@ -133,16 +133,16 @@ static struct nfs_page *nfs_page_find_re
 /* Adjust the file length if we're writing beyond the end */
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	loff_t end, i_size;
 	pgoff_t end_index;
 
 	spin_lock(&inode->i_lock);
 	i_size = i_size_read(inode);
 	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
-	if (i_size > 0 && page->index < end_index)
+	if (i_size > 0 && page_file_index(page) < end_index)
 		goto out;
-	end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
+	end = page_file_offset(page) + ((loff_t)offset+count);
 	if (i_size >= end)
 		goto out;
 	i_size_write(inode, end);
@@ -155,7 +155,7 @@ out:
 static void nfs_set_pageerror(struct page *page)
 {
 	SetPageError(page);
-	nfs_zap_mapping(page->mapping->host, page->mapping);
+	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
 }
 
 /* We can set the PG_uptodate flag if we see that a write request
@@ -196,7 +196,7 @@ static int nfs_set_page_writeback(struct
 	int ret = test_set_page_writeback(page);
 
 	if (!ret) {
-		struct inode *inode = page->mapping->host;
+		struct inode *inode = page_file_mapping(page)->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
 		if (atomic_long_inc_return(&nfss->writeback) >
@@ -210,7 +210,7 @@ static int nfs_set_page_writeback(struct
 
 static void nfs_end_page_writeback(struct page *page)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
@@ -225,7 +225,7 @@ static void nfs_end_page_writeback(struc
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 				struct page *page)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *req;
 	int ret;
 
@@ -268,12 +268,12 @@ static int nfs_page_async_flush(struct n
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
-	nfs_pageio_cond_complete(pgio, page->index);
+	nfs_pageio_cond_complete(pgio, page_file_index(page));
 	return nfs_page_async_flush(pgio, page);
 }
 
@@ -285,7 +285,8 @@ static int nfs_writepage_locked(struct p
 	struct nfs_pageio_descriptor pgio;
 	int err;
 
-	nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc));
+	nfs_pageio_init_write(&pgio, page_file_mapping(page)->host,
+			wb_priority(wbc));
 	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
@@ -428,7 +429,8 @@ nfs_mark_request_commit(struct nfs_page
 			NFS_PAGE_TAG_COMMIT);
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+	inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+			BDI_RECLAIMABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
@@ -439,7 +441,8 @@ nfs_clear_request_commit(struct nfs_page
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(page_file_mapping(page)->backing_dev_info,
+				BDI_RECLAIMABLE);
 		return 1;
 	}
 	return 0;
@@ -551,7 +554,7 @@ nfs_need_commit(struct nfs_inode *nfsi)
  * nfs_scan_commit - Scan an inode for commit requests
  * @inode: NFS inode to scan
  * @dst: destination list
- * @idx_start: lower bound of page->index to scan.
+ * @idx_start: lower bound of page_file_index(page) to scan.
  * @npages: idx_start + npages sets the upper bound to scan.
  *
  * Moves requests from the inode's 'commit' request list.
@@ -664,7 +667,7 @@ out_err:
 static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
 		struct page *page, unsigned int offset, unsigned int bytes)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page	*req;
 	int error;
 
@@ -719,7 +722,7 @@ int nfs_flush_incompatible(struct file *
 		nfs_release_request(req);
 		if (!do_flush)
 			return 0;
-		status = nfs_wb_page(page->mapping->host, page);
+		status = nfs_wb_page(page_file_mapping(page)->host, page);
 	} while (status == 0);
 	return status;
 }
@@ -745,7 +748,7 @@ int nfs_updatepage(struct file *file, st
 		unsigned int offset, unsigned int count)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
-	struct inode	*inode = page->mapping->host;
+	struct inode	*inode = page_file_mapping(page)->host;
 	int		status = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -753,7 +756,7 @@ int nfs_updatepage(struct file *file, st
 	dprintk("NFS:       nfs_updatepage(%s/%s %d@%lld)\n",
 		file->f_path.dentry->d_parent->d_name.name,
 		file->f_path.dentry->d_name.name, count,
-		(long long)(page_offset(page) + offset));
+		(long long)(page_file_offset(page) + offset));
 
 	/* If we're not using byte range locks, and we know the page
 	 * is up to date, it may be more efficient to extend the write
@@ -1028,7 +1031,7 @@ static void nfs_writeback_release_partia
 	}
 
 	if (nfs_write_need_commit(data)) {
-		struct inode *inode = page->mapping->host;
+		struct inode *inode = page_file_mapping(page)->host;
 
 		spin_lock(&inode->i_lock);
 		if (test_bit(PG_NEED_RESCHED, &req->wb_flags)) {
@@ -1310,7 +1313,7 @@ nfs_commit_list(struct inode *inode, str
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+		dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
 				BDI_RECLAIMABLE);
 		nfs_clear_page_tag_locked(req);
 	}
@@ -1500,10 +1503,10 @@ int nfs_wb_nocommit(struct inode *inode)
 int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 {
 	struct nfs_page *req;
-	loff_t range_start = page_offset(page);
+	loff_t range_start = page_file_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
 	struct writeback_control wbc = {
-		.bdi = page->mapping->backing_dev_info,
+		.bdi = page_file_mapping(page)->backing_dev_info,
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = range_start,
@@ -1536,7 +1539,8 @@ int nfs_wb_page_cancel(struct inode *ino
 	}
 	if (!PagePrivate(page))
 		return 0;
-	ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
+	ret = nfs_sync_mapping_wait(page_file_mapping(page), &wbc,
+			FLUSH_INVALIDATE);
 out:
 	return ret;
 }
@@ -1544,10 +1548,10 @@ out:
 static int nfs_wb_page_priority(struct inode *inode, struct page *page,
 				int how)
 {
-	loff_t range_start = page_offset(page);
+	loff_t range_start = page_file_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
 	struct writeback_control wbc = {
-		.bdi = page->mapping->backing_dev_info,
+		.bdi = page_file_mapping(page)->backing_dev_info,
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = LONG_MAX,
 		.range_start = range_start,
@@ -1562,7 +1566,7 @@ static int nfs_wb_page_priority(struct i
 				goto out_error;
 		} else if (!PagePrivate(page))
 			break;
-		ret = nfs_sync_mapping_wait(page->mapping, &wbc, how);
+		ret = nfs_sync_mapping_wait(page_file_mapping(page), &wbc, how);
 		if (ret < 0)
 			goto out_error;
 	} while (PagePrivate(page));

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 26/31] nfs: disable data cache revalidation for swapfiles
From: Suresh Jayaraman @ 2009-10-01 14:10 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Do as Trond suggested:
  http://lkml.org/lkml/2006/8/25/348

Disable NFS data cache revalidation on swap files since it doesn't really 
make sense to have other clients change the file while you are using it.

Thereby we can stop setting PG_private on swap pages, since there ought to
be no further races with invalidate_inode_pages2() to deal with.

And since we cannot set PG_private we cannot use page->private (which is
already used by PG_swapcache pages anyway) to store the nfs_page. Thus
augment the new nfs_page_find_request logic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 fs/nfs/inode.c |    6 ++++
 fs/nfs/write.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 13 deletions(-)

Index: mmotm/fs/nfs/inode.c
===================================================================
--- mmotm.orig/fs/nfs/inode.c
+++ mmotm/fs/nfs/inode.c
@@ -820,6 +820,12 @@ int nfs_revalidate_mapping_nolock(struct
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret = 0;
 
+	/*
+	 * swapfiles are not supposed to be shared.
+	 */
+	if (IS_SWAPFILE(inode))
+		goto out;
+
 	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
 			|| nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
 		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
Index: mmotm/fs/nfs/write.c
===================================================================
--- mmotm.orig/fs/nfs/write.c
+++ mmotm/fs/nfs/write.c
@@ -107,25 +107,64 @@ static void nfs_context_set_write_error(
 	set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 }
 
-static struct nfs_page *nfs_page_find_request_locked(struct page *page)
+static struct nfs_page *
+__nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page,
+		int get)
 {
 	struct nfs_page *req = NULL;
 
-	if (PagePrivate(page)) {
+	if (PagePrivate(page))
 		req = (struct nfs_page *)page_private(page);
-		if (req != NULL)
-			kref_get(&req->wb_kref);
-	}
+	else if (unlikely(PageSwapCache(page)))
+		req = radix_tree_lookup(&nfsi->nfs_page_tree,
+				page_file_index(page));
+
+	if (get && req)
+		kref_get(&req->wb_kref);
+
 	return req;
 }
 
+static inline struct nfs_page *
+nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
+{
+	return __nfs_page_find_request_locked(nfsi, page, 1);
+}
+
+static int __nfs_page_has_request(struct page *page)
+{
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_page *req = NULL;
+
+	spin_lock(&inode->i_lock);
+	req = __nfs_page_find_request_locked(NFS_I(inode), page, 0);
+	spin_unlock(&inode->i_lock);
+
+	/*
+	 * hole here plugged by the caller holding onto PG_locked
+	 */
+
+	return req != NULL;
+}
+
+static inline int nfs_page_has_request(struct page *page)
+{
+	if (PagePrivate(page))
+		return 1;
+
+	if (unlikely(PageSwapCache(page)))
+		return __nfs_page_has_request(page);
+
+	return 0;
+}
+
 static struct nfs_page *nfs_page_find_request(struct page *page)
 {
 	struct inode *inode = page_file_mapping(page)->host;
 	struct nfs_page *req = NULL;
 
 	spin_lock(&inode->i_lock);
-	req = nfs_page_find_request_locked(page);
+	req = nfs_page_find_request_locked(NFS_I(inode), page);
 	spin_unlock(&inode->i_lock);
 	return req;
 }
@@ -231,7 +270,7 @@ static int nfs_page_async_flush(struct n
 
 	spin_lock(&inode->i_lock);
 	for(;;) {
-		req = nfs_page_find_request_locked(page);
+		req = nfs_page_find_request_locked(NFS_I(inode), page);
 		if (req == NULL) {
 			spin_unlock(&inode->i_lock);
 			return 0;
@@ -370,8 +409,14 @@ static int nfs_inode_add_request(struct
 		if (nfs_have_delegation(inode, FMODE_WRITE))
 			nfsi->change_attr++;
 	}
-	SetPagePrivate(req->wb_page);
-	set_page_private(req->wb_page, (unsigned long)req);
+	/*
+	 * Swap-space should not get truncated. Hence no need to plug the race
+	 * with invalidate/truncate.
+	 */
+	if (likely(!PageSwapCache(req->wb_page))) {
+		SetPagePrivate(req->wb_page);
+		set_page_private(req->wb_page, (unsigned long)req);
+	}
 	nfsi->npages++;
 	kref_get(&req->wb_kref);
 	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
@@ -393,8 +438,10 @@ static void nfs_inode_remove_request(str
 	BUG_ON (!NFS_WBACK_BUSY(req));
 
 	spin_lock(&inode->i_lock);
-	set_page_private(req->wb_page, 0);
-	ClearPagePrivate(req->wb_page);
+	if (likely(!PageSwapCache(req->wb_page))) {
+		set_page_private(req->wb_page, 0);
+		ClearPagePrivate(req->wb_page);
+	}
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
 	nfsi->npages--;
 	if (!nfsi->npages) {
@@ -606,7 +653,7 @@ static struct nfs_page *nfs_try_to_updat
 	spin_lock(&inode->i_lock);
 
 	for (;;) {
-		req = nfs_page_find_request_locked(page);
+		req = nfs_page_find_request_locked(NFS_I(inode), page);
 		if (req == NULL)
 			goto out_unlock;
 
@@ -1537,7 +1584,7 @@ int nfs_wb_page_cancel(struct inode *ino
 		if (ret < 0)
 			goto out;
 	}
-	if (!PagePrivate(page))
+	if (!nfs_page_has_request(page))
 		return 0;
 	ret = nfs_sync_mapping_wait(page_file_mapping(page), &wbc,
 			FLUSH_INVALIDATE);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* nfs: [PATCH 27/31] enable swap on NFS
From: Suresh Jayaraman @ 2009-10-01 14:10 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Implement all the new swapfile a_ops for NFS. This will set the NFS socket to
SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well as reset
SOCK_MEMALLOC before engaging the protocol ->connect() method.

PF_MEMALLOC should allow the allocation of struct socket and related objects
and the early (re)setting of SOCK_MEMALLOC should allow us to receive the
packets required for the TCP connection buildup.

(swapping continues over a server reset during heavy network traffic)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 fs/nfs/Kconfig              |   10 ++++++
 fs/nfs/file.c               |   18 +++++++++++
 fs/nfs/write.c              |   22 +++++++++++++
 include/linux/nfs_fs.h      |    2 +
 include/linux/sunrpc/xprt.h |    5 ++-
 net/sunrpc/Kconfig          |    5 +++
 net/sunrpc/sched.c          |    9 ++++-
 net/sunrpc/xprtsock.c       |   70 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 138 insertions(+), 3 deletions(-)

Index: mmotm/fs/nfs/file.c
===================================================================
--- mmotm.orig/fs/nfs/file.c
+++ mmotm/fs/nfs/file.c
@@ -468,6 +468,18 @@ static int nfs_launder_page(struct page
 	return nfs_wb_page(inode, page);
 }
 
+#ifdef CONFIG_NFS_SWAP
+static int nfs_swapon(struct file *file)
+{
+	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
+}
+
+static int nfs_swapoff(struct file *file)
+{
+	return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
+}
+#endif
+
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
 	.readpages = nfs_readpages,
@@ -480,6 +492,12 @@ const struct address_space_operations nf
 	.releasepage = nfs_release_page,
 	.direct_IO = nfs_direct_IO,
 	.launder_page = nfs_launder_page,
+#ifdef CONFIG_NFS_SWAP
+	.swapon = nfs_swapon,
+	.swapoff = nfs_swapoff,
+	.swap_out = nfs_swap_out,
+	.swap_in = nfs_readpage,
+#endif
 };
 
 /*
Index: mmotm/fs/nfs/write.c
===================================================================
--- mmotm.orig/fs/nfs/write.c
+++ mmotm/fs/nfs/write.c
@@ -344,6 +344,28 @@ int nfs_writepage(struct page *page, str
 	return ret;
 }
 
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+		unsigned int offset, unsigned int count);
+
+int nfs_swap_out(struct file *file, struct page *page,
+		 struct writeback_control *wbc)
+{
+	struct nfs_open_context *ctx = nfs_file_open_context(file);
+	int status;
+
+	status = nfs_writepage_setup(ctx, page, 0, nfs_page_length(page));
+	if (status < 0) {
+		nfs_set_pageerror(page);
+		goto out;
+	}
+
+	status = nfs_writepage_locked(page, wbc);
+
+out:
+	unlock_page(page);
+	return status;
+}
+
 static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
 {
 	int ret;
Index: mmotm/include/linux/nfs_fs.h
===================================================================
--- mmotm.orig/include/linux/nfs_fs.h
+++ mmotm/include/linux/nfs_fs.h
@@ -473,6 +473,8 @@ extern int  nfs_writepages(struct addres
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern int  nfs_swap_out(struct file *file, struct page *page,
+			 struct writeback_control *wbc);
 
 /*
  * Try to write back everything synchronously (but check the
Index: mmotm/include/linux/sunrpc/xprt.h
===================================================================
--- mmotm.orig/include/linux/sunrpc/xprt.h
+++ mmotm/include/linux/sunrpc/xprt.h
@@ -153,7 +153,9 @@ struct rpc_xprt {
 	unsigned int		max_reqs;	/* total slots */
 	unsigned long		state;		/* transport state */
 	unsigned char		shutdown   : 1,	/* being shut down */
-				resvport   : 1; /* use a reserved port */
+				resvport   : 1, /* use a reserved port */
+				swapper    : 1; /* we're swapping over this
+						   transport */
 	unsigned int		bind_index;	/* bind function index */
 
 	/*
@@ -285,6 +287,7 @@ void			xprt_release_rqst_cong(struct rpc
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
 void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
+int			xs_swapper(struct rpc_xprt *xprt, int enable);
 
 /*
  * Reserved bit positions in xprt->state
Index: mmotm/net/sunrpc/sched.c
===================================================================
--- mmotm.orig/net/sunrpc/sched.c
+++ mmotm/net/sunrpc/sched.c
@@ -735,7 +735,10 @@ struct rpc_buffer {
 void *rpc_malloc(struct rpc_task *task, size_t size)
 {
 	struct rpc_buffer *buf;
-	gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
+	gfp_t gfp = GFP_NOWAIT;
+
+	if (RPC_IS_SWAPPER(task))
+		gfp |= __GFP_MEMALLOC;
 
 	size += sizeof(struct rpc_buffer);
 	if (size <= RPC_BUFFER_MAXSIZE)
@@ -806,6 +809,8 @@ static void rpc_init_task(struct rpc_tas
 		kref_get(&task->tk_client->cl_kref);
 		if (task->tk_client->cl_softrtry)
 			task->tk_flags |= RPC_TASK_SOFT;
+		if (task->tk_client->cl_xprt->swapper)
+			task->tk_flags |= RPC_TASK_SWAPPER;
 	}
 
 	if (task->tk_ops->rpc_call_prepare != NULL)
@@ -831,7 +836,7 @@ static void rpc_init_task(struct rpc_tas
 static struct rpc_task *
 rpc_alloc_task(void)
 {
-	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
 }
 
 /*
Index: mmotm/net/sunrpc/xprtsock.c
===================================================================
--- mmotm.orig/net/sunrpc/xprtsock.c
+++ mmotm/net/sunrpc/xprtsock.c
@@ -1719,6 +1719,57 @@ static inline void xs_reclassify_socket6
 }
 #endif
 
+#ifdef CONFIG_SUNRPC_SWAP
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+			xprt);
+
+	if (xprt->swapper)
+		sk_set_memalloc(transport->inet);
+}
+
+#define RPC_BUF_RESERVE_PAGES \
+	kmalloc_estimate_objs(sizeof(struct rpc_rqst), GFP_KERNEL, RPC_MAX_SLOT_TABLE)
+#define RPC_RESERVE_PAGES	(RPC_BUF_RESERVE_PAGES + TX_RESERVE_PAGES)
+
+/**
+ * xs_swapper - Tag this transport as being used for swap.
+ * @xprt: transport to tag
+ * @enable: enable/disable
+ *
+ */
+int xs_swapper(struct rpc_xprt *xprt, int enable)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+			xprt);
+	int err = 0;
+
+	if (enable) {
+		/*
+		 * keep one extra sock reference so the reserve won't dip
+		 * when the socket gets reconnected.
+		 */
+		err = sk_adjust_memalloc(1, RPC_RESERVE_PAGES);
+		if (!err) {
+			xprt->swapper = 1;
+			xs_set_memalloc(xprt);
+		}
+	} else if (xprt->swapper) {
+		xprt->swapper = 0;
+		sk_clear_memalloc(transport->inet);
+		sk_adjust_memalloc(-1, -RPC_RESERVE_PAGES);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(xs_swapper);
+#else
+static void xs_set_memalloc(struct rpc_xprt *xprt)
+{
+}
+#endif
+
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1743,6 +1794,8 @@ static void xs_udp_finish_connecting(str
 		transport->sock = sock;
 		transport->inet = sk;
 
+		xs_set_memalloc(xprt);
+
 		write_unlock_bh(&sk->sk_callback_lock);
 	}
 	xs_udp_do_set_buffer_size(xprt);
@@ -1760,11 +1813,15 @@ static void xs_udp_connect_worker4(struc
 		container_of(work, struct sock_xprt, connect_worker.work);
 	struct rpc_xprt *xprt = &transport->xprt;
 	struct socket *sock = transport->sock;
+	unsigned long pflags = current->flags;
 	int err, status = -EIO;
 
 	if (xprt->shutdown)
 		goto out;
 
+	if (xprt->swapper)
+		current->flags |= PF_MEMALLOC;
+
 	/* Start by resetting any existing state */
 	xs_reset_transport(transport);
 
@@ -1788,6 +1845,7 @@ static void xs_udp_connect_worker4(struc
 out:
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 /**
@@ -1802,11 +1860,15 @@ static void xs_udp_connect_worker6(struc
 		container_of(work, struct sock_xprt, connect_worker.work);
 	struct rpc_xprt *xprt = &transport->xprt;
 	struct socket *sock = transport->sock;
+	unsigned long pflags = current->flags;
 	int err, status = -EIO;
 
 	if (xprt->shutdown)
 		goto out;
 
+	if (xprt->swapper)
+		current->flags |= PF_MEMALLOC;
+
 	/* Start by resetting any existing state */
 	xs_reset_transport(transport);
 
@@ -1830,6 +1892,7 @@ static void xs_udp_connect_worker6(struc
 out:
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 /*
@@ -1904,6 +1967,8 @@ static int xs_tcp_finish_connecting(stru
 	if (!xprt_bound(xprt))
 		return -ENOTCONN;
 
+	xs_set_memalloc(xprt);
+
 	/* Tell the socket layer to start connecting... */
 	xprt->stat.connect_count++;
 	xprt->stat.connect_start = jiffies;
@@ -1924,11 +1989,15 @@ static void xs_tcp_setup_socket(struct r
 			struct sock_xprt *))
 {
 	struct socket *sock = transport->sock;
+	unsigned long pflags = current->flags;
 	int status = -EIO;
 
 	if (xprt->shutdown)
 		goto out;
 
+	if (xprt->swapper)
+		current->flags |= PF_MEMALLOC;
+
 	if (!sock) {
 		clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
 		sock = create_sock(xprt, transport);
@@ -1981,6 +2050,7 @@ out_eagain:
 out:
 	xprt_clear_connecting(xprt);
 	xprt_wake_pending_tasks(xprt, status);
+	tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
Index: mmotm/fs/nfs/Kconfig
===================================================================
--- mmotm.orig/fs/nfs/Kconfig
+++ mmotm/fs/nfs/Kconfig
@@ -74,6 +74,16 @@ config NFS_V4
 
 	  If unsure, say N.
 
+config NFS_SWAP
+	bool "Provide swap over NFS support"
+	default n
+	depends on NFS_FS
+	select SUNRPC_SWAP
+	help
+	  This option enables swapon to work on files located on NFS mounts.
+
+	  For more details, see Documentation/network-swap.txt
+
 config NFS_V4_1
 	bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)"
 	depends on NFS_V4 && EXPERIMENTAL
Index: mmotm/net/sunrpc/Kconfig
===================================================================
--- mmotm.orig/net/sunrpc/Kconfig
+++ mmotm/net/sunrpc/Kconfig
@@ -17,6 +17,11 @@ config SUNRPC_XPRT_RDMA
 
 	  If unsure, say N.
 
+config SUNRPC_SWAP
+	def_bool n
+	depends on SUNRPC
+	select NETVM
+
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 28/31] nfs: fix various memory recursions possible with swap over NFS.
From: Suresh Jayaraman @ 2009-10-01 14:10 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Peter Zijlstra <a.p.zijlstra@chello.nl> 

GFP_NOFS is _more_ permissive than GFP_NOIO in that it will initiate IO,
just not of any filesystem data.

The problem is that previuosly NOFS was correct because that avoids
recursion into the NFS code, it now is not, because also IO (swap) can
lead to this recursion.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 fs/nfs/pagelist.c |    2 +-
 fs/nfs/write.c    |    7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

Index: mmotm/fs/nfs/write.c
===================================================================
--- mmotm.orig/fs/nfs/write.c
+++ mmotm/fs/nfs/write.c
@@ -48,7 +48,7 @@ static mempool_t *nfs_commit_mempool;
 
 struct nfs_write_data *nfs_commitdata_alloc(void)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
+	struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
@@ -67,7 +67,7 @@ void nfs_commit_free(struct nfs_write_da
 
 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
-	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
+	struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
 	if (p) {
 		memset(p, 0, sizeof(*p));
@@ -77,7 +77,8 @@ struct nfs_write_data *nfs_writedata_all
 		if (pagecount <= ARRAY_SIZE(p->page_array))
 			p->pagevec = p->page_array;
 		else {
-			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *),
+					GFP_NOIO);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
Index: mmotm/fs/nfs/pagelist.c
===================================================================
--- mmotm.orig/fs/nfs/pagelist.c
+++ mmotm/fs/nfs/pagelist.c
@@ -27,7 +27,7 @@ static inline struct nfs_page *
 nfs_page_alloc(void)
 {
 	struct nfs_page	*p;
-	p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+	p = kmem_cache_alloc(nfs_page_cachep, GFP_NOIO);
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->wb_list);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 29/31] Cope with racy nature of sync_page in swap_sync_page
From: Suresh Jayaraman @ 2009-10-01 14:10 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: NeilBrown <neilb@suse.de>

sync_page is called without that PageLock held.  This means that,
for example, PageSwapCache can be cleared at any time.
We need to be careful not to put much trust any any part of the page.

So allow page_swap_info to return NULL of the page is no longer
in a SwapCache, and handle the NULL gracefully in swap_sync_page.

No other calls need to handle the NULL as that all hold PageLock,
so PageSwapCache cannot be cleared by surprise.  Add a WARN_ON to 
document this fact and help find out if I am wrong.

Acked-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 mm/page_io.c  |    2 ++
 mm/swapfile.c |    8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

Index: mmotm/mm/page_io.c
===================================================================
--- mmotm.orig/mm/page_io.c
+++ mmotm/mm/page_io.c
@@ -137,6 +137,8 @@ void swap_sync_page(struct page *page)
 {
 	struct swap_info_struct *sis = page_swap_info(page);
 
+	if (!sis)
+		return;
 	if (sis->flags & SWP_FILE) {
 		struct address_space *mapping = sis->swap_file->f_mapping;
 
Index: mmotm/mm/swapfile.c
===================================================================
--- mmotm.orig/mm/swapfile.c
+++ mmotm/mm/swapfile.c
@@ -2185,7 +2185,13 @@ get_swap_info_struct(unsigned type)
 struct swap_info_struct *page_swap_info(struct page *page)
 {
 	swp_entry_t swap = { .val = page_private(page) };
-	BUG_ON(!PageSwapCache(page));
+	if (!PageSwapCache(page) || !swap.val) {
+		/* This should only happen from sync_page.
+		 * In other cases the page should be locked and
+		 * should be in a SwapCache
+		 */
+		return NULL;
+	}
 	return &swap_info[swp_type(swap)];
 }
 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 30/31] Fix use of uninitialized variable in cache_grow()
From: Suresh Jayaraman @ 2009-10-01 14:10 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

From: Miklos Szeredi <mszeredi@suse.cz>

This fixes a bug in reserve-slub.patch.

If cache_grow() was called with objp != NULL then the 'reserve' local
variable wasn't initialized. This resulted in ac->reserve being set to
a rubbish value.  Due to this in some circumstances huge amounts of
slab pages were allocated (due to slab_force_alloc() returning true),
which caused atomic page allocation failures and slowdown of the
system.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 mm/slab.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

Index: mmotm/mm/slab.c
===================================================================
--- mmotm.orig/mm/slab.c
+++ mmotm/mm/slab.c
@@ -2760,7 +2760,7 @@ static int cache_grow(struct kmem_cache
 	size_t offset;
 	gfp_t local_flags;
 	struct kmem_list3 *l3;
-	int reserve;
+	int reserve = -1;
 
 	/*
 	 * Be lazy and only check for valid flags here,  keeping it out of the
@@ -2816,7 +2816,8 @@ static int cache_grow(struct kmem_cache
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
-	slab_set_reserve(cachep, reserve);
+	if (reserve != -1)
+		slab_set_reserve(cachep, reserve);
 	spin_lock(&l3->list_lock);
 
 	/* Make slab active. */

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH 31/31] swapfile: avoid NULL pointer dereference in swapon when s_bdev is NULL
From: Suresh Jayaraman @ 2009-10-01 14:11 UTC (permalink / raw)
  To: Linus Torvalds, Andrew Morton, linux-kernel, linux-mm
  Cc: netdev, Neil Brown, Miklos Szeredi, Wouter Verhelst,
	Peter Zijlstra, trond.myklebust, Suresh Jayaraman

While testing Swap over NFS patchset, I noticed an oops that was triggered
during swapon. Investigating further, the NULL pointer deference is due to the
SSD device check/optimization in the swapon code that assumes s_bdev is not
NULL.

inode->i_sb->s_bdev could be NULL in a few cases. For e.g. one such case is
loopback NFS mount, there could be others as well. Fix this by ensuring s_bdev
is not NULL before we try to deference s_bdev.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
---
 mm/swapfile.c |   26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

Index: mmotm/mm/swapfile.c
===================================================================
--- mmotm.orig/mm/swapfile.c
+++ mmotm/mm/swapfile.c
@@ -160,10 +160,12 @@ static int discard_swap(struct swap_info
 				continue;
 		}
 
-		err = blkdev_issue_discard(si->bdev, start_block,
+		if (si->bdev) {
+			err = blkdev_issue_discard(si->bdev, start_block,
 						nr_blocks, GFP_KERNEL);
-		if (err)
-			break;
+			if (err)
+				break;
+		}
 
 		cond_resched();
 	}
@@ -199,9 +201,11 @@ static void discard_swap_cluster(struct
 
 			start_block <<= PAGE_SHIFT - 9;
 			nr_blocks <<= PAGE_SHIFT - 9;
-			if (blkdev_issue_discard(si->bdev, start_block,
+			if (si->bdev) {
+				if (blkdev_issue_discard(si->bdev, start_block,
 							nr_blocks, GFP_NOIO))
-				break;
+					break;
+			}
 		}
 
 		lh = se->list.next;
@@ -1991,12 +1995,14 @@ SYSCALL_DEFINE2(swapon, const char __use
 		goto bad_swap;
 	}
 
-	if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
-		p->flags |= SWP_SOLIDSTATE;
-		p->cluster_next = 1 + (random32() % p->highest_bit);
+	if (p->bdev) {
+		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+			p->flags |= SWP_SOLIDSTATE;
+			p->cluster_next = 1 + (random32() % p->highest_bit);
+		}
+		if (discard_swap(p) == 0)
+			p->flags |= SWP_DISCARDABLE;
 	}
-	if (discard_swap(p) == 0)
-		p->flags |= SWP_DISCARDABLE;
 
 	mutex_lock(&swapon_mutex);
 	spin_lock(&swap_lock);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 0/2] cfg80211: firmware and hardware version
From: Kalle Valo @ 2009-10-01 14:18 UTC (permalink / raw)
  To: John W. Linville; +Cc: Luis R. Rodriguez, linux-wireless, netdev
In-Reply-To: <20091001011340.GA3123@tuxdriver.com>

"John W. Linville" <linville@tuxdriver.com> writes:

> On Fri, Sep 25, 2009 at 09:53:35AM -0700, Luis R. Rodriguez wrote:
>
>> So for Wake-on-Wireless I ran into the same, ethtool just did not
>> offer the same wake up events needed for wireless. I could have
>> technically used ethtool and expanded it to support wireless but it
>> just seemed dirty.
>> 
>> I agree that using ethtool seems overkill compared to the patches
>> you posted.
>
> I think you either overestimate the amount of trouble for implementing
> (minimal) ethtool support or you underestimate the amount of
> functionality available through that interface.

I'm not worried about the implementation complexity, and as your
patches show it was easy. My concern is the overall design for
wireless devices. Instead of using nl80211 for everything, with some
features we would use nl80211/iw and with some ethtool. That's just
confusing and I don't like that. I would prefer that nl80211 provides
everything, it makes things so much easier.

> That, or you just don't like using something named "eth"tool for
> wireless -- but hey, let's be honest about the frames we
> send/receive to/from the kernel... :-)

I don't have a problem with the name :) But ethernet is still so much
different from 802.11 that there isn't that much to share and we in
wireless will need different features.

One example is the hw version, ethtool only provides u32 to userspace
and moves the burden of translating hw id to the user. For us a string
is much better choise because when debuggin we need to often (or
always?) know the chip version.

But this is not something I will start fighting about. If you still
think that ethtool is the way to go, I'm perfectly fine with it.

>> The ethtool interface provides functionality for viewing and modifying
> eeprom contents, dumping registers, trigger self-tests, basic driver
> info, getting and setting message reporting levels, external card
> identification (hey, _could_ be useful!), and some other bits like
> checksum offload that might(?) be useful in the future.  I understand
> regarding the WoW vs. WoL issue but probably the answer is just to
> add a new method for WoW...?

I took a look at ethtool help output from debian unstable and I think
this is the set of features we can use in wireless:

        ethtool -i|--driver DEVNAME     Show driver information
        ethtool -d|--register-dump DEVNAME      Do a register dump
                [ raw on|off ]
                [ file FILENAME ]
        ethtool -e|--eeprom-dump DEVNAME        Do a EEPROM dump
                [ raw on|off ]
                [ offset N ]
                [ length N ]
        ethtool -E|--change-eeprom DEVNAME      Change bytes in device
        EEPROM
                [ magic N ]
                [ offset N ]
                [ value N ]
        ethtool -p|--identify DEVNAME   Show visible port
        identification (e.g. blinking)
               [ TIME-IN-SECONDS ]
        ethtool -t|--test DEVNAME       Execute adapter self test
               [ online | offline ]

But here are the features which I doubt we will ever use:

        ethtool -s|--change DEVNAME     Change generic options
                [ speed %%d ]
                [ duplex half|full ]
                [ port tp|aui|bnc|mii|fibre ]
                [ autoneg on|off ]
                [ advertise %%x ]
                [ phyad %%d ]
                [ xcvr internal|external ]
                [ wol p|u|m|b|a|g|s|d... ]
                [ sopass %%x:%%x:%%x:%%x:%%x:%%x ]
                [ msglvl %%d ] 
        ethtool -a|--show-pause DEVNAME Show pause options
        ethtool -A|--pause DEVNAME      Set pause options
                [ autoneg on|off ]
                [ rx on|off ]
                [ tx on|off ]
        ethtool -c|--show-coalesce DEVNAME      Show coalesce options
        ethtool -C|--coalesce DEVNAME   Set coalesce options
                [adaptive-rx on|off]
                [adaptive-tx on|off]
                [rx-usecs N]
                [rx-frames N]
                [rx-usecs-irq N]
                [rx-frames-irq N]
                [tx-usecs N]
                [tx-frames N]
                [tx-usecs-irq N]
                [tx-frames-irq N]
                [stats-block-usecs N]
                [pkt-rate-low N]
                [rx-usecs-low N]
                [rx-frames-low N]
                [tx-usecs-low N]
                [tx-frames-low N]
                [pkt-rate-high N]
                [rx-usecs-high N]
                [rx-frames-high N]
                [tx-usecs-high N]
                [tx-frames-high N]
                [sample-interval N]
        ethtool -g|--show-ring DEVNAME  Query RX/TX ring parameters
        ethtool -G|--set-ring DEVNAME   Set RX/TX ring parameters
                [ rx N ]
                [ rx-mini N ]
                [ rx-jumbo N ]
                [ tx N ]
        ethtool -k|--show-offload DEVNAME       Get protocol offload
                information
        ethtool -K|--offload DEVNAME    Set protocol offload
                [ rx on|off ]
                [ tx on|off ]
                [ sg on|off ]
                [ tso on|off ]
                [ ufo on|off ]
                [ gso on|off ]
                [ gro on|off ]
                [ lro on|off ]
        ethtool -r|--negotiate DEVNAME  Restart N-WAY negotation
        ethtool -n|--show-nfc DEVNAME   Show Rx network flow
                classificationoptions
                [ rx-flow-hash
                tcp4|udp4|ah4|sctp4|tcp6|udp6|ah6|sctp6 ]
        ethtool -N|--config-nfc DEVNAME Configure Rx network flow
                classification options
                [ rx-flow-hash tcp4|udp4|ah4|sctp4|tcp6|udp6|ah6|sctp6
                m|v|t|s|d|f|n|r... ]

-- 
Kalle Valo

^ permalink raw reply

* Re: [PATCH] net: fix NOHZ: local_softirq_pending 08
From: Kalle Valo @ 2009-10-01 14:24 UTC (permalink / raw)
  To: Michael Buesch
  Cc: David Miller, oliver-fJ+pQTUTwRTk1uMJSBkQmQ,
	johannes-cdvu00un1VgdHxzADdlk8Q, linville-2XuSBdqkA4R54TAoqtyWWQ,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <200910011604.42916.mb-fseUSCV1ubazQB+pC5nmwQ@public.gmane.org>

Michael Buesch <mb-fseUSCV1ubazQB+pC5nmwQ@public.gmane.org> writes:

> On Thursday 01 October 2009 01:33:33 David Miller wrote:
>
>> I'm not applying this until all of these details are sorted out 
>
> John, please apply my fix to wireless-testing to get rid of the
> regression. You can revert it later, if there's a better fix
> available.

I agree, please take Michael's patch. It's trivial to change mac80211
part whenever there's better support available.

But I don't think this is a regression because I see the bug also with
2.6.28, most probably it has been in mac80211 forever. But it's still
a bug which needs to be fixed.

-- 
Kalle Valo
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 3/3] at76c50x-usb: set firmware and hardware version in wiphy
From: Kalle Valo @ 2009-10-01 14:27 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: John W. Linville, linux-wireless, netdev, Luis R. Rodriguez
In-Reply-To: <1254360747.23350.2.camel@localhost>

Ben Hutchings <bhutchings@solarflare.com> writes:

> On Wed, 2009-09-30 at 21:19 -0400, John W. Linville wrote:
> [...]
>> +	len = sizeof(wiphy->fw_version);
>> +	snprintf(wiphy->fw_version, len, "%d.%d.%d-%d",
>> +		 priv->fw_version.major, priv->fw_version.minor,
>> +		 priv->fw_version.patch, priv->fw_version.build);
>> +	/* null terminate the strings in case they were truncated */
>> +	wiphy->fw_version[len - 1] = '\0';
> [...]
>
> This last statement is unnecessary; snprintf() always null-terminates
> (unless the length is zero).

Yes, the extra null termination is unnecessary. This was my mistake in
the first patchset I sent.

-- 
Kalle Valo

^ permalink raw reply

* [PATCH] net/ppp: fix comments - ppp_{sync,asynctty}_receive() may sleep
From: Tilman Schmidt @ 2009-10-01 14:28 UTC (permalink / raw)
  To: Alan Cox, Alan Cox, Paul Mackerras, linux-ppp
  Cc: David Miller, netdev, Jarek Poplawski, linux-kernel

The receive_buf methods of the N_PPP and N_SYNC_PPP line disciplines,
ppp_asynctty_receive() and ppp_sync_receive(), call tty_unthrottle()
which may sleep. Fix the comments claiming otherwise.

Impact: documentation
Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
 drivers/net/ppp_async.c   |    5 +----
 drivers/net/ppp_synctty.c |    5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index 6de8399..30b1b33 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -337,10 +337,7 @@ ppp_asynctty_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 	return 0;
 }
 
-/*
- * This can now be called from hard interrupt level as well
- * as soft interrupt level or mainline.
- */
+/* May sleep, don't call from interrupt level or with interrupts disabled */
 static void
 ppp_asynctty_receive(struct tty_struct *tty, const unsigned char *buf,
 		  char *cflags, int count)
diff --git a/drivers/net/ppp_synctty.c b/drivers/net/ppp_synctty.c
index d2fa2db..c908b08 100644
--- a/drivers/net/ppp_synctty.c
+++ b/drivers/net/ppp_synctty.c
@@ -378,10 +378,7 @@ ppp_sync_poll(struct tty_struct *tty, struct file *file, poll_table *wait)
 	return 0;
 }
 
-/*
- * This can now be called from hard interrupt level as well
- * as soft interrupt level or mainline.
- */
+/* May sleep, don't call from interrupt level or with interrupts disabled */
 static void
 ppp_sync_receive(struct tty_struct *tty, const unsigned char *buf,
 		  char *cflags, int count)
-- 
1.6.2.1.214.ge986c

^ permalink raw reply related

* Re: [PATCHv2] IPv4 TCP fails to send window scale option when window scale is zero
From: Eric Dumazet @ 2009-10-01 14:30 UTC (permalink / raw)
  To: Gilad Ben-Yossef; +Cc: Netdev, Ori Finkalman, Ilpo Järvinen
In-Reply-To: <4AC478E9.5050605@codefidence.com>

Gilad Ben-Yossef a écrit :
> From: Ori Finkelman <ori@comsleep.com>
> 
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 5200aab..fcd278a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -361,6 +361,7 @@ static inline int tcp_urg_mode(const struct tcp_sock
> *tp)
> #define OPTION_SACK_ADVERTISE  (1 << 0)
> #define OPTION_TS              (1 << 1)
> #define OPTION_MD5             (1 << 2)
> +#define OPTION_WSCALE          (1 << 3)

I manually applied your patch and tested it.

So far so good, it works well.

But you'll need to find correct way to submit a patch so that your mailer doesnt
mangle the content.

File Documentation/email-clients.txt contains useful tips.


^ permalink raw reply

* RE: [PATCH 2.6.31-rc9] drivers/net: ks8851_mll ethernet network driver
From: Choi, David @ 2009-10-01 14:51 UTC (permalink / raw)
  To: David Miller; +Cc: greg, netdev, Li, Charles, Choi, jgarzik, shemminger
In-Reply-To: <20090930.200535.217323985.davem@davemloft.net>

Hello all,

I really appreciate all of you, especially for your guidance, patience
and professionalism.


Regards,
David J. Choi


-----Original Message-----
From: David Miller [mailto:davem@davemloft.net] 
Sent: Wednesday, September 30, 2009 8:06 PM
To: Choi, David
Cc: greg@kroah.com; netdev@vger.kernel.org; Li, Charles; Choi@kroah.com;
jgarzik@redhat.com; shemminger@vyatta.com
Subject: Re: [PATCH 2.6.31-rc9] drivers/net: ks8851_mll ethernet network
driver 

From: "Choi, David" <David.Choi@Micrel.Com>
Date: Fri, 25 Sep 2009 17:42:12 -0700

> Hello David Miller,
> 
> First of all, thank you for your feedback.  Here is my new patch.
> 
>>From : David J. Choi <david.choi@micrel.com>
> 
> This is the first registration of ks8851 network driver with 
> MLL(address/data multiplexed) interface.
> 
> Signed-off-by : David J. Choi <david.choi@micrel.com>

Applied, thanks.

^ permalink raw reply

* Re: [RFCv4 PATCH 2/2] net: Allow protocols to provide an unlocked_recvmsg socket method
From: Arnaldo Carvalho de Melo @ 2009-10-01 15:03 UTC (permalink / raw)
  To: Nir Tzachar; +Cc: Linux Networking Development Mailing List, Ziv Ayalon
In-Reply-To: <9b2db90b0910010249h182bf5d4sc7fdaea9e1345720@mail.gmail.com>

Em Thu, Oct 01, 2009 at 11:49:39AM +0200, Nir Tzachar escreveu:
> Hi Arnaldo
> 
> I have repeated the tests using net-next on top of linus' git tree (I
> hope I got it right..) and the patches you sent me. Things did not get
> better, and in most cases were even worse; the recvmmsg parts
> distinctly showed better throughput, but the latency has more than
> doubled.

Interesting... Now the socket lock is held in recvmmsg over all of
udp_recvmmsg for the batch size while when not using unlocked_recvmmsg,
so I think one needs to carefully set the timeout parameter. Perhaps
we'll need to do something like tcp_rcvmmsg does, that is, to call
release_sock + lock_sock to process the backlog in the middle of a
recvmmsg call.
 
> The simplest test of using a batch size of 1 results with recvmmsg's
> latency over 1000 micro, while regular recvmsg is around 450 micro.
> (note that to use 1 packet there is a small bug in the reg_recv which
> needs to be fixed. Namely, change ret = -1 to ret = 0). On the
> previous system config -- part 0001 of the patch, on top of 2.6.31 --
> the latency of a single packet batch is 370 micro.
> 
> So, there seems to be a regression with the kernel tree I am using, or
> with part 0002 of the path. I'll try running the net-next with only
> part 1 of the patch and report.

Yeah, trying with only part 0001 should get you back to the previous
results, but try using it in nonblocking mode and tweaking the timeout
parameter in recvmmsg.

- Arnaldo

^ permalink raw reply

* [PATCH] Use sk_mark for routing lookup in more places
From: Atis Elsts @ 2009-10-01 15:14 UTC (permalink / raw)
  To: Laszlo Attila Toth; +Cc: David S. Miller, netdev

This patch against v2.6.31 adds support for route lookup using sk_mark in some 
more places. The benefits from this patch are the following.
First, SO_MARK option now has effect on UDP sockets too.
Second, ip_queue_xmit() and inet_sk_rebuild_header() could fail to do routing 
lookup correctly if TCP sockets with SO_MARK were used.

Signed-off-by: Atis Elsts <atis@mikrotik.com>
---
 net/ipv4/af_inet.c   |    1 +
 net/ipv4/ip_output.c |    1 +
 net/ipv4/udp.c       |    1 +
 3 files changed, 3 insertions(+)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 566ea6c..7917963 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1103,6 +1103,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 {
        struct flowi fl = {
                .oif = sk->sk_bound_dev_if,
+               .mark = sk->sk_mark,
                .nl_u = {
                        .ip4_u = {
                                .daddr  = daddr,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7ffcd96..e088a97 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -335,6 +335,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)

                {
                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
+                                           .mark = sk->sk_mark,
                                            .nl_u = { .ip4_u =
                                                      { .daddr = daddr,
                                                        .saddr = inet->saddr,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 80e3812..f90cdcc 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -688,6 +688,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, 
struct msghdr *msg,

        if (rt == NULL) {
                struct flowi fl = { .oif = ipc.oif,
+                                   .mark = sk->sk_mark,
                                    .nl_u = { .ip4_u =
                                              { .daddr = faddr,
                                                .saddr = saddr,

^ permalink raw reply related

* Re: [PATCH 0/2] cfg80211: firmware and hardware version
From: John W. Linville @ 2009-10-01 15:18 UTC (permalink / raw)
  To: Kalle Valo; +Cc: Luis R. Rodriguez, linux-wireless, netdev
In-Reply-To: <87fxa3qjt2.fsf@purkki.valot.fi>

On Thu, Oct 01, 2009 at 05:18:33PM +0300, Kalle Valo wrote:
> "John W. Linville" <linville@tuxdriver.com> writes:
> 
> > On Fri, Sep 25, 2009 at 09:53:35AM -0700, Luis R. Rodriguez wrote:
> >
> >> So for Wake-on-Wireless I ran into the same, ethtool just did not
> >> offer the same wake up events needed for wireless. I could have
> >> technically used ethtool and expanded it to support wireless but it
> >> just seemed dirty.
> >> 
> >> I agree that using ethtool seems overkill compared to the patches
> >> you posted.
> >
> > I think you either overestimate the amount of trouble for implementing
> > (minimal) ethtool support or you underestimate the amount of
> > functionality available through that interface.
> 
> I'm not worried about the implementation complexity, and as your
> patches show it was easy. My concern is the overall design for
> wireless devices. Instead of using nl80211 for everything, with some
> features we would use nl80211/iw and with some ethtool. That's just
> confusing and I don't like that. I would prefer that nl80211 provides
> everything, it makes things so much easier.

Well, if the hw/fw version numbers were the only thing then I'd
probably say it's not a big deal.  But having ethtool support is nice
in that it makes a familiar tool work for us.  Among other things,
this probably helps with some distro scripts that don't work quite
right without it.  Plus, there is lots of debugging stuff that could
be turned-on without having to write new tools.

I suppose I understand the 'one API' idea, but why duplicate
functionality?  Anyway, adding a couple of ioctl calls isn't a
big deal.  And don't forget, we are still network drivers too...

> > That, or you just don't like using something named "eth"tool for
> > wireless -- but hey, let's be honest about the frames we
> > send/receive to/from the kernel... :-)
> 
> I don't have a problem with the name :) But ethernet is still so much
> different from 802.11 that there isn't that much to share and we in
> wireless will need different features.
> 
> One example is the hw version, ethtool only provides u32 to userspace
> and moves the burden of translating hw id to the user. For us a string
> is much better choise because when debuggin we need to often (or
> always?) know the chip version.

Look at the way most drivers set the version (using each byte as a
field).  If you want prettier output, adding a parser to the userland
ethtool is fairly trivial.  It looks something like the patch below...

> But this is not something I will start fighting about. If you still
> think that ethtool is the way to go, I'm perfectly fine with it.
> 
> >> The ethtool interface provides functionality for viewing and modifying
> > eeprom contents, dumping registers, trigger self-tests, basic driver
> > info, getting and setting message reporting levels, external card
> > identification (hey, _could_ be useful!), and some other bits like
> > checksum offload that might(?) be useful in the future.  I understand
> > regarding the WoW vs. WoL issue but probably the answer is just to
> > add a new method for WoW...?
> 
> I took a look at ethtool help output from debian unstable and I think
> this is the set of features we can use in wireless:
> 
>         ethtool -i|--driver DEVNAME     Show driver information
>         ethtool -d|--register-dump DEVNAME      Do a register dump
>                 [ raw on|off ]
>                 [ file FILENAME ]
>         ethtool -e|--eeprom-dump DEVNAME        Do a EEPROM dump
>                 [ raw on|off ]
>                 [ offset N ]
>                 [ length N ]
>         ethtool -E|--change-eeprom DEVNAME      Change bytes in device
>         EEPROM
>                 [ magic N ]
>                 [ offset N ]
>                 [ value N ]
>         ethtool -p|--identify DEVNAME   Show visible port
>         identification (e.g. blinking)
>                [ TIME-IN-SECONDS ]
>         ethtool -t|--test DEVNAME       Execute adapter self test
>                [ online | offline ]
 
I agree with the above.

> But here are the features which I doubt we will ever use:
> 
>         ethtool -s|--change DEVNAME     Change generic options
>                 [ speed %%d ]
>                 [ duplex half|full ]
>                 [ port tp|aui|bnc|mii|fibre ]
>                 [ autoneg on|off ]
>                 [ advertise %%x ]
>                 [ phyad %%d ]
>                 [ xcvr internal|external ]
>                 [ wol p|u|m|b|a|g|s|d... ]
>                 [ sopass %%x:%%x:%%x:%%x:%%x:%%x ]
>                 [ msglvl %%d ] 
>         ethtool -a|--show-pause DEVNAME Show pause options
>         ethtool -A|--pause DEVNAME      Set pause options
>                 [ autoneg on|off ]
>                 [ rx on|off ]
>                 [ tx on|off ]

I agree that the above are ethernet-specific.

>         ethtool -c|--show-coalesce DEVNAME      Show coalesce options
>         ethtool -C|--coalesce DEVNAME   Set coalesce options
>                 [adaptive-rx on|off]
>                 [adaptive-tx on|off]
>                 [rx-usecs N]
>                 [rx-frames N]
>                 [rx-usecs-irq N]
>                 [rx-frames-irq N]
>                 [tx-usecs N]
>                 [tx-frames N]
>                 [tx-usecs-irq N]
>                 [tx-frames-irq N]
>                 [stats-block-usecs N]
>                 [pkt-rate-low N]
>                 [rx-usecs-low N]
>                 [rx-frames-low N]
>                 [tx-usecs-low N]
>                 [tx-frames-low N]
>                 [pkt-rate-high N]
>                 [rx-usecs-high N]
>                 [rx-frames-high N]
>                 [tx-usecs-high N]
>                 [tx-frames-high N]
>                 [sample-interval N]

These _could_ be useful if wireless becomes more
performance-oriented...

>         ethtool -g|--show-ring DEVNAME  Query RX/TX ring parameters
>         ethtool -G|--set-ring DEVNAME   Set RX/TX ring parameters
>                 [ rx N ]
>                 [ rx-mini N ]
>                 [ rx-jumbo N ]
>                 [ tx N ]

Wireless devices have ring buffers, no?

>         ethtool -k|--show-offload DEVNAME       Get protocol offload
>                 information
>         ethtool -K|--offload DEVNAME    Set protocol offload
>                 [ rx on|off ]
>                 [ tx on|off ]
>                 [ sg on|off ]
>                 [ tso on|off ]
>                 [ ufo on|off ]
>                 [ gso on|off ]
>                 [ gro on|off ]
>                 [ lro on|off ]

Again, if wireless devices become performance-oriented...

>         ethtool -r|--negotiate DEVNAME  Restart N-WAY negotation

Ethernet-specific...might could be overloaded for wireless to trigger
reassoc...?

>         ethtool -n|--show-nfc DEVNAME   Show Rx network flow
>                 classificationoptions
>                 [ rx-flow-hash
>                 tcp4|udp4|ah4|sctp4|tcp6|udp6|ah6|sctp6 ]
>         ethtool -N|--config-nfc DEVNAME Configure Rx network flow
>                 classification options
>                 [ rx-flow-hash tcp4|udp4|ah4|sctp4|tcp6|udp6|ah6|sctp6
>                 m|v|t|s|d|f|n|r... ]

Long-shot, but no reason it couldn't be used in wireless... :-)

Anyway, it doesn't really matter if we don't use the whole API -- many
older ethernet devices don't support all these features.  The point
is that the API exists and has some overlap with our needs.  It is a
driver-oriented API, with nitty-gritty stuff that need not clutter a
configuraiton API like cfg80211.  There is even the potential of us
adding our own extensions (e.g. WoW) that are also device-oriented.

Anyway, between the link detection and making distro scripts work
plus enabling a familiar tool for basic driver info I think this is
a win.  So much the better if some drivers move to ethtool for register
dumping, setting message verbosity, querying/changing eeprom values,
etc, etc...

John

P.S.  The aforementioned path for userland ethtool...(theorhetical,
not even compiled...)

>From aa92d32ac1cca57bdd3439013b0c7777bdf1217c Mon Sep 17 00:00:00 2001
From: John W. Linville <linville@tuxdriver.com>
Date: Thu, 1 Oct 2009 11:01:32 -0400
Subject: [PATCH] add support for at76c50x-usb driver.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 Makefile.am    |    2 +-
 at76c50x-usb.c |   32 ++++++++++++++++++++++++++++++++
 ethtool.c      |    1 +
 3 files changed, 34 insertions(+), 1 deletions(-)
 create mode 100644 at76c50x-usb.c

diff --git a/Makefile.am b/Makefile.am
index eac65fe..a384949 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -8,7 +8,7 @@ ethtool_SOURCES = ethtool.c ethtool-copy.h ethtool-util.h	\
 		  amd8111e.c de2104x.c e100.c e1000.c igb.c	\
 		  fec_8xx.c ibm_emac.c ixgb.c ixgbe.c natsemi.c	\
 		  pcnet32.c realtek.c tg3.c marvell.c vioc.c	\
-		  smsc911x.c
+		  smsc911x.c at76c50x-usb.c
 
 dist-hook:
 	cp $(top_srcdir)/ethtool.spec $(distdir)
diff --git a/at76c50x-usb.c b/at76c50x-usb.c
new file mode 100644
index 0000000..295d1cb
--- /dev/null
+++ b/at76c50x-usb.c
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include "ethtool-util.h"
+
+static char hw_versions[] = {
+        "503_ISL3861",
+        "503_ISL3863",
+        "        503",
+        "    503_ACC",
+        "        505",
+        "   505_2958",
+        "       505A",
+        "     505AMX",
+};
+
+int
+at76c50x_usb_dump_regs(struct ethtool_drvinfo *info, struct ethtool_regs *regs)
+{
+	u8 version = (u8)(regs->version >> 24);
+	u8 rev_id = (u8)(regs->version);
+	char *ver_string;
+
+	if(version != 0)
+		return -1;
+
+	ver_string = hw_versions[rev_id];
+	fprintf(stdout,
+		"Hardware Version                    %s\n",
+		ver_string);
+
+	return 0;
+}
+
diff --git a/ethtool.c b/ethtool.c
index 0110682..7608750 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -1189,6 +1189,7 @@ static struct {
 	{ "sky2", sky2_dump_regs },
         { "vioc", vioc_dump_regs },
         { "smsc911x", smsc911x_dump_regs },
+        { "at76c50x-usb", at76c50x_usb_dump_regs },
 };
 
 static int dump_regs(struct ethtool_drvinfo *info, struct ethtool_regs *regs)
-- 
1.6.2.5
-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox