From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: Linus Torvalds <torvalds@linux-foundation.org>,
Andrew Morton <akpm@linux-foundation.org>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
netdev@vger.kernel.org, trond.myklebust@fys.uio.no
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH 18/33] netvm: INET reserves.
Date: Tue, 30 Oct 2007 17:04:19 +0100 [thread overview]
Message-ID: <20071030160914.070739000@chello.nl> (raw)
In-Reply-To: 20071030160401.296770000@chello.nl
[-- Attachment #1: netvm-reserve-inet.patch --]
[-- Type: text/plain, Size: 12290 bytes --]
Add reserves for INET.
The two big users seem to be the route cache and ip-fragment cache.
Reserve the route cache under generic RX reserve, its usage is bounded by
the high reclaim watermark, and thus does not need further accounting.
Reserve the ip-fragement caches under SKB data reserve, these add to the
SKB RX limit. By ensuring we can at least receive as much data as fits in
the reassmbly line we avoid fragment attack deadlocks.
Use proc conv() routines to update these limits and return -ENOMEM to user
space.
Adds to the reserve tree:
total network reserve
network TX reserve
protocol TX pages
network RX reserve
+ IPv6 route cache
+ IPv4 route cache
SKB data reserve
+ IPv6 fragment cache
+ IPv4 fragment cache
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sysctl.h | 11 +++++++++++
kernel/sysctl.c | 8 ++++++--
net/ipv4/ip_fragment.c | 7 +++++++
net/ipv4/route.c | 30 +++++++++++++++++++++++++++++-
net/ipv4/sysctl_net_ipv4.c | 24 +++++++++++++++++++++++-
net/ipv6/reassembly.c | 7 +++++++
net/ipv6/route.c | 31 ++++++++++++++++++++++++++++++-
net/ipv6/sysctl_net_ipv6.c | 24 +++++++++++++++++++++++-
8 files changed, 136 insertions(+), 6 deletions(-)
Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -18,6 +18,7 @@
#include <net/route.h>
#include <net/tcp.h>
#include <net/cipso_ipv4.h>
+#include <linux/reserve.h>
/* From af_inet.c */
extern int sysctl_ip_nonlocal_bind;
@@ -186,6 +187,27 @@ static int strategy_allowed_congestion_c
}
+extern struct mem_reserve ipv4_frag_reserve;
+
+static int do_proc_dointvec_fragment_conv(int *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ if (write) {
+ long value = *negp ? -*lvalp : *lvalp;
+ int err = mem_reserve_kmalloc_set(&ipv4_frag_reserve, value);
+ if (err)
+ return err;
+ }
+ return do_proc_dointvec_conv(negp, lvalp, valp, write, data);
+}
+
+static int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_fragment_conv, NULL);
+}
+
ctl_table ipv4_table[] = {
{
.ctl_name = NET_IPV4_TCP_TIMESTAMPS,
@@ -291,7 +313,7 @@ ctl_table ipv4_table[] = {
.data = &sysctl_ipfrag_high_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = &proc_dointvec_fragment
},
{
.ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
Index: linux-2.6/net/ipv6/sysctl_net_ipv6.c
===================================================================
--- linux-2.6.orig/net/ipv6/sysctl_net_ipv6.c
+++ linux-2.6/net/ipv6/sysctl_net_ipv6.c
@@ -12,9 +12,31 @@
#include <net/ndisc.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
+#include <linux/reserve.h>
#ifdef CONFIG_SYSCTL
+extern struct mem_reserve ipv6_frag_reserve;
+
+static int do_proc_dointvec_fragment_conv(int *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ if (write) {
+ long value = *negp ? -*lvalp : *lvalp;
+ int err = mem_reserve_kmalloc_set(&ipv6_frag_reserve, value);
+ if (err)
+ return err;
+ }
+ return do_proc_dointvec_conv(negp, lvalp, valp, write, data);
+}
+
+static int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_fragment_conv, NULL);
+}
+
static ctl_table ipv6_table[] = {
{
.ctl_name = NET_IPV6_ROUTE,
@@ -44,7 +66,7 @@ static ctl_table ipv6_table[] = {
.data = &sysctl_ip6frag_high_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec
+ .proc_handler = &proc_dointvec_fragment
},
{
.ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH,
Index: linux-2.6/net/ipv4/ip_fragment.c
===================================================================
--- linux-2.6.orig/net/ipv4/ip_fragment.c
+++ linux-2.6/net/ipv4/ip_fragment.c
@@ -43,6 +43,7 @@
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/reserve.h>
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
@@ -733,6 +734,8 @@ struct sk_buff *ip_defrag(struct sk_buff
return NULL;
}
+struct mem_reserve ipv4_frag_reserve;
+
void __init ipfrag_init(void)
{
ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
@@ -742,6 +745,10 @@ void __init ipfrag_init(void)
ipfrag_secret_timer.function = ipfrag_secret_rebuild;
ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
add_timer(&ipfrag_secret_timer);
+
+ mem_reserve_init(&ipv4_frag_reserve, "IPv4 fragment cache",
+ &net_skb_reserve);
+ mem_reserve_kmalloc_set(&ipv4_frag_reserve, sysctl_ipfrag_high_thresh);
}
EXPORT_SYMBOL(ip_defrag);
Index: linux-2.6/net/ipv6/reassembly.c
===================================================================
--- linux-2.6.orig/net/ipv6/reassembly.c
+++ linux-2.6/net/ipv6/reassembly.c
@@ -42,6 +42,7 @@
#include <linux/icmpv6.h>
#include <linux/random.h>
#include <linux/jhash.h>
+#include <linux/reserve.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -770,6 +771,8 @@ static struct inet6_protocol frag_protoc
.flags = INET6_PROTO_NOPOLICY,
};
+struct mem_reserve ipv6_frag_reserve;
+
void __init ipv6_frag_init(void)
{
if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0)
@@ -782,4 +785,8 @@ void __init ipv6_frag_init(void)
ip6_frag_secret_timer.function = ip6_frag_secret_rebuild;
ip6_frag_secret_timer.expires = jiffies + sysctl_ip6frag_secret_interval;
add_timer(&ip6_frag_secret_timer);
+
+ mem_reserve_init(&ipv6_frag_reserve, "IPv6 fragment cache",
+ &net_skb_reserve);
+ mem_reserve_kmalloc_set(&ipv6_frag_reserve, sysctl_ip6frag_high_thresh);
}
Index: linux-2.6/net/ipv4/route.c
===================================================================
--- linux-2.6.orig/net/ipv4/route.c
+++ linux-2.6/net/ipv4/route.c
@@ -108,6 +108,7 @@
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
+#include <linux/reserve.h>
#define RT_FL_TOS(oldflp) \
((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
@@ -2698,6 +2699,28 @@ static int ipv4_sysctl_rtcache_flush_str
return 0;
}
+static struct mem_reserve ipv4_route_reserve;
+
+static int do_proc_dointvec_route_conv(int *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ if (write) {
+ long value = *negp ? -*lvalp : *lvalp;
+ int err = mem_reserve_kmem_cache_set(&ipv4_route_reserve,
+ ipv4_dst_ops.kmem_cachep, value);
+ if (err)
+ return err;
+ }
+ return do_proc_dointvec_conv(negp, lvalp, valp, write, data);
+}
+
+static int proc_dointvec_route(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_route_conv, NULL);
+}
+
ctl_table ipv4_route_table[] = {
{
.ctl_name = NET_IPV4_ROUTE_FLUSH,
@@ -2740,7 +2763,7 @@ ctl_table ipv4_route_table[] = {
.data = &ip_rt_max_size,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_route,
},
{
/* Deprecated. Use gc_min_interval_ms */
@@ -2970,6 +2993,11 @@ int __init ip_rt_init(void)
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;
+ mem_reserve_init(&ipv4_route_reserve, "IPv4 route cache",
+ &net_rx_reserve);
+ mem_reserve_kmem_cache_set(&ipv4_route_reserve,
+ ipv4_dst_ops.kmem_cachep, ip_rt_max_size);
+
devinet_init();
ip_fib_init();
Index: linux-2.6/net/ipv6/route.c
===================================================================
--- linux-2.6.orig/net/ipv6/route.c
+++ linux-2.6/net/ipv6/route.c
@@ -38,6 +38,7 @@
#include <linux/in6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
+#include <linux/reserve.h>
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
@@ -2454,6 +2455,28 @@ int ipv6_sysctl_rtcache_flush(ctl_table
return -EINVAL;
}
+static struct mem_reserve ipv6_route_reserve;
+
+static int do_proc_dointvec_route6_conv(int *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ if (write) {
+ long value = *negp ? -*lvalp : *lvalp;
+ int err = mem_reserve_kmem_cache_set(&ipv6_route_reserve,
+ ip6_dst_ops.kmem_cachep, value);
+ if (err)
+ return err;
+ }
+ return do_proc_dointvec_conv(negp, lvalp, valp, write, data);
+}
+
+static int proc_dointvec_route6(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ do_proc_dointvec_route6_conv, NULL);
+}
+
ctl_table ipv6_route_table[] = {
{
.procname = "flush",
@@ -2476,7 +2499,7 @@ ctl_table ipv6_route_table[] = {
.data = &ip6_rt_max_size,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_route6,
},
{
.ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL,
@@ -2564,6 +2587,12 @@ void __init ip6_route_init(void)
proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
#endif
+
+ mem_reserve_init(&ipv6_route_reserve, "IPv6 route cache",
+ &net_rx_reserve);
+ mem_reserve_kmem_cache_set(&ipv6_route_reserve,
+ ip6_dst_ops.kmem_cachep, ip6_rt_max_size);
+
#ifdef CONFIG_XFRM
xfrm6_init();
#endif
Index: linux-2.6/include/linux/sysctl.h
===================================================================
--- linux-2.6.orig/include/linux/sysctl.h
+++ linux-2.6/include/linux/sysctl.h
@@ -966,6 +966,17 @@ typedef int proc_handler (struct ctl_tab
extern int proc_dostring(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
+
+extern int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data);
+
+extern int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+ int write, void *data),
+ void *data);
+
extern int proc_dointvec(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
extern int proc_dointvec_bset(struct ctl_table *, int, struct file *,
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -1702,7 +1702,7 @@ int proc_dostring(struct ctl_table *tabl
}
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
@@ -1721,6 +1721,8 @@ static int do_proc_dointvec_conv(int *ne
return 0;
}
+EXPORT_SYMBOL(do_proc_dointvec_conv);
+
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
int write, struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos,
@@ -1832,7 +1834,7 @@ static int __do_proc_dointvec(void *tbl_
#undef TMPBUFLEN
}
-static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
void __user *buffer, size_t *lenp, loff_t *ppos,
int (*conv)(int *negp, unsigned long *lvalp, int *valp,
int write, void *data),
@@ -1842,6 +1844,8 @@ static int do_proc_dointvec(struct ctl_t
buffer, lenp, ppos, conv, data);
}
+EXPORT_SYMBOL(do_proc_dointvec);
+
/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
--
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2007-10-30 16:04 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-10-30 16:04 [PATCH 00/33] Swap over NFS -v14 Peter Zijlstra
2007-10-30 16:04 ` [PATCH 01/33] mm: gfp_to_alloc_flags() Peter Zijlstra
2007-10-30 16:04 ` [PATCH 02/33] mm: tag reseve pages Peter Zijlstra
2007-10-30 16:04 ` [PATCH 03/33] mm: slub: add knowledge of reserve pages Peter Zijlstra
2007-10-31 3:37 ` Nick Piggin
2007-10-31 10:42 ` Peter Zijlstra
2007-10-31 10:46 ` Nick Piggin
2007-10-31 12:17 ` Peter Zijlstra
2007-10-31 11:25 ` Nick Piggin
2007-10-31 12:54 ` Peter Zijlstra
2007-10-31 13:08 ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 04/33] mm: allow mempool to fall back to memalloc reserves Peter Zijlstra
2007-10-31 3:40 ` Nick Piggin
2007-10-30 16:04 ` [PATCH 05/33] mm: kmem_estimate_pages() Peter Zijlstra
2007-10-31 3:43 ` Nick Piggin
2007-10-31 10:42 ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 06/33] mm: allow PF_MEMALLOC from softirq context Peter Zijlstra
2007-10-31 3:51 ` Nick Piggin
2007-10-31 10:42 ` Peter Zijlstra
2007-10-31 10:49 ` Nick Piggin
2007-10-31 13:06 ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 07/33] mm: serialize access to min_free_kbytes Peter Zijlstra
2007-10-30 16:04 ` [PATCH 08/33] mm: emergency pool Peter Zijlstra
2007-10-30 16:04 ` [PATCH 09/33] mm: system wide ALLOC_NO_WATERMARK Peter Zijlstra
2007-10-31 3:52 ` Nick Piggin
2007-10-31 10:45 ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 10/33] mm: __GFP_MEMALLOC Peter Zijlstra
2007-10-30 16:04 ` [PATCH 11/33] mm: memory reserve management Peter Zijlstra
2007-10-30 16:04 ` [PATCH 12/33] selinux: tag avc cache alloc as non-critical Peter Zijlstra
2007-10-30 16:04 ` [PATCH 13/33] net: wrap sk->sk_backlog_rcv() Peter Zijlstra
2007-10-30 16:04 ` [PATCH 14/33] net: packet split receive api Peter Zijlstra
2007-10-30 16:04 ` [PATCH 15/33] net: sk_allocation() - concentrate socket related allocations Peter Zijlstra
2007-10-30 16:04 ` [PATCH 16/33] netvm: network reserve infrastructure Peter Zijlstra
2007-10-30 16:04 ` [PATCH 17/33] sysctl: propagate conv errors Peter Zijlstra
2007-10-30 16:04 ` Peter Zijlstra [this message]
2007-10-30 16:04 ` [PATCH 19/33] netvm: hook skb allocation to reserves Peter Zijlstra
2007-10-30 16:04 ` [PATCH 20/33] netvm: filter emergency skbs Peter Zijlstra
2007-10-30 16:04 ` [PATCH 21/33] netvm: prevent a TCP specific deadlock Peter Zijlstra
2007-10-30 16:04 ` [PATCH 22/33] netfilter: NF_QUEUE vs emergency skbs Peter Zijlstra
2007-10-30 16:04 ` [PATCH 23/33] netvm: skb processing Peter Zijlstra
2007-10-30 21:26 ` Stephen Hemminger
2007-10-30 21:44 ` Peter Zijlstra
2007-10-30 21:26 ` Stephen Hemminger
2007-10-30 16:04 ` [PATCH 24/33] mm: prepare swap entry methods for use in page methods Peter Zijlstra
2007-10-30 16:04 ` [PATCH 25/33] mm: add support for non block device backed swap files Peter Zijlstra
2007-10-30 16:04 ` [PATCH 26/33] mm: methods for teaching filesystems about PG_swapcache pages Peter Zijlstra
2007-10-30 16:04 ` [PATCH 27/33] nfs: remove mempools Peter Zijlstra
2007-10-30 16:04 ` [PATCH 28/33] nfs: teach the NFS client how to treat PG_swapcache pages Peter Zijlstra
2007-10-31 8:52 ` Christoph Hellwig
2007-10-30 16:04 ` [PATCH 29/33] nfs: disable data cache revalidation for swapfiles Peter Zijlstra
2007-10-30 16:04 ` [PATCH 30/33] nfs: swap vs nfs_writepage Peter Zijlstra
2007-10-30 16:04 ` [PATCH 31/33] nfs: enable swap on NFS Peter Zijlstra
2007-10-30 16:04 ` [PATCH 32/33] nfs: fix various memory recursions possible with swap over NFS Peter Zijlstra
2007-10-30 16:04 ` [PATCH 33/33] nfs: do not warn on radix tree node allocation failures Peter Zijlstra
2007-10-31 3:26 ` [PATCH 00/33] Swap over NFS -v14 Nick Piggin
2007-10-31 4:37 ` David Miller, Nick Piggin
2007-10-31 4:04 ` Nick Piggin
2007-10-31 14:03 ` Byron Stanoszek
2007-10-31 8:50 ` Christoph Hellwig
2007-10-31 10:56 ` Peter Zijlstra
2007-10-31 11:18 ` NBD was " Pavel Machek
2007-10-31 11:24 ` Peter Zijlstra
2007-10-31 14:54 ` Mike Snitzer
2007-10-31 16:31 ` Evgeniy Polyakov
2007-10-31 9:53 ` Peter Zijlstra
2007-10-31 11:27 ` Peter Zijlstra
2007-10-31 12:16 ` Jeff Garzik
2007-10-31 12:56 ` Peter Zijlstra
2007-10-31 13:18 ` Arnaldo Carvalho de Melo
2007-10-31 13:44 ` Gregory Haskins
2007-11-02 8:54 ` Pavel Machek
2007-11-18 18:09 ` Robin Humble
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071030160914.070739000@chello.nl \
--to=a.p.zijlstra@chello.nl \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=netdev@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=trond.myklebust@fys.uio.no \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).