Re: [RFC][PATCH 6/6] net: vm deadlock avoidance core

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: [RFC][PATCH 6/6] net: vm deadlock avoidance core
       [not found] ` <20061130101922.328418000@chello.nl>
@ 2006-11-30 12:04   ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 12:04 UTC (permalink / raw)
  To: netdev; +Cc: linux-mm, David Miller

Oops, it seems I missed some chunks.
New patch attached.

---
Subject: net: vm deadlock avoidance core

In order to provide robust networked block devices there must be a guarantee
of progress. That is, the block device must never stall because of (physical)
OOM, because the device itself might be needed to get out of it (reclaim).

This means that the device queue must always be unplugable, this in turn means
that it must always find enough memory to build/send packets over the network
_and_ receive (level 7) ACKs for those packets.

The network stack has a huge capacity for buffering packets; waiting for 
user-space to read them. There is a practical limit imposed to avoid DoS 
scenarios. These two things make for a deadlock; what if the receive limit is
reached and all packets are buffered in non-critical sockets (those not serving
the network block device waiting for an ACK to free a page). 

Memory pressure will add to that; what if there is simply no memory left to
receive packets in.

This patch provides a service to register sockets as critical; SOCK_VMIO
is a promise the socket will never block on receive. Along with with a memory
reserve that will service a limited number of packets this can guarantee a
limited service to these critical sockets.

When we make sure that packets allocated from the reserve will only service
critical sockets we will not lose the memory and can guarantee progress.

(Note on the name SOCK_VMIO; the basic problem is a circular dependency between
the network and virtual memory subsystems which needs to be broken. This does
make VM network IO - and only VM network IO - special, it does not generalize)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/skbuff.h     |   13 +++-
 include/net/sock.h         |   36 +++++++++++++
 net/core/dev.c             |   40 +++++++++++++-
 net/core/skbuff.c          |   51 ++++++++++++++++--
 net/core/sock.c            |  121 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_fragment.c     |    1 
 net/ipv4/ipmr.c            |    4 +
 net/ipv4/route.c           |   15 +++++
 net/ipv4/sysctl_net_ipv4.c |   14 ++++-
 net/ipv4/tcp_ipv4.c        |   27 +++++++++-
 net/ipv6/reassembly.c      |    1 
 net/ipv6/route.c           |   15 +++++
 net/ipv6/sysctl_net_ipv6.c |    6 +-
 net/ipv6/tcp_ipv6.c        |   27 +++++++++-
 net/netfilter/core.c       |    5 +
 security/selinux/avc.c     |    2 
 16 files changed, 355 insertions(+), 23 deletions(-)

Index: linux-2.6-git/include/linux/skbuff.h
===================================================================
--- linux-2.6-git.orig/include/linux/skbuff.h	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/include/linux/skbuff.h	2006-11-30 11:37:51.000000000 +0100
@@ -283,7 +283,8 @@ struct sk_buff {
 				nfctinfo:3;
 	__u8			pkt_type:3,
 				fclone:2,
-				ipvs_property:1;
+				ipvs_property:1,
+				emergency:1;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
@@ -328,10 +329,13 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+#define SKB_ALLOC_FCLONE	0x01
+#define SKB_ALLOC_RX		0x02
+
 extern void kfree_skb(struct sk_buff *skb);
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   gfp_t priority, int fclone);
+				   gfp_t priority, int flags);
 static inline struct sk_buff *alloc_skb(unsigned int size,
 					gfp_t priority)
 {
@@ -341,7 +345,7 @@ static inline struct sk_buff *alloc_skb(
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 					       gfp_t priority)
 {
-	return __alloc_skb(size, priority, 1);
+	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE);
 }
 
 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
@@ -1102,7 +1106,8 @@ static inline void __skb_queue_purge(str
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
 					      gfp_t gfp_mask)
 {
-	struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+	struct sk_buff *skb =
+		__alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX);
 	if (likely(skb))
 		skb_reserve(skb, NET_SKB_PAD);
 	return skb;
Index: linux-2.6-git/include/net/sock.h
===================================================================
--- linux-2.6-git.orig/include/net/sock.h	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/include/net/sock.h	2006-11-30 11:37:51.000000000 +0100
@@ -391,6 +391,7 @@ enum sock_flags {
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_VMIO, /* the VM depends on us - make sure we're serviced */
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -413,6 +414,40 @@ static inline int sock_flag(struct sock 
 	return test_bit(flag, &sk->sk_flags);
 }
 
+static inline int sk_has_vmio(struct sock *sk)
+{
+	return sock_flag(sk, SOCK_VMIO);
+}
+
+#define MAX_PAGES_PER_SKB 3
+#define MAX_FRAGMENTS ((65536 + 1500 - 1) / 1500)
+/*
+ * Guestimate the per request queue TX upper bound.
+ */
+#define TX_RESERVE_PAGES \
+	(4 * MAX_FRAGMENTS * MAX_PAGES_PER_SKB)
+
+extern atomic_t vmio_socks;
+extern atomic_t emergency_rx_skbs;
+
+static inline int sk_vmio_socks(void)
+{
+	return atomic_read(&vmio_socks);
+}
+
+extern int sk_emergency_skb_get(void);
+
+static inline void sk_emergency_skb_put(void)
+{
+	return atomic_dec(&emergency_rx_skbs);
+}
+
+extern void sk_adjust_memalloc(int socks, int tx_reserve_pages);
+extern void ipfrag_add_memory(int ipfrag_reserve);
+extern void iprt_add_memory(int rt_reserve);
+extern int sk_set_vmio(struct sock *sk);
+extern int sk_clear_vmio(struct sock *sk);
+
 static inline void sk_acceptq_removed(struct sock *sk)
 {
 	sk->sk_ack_backlog--;
@@ -720,6 +755,7 @@ static inline void sk_stream_writequeue_
 
 static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
 {
+	// XXX: skb->emergency stuff
 	return (int)skb->truesize <= sk->sk_forward_alloc ||
 		sk_stream_mem_schedule(sk, skb->truesize, 1);
 }
Index: linux-2.6-git/net/core/dev.c
===================================================================
--- linux-2.6-git.orig/net/core/dev.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/core/dev.c	2006-11-30 11:37:51.000000000 +0100
@@ -1768,10 +1768,23 @@ int netif_receive_skb(struct sk_buff *sk
 	struct net_device *orig_dev;
 	int ret = NET_RX_DROP;
 	unsigned short type;
+	unsigned long pflags = current->flags;
+
+	/* Emergency skb are special, they should
+	 *  - be delivered to SOCK_VMIO sockets only
+	 *  - stay away from userspace
+	 *  - have bounded memory usage
+	 *
+	 * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
+	 * This saves us from propagating the allocation context down to all
+	 * allocation sites.
+	 */
+	if (unlikely(skb->emergency))
+		current->flags |= PF_MEMALLOC;
 
 	/* if we've gotten here through NAPI, check netpoll */
 	if (skb->dev->poll && netpoll_rx(skb))
-		return NET_RX_DROP;
+		goto out;
 
 	if (!skb->tstamp.off_sec)
 		net_timestamp(skb);
@@ -1782,7 +1795,7 @@ int netif_receive_skb(struct sk_buff *sk
 	orig_dev = skb_bond(skb);
 
 	if (!orig_dev)
-		return NET_RX_DROP;
+		goto out;
 
 	__get_cpu_var(netdev_rx_stat).total++;
 
@@ -1799,6 +1812,8 @@ int netif_receive_skb(struct sk_buff *sk
 		goto ncls;
 	}
 #endif
+	if (unlikely(skb->emergency))
+		goto skip_taps;
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 		if (!ptype->dev || ptype->dev == skb->dev) {
@@ -1808,6 +1823,7 @@ int netif_receive_skb(struct sk_buff *sk
 		}
 	}
 
+skip_taps:
 #ifdef CONFIG_NET_CLS_ACT
 	if (pt_prev) {
 		ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -1820,17 +1836,28 @@ int netif_receive_skb(struct sk_buff *sk
 
 	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
 		kfree_skb(skb);
-		goto out;
+		goto unlock;
 	}
 
 	skb->tc_verd = 0;
 ncls:
 #endif
 
+	if (unlikely(skb->emergency))
+		switch(skb->protocol) {
+			case __constant_htons(ETH_P_ARP):
+			case __constant_htons(ETH_P_IP):
+			case __constant_htons(ETH_P_IPV6):
+				break;
+
+			default:
+				goto drop;
+		}
+
 	handle_diverter(skb);
 
 	if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
-		goto out;
+		goto unlock;
 
 	type = skb->protocol;
 	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
@@ -1845,6 +1872,7 @@ ncls:
 	if (pt_prev) {
 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
+drop:
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -1852,8 +1880,10 @@ ncls:
 		ret = NET_RX_DROP;
 	}
 
-out:
+unlock:
 	rcu_read_unlock();
+out:
+	current->flags = pflags;
 	return ret;
 }
 
Index: linux-2.6-git/net/core/skbuff.c
===================================================================
--- linux-2.6-git.orig/net/core/skbuff.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/core/skbuff.c	2006-11-30 11:37:51.000000000 +0100
@@ -139,29 +139,34 @@ EXPORT_SYMBOL(skb_truesize_bug);
  *	Buffers may only be allocated from interrupts using a @gfp_mask of
  *	%GFP_ATOMIC.
  */
-struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-			    int fclone)
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags)
 {
 	kmem_cache_t *cache;
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
+	int emergency = 0;
 
-	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+	size = SKB_DATA_ALIGN(size);
+	cache = (flags & SKB_ALLOC_FCLONE)
+		? skbuff_fclone_cache : skbuff_head_cache;
+	if (flags & SKB_ALLOC_RX)
+		gfp_mask |= __GFP_NOMEMALLOC|__GFP_NOWARN;
 
+retry_alloc:
 	/* Get the HEAD */
 	skb = kmem_cache_alloc(cache, gfp_mask & ~__GFP_DMA);
 	if (!skb)
-		goto out;
+		goto noskb;
 
 	/* Get the DATA. Size must match skb_add_mtu(). */
-	size = SKB_DATA_ALIGN(size);
 	data = kmalloc_track_caller(size + sizeof(struct skb_shared_info),
 			gfp_mask);
 	if (!data)
 		goto nodata;
 
 	memset(skb, 0, offsetof(struct sk_buff, truesize));
+	skb->emergency = emergency;
 	skb->truesize = size + sizeof(struct sk_buff);
 	atomic_set(&skb->users, 1);
 	skb->head = data;
@@ -178,7 +183,7 @@ struct sk_buff *__alloc_skb(unsigned int
 	shinfo->ip6_frag_id = 0;
 	shinfo->frag_list = NULL;
 
-	if (fclone) {
+	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff *child = skb + 1;
 		atomic_t *fclone_ref = (atomic_t *) (child + 1);
 
@@ -186,12 +191,29 @@ struct sk_buff *__alloc_skb(unsigned int
 		atomic_set(fclone_ref, 1);
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
+		child->emergency = skb->emergency;
 	}
 out:
 	return skb;
+
 nodata:
 	kmem_cache_free(cache, skb);
 	skb = NULL;
+noskb:
+	/* Attempt emergency allocation when RX skb. */
+	if (likely(!(flags & SKB_ALLOC_RX) || !sk_vmio_socks()))
+		goto out;
+
+	if (!emergency) {
+		if (sk_emergency_skb_get()) {
+			gfp_mask &= ~(__GFP_NOMEMALLOC|__GFP_NOWARN);
+			gfp_mask |= __GFP_EMERGENCY;
+			emergency = 1;
+			goto retry_alloc;
+		}
+	} else
+		sk_emergency_skb_put();
+
 	goto out;
 }
 
@@ -268,7 +290,7 @@ struct sk_buff *__netdev_alloc_skb(struc
 {
 	struct sk_buff *skb;
 
-	skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX);
 	if (likely(skb)) {
 		skb_reserve(skb, NET_SKB_PAD);
 		skb->dev = dev;
@@ -317,6 +339,8 @@ static void skb_release_data(struct sk_b
 			skb_drop_fraglist(skb);
 
 		kfree(skb->head);
+		if (unlikely(skb->emergency))
+			sk_emergency_skb_put();
 	}
 }
 
@@ -437,6 +461,9 @@ struct sk_buff *skb_clone(struct sk_buff
 		n->fclone = SKB_FCLONE_CLONE;
 		atomic_inc(fclone_ref);
 	} else {
+		if (unlikely(skb->emergency))
+			gfp_mask |= __GFP_EMERGENCY;
+
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)
 			return NULL;
@@ -471,6 +498,7 @@ struct sk_buff *skb_clone(struct sk_buff
 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 	C(ipvs_property);
 #endif
+	C(emergency);
 	C(protocol);
 	n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
@@ -686,12 +714,19 @@ int pskb_expand_head(struct sk_buff *skb
 	u8 *data;
 	int size = nhead + (skb->end - skb->head) + ntail;
 	long off;
+	int emergency = 0;
 
 	if (skb_shared(skb))
 		BUG();
 
 	size = SKB_DATA_ALIGN(size);
 
+	if (unlikely(skb->emergency) && sk_emergency_skb_get()) {
+		gfp_mask |= __GFP_EMERGENCY;
+		emergency = 1;
+	} else
+		gfp_mask |= __GFP_NOMEMALLOC;
+
 	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
 	if (!data)
 		goto nodata;
@@ -724,6 +759,8 @@ int pskb_expand_head(struct sk_buff *skb
 	return 0;
 
 nodata:
+	if (unlikely(emergency))
+		sk_emergency_skb_put();
 	return -ENOMEM;
 }
 
Index: linux-2.6-git/net/core/sock.c
===================================================================
--- linux-2.6-git.orig/net/core/sock.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/core/sock.c	2006-11-30 11:37:51.000000000 +0100
@@ -195,6 +195,120 @@ __u32 sysctl_rmem_default __read_mostly 
 /* Maximal space eaten by iovec or ancilliary data plus some space */
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 
+static DEFINE_SPINLOCK(memalloc_lock);
+static int rx_net_reserve;
+
+atomic_t vmio_socks;
+atomic_t emergency_rx_skbs;
+
+static int ipfrag_threshold;
+
+#define ipfrag_mtu()	(1500) /* XXX: should be smallest mtu system wide */
+#define ipfrag_skbs()	(ipfrag_threshold / ipfrag_mtu())
+#define ipfrag_pages()	(ipfrag_threshold / (ipfrag_mtu() * (PAGE_SIZE / ipfrag_mtu())))
+
+static int iprt_pages;
+
+/*
+ * is there room for another emergency skb.
+ */
+int sk_emergency_skb_get(void)
+{
+	int nr = atomic_add_return(1, &emergency_rx_skbs);
+	int thresh = (3 * ipfrag_skbs()) / 2;
+	if (nr < thresh)
+		return 1;
+
+	atomic_dec(&emergency_rx_skbs);
+	return 0;
+}
+
+/**
+ *	sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
+ *	@socks: number of new %SOCK_VMIO sockets
+ *	@tx_resserve_pages: number of pages to (un)reserve for TX
+ *
+ *	This function adjusts the memalloc reserve based on system demand.
+ *	The RX reserve is a limit, and only added once, not for each socket.
+ *
+ *	NOTE:
+ *	   @tx_reserve_pages is an upper-bound of memory used for TX hence
+ *	   we need not account the pages like we do for RX pages.
+ */
+void sk_adjust_memalloc(int socks, int tx_reserve_pages)
+{
+	unsigned long flags;
+	int reserve = tx_reserve_pages;
+	int nr_socks;
+
+	spin_lock_irqsave(&memalloc_lock, flags);
+	nr_socks = atomic_add_return(socks, &vmio_socks);
+	BUG_ON(nr_socks < 0);
+
+	if (nr_socks) {
+		int rx_pages = 2 * ipfrag_pages() + iprt_pages;
+		reserve += rx_pages - rx_net_reserve;
+		rx_net_reserve = rx_pages;
+	} else {
+		reserve -= rx_net_reserve;
+		rx_net_reserve = 0;
+	}
+
+	if (reserve)
+		adjust_memalloc_reserve(reserve);
+	spin_unlock_irqrestore(&memalloc_lock, flags);
+}
+EXPORT_SYMBOL_GPL(sk_adjust_memalloc);
+
+/*
+ * tiny helper function to track the total ipfragment memory
+ * needed because of modular ipv6
+ */
+void ipfrag_add_memory(int frags)
+{
+	ipfrag_threshold += frags;
+	sk_adjust_memalloc(0, 0);
+}
+EXPORT_SYMBOL_GPL(ipfrag_add_memory);
+
+void iprt_add_memory(int pages)
+{
+	iprt_pages += pages;
+	sk_adjust_memalloc(0, 0);
+}
+EXPORT_SYMBOL_GPL(iprt_add_memory);
+
+/**
+ *	sk_set_vmio - sets %SOCK_VMIO
+ *	@sk: socket to set it on
+ *
+ *	Set %SOCK_VMIO on a socket and increase the memalloc reserve
+ *	accordingly.
+ */
+int sk_set_vmio(struct sock *sk)
+{
+	int set = sock_flag(sk, SOCK_VMIO);
+	if (!set) {
+		sk_adjust_memalloc(1, 0);
+		sock_set_flag(sk, SOCK_VMIO);
+		sk->sk_allocation |= __GFP_EMERGENCY;
+	}
+	return !set;
+}
+EXPORT_SYMBOL_GPL(sk_set_vmio);
+
+int sk_clear_vmio(struct sock *sk)
+{
+	int set = sock_flag(sk, SOCK_VMIO);
+	if (set) {
+		sk_adjust_memalloc(-1, 0);
+		sock_reset_flag(sk, SOCK_VMIO);
+		sk->sk_allocation &= ~__GFP_EMERGENCY;
+	}
+	return set;
+}
+EXPORT_SYMBOL_GPL(sk_clear_vmio);
+
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
 	struct timeval tv;
@@ -238,6 +352,12 @@ int sock_queue_rcv_skb(struct sock *sk, 
 	int err = 0;
 	int skb_len;
 
+	if (unlikely(skb->emergency)) {
+		if (!sk_has_vmio(sk)) {
+			err = -ENOMEM;
+			goto out;
+		}
+	} else
 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
 	   number of warnings when compiling with -W --ANK
 	 */
@@ -877,6 +997,7 @@ void sk_free(struct sock *sk)
 	struct sk_filter *filter;
 	struct module *owner = sk->sk_prot_creator->owner;
 
+	sk_clear_vmio(sk);
 	if (sk->sk_destruct)
 		sk->sk_destruct(sk);
 
Index: linux-2.6-git/net/ipv4/ipmr.c
===================================================================
--- linux-2.6-git.orig/net/ipv4/ipmr.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv4/ipmr.c	2006-11-30 11:37:51.000000000 +0100
@@ -1340,6 +1340,9 @@ int ip_mr_input(struct sk_buff *skb)
 	struct mfc_cache *cache;
 	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
 
+	if (unlikely(skb->emergency))
+		goto drop;
+
 	/* Packet is looped back after forward, it should not be
 	   forwarded second time, but still can be delivered locally.
 	 */
@@ -1411,6 +1414,7 @@ int ip_mr_input(struct sk_buff *skb)
 dont_forward:
 	if (local)
 		return ip_local_deliver(skb);
+drop:
 	kfree_skb(skb);
 	return 0;
 }
Index: linux-2.6-git/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6-git.orig/net/ipv4/sysctl_net_ipv4.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv4/sysctl_net_ipv4.c	2006-11-30 11:37:51.000000000 +0100
@@ -18,6 +18,7 @@
 #include <net/route.h>
 #include <net/tcp.h>
 #include <net/cipso_ipv4.h>
+#include <net/sock.h>
 
 /* From af_inet.c */
 extern int sysctl_ip_nonlocal_bind;
@@ -129,6 +130,17 @@ static int sysctl_tcp_congestion_control
 	return ret;
 }
 
+int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int old_thresh = *(int *)table->data;
+	ret = proc_dointvec(table,write,filp,buffer,lenp,ppos);
+	ipfrag_add_memory(*(int *)table->data - old_thresh);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(proc_dointvec_fragment);
+
 ctl_table ipv4_table[] = {
         {
 		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
@@ -234,7 +246,7 @@ ctl_table ipv4_table[] = {
 		.data		= &sysctl_ipfrag_high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec_fragment
 	},
 	{
 		.ctl_name	= NET_IPV4_IPFRAG_LOW_THRESH,
Index: linux-2.6-git/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6-git.orig/net/ipv4/tcp_ipv4.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv4/tcp_ipv4.c	2006-11-30 12:13:29.000000000 +0100
@@ -1046,6 +1046,22 @@ csum_err:
 	goto discard;
 }
 
+static int tcp_v4_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	unsigned long pflags = current->flags;
+	if (unlikely(skb->emergency)) {
+		BUG_ON(!sk_has_vmio(sk)); /* we dropped those before queueing */
+		if (!(pflags & PF_MEMALLOC))
+			current->flags |= PF_MEMALLOC;
+	}
+
+	ret = tcp_v4_do_rcv(sk, skb);
+
+	current->flags = pflags;
+	return ret;
+}
+
 /*
  *	From tcp_input.c
  */
@@ -1096,6 +1112,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	if (!sk)
 		goto no_tcp_socket;
 
+	if (unlikely(skb->emergency)) {
+	       	if (!sk_has_vmio(sk))
+			goto discard_and_relse;
+		/*
+		   decrease window size..
+		   tcp_enter_quickack_mode(sk);
+		*/
+	}
+
 process:
 	if (sk->sk_state == TCP_TIME_WAIT)
 		goto do_time_wait;
@@ -1848,7 +1873,7 @@ struct proto tcp_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
-	.backlog_rcv		= tcp_v4_do_rcv,
+	.backlog_rcv		= tcp_v4_backlog_rcv,
 	.hash			= tcp_v4_hash,
 	.unhash			= tcp_unhash,
 	.get_port		= tcp_v4_get_port,
Index: linux-2.6-git/net/ipv6/sysctl_net_ipv6.c
===================================================================
--- linux-2.6-git.orig/net/ipv6/sysctl_net_ipv6.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv6/sysctl_net_ipv6.c	2006-11-30 11:37:51.000000000 +0100
@@ -15,6 +15,10 @@
 
 #ifdef CONFIG_SYSCTL
 
+extern int proc_dointvec_fragment(ctl_table *table, int write,
+	       	struct file *filp, void __user *buffer, size_t *lenp,
+	       	loff_t *ppos);
+
 static ctl_table ipv6_table[] = {
 	{
 		.ctl_name	= NET_IPV6_ROUTE,
@@ -44,7 +48,7 @@ static ctl_table ipv6_table[] = {
 		.data		= &sysctl_ip6frag_high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
+		.proc_handler	= &proc_dointvec_fragment
 	},
 	{
 		.ctl_name	= NET_IPV6_IP6FRAG_LOW_THRESH,
Index: linux-2.6-git/net/netfilter/core.c
===================================================================
--- linux-2.6-git.orig/net/netfilter/core.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/netfilter/core.c	2006-11-30 11:37:51.000000000 +0100
@@ -181,6 +181,11 @@ next_hook:
 		kfree_skb(*pskb);
 		ret = -EPERM;
 	} else if ((verdict & NF_VERDICT_MASK)  == NF_QUEUE) {
+		if (unlikely((*pskb)->emergency)) {
+			printk(KERN_ERR "nf_hook: NF_QUEUE encountered for "
+					"emergency skb - skipping rule.\n");
+			goto next_hook;
+		}
 		NFDEBUG("nf_hook: Verdict = QUEUE.\n");
 		if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn,
 			      verdict >> NF_VERDICT_BITS))
Index: linux-2.6-git/security/selinux/avc.c
===================================================================
--- linux-2.6-git.orig/security/selinux/avc.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/security/selinux/avc.c	2006-11-30 11:37:51.000000000 +0100
@@ -333,7 +333,7 @@ static struct avc_node *avc_alloc_node(v
 {
 	struct avc_node *node;
 
-	node = kmem_cache_alloc(avc_node_cachep, SLAB_ATOMIC);
+	node = kmem_cache_alloc(avc_node_cachep, SLAB_ATOMIC | __GFP_NOMEMALLOC);
 	if (!node)
 		goto out;
 
Index: linux-2.6-git/net/ipv4/ip_fragment.c
===================================================================
--- linux-2.6-git.orig/net/ipv4/ip_fragment.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv4/ip_fragment.c	2006-11-30 11:37:51.000000000 +0100
@@ -743,6 +743,7 @@ void ipfrag_init(void)
 	ipfrag_secret_timer.function = ipfrag_secret_rebuild;
 	ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
 	add_timer(&ipfrag_secret_timer);
+	ipfrag_add_memory(sysctl_ipfrag_high_thresh);
 }
 
 EXPORT_SYMBOL(ip_defrag);
Index: linux-2.6-git/net/ipv6/reassembly.c
===================================================================
--- linux-2.6-git.orig/net/ipv6/reassembly.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv6/reassembly.c	2006-11-30 11:37:51.000000000 +0100
@@ -759,4 +759,5 @@ void __init ipv6_frag_init(void)
 	ip6_frag_secret_timer.function = ip6_frag_secret_rebuild;
 	ip6_frag_secret_timer.expires = jiffies + sysctl_ip6frag_secret_interval;
 	add_timer(&ip6_frag_secret_timer);
+	ipfrag_add_memory(sysctl_ip6frag_high_thresh);
 }
Index: linux-2.6-git/net/ipv4/route.c
===================================================================
--- linux-2.6-git.orig/net/ipv4/route.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv4/route.c	2006-11-30 11:37:51.000000000 +0100
@@ -2906,6 +2906,17 @@ static int ipv4_sysctl_rtcache_flush_str
 	return 0;
 }
 
+static int proc_dointvec_rt_size(ctl_table *table, int write, struct file *filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int old = *(int *)table->data;
+	ret = proc_dointvec(table,write,filp,buffer,lenp,ppos);
+	iprt_add_memory(kmem_cache_objs_to_pages(ipv4_dst_ops.kmem_cachep,
+				*(int *)table->data - old));
+	return ret;
+}
+
 ctl_table ipv4_route_table[] = {
         {
 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
@@ -2948,7 +2959,7 @@ ctl_table ipv4_route_table[] = {
 		.data		= &ip_rt_max_size,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_rt_size,
 	},
 	{
 		/*  Deprecated. Use gc_min_interval_ms */
@@ -3175,6 +3186,8 @@ int __init ip_rt_init(void)
 
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	iprt_add_memory(kmem_cache_objs_to_pages(ipv4_dst_ops.kmem_cachep,
+				ip_rt_max_size));
 
 	devinet_init();
 	ip_fib_init();
Index: linux-2.6-git/net/ipv6/route.c
===================================================================
--- linux-2.6-git.orig/net/ipv6/route.c	2006-11-30 10:56:33.000000000 +0100
+++ linux-2.6-git/net/ipv6/route.c	2006-11-30 11:37:51.000000000 +0100
@@ -2365,6 +2365,17 @@ int ipv6_sysctl_rtcache_flush(ctl_table 
 		return -EINVAL;
 }
 
+static int proc_dointvec_rt_size(ctl_table *table, int write, struct file *filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+	int old = *(int *)table->data;
+	ret = proc_dointvec(table,write,filp,buffer,lenp,ppos);
+	iprt_add_memory(kmem_cache_objs_to_pages(ip6_dst_ops.kmem_cachep,
+				*(int *)table->data - old));
+	return ret;
+}
+
 ctl_table ipv6_route_table[] = {
         {
 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH, 
@@ -2388,7 +2399,7 @@ ctl_table ipv6_route_table[] = {
          	.data		=	&ip6_rt_max_size,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-         	.proc_handler	=	&proc_dointvec,
+         	.proc_handler	=	&proc_dointvec_rt_size,
 	},
 	{
 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
@@ -2473,6 +2484,8 @@ void __init ip6_route_init(void)
 
 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
 #endif
+	iprt_add_memory(kmem_cache_objs_to_pages(ip6_dst_ops.kmem_cachep,
+				ip6_rt_max_size));
 #ifdef CONFIG_XFRM
 	xfrm6_init();
 #endif
Index: linux-2.6-git/net/ipv6/tcp_ipv6.c
===================================================================
--- linux-2.6-git.orig/net/ipv6/tcp_ipv6.c	2006-11-30 11:58:57.000000000 +0100
+++ linux-2.6-git/net/ipv6/tcp_ipv6.c	2006-11-30 12:13:29.000000000 +0100
@@ -1180,6 +1180,22 @@ ipv6_pktoptions:
 	return 0;
 }
 
+static int tcp_v6_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+	unsigned long pflags = current->flags;
+	if (unlikely(skb->emergency)) {
+		BUG_ON(!sk_has_vmio(sk)); /* we dropped those before queueing */
+		if (!(pflags & PF_MEMALLOC))
+			current->flags |= PF_MEMALLOC;
+	}
+
+	ret = tcp_v6_do_rcv(sk, skb);
+
+	current->flags = pflags;
+	return ret;
+}
+
 static int tcp_v6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
@@ -1225,6 +1241,15 @@ static int tcp_v6_rcv(struct sk_buff **p
 	if (!sk)
 		goto no_tcp_socket;
 
+	if (unlikely(skb->emergency)) {
+	       	if (!sk_has_vmio(sk))
+			goto discard_and_relse;
+		/*
+		   decrease window size..
+		   tcp_enter_quickack_mode(sk);
+		*/
+	}
+
 process:
 	if (sk->sk_state == TCP_TIME_WAIT)
 		goto do_time_wait;
@@ -1602,7 +1627,7 @@ struct proto tcpv6_prot = {
 	.getsockopt		= tcp_getsockopt,
 	.sendmsg		= tcp_sendmsg,
 	.recvmsg		= tcp_recvmsg,
-	.backlog_rcv		= tcp_v6_do_rcv,
+	.backlog_rcv		= tcp_v6_backlog_rcv,
 	.hash			= tcp_v6_hash,
 	.unhash			= tcp_unhash,
 	.get_port		= tcp_v6_get_port,



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
       [not found] ` <20061130101921.113055000@chello.nl>
@ 2006-11-30 18:52   ` Christoph Lameter
  2006-11-30 18:55     ` Peter Zijlstra
  2006-11-30 19:02     ` Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 18:52 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> The slab has some unfairness wrt gfp flags; when the slab is grown the gfp 
> flags are used to allocate more memory, however when there is slab space 
> available, gfp flags are ignored. Thus it is possible for less critical 
> slab allocations to succeed and gobble up precious memory.

The gfpflags are ignored if there are

1) objects in the per cpu, shared or alien caches

2) objects are in partial or free slabs in the per node queues.

> This patch avoids this by keeping track of the allocation hardness when 
> growing. This is then compared to the current slab alloc's gfp flags.

The approach is to force the allocation of additional slab to increase the 
number of free slabs? The next free will drop the number of free slabs 
back again to the allowed amount.

I would think that one would need a rank with each cached object and 
free slab in order to do this the right way.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 18:52   ` [RFC][PATCH 1/6] mm: slab allocation fairness Christoph Lameter
@ 2006-11-30 18:55     ` Peter Zijlstra
  2006-11-30 19:33       ` Christoph Lameter
  2006-11-30 19:02     ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 18:55 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 10:52 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > The slab has some unfairness wrt gfp flags; when the slab is grown the gfp 
> > flags are used to allocate more memory, however when there is slab space 
> > available, gfp flags are ignored. Thus it is possible for less critical 
> > slab allocations to succeed and gobble up precious memory.
> 
> The gfpflags are ignored if there are
> 
> 1) objects in the per cpu, shared or alien caches
> 
> 2) objects are in partial or free slabs in the per node queues.

Yeah, basically as long as free objects can be found. No matter how
'hard' is was to obtain these objects.

> > This patch avoids this by keeping track of the allocation hardness when 
> > growing. This is then compared to the current slab alloc's gfp flags.
> 
> The approach is to force the allocation of additional slab to increase the 
> number of free slabs? The next free will drop the number of free slabs 
> back again to the allowed amount.

No, the forced allocation is to test the allocation hardness at that
point in time. I could not think of another way to test that than to
actually to an allocation.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages()
  2006-11-30 18:55   ` [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages() Christoph Lameter
@ 2006-11-30 18:55     ` Peter Zijlstra
  2006-11-30 19:06       ` Christoph Lameter
  2006-12-01 12:14     ` Peter Zijlstra
  1 sibling, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 18:55 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 10:55 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > +unsigned int kmem_cache_objs_to_pages(struct kmem_cache *cachep, int nr)
> > +{
> > +	return ((nr + cachep->num - 1) / cachep->num) << cachep->gfporder;
> 
> cachep->num refers to the number of objects in a slab of gfporder.

Ah, my bad, thanks!

> thus
> 
> return (nr + cachep->num - 1) / cachep->num;
> 
> But then this is very optimistic estimate that assumes a single node and 
> no free objects in between.

Right, perhaps my bad in wording the intent; the needed information is
how many more pages would I need to grow the slab with in order to store
so many new object.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages()
       [not found] ` <20061130101922.175620000@chello.nl>
@ 2006-11-30 18:55   ` Christoph Lameter
  2006-11-30 18:55     ` Peter Zijlstra
  2006-12-01 12:14     ` Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 18:55 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> +unsigned int kmem_cache_objs_to_pages(struct kmem_cache *cachep, int nr)
> +{
> +	return ((nr + cachep->num - 1) / cachep->num) << cachep->gfporder;

cachep->num refers to the number of objects in a slab of gfporder.

thus

return (nr + cachep->num - 1) / cachep->num;

But then this is very optimistic estimate that assumes a single node and 
no free objects in between.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 18:52   ` [RFC][PATCH 1/6] mm: slab allocation fairness Christoph Lameter
  2006-11-30 18:55     ` Peter Zijlstra
@ 2006-11-30 19:02     ` Peter Zijlstra
  2006-11-30 19:37       ` Christoph Lameter
  1 sibling, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 19:02 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 10:52 -0800, Christoph Lameter wrote:

> I would think that one would need a rank with each cached object and 
> free slab in order to do this the right way.

Allocation hardness is a temporal attribute, ie. it changes over time.
Hence I do it per slab.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages()
  2006-11-30 19:06       ` Christoph Lameter
@ 2006-11-30 19:03         ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 19:03 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 11:06 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > Right, perhaps my bad in wording the intent; the needed information is
> > how many more pages would I need to grow the slab with in order to store
> > so many new object.
> 
> Would you not have to take objects currently available in 
> caches into account? If you are short on memory then a flushing of all the 
> caches may give you the memory you need (especially on a system with a 
> large number of processors).

Sure, but this gives a safe upper bound.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages()
  2006-11-30 18:55     ` Peter Zijlstra
@ 2006-11-30 19:06       ` Christoph Lameter
  2006-11-30 19:03         ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 19:06 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> Right, perhaps my bad in wording the intent; the needed information is
> how many more pages would I need to grow the slab with in order to store
> so many new object.

Would you not have to take objects currently available in 
caches into account? If you are short on memory then a flushing of all the 
caches may give you the memory you need (especially on a system with a 
large number of processors).

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 18:55     ` Peter Zijlstra
@ 2006-11-30 19:33       ` Christoph Lameter
  2006-11-30 19:33         ` Peter Zijlstra
  2006-12-01 11:28         ` Peter Zijlstra
  0 siblings, 2 replies; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 19:33 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> No, the forced allocation is to test the allocation hardness at that
> point in time. I could not think of another way to test that than to
> actually to an allocation.

Typically we do this by checking the number of free pages in a zone 
compared to the high low limits. See mmzone.h.
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 19:33       ` Christoph Lameter
@ 2006-11-30 19:33         ` Peter Zijlstra
  2006-12-01 11:28         ` Peter Zijlstra
  1 sibling, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 19:33 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 11:33 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > No, the forced allocation is to test the allocation hardness at that
> > point in time. I could not think of another way to test that than to
> > actually to an allocation.
> 
> Typically we do this by checking the number of free pages in a zone 
> compared to the high low limits. See mmzone.h.

True, I did think about that and started out that way but saw myself
duplicating a lot of the page allocation code. I'll give it another
try... see if I can factor out the common parts without too much
duplication.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 19:02     ` Peter Zijlstra
@ 2006-11-30 19:37       ` Christoph Lameter
  2006-11-30 19:40         ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 19:37 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> On Thu, 2006-11-30 at 10:52 -0800, Christoph Lameter wrote:
> 
> > I would think that one would need a rank with each cached object and 
> > free slab in order to do this the right way.
> 
> Allocation hardness is a temporal attribute, ie. it changes over time.
> Hence I do it per slab.

cached objects are also temporal and change over time.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 19:37       ` Christoph Lameter
@ 2006-11-30 19:40         ` Peter Zijlstra
  2006-11-30 20:11           ` Christoph Lameter
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 19:40 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 11:37 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > On Thu, 2006-11-30 at 10:52 -0800, Christoph Lameter wrote:
> > 
> > > I would think that one would need a rank with each cached object and 
> > > free slab in order to do this the right way.
> > 
> > Allocation hardness is a temporal attribute, ie. it changes over time.
> > Hence I do it per slab.
> 
> cached objects are also temporal and change over time.

Sure, but there is nothing wrong with using a slab page with a lower
allocation rank when there is memory aplenty. 

I'm just not seeing how keeping all individual page ranks would make
this better.

The only thing that matters is the actual free pages limit, not that of
a few allocation ago. The stored rank is a safe shortcut for it allows
harder allocation to use easily obtainable free space not the other way
around.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 19:40         ` Peter Zijlstra
@ 2006-11-30 20:11           ` Christoph Lameter
  2006-11-30 20:15             ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 20:11 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> Sure, but there is nothing wrong with using a slab page with a lower
> allocation rank when there is memory aplenty. 

What does "a slab page with a lower allocation rank" mean? Slab pages have 
no allocation ranks that I am aware of.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 20:11           ` Christoph Lameter
@ 2006-11-30 20:15             ` Peter Zijlstra
  2006-11-30 20:29               ` Christoph Lameter
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2006-11-30 20:15 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 12:11 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > Sure, but there is nothing wrong with using a slab page with a lower
> > allocation rank when there is memory aplenty. 
> 
> What does "a slab page with a lower allocation rank" mean? Slab pages have 
> no allocation ranks that I am aware of.

I just added allocation rank and didn't you suggest tracking it for all
slab pages instead of per slab?

The rank is an expression of how hard it was to get that page, with 0
being the hardest allocation (ALLOC_NO_WATERMARK) and 16 the easiest
(ALLOC_WMARK_HIGH).

I store the rank of the last allocated page and retest the rank when a
gfp flag indicates a higher rank, that is when the current slab
allocation would have failed to grow the slab under the conditions of
the previous allocation.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 20:15             ` Peter Zijlstra
@ 2006-11-30 20:29               ` Christoph Lameter
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2006-11-30 20:29 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: netdev, linux-mm, David Miller

On Thu, 30 Nov 2006, Peter Zijlstra wrote:

> > > Sure, but there is nothing wrong with using a slab page with a lower
> > > allocation rank when there is memory aplenty. 
> > What does "a slab page with a lower allocation rank" mean? Slab pages have 
> > no allocation ranks that I am aware of.
> I just added allocation rank and didn't you suggest tracking it for all
> slab pages instead of per slab?

Yes but that is not in place so I was wondering what you were talking 
about. It would help to have some longer text describing what you intend 
to do and how rank would work throughout the VM.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 1/6] mm: slab allocation fairness
  2006-11-30 19:33       ` Christoph Lameter
  2006-11-30 19:33         ` Peter Zijlstra
@ 2006-12-01 11:28         ` Peter Zijlstra
  1 sibling, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2006-12-01 11:28 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 11:33 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > No, the forced allocation is to test the allocation hardness at that
> > point in time. I could not think of another way to test that than to
> > actually to an allocation.
> 
> Typically we do this by checking the number of free pages in a zone 
> compared to the high low limits. See mmzone.h.

This doesn't work under high load because of direct reclaim. And if I go
run direct reclaim to test if I can raise the free pages level to an
acceptable level for the given gfp flags, I might as well do the whole
allocation.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages()
  2006-11-30 18:55   ` [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages() Christoph Lameter
  2006-11-30 18:55     ` Peter Zijlstra
@ 2006-12-01 12:14     ` Peter Zijlstra
  1 sibling, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2006-12-01 12:14 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: netdev, linux-mm, David Miller

On Thu, 2006-11-30 at 10:55 -0800, Christoph Lameter wrote:
> On Thu, 30 Nov 2006, Peter Zijlstra wrote:
> 
> > +unsigned int kmem_cache_objs_to_pages(struct kmem_cache *cachep, int nr)
> > +{
> > +	return ((nr + cachep->num - 1) / cachep->num) << cachep->gfporder;
> 
> cachep->num refers to the number of objects in a slab of gfporder.
> 
> thus
> 
> return (nr + cachep->num - 1) / cachep->num;

No, that would give the number of slabs needed, I want pages.


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2006-12-01 12:21 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20061130101451.495412000@chello.nl>
     [not found] ` <20061130101922.328418000@chello.nl>
2006-11-30 12:04   ` [RFC][PATCH 6/6] net: vm deadlock avoidance core Peter Zijlstra
     [not found] ` <20061130101921.113055000@chello.nl>
2006-11-30 18:52   ` [RFC][PATCH 1/6] mm: slab allocation fairness Christoph Lameter
2006-11-30 18:55     ` Peter Zijlstra
2006-11-30 19:33       ` Christoph Lameter
2006-11-30 19:33         ` Peter Zijlstra
2006-12-01 11:28         ` Peter Zijlstra
2006-11-30 19:02     ` Peter Zijlstra
2006-11-30 19:37       ` Christoph Lameter
2006-11-30 19:40         ` Peter Zijlstra
2006-11-30 20:11           ` Christoph Lameter
2006-11-30 20:15             ` Peter Zijlstra
2006-11-30 20:29               ` Christoph Lameter
     [not found] ` <20061130101922.175620000@chello.nl>
2006-11-30 18:55   ` [RFC][PATCH 5/6] slab: kmem_cache_objs_to_pages() Christoph Lameter
2006-11-30 18:55     ` Peter Zijlstra
2006-11-30 19:06       ` Christoph Lameter
2006-11-30 19:03         ` Peter Zijlstra
2006-12-01 12:14     ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).