All of lore.kernel.org
 help / color / mirror / Atom feed
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-mm@kvack.org, David Miller <davem@davemloft.net>
Subject: Re: [PATCH 9/9] net: vm deadlock avoidance core
Date: Tue, 16 Jan 2007 16:25:03 +0300	[thread overview]
Message-ID: <20070116132503.GA23144@2ka.mipt.ru> (raw)
In-Reply-To: <20070116101816.115266000@taijtu.programming.kicks-ass.net>

On Tue, Jan 16, 2007 at 10:46:06AM +0100, Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> In order to provide robust networked storage there must be a guarantee
> of progress. That is, the storage device must never stall because of (physical)
> OOM, because the device itself might be needed to get out of it (reclaim).


>  /* Used by processes to "lock" a socket state, so that
> Index: linux-2.6-git/net/core/dev.c
> ===================================================================
> --- linux-2.6-git.orig/net/core/dev.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/core/dev.c	2007-01-12 12:21:55.000000000 +0100
> @@ -1767,10 +1767,23 @@ int netif_receive_skb(struct sk_buff *sk
>  	struct net_device *orig_dev;
>  	int ret = NET_RX_DROP;
>  	__be16 type;
> +	unsigned long pflags = current->flags;
> +
> +	/* Emergency skb are special, they should
> +	 *  - be delivered to SOCK_VMIO sockets only
> +	 *  - stay away from userspace
> +	 *  - have bounded memory usage
> +	 *
> +	 * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
> +	 * This saves us from propagating the allocation context down to all
> +	 * allocation sites.
> +	 */
> +	if (unlikely(skb->emergency))
> +		current->flags |= PF_MEMALLOC;

Access to 'current' in netif_receive_skb()???
Why do you want to work with, for example keventd?

>  	/* if we've gotten here through NAPI, check netpoll */
>  	if (skb->dev->poll && netpoll_rx(skb))
> -		return NET_RX_DROP;
> +		goto out;
>  
>  	if (!skb->tstamp.off_sec)
>  		net_timestamp(skb);
> @@ -1781,7 +1794,7 @@ int netif_receive_skb(struct sk_buff *sk
>  	orig_dev = skb_bond(skb);
>  
>  	if (!orig_dev)
> -		return NET_RX_DROP;
> +		goto out;
>  
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
> @@ -1798,6 +1811,8 @@ int netif_receive_skb(struct sk_buff *sk
>  		goto ncls;
>  	}
>  #endif
> +	if (unlikely(skb->emergency))
> +		goto skip_taps;
>  
>  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>  		if (!ptype->dev || ptype->dev == skb->dev) {
> @@ -1807,6 +1822,7 @@ int netif_receive_skb(struct sk_buff *sk
>  		}
>  	}
>  
> +skip_taps:

It is still a 'tap'.

>  #ifdef CONFIG_NET_CLS_ACT
>  	if (pt_prev) {
>  		ret = deliver_skb(skb, pt_prev, orig_dev);
> @@ -1819,15 +1835,26 @@ int netif_receive_skb(struct sk_buff *sk
>  
>  	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
>  		kfree_skb(skb);
> -		goto out;
> +		goto unlock;
>  	}
>  
>  	skb->tc_verd = 0;
>  ncls:
>  #endif
>  
> +	if (unlikely(skb->emergency))
> +		switch(skb->protocol) {
> +			case __constant_htons(ETH_P_ARP):
> +			case __constant_htons(ETH_P_IP):
> +			case __constant_htons(ETH_P_IPV6):
> +				break;

Poor vlans and appletalk.

> +			default:
> +				goto drop;
> +		}
> +
>  	if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
> -		goto out;
> +		goto unlock;
>  
>  	type = skb->protocol;
>  	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
> @@ -1842,6 +1869,7 @@ ncls:
>  	if (pt_prev) {
>  		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>  	} else {
> +drop:
>  		kfree_skb(skb);
>  		/* Jamal, now you will not able to escape explaining
>  		 * me how you were going to use this. :-)
> @@ -1849,8 +1877,10 @@ ncls:
>  		ret = NET_RX_DROP;
>  	}
>  
> -out:
> +unlock:
>  	rcu_read_unlock();
> +out:
> +	current->flags = pflags;
>  	return ret;
>  }
>  
> Index: linux-2.6-git/net/core/skbuff.c
> ===================================================================
> --- linux-2.6-git.orig/net/core/skbuff.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/core/skbuff.c	2007-01-12 13:29:51.000000000 +0100
> @@ -142,28 +142,34 @@ EXPORT_SYMBOL(skb_truesize_bug);
>   *	%GFP_ATOMIC.
>   */
>  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> -			    int fclone, int node)
> +			    int flags, int node)
>  {
>  	struct kmem_cache *cache;
>  	struct skb_shared_info *shinfo;
>  	struct sk_buff *skb;
>  	u8 *data;
> +	int emergency = 0;
>  
> -	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
> +	size = SKB_DATA_ALIGN(size);
> +	cache = (flags & SKB_ALLOC_FCLONE)
> +		? skbuff_fclone_cache : skbuff_head_cache;
> +	if (flags & SKB_ALLOC_RX)
> +		gfp_mask |= __GFP_NOMEMALLOC|__GFP_NOWARN;
>  
> +retry_alloc:
>  	/* Get the HEAD */
>  	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
>  	if (!skb)
> -		goto out;
> +		goto noskb;
>  
>  	/* Get the DATA. Size must match skb_add_mtu(). */
> -	size = SKB_DATA_ALIGN(size);
>  	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
>  			gfp_mask, node);
>  	if (!data)
>  		goto nodata;
>  
>  	memset(skb, 0, offsetof(struct sk_buff, truesize));
> +	skb->emergency = emergency;
>  	skb->truesize = size + sizeof(struct sk_buff);
>  	atomic_set(&skb->users, 1);
>  	skb->head = data;
> @@ -180,7 +186,7 @@ struct sk_buff *__alloc_skb(unsigned int
>  	shinfo->ip6_frag_id = 0;
>  	shinfo->frag_list = NULL;
>  
> -	if (fclone) {
> +	if (flags & SKB_ALLOC_FCLONE) {
>  		struct sk_buff *child = skb + 1;
>  		atomic_t *fclone_ref = (atomic_t *) (child + 1);
>  
> @@ -188,12 +194,29 @@ struct sk_buff *__alloc_skb(unsigned int
>  		atomic_set(fclone_ref, 1);
>  
>  		child->fclone = SKB_FCLONE_UNAVAILABLE;
> +		child->emergency = skb->emergency;
>  	}
>  out:
>  	return skb;
> +
>  nodata:
>  	kmem_cache_free(cache, skb);
>  	skb = NULL;
> +noskb:
> +	/* Attempt emergency allocation when RX skb. */
> +	if (likely(!(flags & SKB_ALLOC_RX) || !sk_vmio_socks()))
> +		goto out;
> +
> +	if (!emergency) {
> +		if (sk_emergency_skb_get()) {
> +			gfp_mask &= ~(__GFP_NOMEMALLOC|__GFP_NOWARN);
> +			gfp_mask |= __GFP_EMERGENCY;
> +			emergency = 1;
> +			goto retry_alloc;
> +		}
> +	} else
> +		sk_emergency_skb_put();
> +
>  	goto out;
>  }
>  
> @@ -271,7 +294,7 @@ struct sk_buff *__netdev_alloc_skb(struc
>  	int node = dev->class_dev.dev ? dev_to_node(dev->class_dev.dev) : -1;
>  	struct sk_buff *skb;
>  
> - 	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
> + 	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, node);
>  	if (likely(skb)) {
>  		skb_reserve(skb, NET_SKB_PAD);
>  		skb->dev = dev;
> @@ -320,6 +343,8 @@ static void skb_release_data(struct sk_b
>  			skb_drop_fraglist(skb);
>  
>  		kfree(skb->head);
> +		if (unlikely(skb->emergency))
> +			sk_emergency_skb_put();
>  	}
>  }
>  
> @@ -440,6 +465,9 @@ struct sk_buff *skb_clone(struct sk_buff
>  		n->fclone = SKB_FCLONE_CLONE;
>  		atomic_inc(fclone_ref);
>  	} else {
> +		if (unlikely(skb->emergency))
> +			gfp_mask |= __GFP_EMERGENCY;
> +
>  		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
>  		if (!n)
>  			return NULL;
> @@ -474,6 +502,7 @@ struct sk_buff *skb_clone(struct sk_buff
>  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
>  	C(ipvs_property);
>  #endif
> +	C(emergency);
>  	C(protocol);
>  	n->destructor = NULL;
>  	C(mark);
> @@ -689,12 +718,19 @@ int pskb_expand_head(struct sk_buff *skb
>  	u8 *data;
>  	int size = nhead + (skb->end - skb->head) + ntail;
>  	long off;
> +	int emergency = 0;
>  
>  	if (skb_shared(skb))
>  		BUG();
>  
>  	size = SKB_DATA_ALIGN(size);
>  
> +	if (unlikely(skb->emergency) && sk_emergency_skb_get()) {
> +		gfp_mask |= __GFP_EMERGENCY;
> +		emergency = 1;
> +	} else
> +		gfp_mask |= __GFP_NOMEMALLOC;
> +
>  	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
>  	if (!data)
>  		goto nodata;
> @@ -727,6 +763,8 @@ int pskb_expand_head(struct sk_buff *skb
>  	return 0;
>  
>  nodata:
> +	if (unlikely(emergency))
> +		sk_emergency_skb_put();
>  	return -ENOMEM;
>  }
>  
> Index: linux-2.6-git/net/core/sock.c
> ===================================================================
> --- linux-2.6-git.orig/net/core/sock.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/core/sock.c	2007-01-12 12:21:14.000000000 +0100
> @@ -196,6 +196,120 @@ __u32 sysctl_rmem_default __read_mostly 
>  /* Maximal space eaten by iovec or ancilliary data plus some space */
>  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
>  
> +static DEFINE_SPINLOCK(memalloc_lock);
> +static int rx_net_reserve;
> +
> +atomic_t vmio_socks;
> +atomic_t emergency_rx_skbs;
> +
> +static int ipfrag_threshold;
> +
> +#define ipfrag_mtu()	(1500) /* XXX: should be smallest mtu system wide */
> +#define ipfrag_skbs()	(ipfrag_threshold / ipfrag_mtu())
> +#define ipfrag_pages()	(ipfrag_threshold / (ipfrag_mtu() * (PAGE_SIZE / ipfrag_mtu())))
> +
> +static int iprt_pages;
> +
> +/*
> + * is there room for another emergency skb.
> + */
> +int sk_emergency_skb_get(void)
> +{
> +	int nr = atomic_add_return(1, &emergency_rx_skbs);
> +	int thresh = (3 * ipfrag_skbs()) / 2;
> +	if (nr < thresh)
> +		return 1;
> +
> +	atomic_dec(&emergency_rx_skbs);
> +	return 0;
> +}
> +
> +/**
> + *	sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
> + *	@socks: number of new %SOCK_VMIO sockets
> + *	@tx_resserve_pages: number of pages to (un)reserve for TX
> + *
> + *	This function adjusts the memalloc reserve based on system demand.
> + *	The RX reserve is a limit, and only added once, not for each socket.
> + *
> + *	NOTE:
> + *	   @tx_reserve_pages is an upper-bound of memory used for TX hence
> + *	   we need not account the pages like we do for RX pages.
> + */
> +void sk_adjust_memalloc(int socks, int tx_reserve_pages)
> +{
> +	unsigned long flags;
> +	int reserve = tx_reserve_pages;
> +	int nr_socks;
> +
> +	spin_lock_irqsave(&memalloc_lock, flags);
> +	nr_socks = atomic_add_return(socks, &vmio_socks);
> +	BUG_ON(nr_socks < 0);
> +
> +	if (nr_socks) {
> +		int rx_pages = 2 * ipfrag_pages() + iprt_pages;
> +		reserve += rx_pages - rx_net_reserve;
> +		rx_net_reserve = rx_pages;
> +	} else {
> +		reserve -= rx_net_reserve;
> +		rx_net_reserve = 0;
> +	}
> +
> +	if (reserve)
> +		adjust_memalloc_reserve(reserve);
> +	spin_unlock_irqrestore(&memalloc_lock, flags);
> +}
> +EXPORT_SYMBOL_GPL(sk_adjust_memalloc);
> +
> +/*
> + * tiny helper function to track the total ipfragment memory
> + * needed because of modular ipv6
> + */
> +void ipfrag_reserve_memory(int frags)
> +{
> +	ipfrag_threshold += frags;
> +	sk_adjust_memalloc(0, 0);
> +}
> +EXPORT_SYMBOL_GPL(ipfrag_reserve_memory);
> +
> +void iprt_reserve_memory(int pages)
> +{
> +	iprt_pages += pages;
> +	sk_adjust_memalloc(0, 0);
> +}
> +EXPORT_SYMBOL_GPL(iprt_reserve_memory);
> +
> +/**
> + *	sk_set_vmio - sets %SOCK_VMIO
> + *	@sk: socket to set it on
> + *
> + *	Set %SOCK_VMIO on a socket and increase the memalloc reserve
> + *	accordingly.
> + */
> +int sk_set_vmio(struct sock *sk)
> +{
> +	int set = sock_flag(sk, SOCK_VMIO);
> +	if (!set) {
> +		sk_adjust_memalloc(1, 0);
> +		sock_set_flag(sk, SOCK_VMIO);
> +		sk->sk_allocation |= __GFP_EMERGENCY;
> +	}
> +	return !set;
> +}
> +EXPORT_SYMBOL_GPL(sk_set_vmio);
> +
> +int sk_clear_vmio(struct sock *sk)
> +{
> +	int set = sock_flag(sk, SOCK_VMIO);
> +	if (set) {
> +		sk_adjust_memalloc(-1, 0);
> +		sock_reset_flag(sk, SOCK_VMIO);
> +		sk->sk_allocation &= ~__GFP_EMERGENCY;
> +	}
> +	return set;
> +}
> +EXPORT_SYMBOL_GPL(sk_clear_vmio);
> +
>  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
>  {
>  	struct timeval tv;
> @@ -239,6 +353,12 @@ int sock_queue_rcv_skb(struct sock *sk, 
>  	int err = 0;
>  	int skb_len;
>  
> +	if (unlikely(skb->emergency)) {
> +		if (!sk_has_vmio(sk)) {
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +	} else
>  	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
>  	   number of warnings when compiling with -W --ANK
>  	 */
> @@ -868,6 +988,7 @@ void sk_free(struct sock *sk)
>  	struct sk_filter *filter;
>  	struct module *owner = sk->sk_prot_creator->owner;
>  
> +	sk_clear_vmio(sk);
>  	if (sk->sk_destruct)
>  		sk->sk_destruct(sk);
>  
> Index: linux-2.6-git/net/ipv4/ipmr.c
> ===================================================================
> --- linux-2.6-git.orig/net/ipv4/ipmr.c	2007-01-12 12:20:08.000000000 +0100
> +++ linux-2.6-git/net/ipv4/ipmr.c	2007-01-12 12:21:14.000000000 +0100
> @@ -1340,6 +1340,9 @@ int ip_mr_input(struct sk_buff *skb)
>  	struct mfc_cache *cache;
>  	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
>  
> +	if (unlikely(skb->emergency))
> +		goto drop;
> +
>  	/* Packet is looped back after forward, it should not be
>  	   forwarded second time, but still can be delivered locally.
>  	 */
> @@ -1411,6 +1414,7 @@ int ip_mr_input(struct sk_buff *skb)
>  dont_forward:
>  	if (local)
>  		return ip_local_deliver(skb);
> +drop:
>  	kfree_skb(skb);
>  	return 0;
>  }
> Index: linux-2.6-git/net/ipv4/sysctl_net_ipv4.c
> ===================================================================
> --- linux-2.6-git.orig/net/ipv4/sysctl_net_ipv4.c	2007-01-12 12:20:08.000000000 +0100
> +++ linux-2.6-git/net/ipv4/sysctl_net_ipv4.c	2007-01-12 12:21:14.000000000 +0100
> @@ -18,6 +18,7 @@
>  #include <net/route.h>
>  #include <net/tcp.h>
>  #include <net/cipso_ipv4.h>
> +#include <net/sock.h>
>  
>  /* From af_inet.c */
>  extern int sysctl_ip_nonlocal_bind;
> @@ -186,6 +187,17 @@ static int strategy_allowed_congestion_c
>  
>  }
>  
> +int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp,
> +		     void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	int ret;
> +	int old_thresh = *(int *)table->data;
> +	ret = proc_dointvec(table,write,filp,buffer,lenp,ppos);
> +	ipfrag_reserve_memory(*(int *)table->data - old_thresh);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(proc_dointvec_fragment);
> +
>  ctl_table ipv4_table[] = {
>          {
>  		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
> @@ -291,7 +303,7 @@ ctl_table ipv4_table[] = {
>  		.data		= &sysctl_ipfrag_high_thresh,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= &proc_dointvec
> +		.proc_handler	= &proc_dointvec_fragment
>  	},
>  	{
>  		.ctl_name	= NET_IPV4_IPFRAG_LOW_THRESH,
> Index: linux-2.6-git/net/ipv4/tcp_ipv4.c
> ===================================================================
> --- linux-2.6-git.orig/net/ipv4/tcp_ipv4.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/ipv4/tcp_ipv4.c	2007-01-12 12:21:14.000000000 +0100
> @@ -1604,6 +1604,22 @@ csum_err:
>  	goto discard;
>  }
>  
> +static int tcp_v4_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> +	int ret;
> +	unsigned long pflags = current->flags;
> +	if (unlikely(skb->emergency)) {
> +		BUG_ON(!sk_has_vmio(sk)); /* we dropped those before queueing */
> +		if (!(pflags & PF_MEMALLOC))
> +			current->flags |= PF_MEMALLOC;
> +	}
> +
> +	ret = tcp_v4_do_rcv(sk, skb);
> +
> +	current->flags = pflags;
> +	return ret;

Why don't you want to just setup PF_MEMALLOC for the socket and all
related processes?

> +}
> +
>  /*
>   *	From tcp_input.c
>   */
> @@ -1654,6 +1670,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
>  	if (!sk)
>  		goto no_tcp_socket;
>  
> +	if (unlikely(skb->emergency)) {
> +	       	if (!sk_has_vmio(sk))
> +			goto discard_and_relse;
> +		/*
> +		   decrease window size..
> +		   tcp_enter_quickack_mode(sk);
> +		*/

How does this decrease window size?
Maybe ack scheduling would be better handled by inet_csk_schedule_ack()
or just directly send an ack, which in turn requires allocation, which
can be bound to this received frame processing...


-- 
	Evgeniy Polyakov

WARNING: multiple messages have this Message-ID (diff)
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-mm@kvack.org, David Miller <davem@davemloft.net>
Subject: Re: [PATCH 9/9] net: vm deadlock avoidance core
Date: Tue, 16 Jan 2007 16:25:03 +0300	[thread overview]
Message-ID: <20070116132503.GA23144@2ka.mipt.ru> (raw)
In-Reply-To: <20070116101816.115266000@taijtu.programming.kicks-ass.net>

On Tue, Jan 16, 2007 at 10:46:06AM +0100, Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> In order to provide robust networked storage there must be a guarantee
> of progress. That is, the storage device must never stall because of (physical)
> OOM, because the device itself might be needed to get out of it (reclaim).


>  /* Used by processes to "lock" a socket state, so that
> Index: linux-2.6-git/net/core/dev.c
> ===================================================================
> --- linux-2.6-git.orig/net/core/dev.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/core/dev.c	2007-01-12 12:21:55.000000000 +0100
> @@ -1767,10 +1767,23 @@ int netif_receive_skb(struct sk_buff *sk
>  	struct net_device *orig_dev;
>  	int ret = NET_RX_DROP;
>  	__be16 type;
> +	unsigned long pflags = current->flags;
> +
> +	/* Emergency skb are special, they should
> +	 *  - be delivered to SOCK_VMIO sockets only
> +	 *  - stay away from userspace
> +	 *  - have bounded memory usage
> +	 *
> +	 * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
> +	 * This saves us from propagating the allocation context down to all
> +	 * allocation sites.
> +	 */
> +	if (unlikely(skb->emergency))
> +		current->flags |= PF_MEMALLOC;

Access to 'current' in netif_receive_skb()???
Why do you want to work with, for example keventd?

>  	/* if we've gotten here through NAPI, check netpoll */
>  	if (skb->dev->poll && netpoll_rx(skb))
> -		return NET_RX_DROP;
> +		goto out;
>  
>  	if (!skb->tstamp.off_sec)
>  		net_timestamp(skb);
> @@ -1781,7 +1794,7 @@ int netif_receive_skb(struct sk_buff *sk
>  	orig_dev = skb_bond(skb);
>  
>  	if (!orig_dev)
> -		return NET_RX_DROP;
> +		goto out;
>  
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
> @@ -1798,6 +1811,8 @@ int netif_receive_skb(struct sk_buff *sk
>  		goto ncls;
>  	}
>  #endif
> +	if (unlikely(skb->emergency))
> +		goto skip_taps;
>  
>  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>  		if (!ptype->dev || ptype->dev == skb->dev) {
> @@ -1807,6 +1822,7 @@ int netif_receive_skb(struct sk_buff *sk
>  		}
>  	}
>  
> +skip_taps:

It is still a 'tap'.

>  #ifdef CONFIG_NET_CLS_ACT
>  	if (pt_prev) {
>  		ret = deliver_skb(skb, pt_prev, orig_dev);
> @@ -1819,15 +1835,26 @@ int netif_receive_skb(struct sk_buff *sk
>  
>  	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
>  		kfree_skb(skb);
> -		goto out;
> +		goto unlock;
>  	}
>  
>  	skb->tc_verd = 0;
>  ncls:
>  #endif
>  
> +	if (unlikely(skb->emergency))
> +		switch(skb->protocol) {
> +			case __constant_htons(ETH_P_ARP):
> +			case __constant_htons(ETH_P_IP):
> +			case __constant_htons(ETH_P_IPV6):
> +				break;

Poor vlans and appletalk.

> +			default:
> +				goto drop;
> +		}
> +
>  	if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
> -		goto out;
> +		goto unlock;
>  
>  	type = skb->protocol;
>  	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
> @@ -1842,6 +1869,7 @@ ncls:
>  	if (pt_prev) {
>  		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>  	} else {
> +drop:
>  		kfree_skb(skb);
>  		/* Jamal, now you will not able to escape explaining
>  		 * me how you were going to use this. :-)
> @@ -1849,8 +1877,10 @@ ncls:
>  		ret = NET_RX_DROP;
>  	}
>  
> -out:
> +unlock:
>  	rcu_read_unlock();
> +out:
> +	current->flags = pflags;
>  	return ret;
>  }
>  
> Index: linux-2.6-git/net/core/skbuff.c
> ===================================================================
> --- linux-2.6-git.orig/net/core/skbuff.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/core/skbuff.c	2007-01-12 13:29:51.000000000 +0100
> @@ -142,28 +142,34 @@ EXPORT_SYMBOL(skb_truesize_bug);
>   *	%GFP_ATOMIC.
>   */
>  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
> -			    int fclone, int node)
> +			    int flags, int node)
>  {
>  	struct kmem_cache *cache;
>  	struct skb_shared_info *shinfo;
>  	struct sk_buff *skb;
>  	u8 *data;
> +	int emergency = 0;
>  
> -	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
> +	size = SKB_DATA_ALIGN(size);
> +	cache = (flags & SKB_ALLOC_FCLONE)
> +		? skbuff_fclone_cache : skbuff_head_cache;
> +	if (flags & SKB_ALLOC_RX)
> +		gfp_mask |= __GFP_NOMEMALLOC|__GFP_NOWARN;
>  
> +retry_alloc:
>  	/* Get the HEAD */
>  	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
>  	if (!skb)
> -		goto out;
> +		goto noskb;
>  
>  	/* Get the DATA. Size must match skb_add_mtu(). */
> -	size = SKB_DATA_ALIGN(size);
>  	data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
>  			gfp_mask, node);
>  	if (!data)
>  		goto nodata;
>  
>  	memset(skb, 0, offsetof(struct sk_buff, truesize));
> +	skb->emergency = emergency;
>  	skb->truesize = size + sizeof(struct sk_buff);
>  	atomic_set(&skb->users, 1);
>  	skb->head = data;
> @@ -180,7 +186,7 @@ struct sk_buff *__alloc_skb(unsigned int
>  	shinfo->ip6_frag_id = 0;
>  	shinfo->frag_list = NULL;
>  
> -	if (fclone) {
> +	if (flags & SKB_ALLOC_FCLONE) {
>  		struct sk_buff *child = skb + 1;
>  		atomic_t *fclone_ref = (atomic_t *) (child + 1);
>  
> @@ -188,12 +194,29 @@ struct sk_buff *__alloc_skb(unsigned int
>  		atomic_set(fclone_ref, 1);
>  
>  		child->fclone = SKB_FCLONE_UNAVAILABLE;
> +		child->emergency = skb->emergency;
>  	}
>  out:
>  	return skb;
> +
>  nodata:
>  	kmem_cache_free(cache, skb);
>  	skb = NULL;
> +noskb:
> +	/* Attempt emergency allocation when RX skb. */
> +	if (likely(!(flags & SKB_ALLOC_RX) || !sk_vmio_socks()))
> +		goto out;
> +
> +	if (!emergency) {
> +		if (sk_emergency_skb_get()) {
> +			gfp_mask &= ~(__GFP_NOMEMALLOC|__GFP_NOWARN);
> +			gfp_mask |= __GFP_EMERGENCY;
> +			emergency = 1;
> +			goto retry_alloc;
> +		}
> +	} else
> +		sk_emergency_skb_put();
> +
>  	goto out;
>  }
>  
> @@ -271,7 +294,7 @@ struct sk_buff *__netdev_alloc_skb(struc
>  	int node = dev->class_dev.dev ? dev_to_node(dev->class_dev.dev) : -1;
>  	struct sk_buff *skb;
>  
> - 	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node);
> + 	skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, node);
>  	if (likely(skb)) {
>  		skb_reserve(skb, NET_SKB_PAD);
>  		skb->dev = dev;
> @@ -320,6 +343,8 @@ static void skb_release_data(struct sk_b
>  			skb_drop_fraglist(skb);
>  
>  		kfree(skb->head);
> +		if (unlikely(skb->emergency))
> +			sk_emergency_skb_put();
>  	}
>  }
>  
> @@ -440,6 +465,9 @@ struct sk_buff *skb_clone(struct sk_buff
>  		n->fclone = SKB_FCLONE_CLONE;
>  		atomic_inc(fclone_ref);
>  	} else {
> +		if (unlikely(skb->emergency))
> +			gfp_mask |= __GFP_EMERGENCY;
> +
>  		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
>  		if (!n)
>  			return NULL;
> @@ -474,6 +502,7 @@ struct sk_buff *skb_clone(struct sk_buff
>  #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
>  	C(ipvs_property);
>  #endif
> +	C(emergency);
>  	C(protocol);
>  	n->destructor = NULL;
>  	C(mark);
> @@ -689,12 +718,19 @@ int pskb_expand_head(struct sk_buff *skb
>  	u8 *data;
>  	int size = nhead + (skb->end - skb->head) + ntail;
>  	long off;
> +	int emergency = 0;
>  
>  	if (skb_shared(skb))
>  		BUG();
>  
>  	size = SKB_DATA_ALIGN(size);
>  
> +	if (unlikely(skb->emergency) && sk_emergency_skb_get()) {
> +		gfp_mask |= __GFP_EMERGENCY;
> +		emergency = 1;
> +	} else
> +		gfp_mask |= __GFP_NOMEMALLOC;
> +
>  	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
>  	if (!data)
>  		goto nodata;
> @@ -727,6 +763,8 @@ int pskb_expand_head(struct sk_buff *skb
>  	return 0;
>  
>  nodata:
> +	if (unlikely(emergency))
> +		sk_emergency_skb_put();
>  	return -ENOMEM;
>  }
>  
> Index: linux-2.6-git/net/core/sock.c
> ===================================================================
> --- linux-2.6-git.orig/net/core/sock.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/core/sock.c	2007-01-12 12:21:14.000000000 +0100
> @@ -196,6 +196,120 @@ __u32 sysctl_rmem_default __read_mostly 
>  /* Maximal space eaten by iovec or ancilliary data plus some space */
>  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
>  
> +static DEFINE_SPINLOCK(memalloc_lock);
> +static int rx_net_reserve;
> +
> +atomic_t vmio_socks;
> +atomic_t emergency_rx_skbs;
> +
> +static int ipfrag_threshold;
> +
> +#define ipfrag_mtu()	(1500) /* XXX: should be smallest mtu system wide */
> +#define ipfrag_skbs()	(ipfrag_threshold / ipfrag_mtu())
> +#define ipfrag_pages()	(ipfrag_threshold / (ipfrag_mtu() * (PAGE_SIZE / ipfrag_mtu())))
> +
> +static int iprt_pages;
> +
> +/*
> + * is there room for another emergency skb.
> + */
> +int sk_emergency_skb_get(void)
> +{
> +	int nr = atomic_add_return(1, &emergency_rx_skbs);
> +	int thresh = (3 * ipfrag_skbs()) / 2;
> +	if (nr < thresh)
> +		return 1;
> +
> +	atomic_dec(&emergency_rx_skbs);
> +	return 0;
> +}
> +
> +/**
> + *	sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
> + *	@socks: number of new %SOCK_VMIO sockets
> + *	@tx_resserve_pages: number of pages to (un)reserve for TX
> + *
> + *	This function adjusts the memalloc reserve based on system demand.
> + *	The RX reserve is a limit, and only added once, not for each socket.
> + *
> + *	NOTE:
> + *	   @tx_reserve_pages is an upper-bound of memory used for TX hence
> + *	   we need not account the pages like we do for RX pages.
> + */
> +void sk_adjust_memalloc(int socks, int tx_reserve_pages)
> +{
> +	unsigned long flags;
> +	int reserve = tx_reserve_pages;
> +	int nr_socks;
> +
> +	spin_lock_irqsave(&memalloc_lock, flags);
> +	nr_socks = atomic_add_return(socks, &vmio_socks);
> +	BUG_ON(nr_socks < 0);
> +
> +	if (nr_socks) {
> +		int rx_pages = 2 * ipfrag_pages() + iprt_pages;
> +		reserve += rx_pages - rx_net_reserve;
> +		rx_net_reserve = rx_pages;
> +	} else {
> +		reserve -= rx_net_reserve;
> +		rx_net_reserve = 0;
> +	}
> +
> +	if (reserve)
> +		adjust_memalloc_reserve(reserve);
> +	spin_unlock_irqrestore(&memalloc_lock, flags);
> +}
> +EXPORT_SYMBOL_GPL(sk_adjust_memalloc);
> +
> +/*
> + * tiny helper function to track the total ipfragment memory
> + * needed because of modular ipv6
> + */
> +void ipfrag_reserve_memory(int frags)
> +{
> +	ipfrag_threshold += frags;
> +	sk_adjust_memalloc(0, 0);
> +}
> +EXPORT_SYMBOL_GPL(ipfrag_reserve_memory);
> +
> +void iprt_reserve_memory(int pages)
> +{
> +	iprt_pages += pages;
> +	sk_adjust_memalloc(0, 0);
> +}
> +EXPORT_SYMBOL_GPL(iprt_reserve_memory);
> +
> +/**
> + *	sk_set_vmio - sets %SOCK_VMIO
> + *	@sk: socket to set it on
> + *
> + *	Set %SOCK_VMIO on a socket and increase the memalloc reserve
> + *	accordingly.
> + */
> +int sk_set_vmio(struct sock *sk)
> +{
> +	int set = sock_flag(sk, SOCK_VMIO);
> +	if (!set) {
> +		sk_adjust_memalloc(1, 0);
> +		sock_set_flag(sk, SOCK_VMIO);
> +		sk->sk_allocation |= __GFP_EMERGENCY;
> +	}
> +	return !set;
> +}
> +EXPORT_SYMBOL_GPL(sk_set_vmio);
> +
> +int sk_clear_vmio(struct sock *sk)
> +{
> +	int set = sock_flag(sk, SOCK_VMIO);
> +	if (set) {
> +		sk_adjust_memalloc(-1, 0);
> +		sock_reset_flag(sk, SOCK_VMIO);
> +		sk->sk_allocation &= ~__GFP_EMERGENCY;
> +	}
> +	return set;
> +}
> +EXPORT_SYMBOL_GPL(sk_clear_vmio);
> +
>  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
>  {
>  	struct timeval tv;
> @@ -239,6 +353,12 @@ int sock_queue_rcv_skb(struct sock *sk, 
>  	int err = 0;
>  	int skb_len;
>  
> +	if (unlikely(skb->emergency)) {
> +		if (!sk_has_vmio(sk)) {
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +	} else
>  	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
>  	   number of warnings when compiling with -W --ANK
>  	 */
> @@ -868,6 +988,7 @@ void sk_free(struct sock *sk)
>  	struct sk_filter *filter;
>  	struct module *owner = sk->sk_prot_creator->owner;
>  
> +	sk_clear_vmio(sk);
>  	if (sk->sk_destruct)
>  		sk->sk_destruct(sk);
>  
> Index: linux-2.6-git/net/ipv4/ipmr.c
> ===================================================================
> --- linux-2.6-git.orig/net/ipv4/ipmr.c	2007-01-12 12:20:08.000000000 +0100
> +++ linux-2.6-git/net/ipv4/ipmr.c	2007-01-12 12:21:14.000000000 +0100
> @@ -1340,6 +1340,9 @@ int ip_mr_input(struct sk_buff *skb)
>  	struct mfc_cache *cache;
>  	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
>  
> +	if (unlikely(skb->emergency))
> +		goto drop;
> +
>  	/* Packet is looped back after forward, it should not be
>  	   forwarded second time, but still can be delivered locally.
>  	 */
> @@ -1411,6 +1414,7 @@ int ip_mr_input(struct sk_buff *skb)
>  dont_forward:
>  	if (local)
>  		return ip_local_deliver(skb);
> +drop:
>  	kfree_skb(skb);
>  	return 0;
>  }
> Index: linux-2.6-git/net/ipv4/sysctl_net_ipv4.c
> ===================================================================
> --- linux-2.6-git.orig/net/ipv4/sysctl_net_ipv4.c	2007-01-12 12:20:08.000000000 +0100
> +++ linux-2.6-git/net/ipv4/sysctl_net_ipv4.c	2007-01-12 12:21:14.000000000 +0100
> @@ -18,6 +18,7 @@
>  #include <net/route.h>
>  #include <net/tcp.h>
>  #include <net/cipso_ipv4.h>
> +#include <net/sock.h>
>  
>  /* From af_inet.c */
>  extern int sysctl_ip_nonlocal_bind;
> @@ -186,6 +187,17 @@ static int strategy_allowed_congestion_c
>  
>  }
>  
> +int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp,
> +		     void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> +	int ret;
> +	int old_thresh = *(int *)table->data;
> +	ret = proc_dointvec(table,write,filp,buffer,lenp,ppos);
> +	ipfrag_reserve_memory(*(int *)table->data - old_thresh);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(proc_dointvec_fragment);
> +
>  ctl_table ipv4_table[] = {
>          {
>  		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
> @@ -291,7 +303,7 @@ ctl_table ipv4_table[] = {
>  		.data		= &sysctl_ipfrag_high_thresh,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= &proc_dointvec
> +		.proc_handler	= &proc_dointvec_fragment
>  	},
>  	{
>  		.ctl_name	= NET_IPV4_IPFRAG_LOW_THRESH,
> Index: linux-2.6-git/net/ipv4/tcp_ipv4.c
> ===================================================================
> --- linux-2.6-git.orig/net/ipv4/tcp_ipv4.c	2007-01-12 12:20:07.000000000 +0100
> +++ linux-2.6-git/net/ipv4/tcp_ipv4.c	2007-01-12 12:21:14.000000000 +0100
> @@ -1604,6 +1604,22 @@ csum_err:
>  	goto discard;
>  }
>  
> +static int tcp_v4_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> +	int ret;
> +	unsigned long pflags = current->flags;
> +	if (unlikely(skb->emergency)) {
> +		BUG_ON(!sk_has_vmio(sk)); /* we dropped those before queueing */
> +		if (!(pflags & PF_MEMALLOC))
> +			current->flags |= PF_MEMALLOC;
> +	}
> +
> +	ret = tcp_v4_do_rcv(sk, skb);
> +
> +	current->flags = pflags;
> +	return ret;

Why don't you want to just setup PF_MEMALLOC for the socket and all
related processes?

> +}
> +
>  /*
>   *	From tcp_input.c
>   */
> @@ -1654,6 +1670,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
>  	if (!sk)
>  		goto no_tcp_socket;
>  
> +	if (unlikely(skb->emergency)) {
> +	       	if (!sk_has_vmio(sk))
> +			goto discard_and_relse;
> +		/*
> +		   decrease window size..
> +		   tcp_enter_quickack_mode(sk);
> +		*/

How does this decrease window size?
Maybe ack scheduling would be better handled by inet_csk_schedule_ack()
or just directly send an ack, which in turn requires allocation, which
can be bound to this received frame processing...


-- 
	Evgeniy Polyakov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2007-01-16 13:25 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-01-16  9:45 [PATCH 0/9] VM deadlock avoidance -v10 Peter Zijlstra
2007-01-16  9:45 ` Peter Zijlstra
2007-01-16  9:45 ` [PATCH 1/9] mm: page allocation rank Peter Zijlstra
2007-01-16  9:45   ` Peter Zijlstra
2007-01-16  9:45 ` [PATCH 2/9] mm: slab allocation fairness Peter Zijlstra
2007-01-16  9:45   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 3/9] mm: allow PF_MEMALLOC from softirq context Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 4/9] mm: serialize access to min_free_kbytes Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 5/9] mm: emergency pool Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 6/9] mm: __GFP_EMERGENCY Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 7/9] mm: allow mempool to fall back to memalloc reserves Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 8/9] slab: kmem_cache_objs_to_pages() Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16  9:46 ` [PATCH 9/9] net: vm deadlock avoidance core Peter Zijlstra
2007-01-16  9:46   ` Peter Zijlstra
2007-01-16 13:25   ` Evgeniy Polyakov [this message]
2007-01-16 13:25     ` Evgeniy Polyakov
2007-01-16 13:47     ` Peter Zijlstra
2007-01-16 13:47       ` Peter Zijlstra
2007-01-16 15:33       ` Evgeniy Polyakov
2007-01-16 15:33         ` Evgeniy Polyakov
2007-01-16 16:08         ` Peter Zijlstra
2007-01-16 16:08           ` Peter Zijlstra
2007-01-17  4:54           ` Evgeniy Polyakov
2007-01-17  4:54             ` Evgeniy Polyakov
2007-01-17  9:07             ` Peter Zijlstra
2007-01-17  9:07               ` Peter Zijlstra
2007-01-18 10:41               ` Evgeniy Polyakov
2007-01-18 10:41                 ` Evgeniy Polyakov
2007-01-18 12:18                 ` Peter Zijlstra
2007-01-18 12:18                   ` Peter Zijlstra
2007-01-18 13:58                   ` Possible ways of dealing with OOM conditions Evgeniy Polyakov
2007-01-18 13:58                     ` Evgeniy Polyakov
2007-01-18 15:10                     ` Peter Zijlstra
2007-01-18 15:10                       ` Peter Zijlstra
2007-01-18 15:50                       ` Evgeniy Polyakov
2007-01-18 15:50                         ` Evgeniy Polyakov
2007-01-18 17:31                         ` Peter Zijlstra
2007-01-18 17:31                           ` Peter Zijlstra
2007-01-18 18:34                           ` Evgeniy Polyakov
2007-01-18 18:34                             ` Evgeniy Polyakov
2007-01-19 12:53                             ` Peter Zijlstra
2007-01-19 12:53                               ` Peter Zijlstra
2007-01-19 22:56                               ` Evgeniy Polyakov
2007-01-19 22:56                                 ` Evgeniy Polyakov
2007-01-20 22:36                                 ` Rik van Riel
2007-01-20 22:36                                   ` Rik van Riel
2007-01-21  1:46                                   ` Evgeniy Polyakov
2007-01-21  1:46                                     ` Evgeniy Polyakov
2007-01-21  2:14                                     ` Evgeniy Polyakov
2007-01-21  2:14                                       ` Evgeniy Polyakov
2007-01-21 16:30                                     ` Rik van Riel
2007-01-21 16:30                                       ` Rik van Riel
2007-01-19 17:54                           ` Christoph Lameter
2007-01-19 17:54                             ` Christoph Lameter
2007-01-17  9:12 ` [PATCH 0/9] VM deadlock avoidance -v10 Pavel Machek
2007-01-17  9:12   ` Pavel Machek
2007-01-17  9:20   ` Peter Zijlstra
2007-01-17  9:20     ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070116132503.GA23144@2ka.mipt.ru \
    --to=johnpol@2ka.mipt.ru \
    --cc=a.p.zijlstra@chello.nl \
    --cc=davem@davemloft.net \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.