linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Stephen Hemminger <shemminger@linux-foundation.org>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Subject: Re: [PATCH 23/33] netvm: skb processing
Date: Tue, 30 Oct 2007 14:26:34 -0700	[thread overview]
Message-ID: <20071030142634.0f00b492@freepuppy.rosehill> (raw)
In-Reply-To: <20071030160914.749995000@chello.nl>

On Tue, 30 Oct 2007 17:04:24 +0100
Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:

> In order to make sure emergency packets receive all memory needed to proceed
> ensure processing of emergency SKBs happens under PF_MEMALLOC.
> 
> Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing.
> 
> Skip taps, since those are user-space again.
> 
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
>  include/net/sock.h |    5 +++++
>  net/core/dev.c     |   44 ++++++++++++++++++++++++++++++++++++++------
>  net/core/sock.c    |   18 ++++++++++++++++++
>  3 files changed, 61 insertions(+), 6 deletions(-)
> 
> Index: linux-2.6/net/core/dev.c
> ===================================================================
> --- linux-2.6.orig/net/core/dev.c
> +++ linux-2.6/net/core/dev.c
> @@ -1976,10 +1976,23 @@ int netif_receive_skb(struct sk_buff *sk
>  	struct net_device *orig_dev;
>  	int ret = NET_RX_DROP;
>  	__be16 type;
> +	unsigned long pflags = current->flags;
> +
> +	/* Emergency skb are special, they should
> +	 *  - be delivered to SOCK_MEMALLOC sockets only
> +	 *  - stay away from userspace
> +	 *  - have bounded memory usage
> +	 *
> +	 * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
> +	 * This saves us from propagating the allocation context down to all
> +	 * allocation sites.
> +	 */
> +	if (skb_emergency(skb))
> +		current->flags |= PF_MEMALLOC;
>  
>  	/* if we've gotten here through NAPI, check netpoll */
>  	if (netpoll_receive_skb(skb))
> -		return NET_RX_DROP;
> +		goto out;

Why the change? doesn't gcc optimize the common exit case anyway?

>  
>  	if (!skb->tstamp.tv64)
>  		net_timestamp(skb);
> @@ -1990,7 +2003,7 @@ int netif_receive_skb(struct sk_buff *sk
>  	orig_dev = skb_bond(skb);
>  
>  	if (!orig_dev)
> -		return NET_RX_DROP;
> +		goto out;
>  
>  	__get_cpu_var(netdev_rx_stat).total++;
>  
> @@ -2009,6 +2022,9 @@ int netif_receive_skb(struct sk_buff *sk
>  	}
>  #endif
>  
> +	if (skb_emergency(skb))
> +		goto skip_taps;
> +
>  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
>  		if (!ptype->dev || ptype->dev == skb->dev) {
>  			if (pt_prev)
> @@ -2017,6 +2033,7 @@ int netif_receive_skb(struct sk_buff *sk
>  		}
>  	}
>  
> +skip_taps:
>  #ifdef CONFIG_NET_CLS_ACT
>  	if (pt_prev) {
>  		ret = deliver_skb(skb, pt_prev, orig_dev);
> @@ -2029,19 +2046,31 @@ int netif_receive_skb(struct sk_buff *sk
>  
>  	if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
>  		kfree_skb(skb);
> -		goto out;
> +		goto unlock;
>  	}
>  
>  	skb->tc_verd = 0;
>  ncls:
>  #endif
>  
> +	if (skb_emergency(skb))
> +		switch(skb->protocol) {
> +			case __constant_htons(ETH_P_ARP):
> +			case __constant_htons(ETH_P_IP):
> +			case __constant_htons(ETH_P_IPV6):
> +			case __constant_htons(ETH_P_8021Q):
> +				break;

Indentation is wrong, and hard coding protocol values as spcial case
seems bad here. What about vlan's, etc?

> +			default:
> +				goto drop;
> +		}
> +
>  	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
>  	if (!skb)
> -		goto out;
> +		goto unlock;
>  	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
>  	if (!skb)
> -		goto out;
> +		goto unlock;
>  
>  	type = skb->protocol;
>  	list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
> @@ -2056,6 +2085,7 @@ ncls:
>  	if (pt_prev) {
>  		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>  	} else {
> +drop:
>  		kfree_skb(skb);
>  		/* Jamal, now you will not able to escape explaining
>  		 * me how you were going to use this. :-)
> @@ -2063,8 +2093,10 @@ ncls:
>  		ret = NET_RX_DROP;
>  	}
>  
> -out:
> +unlock:
>  	rcu_read_unlock();
> +out:
> +	tsk_restore_flags(current, pflags, PF_MEMALLOC);
>  	return ret;
>  }
>  
> Index: linux-2.6/include/net/sock.h
> ===================================================================
> --- linux-2.6.orig/include/net/sock.h
> +++ linux-2.6/include/net/sock.h
> @@ -523,8 +523,13 @@ static inline void sk_add_backlog(struct
>  	skb->next = NULL;
>  }
>  
> +extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
> +
>  static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>  {
> +	if (skb_emergency(skb))
> +		return __sk_backlog_rcv(sk, skb);
> +
>  	return sk->sk_backlog_rcv(sk, skb);
>  }
>  
> Index: linux-2.6/net/core/sock.c
> ===================================================================
> --- linux-2.6.orig/net/core/sock.c
> +++ linux-2.6/net/core/sock.c
> @@ -319,6 +319,24 @@ int sk_clear_memalloc(struct sock *sk)
>  }
>  EXPORT_SYMBOL_GPL(sk_clear_memalloc);
>  
> +#ifdef CONFIG_NETVM
> +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> +	int ret;
> +	unsigned long pflags = current->flags;
> +
> +	/* these should have been dropped before queueing */
> +	BUG_ON(!sk_has_memalloc(sk));
> +
> +	current->flags |= PF_MEMALLOC;
> +	ret = sk->sk_backlog_rcv(sk, skb);
> +	tsk_restore_flags(current, pflags, PF_MEMALLOC);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(__sk_backlog_rcv);
> +#endif
> +
>  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
>  {
>  	struct timeval tv;


I am still not convinced that this solves the problem well enough
to be useful.  Can you really survive a heavy memory overcommit?
In other words, can you prove that the added complexity causes the system
to survive a real test where otherwise it would not?


-- 
Stephen Hemminger <shemminger@linux-foundation.org>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2007-10-30 21:30 UTC|newest]

Thread overview: 72+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-10-30 16:04 [PATCH 00/33] Swap over NFS -v14 Peter Zijlstra
2007-10-30 16:04 ` [PATCH 01/33] mm: gfp_to_alloc_flags() Peter Zijlstra
2007-10-30 16:04 ` [PATCH 02/33] mm: tag reseve pages Peter Zijlstra
2007-10-30 16:04 ` [PATCH 03/33] mm: slub: add knowledge of reserve pages Peter Zijlstra
2007-10-31  3:37   ` Nick Piggin
2007-10-31 10:42     ` Peter Zijlstra
2007-10-31 10:46       ` Nick Piggin
2007-10-31 12:17         ` Peter Zijlstra
2007-10-31 11:25           ` Nick Piggin
2007-10-31 12:54             ` Peter Zijlstra
2007-10-31 13:08               ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 04/33] mm: allow mempool to fall back to memalloc reserves Peter Zijlstra
2007-10-31  3:40   ` Nick Piggin
2007-10-30 16:04 ` [PATCH 05/33] mm: kmem_estimate_pages() Peter Zijlstra
2007-10-31  3:43   ` Nick Piggin
2007-10-31 10:42     ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 06/33] mm: allow PF_MEMALLOC from softirq context Peter Zijlstra
2007-10-31  3:51   ` Nick Piggin
2007-10-31 10:42     ` Peter Zijlstra
2007-10-31 10:49       ` Nick Piggin
2007-10-31 13:06         ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 07/33] mm: serialize access to min_free_kbytes Peter Zijlstra
2007-10-30 16:04 ` [PATCH 08/33] mm: emergency pool Peter Zijlstra
2007-10-30 16:04 ` [PATCH 09/33] mm: system wide ALLOC_NO_WATERMARK Peter Zijlstra
2007-10-31  3:52   ` Nick Piggin
2007-10-31 10:45     ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 10/33] mm: __GFP_MEMALLOC Peter Zijlstra
2007-10-30 16:04 ` [PATCH 11/33] mm: memory reserve management Peter Zijlstra
2007-10-30 16:04 ` [PATCH 12/33] selinux: tag avc cache alloc as non-critical Peter Zijlstra
2007-10-30 16:04 ` [PATCH 13/33] net: wrap sk->sk_backlog_rcv() Peter Zijlstra
2007-10-30 16:04 ` [PATCH 14/33] net: packet split receive api Peter Zijlstra
2007-10-30 16:04 ` [PATCH 15/33] net: sk_allocation() - concentrate socket related allocations Peter Zijlstra
2007-10-30 16:04 ` [PATCH 16/33] netvm: network reserve infrastructure Peter Zijlstra
2007-10-30 16:04 ` [PATCH 17/33] sysctl: propagate conv errors Peter Zijlstra
2007-10-30 16:04 ` [PATCH 18/33] netvm: INET reserves Peter Zijlstra
2007-10-30 16:04 ` [PATCH 19/33] netvm: hook skb allocation to reserves Peter Zijlstra
2007-10-30 16:04 ` [PATCH 20/33] netvm: filter emergency skbs Peter Zijlstra
2007-10-30 16:04 ` [PATCH 21/33] netvm: prevent a TCP specific deadlock Peter Zijlstra
2007-10-30 16:04 ` [PATCH 22/33] netfilter: NF_QUEUE vs emergency skbs Peter Zijlstra
2007-10-30 16:04 ` [PATCH 23/33] netvm: skb processing Peter Zijlstra
2007-10-30 21:26   ` Stephen Hemminger
2007-10-30 21:26   ` Stephen Hemminger [this message]
2007-10-30 21:44     ` Peter Zijlstra
2007-10-30 16:04 ` [PATCH 24/33] mm: prepare swap entry methods for use in page methods Peter Zijlstra
2007-10-30 16:04 ` [PATCH 25/33] mm: add support for non block device backed swap files Peter Zijlstra
2007-10-30 16:04 ` [PATCH 26/33] mm: methods for teaching filesystems about PG_swapcache pages Peter Zijlstra
2007-10-30 16:04 ` [PATCH 27/33] nfs: remove mempools Peter Zijlstra
2007-10-30 16:04 ` [PATCH 28/33] nfs: teach the NFS client how to treat PG_swapcache pages Peter Zijlstra
2007-10-31  8:52   ` Christoph Hellwig
2007-10-30 16:04 ` [PATCH 29/33] nfs: disable data cache revalidation for swapfiles Peter Zijlstra
2007-10-30 16:04 ` [PATCH 30/33] nfs: swap vs nfs_writepage Peter Zijlstra
2007-10-30 16:04 ` [PATCH 31/33] nfs: enable swap on NFS Peter Zijlstra
2007-10-30 16:04 ` [PATCH 32/33] nfs: fix various memory recursions possible with swap over NFS Peter Zijlstra
2007-10-30 16:04 ` [PATCH 33/33] nfs: do not warn on radix tree node allocation failures Peter Zijlstra
2007-10-31  3:26 ` [PATCH 00/33] Swap over NFS -v14 Nick Piggin
2007-10-31  4:37   ` David Miller, Nick Piggin
2007-10-31  4:04     ` Nick Piggin
2007-10-31 14:03       ` Byron Stanoszek
2007-10-31  8:50     ` Christoph Hellwig
2007-10-31 10:56       ` Peter Zijlstra
2007-10-31 11:18         ` NBD was " Pavel Machek
2007-10-31 11:24           ` Peter Zijlstra
2007-10-31 14:54         ` Mike Snitzer
2007-10-31 16:31           ` Evgeniy Polyakov
2007-10-31  9:53     ` Peter Zijlstra
2007-10-31 11:27   ` Peter Zijlstra
2007-10-31 12:16     ` Jeff Garzik
2007-10-31 12:56       ` Peter Zijlstra
2007-10-31 13:18         ` Arnaldo Carvalho de Melo
2007-10-31 13:44         ` Gregory Haskins
2007-11-02  8:54         ` Pavel Machek
2007-11-18 18:09         ` Robin Humble

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20071030142634.0f00b492@freepuppy.rosehill \
    --to=shemminger@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).