All of lore.kernel.org
 help / color / mirror / Atom feed
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	linux-mm <linux-mm@kvack.org>, netdev <netdev@vger.kernel.org>,
	Daniel Phillips <phillips@google.com>,
	David Miller <davem@davemloft.net>, Thomas Graf <tgraf@suug.ch>,
	Indan Zupancic <indan@nul.nu>
Subject: Re: [RFC][PATCH] VM deadlock prevention core -v3
Date: Thu, 10 Aug 2006 18:02:40 +0400	[thread overview]
Message-ID: <20060810140240.GA28989@2ka.mipt.ru> (raw)
In-Reply-To: <1155216769.5696.23.camel@twins>

On Thu, Aug 10, 2006 at 03:32:49PM +0200, Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> Hi,

Hello, Peter.

> So I try again, please tell me if I'm still on crack and should go detox.
> However if you do so, I kindly request some words on the how and why of it.

I think you should talk with doctor in that case, but not with kernel
hackers :)

I have some comments about implementation, not overall design, since we
have slightly diametral points of view there.

...

> --- linux-2.6.orig/net/core/skbuff.c
> +++ linux-2.6/net/core/skbuff.c
> @@ -43,6 +43,7 @@
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
>  #include <linux/mm.h>
> +#include <linux/pagemap.h>
>  #include <linux/interrupt.h>
>  #include <linux/in.h>
>  #include <linux/inet.h>
> @@ -125,6 +126,8 @@ EXPORT_SYMBOL(skb_truesize_bug);
>   *
>   */
>  
> +#define ceiling_log2(x)	fls((x) - 1)
> +
>  /**
>   *	__alloc_skb	-	allocate a network buffer
>   *	@size: size to allocate
> @@ -147,6 +150,59 @@ struct sk_buff *__alloc_skb(unsigned int
>  	struct sk_buff *skb;
>  	u8 *data;
>  
> +	size = SKB_DATA_ALIGN(size);
> +
> +	if (gfp_mask & __GFP_MEMALLOC) {
> +		/*
> +		 * Fallback allocation for memalloc reserves.
> +		 *
> +		 * the page is populated like so:
> +		 *
> +		 *   struct sk_buff
> +		 *   [ struct sk_buff ]
> +		 *   [ atomic_t ]
> +		 *   unsigned int
> +		 *   struct skb_shared_info
> +		 *   char []
> +		 *
> +		 * We have to do higher order allocations for icky jumbo
> +		 * frame drivers :-(. They really should be migrated to
> +		 * scather/gather DMA and use skb fragments.
> +		 */
> +		unsigned int data_offset =
> +			sizeof(struct sk_buff) + sizeof(unsigned int);
> +		unsigned long length = size + data_offset +
> +			sizeof(struct skb_shared_info);
> +		unsigned int pages;
> +		unsigned int order;
> +		struct page *page;
> +		void *kaddr;
> +
> +		/*
> +		 * Force fclone alloc in order to fudge a lacking in skb_clone().
> +		 */
> +		fclone = 1;
> +		if (fclone) {
> +			data_offset += sizeof(struct sk_buff) + sizeof(atomic_t);
> +			length += sizeof(struct sk_buff) + sizeof(atomic_t);
> +		}
> +		pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +		order = ceiling_log2(pages);
> +		skb = NULL;
> +		if (!(page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order)))
> +			goto out;
> +
> +		kaddr = pfn_to_kaddr(page_to_pfn(page));
> +		skb = (struct sk_buff *)kaddr;
> +
> +		*((unsigned int *)(kaddr + data_offset -
> +					sizeof(unsigned int))) = order;
> +		data = (u8 *)(kaddr + data_offset);
> +

Tricky, but since you are using own allocator here, you could change it to
be not so aggressive - i.e. do not round size to number of pages.

> +		goto allocated;
> +	}
> +
>  	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>  
>  	/* Get the HEAD */
> @@ -155,12 +211,13 @@ struct sk_buff *__alloc_skb(unsigned int
>  		goto out;
>  
>  	/* Get the DATA. Size must match skb_add_mtu(). */
> -	size = SKB_DATA_ALIGN(size);

Bad sign.

>  	data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
>  	if (!data)
>  		goto nodata;
>  
> +struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
> +		unsigned length, gfp_t gfp_mask)
> +{
> +	struct sk_buff *skb;
> +
> +	WARN_ON(gfp_mask & (__GFP_NOMEMALLOC | __GFP_MEMALLOC));
> +	gfp_mask &= ~(__GFP_NOMEMALLOC | __GFP_MEMALLOC);
> +
> +	skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_NOMEMALLOC);
> +	if (skb)
> +		goto done;
> +
> +	if (atomic_read(&dev->rx_reserve_used) >=
> +			dev->rx_reserve * dev->memalloc_socks)
> +		goto out;
> +
> +	/*
> +	 * pre-inc guards against a race with netdev_wait_memalloc()
> +	 */
> +	atomic_inc(&dev->rx_reserve_used);
> +	skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_MEMALLOC);
> +	if (unlikely(!skb)) {
> +		atomic_dec(&dev->rx_reserve_used);
> +		goto out;
> +	}

Since you have added atomic operation in that path, you can use device's
reference counter instead and do not care that it can dissapear.

> +done:
> +	skb->dev = dev;
> +out:
> +	return skb;
> +}
> +
>  static void skb_drop_list(struct sk_buff **listp)
>  {
>  	struct sk_buff *list = *listp;
> @@ -313,10 +417,35 @@ static void skb_release_data(struct sk_b
>  		if (skb_shinfo(skb)->frag_list)
>  			skb_drop_fraglist(skb);
>  
> -		kfree(skb->head);
> +		if (!skb->memalloc)
> +			kfree(skb->head);
> +		skb->head = NULL;
>  	}
>  }
>  
> +/**
> + *	free_skb_pages - frees a memalloced skbuff
> + *	@cache: fake %kmem_cache argument
> + *	@objp: %sk_buff pointer
> + *
> + *	Function is made to look like %kmem_cache_free so we can easily
> + *	substitue the free function in %kfree_skbmem.
> + */
> +static void free_skb_pages(struct kmem_cache *cache, void *objp)
> +{
> +	struct sk_buff *skb = (struct sk_buff *)objp;
> +	/*
> +	 * The input_dev is the initial input device;
> +	 * we have it pinned by virtue of rx_reserve_used not being zero.
> +	 */
> +	struct net_device *dev = skb->input_dev ?: skb->dev;
> +	unsigned int order =
> +		*(unsigned int *)(skb->head - sizeof(unsigned int));
> +	if (!skb->head)
> +		atomic_dec(&dev->rx_reserve_used);
> +	free_pages((unsigned long)skb, order);
> +}
> +
>  /*
>   *	Free an skbuff by memory without cleaning the state.
>   */
> @@ -324,17 +453,21 @@ void kfree_skbmem(struct sk_buff *skb)
>  {
>  	struct sk_buff *other;
>  	atomic_t *fclone_ref;
> +	void (*free_skb)(struct kmem_cache *, void *);
>  
>  	skb_release_data(skb);
> +
> +	free_skb = skb->memalloc ? free_skb_pages : kmem_cache_free;
> +
>  	switch (skb->fclone) {
>  	case SKB_FCLONE_UNAVAILABLE:
> -		kmem_cache_free(skbuff_head_cache, skb);
> +		free_skb(skbuff_head_cache, skb);
>  		break;
>  
>  	case SKB_FCLONE_ORIG:
>  		fclone_ref = (atomic_t *) (skb + 2);
>  		if (atomic_dec_and_test(fclone_ref))
> -			kmem_cache_free(skbuff_fclone_cache, skb);
> +			free_skb(skbuff_fclone_cache, skb);
>  		break;
>  
>  	case SKB_FCLONE_CLONE:
> @@ -347,7 +480,7 @@ void kfree_skbmem(struct sk_buff *skb)
>  		skb->fclone = SKB_FCLONE_UNAVAILABLE;
>  
>  		if (atomic_dec_and_test(fclone_ref))
> -			kmem_cache_free(skbuff_fclone_cache, other);
> +			free_skb(skbuff_fclone_cache, other);
>  		break;
>  	};
>  }
> @@ -434,6 +567,12 @@ struct sk_buff *skb_clone(struct sk_buff
>  		n->fclone = SKB_FCLONE_CLONE;
>  		atomic_inc(fclone_ref);
>  	} else {
> +		/*
> +		 * should we special-case skb->memalloc cloning?
> +		 * for now fudge it by forcing fast-clone alloc.
> +		 */
> +		BUG_ON(skb->memalloc);
> +
>  		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
>  		if (!n)
>  			return NULL;

Ugh... cloning is a one of the shoulders of giant where Linux network
stack is staying...

> @@ -686,6 +825,8 @@ int pskb_expand_head(struct sk_buff *skb
>  	if (skb_shared(skb))
>  		BUG();
>  
> +	BUG_ON(skb->memalloc);
> +
>  	size = SKB_DATA_ALIGN(size);
>  
>  	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);

And that is a bug.
That operation can happen even with usual receiving processing.

> +/**
> + *	dev_adjust_memalloc - adjust the global memalloc reserve for this device
> + *	@dev: device that has memalloc demands
> + *	@nr_socks: number of new %SOCK_MEMALLOC sockets
> + *
> + *	This function adjusts the memalloc reserve based on device
> + *	demand. For each %SOCK_MEMALLOC socket this device will reserve
> + *	2 * %MAX_PHYS_SEGMENTS pages for outbound traffic (assumption:
> + *	each %SOCK_MEMALLOC socket will have a %request_queue associated)
> + *	and @dev->rx_reserve mtu pages.
> + */
> +int dev_adjust_memalloc(struct net_device *dev, int nr_socks)
> +{
> +	unsigned long flags;
> +	unsigned long reserve;
> +	int err;
> +
> +	spin_lock_irqsave(&dev->memalloc_lock, flags);
> +
> +	dev->memalloc_socks += nr_socks;
> +	BUG_ON(dev->memalloc_socks < 0);
> +
> +	reserve = dev->memalloc_socks *
> +		(2 * MAX_PHYS_SEGMENTS +		 /* outbound */
> +		 dev->rx_reserve * mtu_pages(dev->mtu)); /* inbound */
> +
> +	err = adjust_memalloc_reserve(reserve - dev->memalloc_reserve);
> +	if (err) {
> +		printk(KERN_WARNING
> +			"%s: Unable to change RX reserve to: %lu, error: %d\n",
> +			dev->name, reserve, err);
> +		goto unlock;
> +	}
> +	dev->memalloc_reserve = reserve;
> +
> +unlock:
> +	spin_unlock_irqrestore(&dev->memalloc_lock, flags);
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(dev_adjust_memalloc);
>  
>  /*
>   *	Device change register/unregister. These are not inline or static
> @@ -2464,6 +2527,9 @@ int dev_set_mtu(struct net_device *dev, 
>  		err = dev->change_mtu(dev, new_mtu);
>  	else
>  		dev->mtu = new_mtu;
> +
> +	dev_adjust_memalloc(dev, 0);
> +
>  	if (!err && dev->flags & IFF_UP)
>  		raw_notifier_call_chain(&netdev_chain,
>  				NETDEV_CHANGEMTU, dev);
> @@ -2900,6 +2966,7 @@ int register_netdevice(struct net_device
>  #ifdef CONFIG_NET_CLS_ACT
>  	spin_lock_init(&dev->ingress_lock);
>  #endif
> +	spin_lock_init(&dev->memalloc_lock);
>  
>  	ret = alloc_divert_blk(dev);
>  	if (ret)
> @@ -3106,6 +3173,28 @@ static void netdev_wait_allrefs(struct n
>  	}
>  }
>  
> +/* netdev_wait_memalloc - wait for all outstanding memalloc skbs
> + *
> + * This is called when unregistering network devices.
> + *
> + * Better make sure the skb -> dev mapping is correct, if we leak
> + * some this will stall forever.
> + */
> +static void netdev_wait_memalloc(struct net_device *dev)
> +{
> +	unsigned long warning_time = jiffies;
> +	while (atomic_read(&dev->rx_reserve_used) != 0) {
> +		msleep(250);
> +		if (time_after(jiffies, warning_time + 10 * HZ)) {
> +			printk(KERN_EMERG "netdev_wait_memalloc: "
> +			       "waiting for %s to become free. SKB "
> +			       "count = %d\n",
> +			       dev->name, atomic_read(&dev->rx_reserve_used));
> +			warning_time = jiffies;
> +		}
> +	}
> +}
> +
>  /* The sequence is:
>   *
>   *	rtnl_lock();
> @@ -3165,6 +3254,14 @@ void netdev_run_todo(void)
>  
>  		netdev_wait_allrefs(dev);
>  
> +		netdev_wait_memalloc(dev);
> +
> +		/* Get rid of any SOCK_MEMALLOC reserves. */
> +		if (dev->memalloc_reserve) {
> +			BUG_ON(!dev->memalloc_socks);
> +			dev_adjust_memalloc(dev, -dev->memalloc_socks);
> +		}
> +
>  		/* paranoia */
>  		BUG_ON(atomic_read(&dev->refcnt));
>  		BUG_TRAP(!dev->ip_ptr);
> Index: linux-2.6/net/ipv4/af_inet.c
> ===================================================================
> --- linux-2.6.orig/net/ipv4/af_inet.c
> +++ linux-2.6/net/ipv4/af_inet.c
> @@ -132,6 +132,14 @@ void inet_sock_destruct(struct sock *sk)
>  {
>  	struct inet_sock *inet = inet_sk(sk);
>  
> +	if (sk_is_memalloc(sk)) {
> +		struct net_device *dev = ip_dev_find(inet->rcv_saddr);
> +		if (dev) {
> +			dev_adjust_memalloc(dev, -1);
> +			dev_put(dev);
> +		}
> +	}
> +

This looks very strange - you decrement reference counter both in socket
destruction code and in netdevice destruction code.

-- 
	Evgeniy Polyakov

WARNING: multiple messages have this Message-ID (diff)
From: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-kernel <linux-kernel@vger.kernel.org>,
	linux-mm <linux-mm@kvack.org>, netdev <netdev@vger.kernel.org>,
	Daniel Phillips <phillips@google.com>,
	David Miller <davem@davemloft.net>, Thomas Graf <tgraf@suug.ch>,
	Indan Zupancic <indan@nul.nu>
Subject: Re: [RFC][PATCH] VM deadlock prevention core -v3
Date: Thu, 10 Aug 2006 18:02:40 +0400	[thread overview]
Message-ID: <20060810140240.GA28989@2ka.mipt.ru> (raw)
In-Reply-To: <1155216769.5696.23.camel@twins>

On Thu, Aug 10, 2006 at 03:32:49PM +0200, Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> Hi,

Hello, Peter.

> So I try again, please tell me if I'm still on crack and should go detox.
> However if you do so, I kindly request some words on the how and why of it.

I think you should talk with doctor in that case, but not with kernel
hackers :)

I have some comments about implementation, not overall design, since we
have slightly diametral points of view there.

...

> --- linux-2.6.orig/net/core/skbuff.c
> +++ linux-2.6/net/core/skbuff.c
> @@ -43,6 +43,7 @@
>  #include <linux/kernel.h>
>  #include <linux/sched.h>
>  #include <linux/mm.h>
> +#include <linux/pagemap.h>
>  #include <linux/interrupt.h>
>  #include <linux/in.h>
>  #include <linux/inet.h>
> @@ -125,6 +126,8 @@ EXPORT_SYMBOL(skb_truesize_bug);
>   *
>   */
>  
> +#define ceiling_log2(x)	fls((x) - 1)
> +
>  /**
>   *	__alloc_skb	-	allocate a network buffer
>   *	@size: size to allocate
> @@ -147,6 +150,59 @@ struct sk_buff *__alloc_skb(unsigned int
>  	struct sk_buff *skb;
>  	u8 *data;
>  
> +	size = SKB_DATA_ALIGN(size);
> +
> +	if (gfp_mask & __GFP_MEMALLOC) {
> +		/*
> +		 * Fallback allocation for memalloc reserves.
> +		 *
> +		 * the page is populated like so:
> +		 *
> +		 *   struct sk_buff
> +		 *   [ struct sk_buff ]
> +		 *   [ atomic_t ]
> +		 *   unsigned int
> +		 *   struct skb_shared_info
> +		 *   char []
> +		 *
> +		 * We have to do higher order allocations for icky jumbo
> +		 * frame drivers :-(. They really should be migrated to
> +		 * scather/gather DMA and use skb fragments.
> +		 */
> +		unsigned int data_offset =
> +			sizeof(struct sk_buff) + sizeof(unsigned int);
> +		unsigned long length = size + data_offset +
> +			sizeof(struct skb_shared_info);
> +		unsigned int pages;
> +		unsigned int order;
> +		struct page *page;
> +		void *kaddr;
> +
> +		/*
> +		 * Force fclone alloc in order to fudge a lacking in skb_clone().
> +		 */
> +		fclone = 1;
> +		if (fclone) {
> +			data_offset += sizeof(struct sk_buff) + sizeof(atomic_t);
> +			length += sizeof(struct sk_buff) + sizeof(atomic_t);
> +		}
> +		pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +		order = ceiling_log2(pages);
> +		skb = NULL;
> +		if (!(page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order)))
> +			goto out;
> +
> +		kaddr = pfn_to_kaddr(page_to_pfn(page));
> +		skb = (struct sk_buff *)kaddr;
> +
> +		*((unsigned int *)(kaddr + data_offset -
> +					sizeof(unsigned int))) = order;
> +		data = (u8 *)(kaddr + data_offset);
> +

Tricky, but since you are using own allocator here, you could change it to
be not so aggressive - i.e. do not round size to number of pages.

> +		goto allocated;
> +	}
> +
>  	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
>  
>  	/* Get the HEAD */
> @@ -155,12 +211,13 @@ struct sk_buff *__alloc_skb(unsigned int
>  		goto out;
>  
>  	/* Get the DATA. Size must match skb_add_mtu(). */
> -	size = SKB_DATA_ALIGN(size);

Bad sign.

>  	data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
>  	if (!data)
>  		goto nodata;
>  
> +struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
> +		unsigned length, gfp_t gfp_mask)
> +{
> +	struct sk_buff *skb;
> +
> +	WARN_ON(gfp_mask & (__GFP_NOMEMALLOC | __GFP_MEMALLOC));
> +	gfp_mask &= ~(__GFP_NOMEMALLOC | __GFP_MEMALLOC);
> +
> +	skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_NOMEMALLOC);
> +	if (skb)
> +		goto done;
> +
> +	if (atomic_read(&dev->rx_reserve_used) >=
> +			dev->rx_reserve * dev->memalloc_socks)
> +		goto out;
> +
> +	/*
> +	 * pre-inc guards against a race with netdev_wait_memalloc()
> +	 */
> +	atomic_inc(&dev->rx_reserve_used);
> +	skb = ___netdev_alloc_skb(dev, length, gfp_mask | __GFP_MEMALLOC);
> +	if (unlikely(!skb)) {
> +		atomic_dec(&dev->rx_reserve_used);
> +		goto out;
> +	}

Since you have added atomic operation in that path, you can use device's
reference counter instead and do not care that it can dissapear.

> +done:
> +	skb->dev = dev;
> +out:
> +	return skb;
> +}
> +
>  static void skb_drop_list(struct sk_buff **listp)
>  {
>  	struct sk_buff *list = *listp;
> @@ -313,10 +417,35 @@ static void skb_release_data(struct sk_b
>  		if (skb_shinfo(skb)->frag_list)
>  			skb_drop_fraglist(skb);
>  
> -		kfree(skb->head);
> +		if (!skb->memalloc)
> +			kfree(skb->head);
> +		skb->head = NULL;
>  	}
>  }
>  
> +/**
> + *	free_skb_pages - frees a memalloced skbuff
> + *	@cache: fake %kmem_cache argument
> + *	@objp: %sk_buff pointer
> + *
> + *	Function is made to look like %kmem_cache_free so we can easily
> + *	substitue the free function in %kfree_skbmem.
> + */
> +static void free_skb_pages(struct kmem_cache *cache, void *objp)
> +{
> +	struct sk_buff *skb = (struct sk_buff *)objp;
> +	/*
> +	 * The input_dev is the initial input device;
> +	 * we have it pinned by virtue of rx_reserve_used not being zero.
> +	 */
> +	struct net_device *dev = skb->input_dev ?: skb->dev;
> +	unsigned int order =
> +		*(unsigned int *)(skb->head - sizeof(unsigned int));
> +	if (!skb->head)
> +		atomic_dec(&dev->rx_reserve_used);
> +	free_pages((unsigned long)skb, order);
> +}
> +
>  /*
>   *	Free an skbuff by memory without cleaning the state.
>   */
> @@ -324,17 +453,21 @@ void kfree_skbmem(struct sk_buff *skb)
>  {
>  	struct sk_buff *other;
>  	atomic_t *fclone_ref;
> +	void (*free_skb)(struct kmem_cache *, void *);
>  
>  	skb_release_data(skb);
> +
> +	free_skb = skb->memalloc ? free_skb_pages : kmem_cache_free;
> +
>  	switch (skb->fclone) {
>  	case SKB_FCLONE_UNAVAILABLE:
> -		kmem_cache_free(skbuff_head_cache, skb);
> +		free_skb(skbuff_head_cache, skb);
>  		break;
>  
>  	case SKB_FCLONE_ORIG:
>  		fclone_ref = (atomic_t *) (skb + 2);
>  		if (atomic_dec_and_test(fclone_ref))
> -			kmem_cache_free(skbuff_fclone_cache, skb);
> +			free_skb(skbuff_fclone_cache, skb);
>  		break;
>  
>  	case SKB_FCLONE_CLONE:
> @@ -347,7 +480,7 @@ void kfree_skbmem(struct sk_buff *skb)
>  		skb->fclone = SKB_FCLONE_UNAVAILABLE;
>  
>  		if (atomic_dec_and_test(fclone_ref))
> -			kmem_cache_free(skbuff_fclone_cache, other);
> +			free_skb(skbuff_fclone_cache, other);
>  		break;
>  	};
>  }
> @@ -434,6 +567,12 @@ struct sk_buff *skb_clone(struct sk_buff
>  		n->fclone = SKB_FCLONE_CLONE;
>  		atomic_inc(fclone_ref);
>  	} else {
> +		/*
> +		 * should we special-case skb->memalloc cloning?
> +		 * for now fudge it by forcing fast-clone alloc.
> +		 */
> +		BUG_ON(skb->memalloc);
> +
>  		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
>  		if (!n)
>  			return NULL;

Ugh... cloning is a one of the shoulders of giant where Linux network
stack is staying...

> @@ -686,6 +825,8 @@ int pskb_expand_head(struct sk_buff *skb
>  	if (skb_shared(skb))
>  		BUG();
>  
> +	BUG_ON(skb->memalloc);
> +
>  	size = SKB_DATA_ALIGN(size);
>  
>  	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);

And that is a bug.
That operation can happen even with usual receiving processing.

> +/**
> + *	dev_adjust_memalloc - adjust the global memalloc reserve for this device
> + *	@dev: device that has memalloc demands
> + *	@nr_socks: number of new %SOCK_MEMALLOC sockets
> + *
> + *	This function adjusts the memalloc reserve based on device
> + *	demand. For each %SOCK_MEMALLOC socket this device will reserve
> + *	2 * %MAX_PHYS_SEGMENTS pages for outbound traffic (assumption:
> + *	each %SOCK_MEMALLOC socket will have a %request_queue associated)
> + *	and @dev->rx_reserve mtu pages.
> + */
> +int dev_adjust_memalloc(struct net_device *dev, int nr_socks)
> +{
> +	unsigned long flags;
> +	unsigned long reserve;
> +	int err;
> +
> +	spin_lock_irqsave(&dev->memalloc_lock, flags);
> +
> +	dev->memalloc_socks += nr_socks;
> +	BUG_ON(dev->memalloc_socks < 0);
> +
> +	reserve = dev->memalloc_socks *
> +		(2 * MAX_PHYS_SEGMENTS +		 /* outbound */
> +		 dev->rx_reserve * mtu_pages(dev->mtu)); /* inbound */
> +
> +	err = adjust_memalloc_reserve(reserve - dev->memalloc_reserve);
> +	if (err) {
> +		printk(KERN_WARNING
> +			"%s: Unable to change RX reserve to: %lu, error: %d\n",
> +			dev->name, reserve, err);
> +		goto unlock;
> +	}
> +	dev->memalloc_reserve = reserve;
> +
> +unlock:
> +	spin_unlock_irqrestore(&dev->memalloc_lock, flags);
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(dev_adjust_memalloc);
>  
>  /*
>   *	Device change register/unregister. These are not inline or static
> @@ -2464,6 +2527,9 @@ int dev_set_mtu(struct net_device *dev, 
>  		err = dev->change_mtu(dev, new_mtu);
>  	else
>  		dev->mtu = new_mtu;
> +
> +	dev_adjust_memalloc(dev, 0);
> +
>  	if (!err && dev->flags & IFF_UP)
>  		raw_notifier_call_chain(&netdev_chain,
>  				NETDEV_CHANGEMTU, dev);
> @@ -2900,6 +2966,7 @@ int register_netdevice(struct net_device
>  #ifdef CONFIG_NET_CLS_ACT
>  	spin_lock_init(&dev->ingress_lock);
>  #endif
> +	spin_lock_init(&dev->memalloc_lock);
>  
>  	ret = alloc_divert_blk(dev);
>  	if (ret)
> @@ -3106,6 +3173,28 @@ static void netdev_wait_allrefs(struct n
>  	}
>  }
>  
> +/* netdev_wait_memalloc - wait for all outstanding memalloc skbs
> + *
> + * This is called when unregistering network devices.
> + *
> + * Better make sure the skb -> dev mapping is correct, if we leak
> + * some this will stall forever.
> + */
> +static void netdev_wait_memalloc(struct net_device *dev)
> +{
> +	unsigned long warning_time = jiffies;
> +	while (atomic_read(&dev->rx_reserve_used) != 0) {
> +		msleep(250);
> +		if (time_after(jiffies, warning_time + 10 * HZ)) {
> +			printk(KERN_EMERG "netdev_wait_memalloc: "
> +			       "waiting for %s to become free. SKB "
> +			       "count = %d\n",
> +			       dev->name, atomic_read(&dev->rx_reserve_used));
> +			warning_time = jiffies;
> +		}
> +	}
> +}
> +
>  /* The sequence is:
>   *
>   *	rtnl_lock();
> @@ -3165,6 +3254,14 @@ void netdev_run_todo(void)
>  
>  		netdev_wait_allrefs(dev);
>  
> +		netdev_wait_memalloc(dev);
> +
> +		/* Get rid of any SOCK_MEMALLOC reserves. */
> +		if (dev->memalloc_reserve) {
> +			BUG_ON(!dev->memalloc_socks);
> +			dev_adjust_memalloc(dev, -dev->memalloc_socks);
> +		}
> +
>  		/* paranoia */
>  		BUG_ON(atomic_read(&dev->refcnt));
>  		BUG_TRAP(!dev->ip_ptr);
> Index: linux-2.6/net/ipv4/af_inet.c
> ===================================================================
> --- linux-2.6.orig/net/ipv4/af_inet.c
> +++ linux-2.6/net/ipv4/af_inet.c
> @@ -132,6 +132,14 @@ void inet_sock_destruct(struct sock *sk)
>  {
>  	struct inet_sock *inet = inet_sk(sk);
>  
> +	if (sk_is_memalloc(sk)) {
> +		struct net_device *dev = ip_dev_find(inet->rcv_saddr);
> +		if (dev) {
> +			dev_adjust_memalloc(dev, -1);
> +			dev_put(dev);
> +		}
> +	}
> +

This looks very strange - you decrement reference counter both in socket
destruction code and in netdevice destruction code.

-- 
	Evgeniy Polyakov

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2006-08-10 14:03 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-08-10 13:32 [RFC][PATCH] VM deadlock prevention core -v3 Peter Zijlstra
2006-08-10 13:32 ` Peter Zijlstra
2006-08-10 14:02 ` Evgeniy Polyakov [this message]
2006-08-10 14:02   ` Evgeniy Polyakov
2006-08-10 14:46   ` Peter Zijlstra
2006-08-10 14:46     ` Peter Zijlstra
2006-08-10 16:22     ` Evgeniy Polyakov
2006-08-10 16:22       ` Evgeniy Polyakov
2006-08-10 16:50       ` Peter Zijlstra
2006-08-10 16:50         ` Peter Zijlstra
2006-08-11  0:45         ` Indan Zupancic
2006-08-11  0:45           ` Indan Zupancic

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20060810140240.GA28989@2ka.mipt.ru \
    --to=johnpol@2ka.mipt.ru \
    --cc=a.p.zijlstra@chello.nl \
    --cc=davem@davemloft.net \
    --cc=indan@nul.nu \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=netdev@vger.kernel.org \
    --cc=phillips@google.com \
    --cc=tgraf@suug.ch \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.