Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 3/3] pm_qos: get rid of the allocation in pm_qos_add_request()
From: mark gross @ 2010-06-29  4:39 UTC (permalink / raw)
  To: James Bottomley; +Cc: Linux PM, markgross, netdev, Takashi Iwai
In-Reply-To: <1277747088.10879.201.camel@mulgrave.site>

On Mon, Jun 28, 2010 at 12:44:48PM -0500, James Bottomley wrote:
> Since every caller has to squirrel away the returned pointer anyway,
> they might as well supply the memory area.  This fixes a bug in a few of
> the call sites where the returned pointer was dereferenced without
> checking it for NULL (which gets returned if the kzalloc failed).
> 
> I'd like to hear how sound and netdev feels about this: it will add
> about two more pointers worth of data to struct netdev and struct
> snd_pcm_substream .. but I think it's worth it.  If you're OK, I'll add
> your acks and send through the pm tree.
> 
> This also looks to me like an android independent clean up (even though
> it renders the request_add atomically callable).  I also added include
> guards to include/linux/pm_qos_params.h
> 
> cc: netdev@vger.kernel.org
> cc: Takashi Iwai <tiwai@suse.de>
> Signed-off-by: James Bottomley <James.Bottomley@suse.de>
Thank you for doing this!, I'll integrate it into some testing targets
in the morning!

Signed-off-by: mark gross <markgross@thegnar.org>

--mgross



> ---
>  drivers/net/e1000e/netdev.c            |   17 +++-----
>  drivers/net/igbvf/netdev.c             |    9 ++--
>  drivers/net/wireless/ipw2x00/ipw2100.c |   12 +++---
>  include/linux/netdevice.h              |    2 +-
>  include/linux/pm_qos_params.h          |   13 +++++-
>  include/sound/pcm.h                    |    2 +-
>  kernel/pm_qos_params.c                 |   67 +++++++++++++++++++-------------
>  sound/core/pcm_native.c                |   13 ++----
>  8 files changed, 74 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
> index 24507f3..47ea62f 100644
> --- a/drivers/net/e1000e/netdev.c
> +++ b/drivers/net/e1000e/netdev.c
> @@ -2901,10 +2901,10 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
>  			 * dropped transactions.
>  			 */
>  			pm_qos_update_request(
> -				adapter->netdev->pm_qos_req, 55);
> +				&adapter->netdev->pm_qos_req, 55);
>  		} else {
>  			pm_qos_update_request(
> -				adapter->netdev->pm_qos_req,
> +				&adapter->netdev->pm_qos_req,
>  				PM_QOS_DEFAULT_VALUE);
>  		}
>  	}
> @@ -3196,9 +3196,9 @@ int e1000e_up(struct e1000_adapter *adapter)
>  
>  	/* DMA latency requirement to workaround early-receive/jumbo issue */
>  	if (adapter->flags & FLAG_HAS_ERT)
> -		adapter->netdev->pm_qos_req =
> -			pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
> -				       PM_QOS_DEFAULT_VALUE);
> +		pm_qos_add_request(&adapter->netdev->pm_qos_req,
> +				   PM_QOS_CPU_DMA_LATENCY,
> +				   PM_QOS_DEFAULT_VALUE);
>  
>  	/* hardware has been reset, we need to reload some things */
>  	e1000_configure(adapter);
> @@ -3263,11 +3263,8 @@ void e1000e_down(struct e1000_adapter *adapter)
>  	e1000_clean_tx_ring(adapter);
>  	e1000_clean_rx_ring(adapter);
>  
> -	if (adapter->flags & FLAG_HAS_ERT) {
> -		pm_qos_remove_request(
> -			      adapter->netdev->pm_qos_req);
> -		adapter->netdev->pm_qos_req = NULL;
> -	}
> +	if (adapter->flags & FLAG_HAS_ERT)
> +		pm_qos_remove_request(&adapter->netdev->pm_qos_req);
>  
>  	/*
>  	 * TODO: for power management, we could drop the link and
> diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
> index 5e2b2a8..add6197 100644
> --- a/drivers/net/igbvf/netdev.c
> +++ b/drivers/net/igbvf/netdev.c
> @@ -48,7 +48,7 @@
>  #define DRV_VERSION "1.0.0-k0"
>  char igbvf_driver_name[] = "igbvf";
>  const char igbvf_driver_version[] = DRV_VERSION;
> -struct pm_qos_request_list *igbvf_driver_pm_qos_req;
> +static struct pm_qos_request_list igbvf_driver_pm_qos_req;
>  static const char igbvf_driver_string[] =
>  				"Intel(R) Virtual Function Network Driver";
>  static const char igbvf_copyright[] = "Copyright (c) 2009 Intel Corporation.";
> @@ -2902,8 +2902,8 @@ static int __init igbvf_init_module(void)
>  	printk(KERN_INFO "%s\n", igbvf_copyright);
>  
>  	ret = pci_register_driver(&igbvf_driver);
> -	igbvf_driver_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
> -	                       PM_QOS_DEFAULT_VALUE);
> +	pm_qos_add_request(&igbvf_driver_pm_qos_req, PM_QOS_CPU_DMA_LATENCY,
> +			   PM_QOS_DEFAULT_VALUE);
>  
>  	return ret;
>  }
> @@ -2918,8 +2918,7 @@ module_init(igbvf_init_module);
>  static void __exit igbvf_exit_module(void)
>  {
>  	pci_unregister_driver(&igbvf_driver);
> -	pm_qos_remove_request(igbvf_driver_pm_qos_req);
> -	igbvf_driver_pm_qos_req = NULL;
> +	pm_qos_remove_request(&igbvf_driver_pm_qos_req);
>  }
>  module_exit(igbvf_exit_module);
>  
> diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
> index 0bd4dfa..7f0d98b 100644
> --- a/drivers/net/wireless/ipw2x00/ipw2100.c
> +++ b/drivers/net/wireless/ipw2x00/ipw2100.c
> @@ -174,7 +174,7 @@ that only one external action is invoked at a time.
>  #define DRV_DESCRIPTION	"Intel(R) PRO/Wireless 2100 Network Driver"
>  #define DRV_COPYRIGHT	"Copyright(c) 2003-2006 Intel Corporation"
>  
> -struct pm_qos_request_list *ipw2100_pm_qos_req;
> +struct pm_qos_request_list ipw2100_pm_qos_req;
>  
>  /* Debugging stuff */
>  #ifdef CONFIG_IPW2100_DEBUG
> @@ -1741,7 +1741,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
>  	/* the ipw2100 hardware really doesn't want power management delays
>  	 * longer than 175usec
>  	 */
> -	pm_qos_update_request(ipw2100_pm_qos_req, 175);
> +	pm_qos_update_request(&ipw2100_pm_qos_req, 175);
>  
>  	/* If the interrupt is enabled, turn it off... */
>  	spin_lock_irqsave(&priv->low_lock, flags);
> @@ -1889,7 +1889,7 @@ static void ipw2100_down(struct ipw2100_priv *priv)
>  	ipw2100_disable_interrupts(priv);
>  	spin_unlock_irqrestore(&priv->low_lock, flags);
>  
> -	pm_qos_update_request(ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
> +	pm_qos_update_request(&ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
>  
>  	/* We have to signal any supplicant if we are disassociating */
>  	if (associated)
> @@ -6669,8 +6669,8 @@ static int __init ipw2100_init(void)
>  	if (ret)
>  		goto out;
>  
> -	ipw2100_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
> -			PM_QOS_DEFAULT_VALUE);
> +	pm_qos_add_request(&ipw2100_pm_qos_req, PM_QOS_CPU_DMA_LATENCY,
> +			   PM_QOS_DEFAULT_VALUE);
>  #ifdef CONFIG_IPW2100_DEBUG
>  	ipw2100_debug_level = debug;
>  	ret = driver_create_file(&ipw2100_pci_driver.driver,
> @@ -6692,7 +6692,7 @@ static void __exit ipw2100_exit(void)
>  			   &driver_attr_debug_level);
>  #endif
>  	pci_unregister_driver(&ipw2100_pci_driver);
> -	pm_qos_remove_request(ipw2100_pm_qos_req);
> +	pm_qos_remove_request(&ipw2100_pm_qos_req);
>  }
>  
>  module_init(ipw2100_init);
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 40291f3..393555a 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -779,7 +779,7 @@ struct net_device {
>  	 */
>  	char			name[IFNAMSIZ];
>  
> -	struct pm_qos_request_list *pm_qos_req;
> +	struct pm_qos_request_list pm_qos_req;
>  
>  	/* device name hash chain */
>  	struct hlist_node	name_hlist;
> diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
> index 8ba440e..77cbddb 100644
> --- a/include/linux/pm_qos_params.h
> +++ b/include/linux/pm_qos_params.h
> @@ -1,8 +1,10 @@
> +#ifndef _LINUX_PM_QOS_PARAMS_H
> +#define _LINUX_PM_QOS_PARAMS_H
>  /* interface for the pm_qos_power infrastructure of the linux kernel.
>   *
>   * Mark Gross <mgross@linux.intel.com>
>   */
> -#include <linux/list.h>
> +#include <linux/plist.h>
>  #include <linux/notifier.h>
>  #include <linux/miscdevice.h>
>  
> @@ -14,9 +16,12 @@
>  #define PM_QOS_NUM_CLASSES 4
>  #define PM_QOS_DEFAULT_VALUE -1
>  
> -struct pm_qos_request_list;
> +struct pm_qos_request_list {
> +	struct plist_node list;
> +	int pm_qos_class;
> +};
>  
> -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value);
> +void pm_qos_add_request(struct pm_qos_request_list *l, int pm_qos_class, s32 value);
>  void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
>  		s32 new_value);
>  void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
> @@ -24,4 +29,6 @@ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
>  int pm_qos_request(int pm_qos_class);
>  int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier);
>  int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier);
> +int pm_qos_request_active(struct pm_qos_request_list *req);
>  
> +#endif
> diff --git a/include/sound/pcm.h b/include/sound/pcm.h
> index dd76cde..6e3a297 100644
> --- a/include/sound/pcm.h
> +++ b/include/sound/pcm.h
> @@ -366,7 +366,7 @@ struct snd_pcm_substream {
>  	int number;
>  	char name[32];			/* substream name */
>  	int stream;			/* stream (direction) */
> -	struct pm_qos_request_list *latency_pm_qos_req; /* pm_qos request */
> +	struct pm_qos_request_list latency_pm_qos_req; /* pm_qos request */
>  	size_t buffer_bytes_max;	/* limit ring buffer size */
>  	struct snd_dma_buffer dma_buffer;
>  	unsigned int dma_buf_id;
> diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
> index b130b97..bff4053 100644
> --- a/kernel/pm_qos_params.c
> +++ b/kernel/pm_qos_params.c
> @@ -30,7 +30,6 @@
>  /*#define DEBUG*/
>  
>  #include <linux/pm_qos_params.h>
> -#include <linux/plist.h>
>  #include <linux/sched.h>
>  #include <linux/spinlock.h>
>  #include <linux/slab.h>
> @@ -49,11 +48,6 @@
>   * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
>   * held, taken with _irqsave.  One lock to rule them all
>   */
> -struct pm_qos_request_list {
> -	struct plist_node list;
> -	int pm_qos_class;
> -};
> -
>  enum pm_qos_type {
>  	PM_QOS_MAX,		/* return the largest value */
>  	PM_QOS_MIN		/* return the smallest value */
> @@ -210,6 +204,12 @@ int pm_qos_request(int pm_qos_class)
>  }
>  EXPORT_SYMBOL_GPL(pm_qos_request);
>  
> +int pm_qos_request_active(struct pm_qos_request_list *req)
> +{
> +	return req->pm_qos_class != 0;
> +}
> +EXPORT_SYMBOL_GPL(pm_qos_request_active);
> +
>  /**
>   * pm_qos_add_request - inserts new qos request into the list
>   * @pm_qos_class: identifies which list of qos request to us
> @@ -221,25 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request);
>   * element as a handle for use in updating and removal.  Call needs to save
>   * this handle for later use.
>   */
> -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
> +void pm_qos_add_request(struct pm_qos_request_list *dep,
> +			int pm_qos_class, s32 value)
>  {
> -	struct pm_qos_request_list *dep;
> -
> -	dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
> -	if (dep) {
> -		struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
> -		int new_value;
> -
> -		if (value == PM_QOS_DEFAULT_VALUE)
> -			new_value = o->default_value;
> -		else
> -			new_value = value;
> -		plist_node_init(&dep->list, new_value);
> -		dep->pm_qos_class = pm_qos_class;
> -		update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
> -	}
> +	struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
> +	int new_value;
>  
> -	return dep;
> +	if (pm_qos_request_active(dep)) {
> +		WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
> +		return;
> +	}
> +	if (value == PM_QOS_DEFAULT_VALUE)
> +		new_value = o->default_value;
> +	else
> +		new_value = value;
> +	plist_node_init(&dep->list, new_value);
> +	dep->pm_qos_class = pm_qos_class;
> +	update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
>  }
>  EXPORT_SYMBOL_GPL(pm_qos_add_request);
>  
> @@ -262,6 +260,11 @@ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
>  	if (!pm_qos_req) /*guard against callers passing in null */
>  		return;
>  
> +	if (pm_qos_request_active(pm_qos_req)) {
> +		WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
> +		return;
> +	}
> +
>  	o = pm_qos_array[pm_qos_req->pm_qos_class];
>  
>  	if (new_value == PM_QOS_DEFAULT_VALUE)
> @@ -290,9 +293,14 @@ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
>  		return;
>  		/* silent return to keep pcm code cleaner */
>  
> +	if (!pm_qos_request_active(pm_qos_req)) {
> +		WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
> +		return;
> +	}
> +
>  	o = pm_qos_array[pm_qos_req->pm_qos_class];
>  	update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
> -	kfree(pm_qos_req);
> +	memset(pm_qos_req, 0, sizeof(*pm_qos_req));
>  }
>  EXPORT_SYMBOL_GPL(pm_qos_remove_request);
>  
> @@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
>  
>  	pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
>  	if (pm_qos_class >= 0) {
> -		filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
> -				PM_QOS_DEFAULT_VALUE);
> +		struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
> +		if (!req)
> +			return -ENOMEM;
> +
> +		pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
> +		filp->private_data = req;
>  
>  		if (filp->private_data)
>  			return 0;
> @@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
>  {
>  	struct pm_qos_request_list *req;
>  
> -	req = (struct pm_qos_request_list *)filp->private_data;
> +	req = filp->private_data;
>  	pm_qos_remove_request(req);
> +	kfree(req);
>  
>  	return 0;
>  }
> diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
> index 303ac04..a3b2a64 100644
> --- a/sound/core/pcm_native.c
> +++ b/sound/core/pcm_native.c
> @@ -451,13 +451,11 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
>  	snd_pcm_timer_resolution_change(substream);
>  	runtime->status->state = SNDRV_PCM_STATE_SETUP;
>  
> -	if (substream->latency_pm_qos_req) {
> -		pm_qos_remove_request(substream->latency_pm_qos_req);
> -		substream->latency_pm_qos_req = NULL;
> -	}
> +	if (pm_qos_request_active(&substream->latency_pm_qos_req))
> +		pm_qos_remove_request(&substream->latency_pm_qos_req);
>  	if ((usecs = period_to_usecs(runtime)) >= 0)
> -		substream->latency_pm_qos_req = pm_qos_add_request(
> -					PM_QOS_CPU_DMA_LATENCY, usecs);
> +		pm_qos_add_request(&substream->latency_pm_qos_req,
> +				   PM_QOS_CPU_DMA_LATENCY, usecs);
>  	return 0;
>   _error:
>  	/* hardware might be unuseable from this time,
> @@ -512,8 +510,7 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
>  	if (substream->ops->hw_free)
>  		result = substream->ops->hw_free(substream);
>  	runtime->status->state = SNDRV_PCM_STATE_OPEN;
> -	pm_qos_remove_request(substream->latency_pm_qos_req);
> -	substream->latency_pm_qos_req = NULL;
> +	pm_qos_remove_request(&substream->latency_pm_qos_req);
>  	return result;
>  }
>  
> -- 
> 1.6.4.2
> 
> 
> 

^ permalink raw reply

* [PATCH net-next-2.6] xfrm: remove export of xfrm4_rcv_encap.
From: Rami Rosen @ 2010-06-29  5:05 UTC (permalink / raw)
  To: davem, netdev

[-- Attachment #1: Type: text/plain, Size: 155 bytes --]

Hi,
 The patch removes EXPORT_SYMBOL of xfrm4_rcv_encap() method
 as it is unneeded.

Regards,
Rami Rosen


Signed-off-by: Rami Rosen <ramirose@gmail.com>

[-- Attachment #2: patch.txt --]
[-- Type: text/plain, Size: 464 bytes --]

diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index ad8fbb8..d85336c 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -44,7 +44,6 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
 	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
 	return xfrm_input(skb, nexthdr, spi, encap_type);
 }
-EXPORT_SYMBOL(xfrm4_rcv_encap);
 
 int xfrm4_transport_finish(struct sk_buff *skb, int async)
 {

^ permalink raw reply related

* Re: b44: Reset due to FIFO overflow.
From: Eric Dumazet @ 2010-06-29  5:17 UTC (permalink / raw)
  To: Mitchell Erblich; +Cc: James Courtier-Dutton, netdev
In-Reply-To: <ED315045-4A5D-4ECA-99C8-06B4714D8FA0@earthlink.net>

Le lundi 28 juin 2010 à 14:21 -0700, Mitchell Erblich a écrit :
> On Jun 28, 2010, at 4:09 AM, Eric Dumazet wrote:
> 
> > Le lundi 28 juin 2010 à 11:17 +0100, James Courtier-Dutton a écrit :
> >> On 28 June 2010 11:00, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >>> 
> >>> Problem is we receive a spike of RX network frames (possibly UDP or some
> >>> other RX only trafic), and chip raises an RX fifo overflow _error_
> >>> indication.
> >>> 
> 
> IMO, spikes are a normal behaviour.

Yes, this is why I said NIC is buggy, if it requires a reset (lasting a
_very_ long time) on a normal condition.

> 
> >> 
> >> The cause of the RX overflow is in my case is TCP.
> >> It is reproducible in mythtv.
> >> While watching LiveTV, press "s" for the program guide.
> >> The program guide is implemented into mythtv by a SQL query that
> >> results in a large response.
> >> The kernel is probably not servicing the RX FIFO quickly enough due to
> >> it being busy doing something else. In this case, probably a video
> >> mode switch.
> >> 
> > 
> > Thats strange, b44 has a big RX ring... and tcp sender should wait for
> > ACK...
> > 
> 
> Slow start, etc SHOULD/CAN  double the number of in-flight segments in each
> next round-trip, placing them back to back.
> 

rx ring buffer is about 200 frames on b44. One single tcp flow should
fit.

Limit is 511. James, did you try to increase rx ring ?

ethtool -G eth0 rx 511

> IMO,  a stress test, would be a large number/wirespeed set of pings?
> 

Better is to use frames that are going to slow down receiver.
Say multicast trafic with 100 receivers on same multicast group.
Send 1000 consecutive frames, last ones will trigger RX overflow,
because softirq handler cannot be fast enough.

Ping is answered by kernel, its pretty fast.

> >>> Some hardware are buggy enough that such error indication is fatal and
> >>> _require_ hardware reset. Thats life. I suspect b44 driver doing a full
> >>> reset is not a random guess from driver author, but to avoid a complete
> >>> NIC lockup.
> >>> 
> >> 
> >> Interesting, which hardware, apart from the b44, is it that "requires"
> >> a hardware reset after a RX FIFO overflow.
> > 
> > Just take a look at some net drivers and you'll see some of them have
> > this requirement.
> > 
> > rtl8169_rx_interrupt()
> > ...
> > 	if (status & RxFOVF) {
> > 		rtl8169_schedule_work(dev, rtl8169_reset_task);
> > 		dev->stats.rx_fifo_errors++;
> > 	}
> > 
> > 
> > 
> > 
> 
> 
> If they can reset in say X frame loss units, then why not reset if
> X is an acceptable number?
> 

Because a reset is an exception. While card is reset, we lose many tx
and rx frames and this should be the very last thing to consider.

Why not a complete reboot of the host while we are at it ?

> And a hammer may fix the dent, while I may be more
> interested in preventing the dent in the first place.

So ? Please submit an alternative firmware for this NIC, or provide
another NIC on thousand of machines that are stuck with it.




^ permalink raw reply

* Re: [PATCH net-next-2.6] xfrm: remove export of xfrm4_rcv_encap.
From: Eric Dumazet @ 2010-06-29  5:37 UTC (permalink / raw)
  To: Rami Rosen; +Cc: davem, netdev
In-Reply-To: <AANLkTimIJ_vhe7wwCqTrfjj4HUdrbpyrlAwnQGrtAiOk@mail.gmail.com>

Le mardi 29 juin 2010 à 08:05 +0300, Rami Rosen a écrit :
> Hi,
>  The patch removes EXPORT_SYMBOL of xfrm4_rcv_encap() method
>  as it is unneeded.
> 
> Regards,
> Rami Rosen
> 

This claim seems wrong. I wonder how you came to it.

CONFIG_INET_XFRM_TUNNEL=m
CONFIG_XFRM_IPCOMP=m
CONFIG_INET_IPCOMP=m

ERROR: "xfrm4_rcv_encap" [net/ipv4/xfrm4_tunnel.ko] undefined!




^ permalink raw reply

* [iproute2] iproute2:  Allow 'ip addr flush' to loop more than 10 times.
From: greearb @ 2010-06-29  5:55 UTC (permalink / raw)
  To: netdev; +Cc: Ben Greear

From: Ben Greear <greearb@candelatech.com>

The default remains at 10 for backwards compatibility.

For instance:
 # ip addr flush dev eth2
 *** Flush remains incomplete after 10 rounds. ***
 # ip -l 20 addr flush dev eth2
 *** Flush remains incomplete after 20 rounds. ***
 # ip -loops 0 addr flush dev eth2
 #

This is useful for getting rid of large numbers of IP
addresses in scripts.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---
:100644 100644 f7ef939... 3da6998... M	include/utils.h
:100644 100644 9f29533... b127d57... M	ip/ip.c
:100644 100644 3a411b1... 5f0789c... M	ip/ipaddress.c
:100644 100644 1a73efa... d0146a5... M	man/man8/ip.8
 include/utils.h |    1 +
 ip/ip.c         |   11 ++++++++++-
 ip/ipaddress.c  |    6 +++---
 man/man8/ip.8   |    6 ++++++
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index f7ef939..3da6998 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -17,6 +17,7 @@ extern int resolve_hosts;
 extern int oneline;
 extern int timestamp;
 extern char * _SL_;
+extern int max_flush_loops;
 
 #ifndef IPPROTO_ESP
 #define IPPROTO_ESP	50
diff --git a/ip/ip.c b/ip/ip.c
index 9f29533..b127d57 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -32,6 +32,8 @@ int timestamp = 0;
 char * _SL_ = NULL;
 char *batch_file = NULL;
 int force = 0;
+int max_flush_loops = 10;
+
 struct rtnl_handle rth = { .fd = -1 };
 
 static void usage(void) __attribute__((noreturn));
@@ -45,6 +47,7 @@ static void usage(void)
 "                   tunnel | tuntap | maddr | mroute | mrule | monitor | xfrm }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -f[amily] { inet | inet6 | ipx | dnet | link } |\n"
+"                    -l[oops] { maximum-addr-flush-attempts } |\n"
 "                    -o[neline] | -t[imestamp] | -b[atch] [filename] |\n"
 "                    -rc[vbuf] [size]}\n");
 	exit(-1);
@@ -157,7 +160,13 @@ int main(int argc, char **argv)
 			break;
 		if (opt[1] == '-')
 			opt++;
-		if (matches(opt, "-family") == 0) {
+		if (matches(opt, "-loops") == 0) {
+			argc--;
+			argv++;
+			if (argc <= 1)
+				usage();
+                        max_flush_loops = atoi(argv[1]);
+                } else if (matches(opt, "-family") == 0) {
 			argc--;
 			argv++;
 			if (argc <= 1)
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 3a411b1..5f0789c 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -33,7 +33,6 @@
 #include "ll_map.h"
 #include "ip_common.h"
 
-#define MAX_ROUNDS 10
 
 static struct
 {
@@ -818,7 +817,7 @@ static int ipaddr_list_or_flush(int argc, char **argv, int flush)
 		filter.flushp = 0;
 		filter.flushe = sizeof(flushb);
 
-		while (round < MAX_ROUNDS) {
+		while ((max_flush_loops == 0) || (round < max_flush_loops)) {
 			const struct rtnl_dump_filter_arg a[3] = {
 				{
 					.filter = print_addrinfo_secondary,
@@ -867,7 +866,8 @@ static int ipaddr_list_or_flush(int argc, char **argv, int flush)
 				fflush(stdout);
 			}
 		}
-		fprintf(stderr, "*** Flush remains incomplete after %d rounds. ***\n", MAX_ROUNDS); fflush(stderr);
+		fprintf(stderr, "*** Flush remains incomplete after %d rounds. ***\n", max_flush_loops);
+		fflush(stderr);
 		return 1;
 	}
 
diff --git a/man/man8/ip.8 b/man/man8/ip.8
index 1a73efa..d0146a5 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8
@@ -730,6 +730,12 @@ appears twice or more, the amount of information increases.
 As a rule, the information is statistics or some time values.
 
 .TP
+.BR "\-l" , " \-loops"
+Specify maximum number of loops the 'ip addr flush' logic
+will attempt before giving up.  The default is 10.
+Zero (0) means loop until all addresses are removed.
+
+.TP
 .BR "\-f" , " \-family"
 followed by protocol family identifier:
 .BR "inet" , " inet6"
-- 
1.7.0.1


^ permalink raw reply related

* Re: [PATCH] s2io: add dynamic LRO disable support
From: David Miller @ 2010-06-29  6:04 UTC (permalink / raw)
  To: jon.mason
  Cc: amwang, netdev, nhorman, sgruszka, herbert.xu, bhutchings,
	Ramkrishna.Vepa
In-Reply-To: <20100625044510.GC2739@exar.com>

From: Jon Mason <jon.mason@exar.com>
Date: Thu, 24 Jun 2010 23:45:10 -0500

> This patch adds dynamic LRO disable support for s2io net driver,
> enables LRO by default, increases the driver version number, and
> corrects the name of the LRO modparm.
> 
> This is mostly Wang's patch based on Neil's initial work, heavily
> modified based on Ramkrishna's suggestions.  This has been tested on
> a Neterion Xframe adapter and verified via adapter LRO statistics.
> 
> Signed-off-by: Jon Mason <jon.mason@exar.com>
> Signed-off-by: WANG Cong <amwang@redhat.com>
> Signed-off-by: Neil Horman <nhorman@redhat.com>
> Acked-by: Neil Horman <nhorman@redhat.com>
> Reviewed-by: Stanislaw Gruszka <sgruszka@redhat.com>
> Cc: Ramkrishna Vepa <Ramkrishna.Vepa@exar.com>

Applied to net-next-2.6

^ permalink raw reply

* Re: [v4 Patch 2/2] mlx4: add dynamic LRO disable support
From: David Miller @ 2010-06-29  6:04 UTC (permalink / raw)
  To: amwang; +Cc: netdev, nhorman, sgruszka, herbert.xu, bhutchings,
	Ramkrishna.Vepa
In-Reply-To: <20100622085426.5566.51436.sendpatchset@localhost.localdomain>

From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 22 Jun 2010 04:50:17 -0400

> 
> This patch adds dynamic LRO diable support for mlx4 net driver.
> It also fixes a bug of mlx4, which checks NETIF_F_LRO flag in rx
> path without rtnl lock.
> 
> (I don't have mlx4 card, so only did compiling test. Anyone who wants
> to test this is more than welcome.)
> 
> This is based on Neil's initial work too, and heavily modified based
> on Stanislaw's suggestions.
> 
> Signed-off-by: WANG Cong <amwang@redhat.com>
> Signed-off-by: Neil Horman <nhorman@redhat.com>
> Acked-by: Neil Horman <nhorman@redhat.com>
> Reviewed-by: Stanislaw Gruszka <sgruszka@redhat.com>
> Cc: Ben Hutchings <bhutchings@solarflare.com>

Applied to net-next-2.6

I'd really like the module options to just die as the dynamic
ethtool mechanism should be the only knob for this for consistency
with the rest of the drivers.

^ permalink raw reply

* Re: [iproute2] iproute2: Allow 'ip addr flush' to loop more than 10 times.
From: David Miller @ 2010-06-29  6:12 UTC (permalink / raw)
  To: greearb; +Cc: netdev, greearb
In-Reply-To: <1277790959-28075-1-git-send-email-greearb@candelatech.com>

From: greearb@gmail.com
Date: Mon, 28 Jun 2010 22:55:59 -0700

> From: Ben Greear <greearb@candelatech.com>
> 
> The default remains at 10 for backwards compatibility.
> 
> For instance:
>  # ip addr flush dev eth2
>  *** Flush remains incomplete after 10 rounds. ***
>  # ip -l 20 addr flush dev eth2
>  *** Flush remains incomplete after 20 rounds. ***
>  # ip -loops 0 addr flush dev eth2
>  #
> 
> This is useful for getting rid of large numbers of IP
> addresses in scripts.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>

I would suggest to instead add some logic to this code to detect that
forward progress is being made.

I really don't see any value in having a hard limit that triggers on a
bulk delete when no other address changing activity is happening in
the system.

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: David Miller @ 2010-06-29  6:18 UTC (permalink / raw)
  To: steffen.klassert; +Cc: ben, netdev, chase.douglas, nordmark
In-Reply-To: <20100625082447.GK5570@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 25 Jun 2010 10:24:47 +0200

> These locks are not needed, why you want to have them arround?

Steffen I think you are being overly picky of Ben's changes.

I'd rather have too much locking during device probe and
initialization than a subtle bug that occurs because later on someone
decides to move IRQ enabling earlier in the chip init path and now
we get strange hangs that take forever to diagnose.

I mean, extra locking in probe/init paths... ugh, there are so many
more important things to worry about!

Once Ben posts a new version of this second patch with the
proper spin_lock_init() calls added I am going to apply both
of his changes.

^ permalink raw reply

* Re: [PATCH net-next-2.6 1/2] 3c59x: Specify window explicitly for access to windowed registers
From: David Miller @ 2010-06-29  6:20 UTC (permalink / raw)
  To: ben; +Cc: netdev, chase.douglas, nordmark
In-Reply-To: <1277337271.26161.17.camel@localhost>

From: Ben Hutchings <ben@decadent.org.uk>
Date: Thu, 24 Jun 2010 00:54:31 +0100

> Currently much of the code assumes that a specific window has been
> selected, while a few functions save and restore the window.  This
> makes it impossible to introduce fine-grained locking.
> 
> Make those assumptions explicit by introducing wrapper functions
> to set the window and read/write a register.  Use these everywhere
> except vortex_interrupt(), vortex_start_xmit() and vortex_rx().
> These set the window just once, or not at all in the case of
> vortex_rx() as it should always be called from vortex_interrupt().
> 
> Cache the current window in struct vortex_private to avoid
> unnecessary hardware writes.
> 
> Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
> Tested-by: Arne Nordmark <nordmark@mech.kth.se> [against 2.6.32]

Applied.

^ permalink raw reply

* Re: [iproute2] iproute2: Allow 'ip addr flush' to loop more than 10 times.
From: Ben Greear @ 2010-06-29  6:27 UTC (permalink / raw)
  To: David Miller; +Cc: greearb, netdev
In-Reply-To: <20100628.231204.229752207.davem@davemloft.net>

On 06/28/2010 11:12 PM, David Miller wrote:
> From: greearb@gmail.com
> Date: Mon, 28 Jun 2010 22:55:59 -0700
>
>> From: Ben Greear<greearb@candelatech.com>
>>
>> The default remains at 10 for backwards compatibility.
>>
>> For instance:
>>   # ip addr flush dev eth2
>>   *** Flush remains incomplete after 10 rounds. ***
>>   # ip -l 20 addr flush dev eth2
>>   *** Flush remains incomplete after 20 rounds. ***
>>   # ip -loops 0 addr flush dev eth2
>>   #
>>
>> This is useful for getting rid of large numbers of IP
>> addresses in scripts.
>>
>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>
> I would suggest to instead add some logic to this code to detect that
> forward progress is being made.
>
> I really don't see any value in having a hard limit that triggers on a
> bulk delete when no other address changing activity is happening in
> the system.

I'm not sure I understand how this loop could have run forever
anyway, unless some other process(es) was constantly adding addresses at
the same time?  Or maybe some ipv6 auto config thing?

It appears there is already code to detect when the loop
is done (flushing ~70 IPv4 addresses with -l 0 was one of my
test cases, and worked as expected).

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: [iproute2] iproute2: Allow 'ip addr flush' to loop more than 10 times.
From: David Miller @ 2010-06-29  6:36 UTC (permalink / raw)
  To: greearb; +Cc: greearb, netdev
In-Reply-To: <4C29925B.9090008@candelatech.com>

From: Ben Greear <greearb@candelatech.com>
Date: Mon, 28 Jun 2010 23:27:39 -0700

> I'm not sure I understand how this loop could have run forever
> anyway, unless some other process(es) was constantly adding
> addresses at the same time?  Or maybe some ipv6 auto config thing?
> 
> It appears there is already code to detect when the loop
> is done (flushing ~70 IPv4 addresses with -l 0 was one of my
> test cases, and worked as expected).

What happens is that we are simply limited by how many addresses
we can delete in one go, and that limit is 4096 bytes of netlink
message size.

So we have to iterate, reusing that buffer each time, to get them all
done.

The limit exists because meanwhile it is possible that some other
entity could add addresses and thus cause us to loop forever and
never actually delete all of the addresses because every time we
delete a bunch the other entity adds more.

I can understand the reasoning behind the limit, because if this is
run by something automated it's not like someone is at the command
line and hit Ctrl-C to break out of a looping instance.

But practically speaking I bet this never happens.

So what makes sense to me is:

1) Loop forever by default.

2) When the number of loops exceeds a threshold (calculated by the
   number of addresses we see the first dump, divided by the number
   of deletes we can squeeze into the 4096 byte message), we emit
   a warning.

3) A hard limit, off by default, it available via your "-l" new option.

But seriously we can determine forward progress quite easily I think.

Each loop, we see if the dump returns a smaller number of addresses
than the last iteration.  If so, we just keep going.

If the number of addresses increases, I think we can bail in this
case.

This logic would only ever trigger iff another entity is adding a
large number of addresses simultaneously with our flush.  And frankly
speaking the person doing the flush probably doesn't expect that to be
happening.  You're flushing all of the addresses so you can start with
a clean slate and then add specific addresses back, or whatever.


^ permalink raw reply

* Re: [PATCH net-next-2.6] net: u64_stats_sync improvements
From: David Miller @ 2010-06-29  6:37 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1277373878.2816.177.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 12:04:38 +0200

> - Add a comment about interrupts:
> 
> 6) If counter might be written by an interrupt, readers should block
> interrupts.
> 
> - Fix a typo in sample of use.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next-2.6] net: use this_cpu_ptr()
From: David Miller @ 2010-06-29  6:37 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1277376757.2816.272.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 12:52:37 +0200

> use this_cpu_ptr(p) instead of per_cpu_ptr(p, smp_processor_id())
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/4] net: u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh()
From: David Miller @ 2010-06-29  6:37 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1277376846.2816.283.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 12:54:06 +0200

> - Must disable preemption in case of 32bit UP in u64_stats_fetch_begin()
> and u64_stats_fetch_retry()
> 
> - Add new u64_stats_fetch_begin_bh() and u64_stats_fetch_retry_bh() for
> network usage, disabling BH on 32bit UP only.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next-2.6 3/4] macvlan: 64 bit rx counters
From: David Miller @ 2010-06-29  6:37 UTC (permalink / raw)
  To: eric.dumazet; +Cc: kaber, netdev
In-Reply-To: <1277376861.2816.284.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 12:54:21 +0200

> Use u64_stats_sync infrastructure to implement 64bit stats.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next-2.6 4/4] vlan: 64 bit rx counters
From: David Miller @ 2010-06-29  6:37 UTC (permalink / raw)
  To: eric.dumazet; +Cc: kaber, netdev
In-Reply-To: <1277376906.2816.287.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 12:55:06 +0200

> Use u64_stats_sync infrastructure to implement 64bit rx stats.
> 
> (tx stats are addressed later)
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next-2.6] tcp: tso_fragment() might avoid GFP_ATOMIC
From: David Miller @ 2010-06-29  6:38 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1277377222.2816.296.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 24 Jun 2010 13:00:22 +0200

> We can pass a gfp argument to tso_fragment() and avoid GFP_ATOMIC
> allocations sometimes.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

Slow down Eric, you're on fire :-)

^ permalink raw reply

* Re: [PATCH net-next-2.6] netfilter: allow nf_tproxy_core module to be removed
From: David Miller @ 2010-06-29  6:38 UTC (permalink / raw)
  To: kaber; +Cc: fw, jpirko, netdev, bazsi, hidden
In-Reply-To: <4C2379D9.2080608@trash.net>

From: Patrick McHardy <kaber@trash.net>
Date: Thu, 24 Jun 2010 17:29:29 +0200

> David Miller wrote:
>> From: Florian Westphal <fw@strlen.de>
>> Date: Wed, 23 Jun 2010 20:46:11 +0200
>>
>>   
>>> tproxy assigns skb->destructor, what prevents module unload while such
>>> skbs may
>>> still be around?
>>>     
>>
>> The only reference to nf_tproxy_core.ko is for the symbol,
>> "nf_tproxy_assign_sock".
>> xt_TPROXY.c, which references this symbol, thus creates a symbol
>> dependency on this
>> module, so xt_TPROXY.o needs to unload before nf_tproxy_core.ko can
>> unload, and
>> xt_TPROXY.o has it's own manner for handling module references
>> properly.
>>   
> 
> I don't see anything waiting for skbs in flight using the tproxy
> destructor in either xt_TPROXY or nf_tproxy_core though, so I think
> Florian is correct.

Ok.

^ permalink raw reply

* Re: [PATCH net-next-2.6 2/2] 3c59x: Use fine-grained locks for MII and windowed register access
From: Steffen Klassert @ 2010-06-29  6:39 UTC (permalink / raw)
  To: David Miller; +Cc: ben, netdev, chase.douglas, nordmark
In-Reply-To: <20100628.231812.35040625.davem@davemloft.net>

On Mon, Jun 28, 2010 at 11:18:12PM -0700, David Miller wrote:
> 
> Once Ben posts a new version of this second patch with the
> proper spin_lock_init() calls added I am going to apply both
> of his changes.

Yes, of course apply them. It was just a recommendation to avoid the locks
in the cases they are not needed. These patches are a real improvement,
so I'm fine with them.

Steffen

^ permalink raw reply

* Re: [PATCH v3] act_mirred: don't clone skb when skb isn't shared
From: David Miller @ 2010-06-29  6:38 UTC (permalink / raw)
  To: hadi; +Cc: xiaosuo, netdev
In-Reply-To: <1277469551.5438.0.camel@bigi>

From: jamal <hadi@cyberus.ca>
Date: Fri, 25 Jun 2010 08:39:11 -0400

> On Fri, 2010-06-25 at 10:25 +0800, Changli Gao wrote:
>> don't clone skb when skb isn't shared
>> 
>> When the tcf_action is TC_ACT_STOLEN, and the skb isn't shared, we don't need
>> to clone a new skb. As the skb will be freed after this function returns, we
>> can use it freely once we get a reference to it.
>> 
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> 
> Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>

Applied.

^ permalink raw reply

* Re: [RFC][BUG-FIX] the problem of checksum checking in UDP protocol
From: Shan Wei @ 2010-06-29  6:39 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Ronciak, John, netdev
In-Reply-To: <1277757512.4235.750.camel@edumazet-laptop>

Eric Dumazet wrote, at 06/29/2010 04:38 AM:
> If a change is needed, I would vote for a change in NIC firmware,

Eric, thanks for your explanation very much.

My solution is ugly. Modify UDP protocol stack is just a way to circumvent. 
Most in need of modification should be the NIC's fireware. However,
such as the e1000e network card for years, and no serious problems have been reported.

So, I think this thread can be closed now.

-- 
Best Regards
-----
Shan Wei


^ permalink raw reply

* Re: [PATCH] s2io: read rx_packets count from the hardware stats
From: David Miller @ 2010-06-29  6:52 UTC (permalink / raw)
  To: mschmidt; +Cc: netdev, Ramkrishna.Vepa, sivakumar.subramani, sreenivasa.honnur
In-Reply-To: <20100624233230.5864.67401.stgit@leela.lan>

From: Michal Schmidt <mschmidt@redhat.com>
Date: Fri, 25 Jun 2010 01:32:32 +0200

> Most of the statistics the s2io driver provides in /proc/net/dev
> it reads directly from the hardware counters. For some reason it does
> not do that for rx_packets. It counts rx_packets purely in software.
> 
> A customer reported a bug where in /proc/net/dev the 'multicast' counter
> was increasing faster than 'packets' ( = rx_packets in the source code).
> This confuses userspace, especially snmpd.
> 
> The hardware provides a counter for the total number of received
> frames (RMAC_VLD_FRMS) which the driver can use for the rx_packets
> statistic. By reading both statistics from the hardware it makes sure
> that all multicast frames are included in the total.
> 
> The customer tested a patch like this (only modified for RHEL5) with
> S2io Inc. Xframe II 10Gbps Ethernet (rev 02)
> and it fixed the problem.
> 
> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>

Please also use the rmac_data_octets HW statistic for rx_bytes
otherwise rx_bytes will be out of sync with the other stats too.

^ permalink raw reply

* Re: [PATCH 2.6.35] bonding: prevent netpoll over bonded interfaces
From: David Miller @ 2010-06-29  6:54 UTC (permalink / raw)
  To: andy; +Cc: netdev, amwang, fubar
In-Reply-To: <20100625195044.GQ7497@gospo.rdu.redhat.com>

From: Andy Gospodarek <andy@greyhouse.net>
Date: Fri, 25 Jun 2010 15:50:44 -0400

> 
> Support for netpoll over bonded interfaces was added here:
> 
> 	commit f6dc31a85cd46a959bdd987adad14c3b645e03c1
> 	Author: WANG Cong <amwang@redhat.com>
> 	Date:   Thu May 6 00:48:51 2010 -0700
> 
> 	    bonding: make bonding support netpoll
> 
> but it is bad enough that we should probably just disable netpoll over
> bonding until some of the locking logic in the bonding driver is changed
> or converted completely to RCU.  Simple actions like changing the active
> slave in active-backup mode will hang the box if a high enough printk
> debugging level is enabled.
> 
> Keeping the old code around will be good for anyone that wants to work
> on it (and for after the RCU conversion), so I propose this small patch
> rather than ripping it all out.
> 
> Signed-off-by: Andy Gospodarek <andy@greyhouse.net>

Applied, thanks a lot Andy.

^ permalink raw reply

* Re: [PATCHv2] vhost-net: add dhclient work-around from userspace
From: Michael S. Tsirkin @ 2010-06-29  6:55 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Aristeu Rozanski, Herbert Xu, Juan Quintela, David S. Miller, kvm,
	virtualization, netdev, linux-kernel, ykaul, markmc
In-Reply-To: <1277763581.23755.16.camel@w-sridhar.beaverton.ibm.com>

On Mon, Jun 28, 2010 at 03:19:41PM -0700, Sridhar Samudrala wrote:
> On Mon, 2010-06-28 at 13:08 +0300, Michael S. Tsirkin wrote:
> > Userspace virtio server has the following hack
> > so guests rely on it, and we have to replicate it, too:
> > 
> > Use port number to detect incoming IPv4 DHCP response packets,
> > and fill in the checksum for these.
> > 
> > The issue we are solving is that on linux guests, some apps
> > that use recvmsg with AF_PACKET sockets, don't know how to
> > handle CHECKSUM_PARTIAL;
> > The interface to return the relevant information was added
> > in 8dc4194474159660d7f37c495e3fc3f10d0db8cc,
> > and older userspace does not use it.
> > One important user of recvmsg with AF_PACKET is dhclient,
> > so we add a work-around just for DHCP.
> > 
> > Don't bother applying the hack to IPv6 as userspace virtio does not
> > have a work-around for that - let's hope guests will do the right
> > thing wrt IPv6.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > 
> > Dave, I'm going to put this patch on the vhost tree,
> > no need for you to bother merging it - you'll get
> > it with a pull request.
> > 
> > 
> >  drivers/vhost/net.c |   44 +++++++++++++++++++++++++++++++++++++++++++-
> >  1 files changed, 43 insertions(+), 1 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index cc19595..03bba6a 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -24,6 +24,10 @@
> >  #include <linux/if_tun.h>
> >  #include <linux/if_macvlan.h>
> > 
> > +#include <linux/ip.h>
> > +#include <linux/udp.h>
> > +#include <linux/netdevice.h>
> > +
> >  #include <net/sock.h>
> > 
> >  #include "vhost.h"
> > @@ -186,6 +190,44 @@ static void handle_tx(struct vhost_net *net)
> >  	unuse_mm(net->dev.mm);
> >  }
> > 
> > +static int peek_head(struct sock *sk)
> 
> This routine is doing more than just peeking the head of sk's receive
> queue. May be this should be named similar to what qemu calls
> 'work_around_broken_dhclient()'
> > +{
> > +	struct sk_buff *skb;
> > +
> > +	lock_sock(sk);
> > +	skb = skb_peek(&sk->sk_receive_queue);
> > +	if (unlikely(!skb)) {
> > +		release_sock(sk);
> > +		return 0;
> > +	}
> > +	/* Userspace virtio server has the following hack so
> > +	 * guests rely on it, and we have to replicate it, too: */
> > +	/* Use port number to detect incoming IPv4 DHCP response packets,
> > +	 * and fill in the checksum. */
> > +
> > +	/* The issue we are solving is that on linux guests, some apps
> > +	 * that use recvmsg with AF_PACKET sockets, don't know how to
> > +	 * handle CHECKSUM_PARTIAL;
> > +	 * The interface to return the relevant information was added in
> > +	 * 8dc4194474159660d7f37c495e3fc3f10d0db8cc,
> > +	 * and older userspace does not use it.
> > +	 * One important user of recvmsg with AF_PACKET is dhclient,
> > +	 * so we add a work-around just for DHCP. */
> > +	if (skb->ip_summed == CHECKSUM_PARTIAL &&
> > +	    skb_headlen(skb) >= skb_transport_offset(skb) +
> > +				sizeof(struct udphdr) &&
> > +	    udp_hdr(skb)->dest == htons(68) &&
> > +	    skb_network_header_len(skb) >= sizeof(struct iphdr) &&
> > +	    ip_hdr(skb)->protocol == IPPROTO_UDP &&
> > +	    skb->protocol == htons(ETH_P_IP)) {
> 
> Isn't it more logical to check for skb->protocol, followed by ip_hdr and
> then udp_hdr?


Yes, but then we'll only exit after checking them all.
My way we'll almost always exit after port check.

> > +		skb_checksum_help(skb);
> > +		/* Restore ip_summed value: tun passes it to user. */
> > +		skb->ip_summed = CHECKSUM_PARTIAL;
> > +	}
> > +	release_sock(sk);
> > +	return 1;
> > +}
> > +
> >  /* Expects to be always run from workqueue - which acts as
> >   * read-size critical section for our kind of RCU. */
> >  static void handle_rx(struct vhost_net *net)
> > @@ -222,7 +264,7 @@ static void handle_rx(struct vhost_net *net)
> >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >  		vq->log : NULL;
> > 
> > -	for (;;) {
> > +	while (peek_head(sock->sk)) {
> >  		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> >  					 ARRAY_SIZE(vq->iov),
> >  					 &out, &in,

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox