Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH RFC 3/5] net: macb: Use devm_ioremap()
From: Nicolas Ferre @ 2013-10-15  7:45 UTC (permalink / raw)
  To: Soren Brinkmann, netdev, David Miller; +Cc: linux-kernel, Michal Simek
In-Reply-To: <1381795140-10792-4-git-send-email-soren.brinkmann@xilinx.com>

On 15/10/2013 01:58, Soren Brinkmann :
> Use the device managed version of ioremap to remap IO memory,
> simplifying error paths.
>
> Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>

Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>

> ---
>   drivers/net/ethernet/cadence/macb.c | 8 +++-----
>   1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
> index 62aa136889a4..436aecc31732 100644
> --- a/drivers/net/ethernet/cadence/macb.c
> +++ b/drivers/net/ethernet/cadence/macb.c
> @@ -17,6 +17,7 @@
>   #include <linux/circ_buf.h>
>   #include <linux/slab.h>
>   #include <linux/init.h>
> +#include <linux/io.h>
>   #include <linux/gpio.h>
>   #include <linux/interrupt.h>
>   #include <linux/netdevice.h>
> @@ -1816,7 +1817,7 @@ static int __init macb_probe(struct platform_device *pdev)
>   		goto err_out_disable_pclk;
>   	}
>
> -	bp->regs = ioremap(regs->start, resource_size(regs));
> +	bp->regs = devm_ioremap(&pdev->dev, regs->start, resource_size(regs));
>   	if (!bp->regs) {
>   		dev_err(&pdev->dev, "failed to map registers, aborting.\n");
>   		err = -ENOMEM;
> @@ -1828,7 +1829,7 @@ static int __init macb_probe(struct platform_device *pdev)
>   	if (err) {
>   		dev_err(&pdev->dev, "Unable to request IRQ %d (error %d)\n",
>   			dev->irq, err);
> -		goto err_out_iounmap;
> +		goto err_out_disable_clocks;
>   	}
>
>   	dev->netdev_ops = &macb_netdev_ops;
> @@ -1916,8 +1917,6 @@ err_out_unregister_netdev:
>   	unregister_netdev(dev);
>   err_out_free_irq:
>   	free_irq(dev->irq, dev);
> -err_out_iounmap:
> -	iounmap(bp->regs);
>   err_out_disable_clocks:
>   	clk_disable_unprepare(bp->hclk);
>   err_out_disable_pclk:
> @@ -1944,7 +1943,6 @@ static int __exit macb_remove(struct platform_device *pdev)
>   		mdiobus_free(bp->mii_bus);
>   		unregister_netdev(dev);
>   		free_irq(dev->irq, dev);
> -		iounmap(bp->regs);
>   		clk_disable_unprepare(bp->hclk);
>   		clk_disable_unprepare(bp->pclk);
>   		free_netdev(dev);
>


-- 
Nicolas Ferre

^ permalink raw reply

* Re: [PATCH RFC 4/5] net: macb: Use devm_request_irq()
From: Nicolas Ferre @ 2013-10-15  7:46 UTC (permalink / raw)
  To: Soren Brinkmann, netdev, David Miller; +Cc: linux-kernel, Michal Simek
In-Reply-To: <1381795140-10792-5-git-send-email-soren.brinkmann@xilinx.com>

On 15/10/2013 01:58, Soren Brinkmann :
> Use the device managed interface to request the IRQ, simplifying error
> paths.
>
> Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>

Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>

> ---
>   drivers/net/ethernet/cadence/macb.c | 8 +++-----
>   1 file changed, 3 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
> index 436aecc31732..603844b1d483 100644
> --- a/drivers/net/ethernet/cadence/macb.c
> +++ b/drivers/net/ethernet/cadence/macb.c
> @@ -1825,7 +1825,8 @@ static int __init macb_probe(struct platform_device *pdev)
>   	}
>
>   	dev->irq = platform_get_irq(pdev, 0);
> -	err = request_irq(dev->irq, macb_interrupt, 0, dev->name, dev);
> +	err = devm_request_irq(&pdev->dev, dev->irq, macb_interrupt, 0,
> +			dev->name, dev);
>   	if (err) {
>   		dev_err(&pdev->dev, "Unable to request IRQ %d (error %d)\n",
>   			dev->irq, err);
> @@ -1892,7 +1893,7 @@ static int __init macb_probe(struct platform_device *pdev)
>   	err = register_netdev(dev);
>   	if (err) {
>   		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
> -		goto err_out_free_irq;
> +		goto err_out_disable_clocks;
>   	}
>
>   	err = macb_mii_init(bp);
> @@ -1915,8 +1916,6 @@ static int __init macb_probe(struct platform_device *pdev)
>
>   err_out_unregister_netdev:
>   	unregister_netdev(dev);
> -err_out_free_irq:
> -	free_irq(dev->irq, dev);
>   err_out_disable_clocks:
>   	clk_disable_unprepare(bp->hclk);
>   err_out_disable_pclk:
> @@ -1942,7 +1941,6 @@ static int __exit macb_remove(struct platform_device *pdev)
>   		kfree(bp->mii_bus->irq);
>   		mdiobus_free(bp->mii_bus);
>   		unregister_netdev(dev);
> -		free_irq(dev->irq, dev);
>   		clk_disable_unprepare(bp->hclk);
>   		clk_disable_unprepare(bp->pclk);
>   		free_netdev(dev);
>


-- 
Nicolas Ferre

^ permalink raw reply

* Re: [PATCH RFC 5/5] net: macb: Adjust tx_clk when link speed changes
From: Nicolas Ferre @ 2013-10-15  7:54 UTC (permalink / raw)
  To: Soren Brinkmann, netdev, David Miller; +Cc: linux-kernel, Michal Simek
In-Reply-To: <1381795140-10792-6-git-send-email-soren.brinkmann@xilinx.com>

On 15/10/2013 01:59, Soren Brinkmann :
> Adjust the ethernet clock according to the negotiated link speed.
>
> Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>

I will need more time to study this one.

Moreover, I will have to add the "tx_clk" to every user of this driver 
before switchin to the addition of this clock.

Best regards,

> ---
>   drivers/net/ethernet/cadence/macb.c | 66 +++++++++++++++++++++++++++++++++++++
>   drivers/net/ethernet/cadence/macb.h |  1 +
>   2 files changed, 67 insertions(+)
>
> diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
> index 603844b1d483..beb9fa863811 100644
> --- a/drivers/net/ethernet/cadence/macb.c
> +++ b/drivers/net/ethernet/cadence/macb.c
> @@ -204,6 +204,49 @@ static int macb_mdio_reset(struct mii_bus *bus)
>   	return 0;
>   }
>
> +/**
> + * macb_set_tx_clk() - Set a clock to a new frequency
> + * @clk		Pointer to the clock to change
> + * @rate	New frequency in Hz
> + * @dev		Pointer to the struct net_device
> + */
> +static void macb_set_tx_clk(struct clk *clk, int speed, struct net_device *dev)
> +{
> +	long ferr;
> +	long rate;
> +	long rate_rounded;
> +
> +	switch (speed) {
> +	case SPEED_10:
> +		rate = 2500000;
> +		break;
> +	case SPEED_100:
> +		rate = 25000000;
> +		break;
> +	case SPEED_1000:
> +		rate = 125000000;
> +		break;
> +	default:
> +		break;
> +	}
> +
> +	rate_rounded = clk_round_rate(clk, rate);
> +	if (rate_rounded < 0)
> +		return;
> +
> +	/* RGMII allows 50 ppm frequency error. Test and warn if this limit
> +	 * are not satisfied.
> +	 */
> +	ferr = abs(rate_rounded - rate);
> +	ferr = DIV_ROUND_UP(ferr, rate / 100000);
> +	if (ferr > 5)
> +		netdev_warn(dev, "unable to generate target frequency: %ld Hz\n",
> +				rate);
> +
> +	if (clk_set_rate(clk, rate_rounded))
> +		netdev_err(dev, "adjusting tx_clk failed.\n");
> +}
> +
>   static void macb_handle_link_change(struct net_device *dev)
>   {
>   	struct macb *bp = netdev_priv(dev);
> @@ -251,6 +294,9 @@ static void macb_handle_link_change(struct net_device *dev)
>
>   	spin_unlock_irqrestore(&bp->lock, flags);
>
> +	if (!IS_ERR(bp->tx_clk))
> +		macb_set_tx_clk(bp->tx_clk, phydev->speed, dev);
> +
>   	if (status_change) {
>   		if (phydev->link) {
>   			netif_carrier_on(dev);
> @@ -1805,6 +1851,8 @@ static int __init macb_probe(struct platform_device *pdev)
>   		goto err_out_free_dev;
>   	}
>
> +	bp->tx_clk = devm_clk_get(&pdev->dev, "tx_clk");
> +
>   	err = clk_prepare_enable(bp->pclk);
>   	if (err) {
>   		dev_err(&pdev->dev, "failed to enable pclk (%u)\n", err);
> @@ -1817,6 +1865,15 @@ static int __init macb_probe(struct platform_device *pdev)
>   		goto err_out_disable_pclk;
>   	}
>
> +	if (!IS_ERR(bp->tx_clk)) {
> +		err = clk_prepare_enable(bp->tx_clk);
> +		if (err) {
> +			dev_err(&pdev->dev, "failed to enable tx_clk (%u)\n",
> +					err);
> +			goto err_out_disable_hclk;
> +		}
> +	}
> +
>   	bp->regs = devm_ioremap(&pdev->dev, regs->start, resource_size(regs));
>   	if (!bp->regs) {
>   		dev_err(&pdev->dev, "failed to map registers, aborting.\n");
> @@ -1917,6 +1974,9 @@ static int __init macb_probe(struct platform_device *pdev)
>   err_out_unregister_netdev:
>   	unregister_netdev(dev);
>   err_out_disable_clocks:
> +	if (!IS_ERR(bp->tx_clk))
> +		clk_disable_unprepare(bp->tx_clk);
> +err_out_disable_hclk:
>   	clk_disable_unprepare(bp->hclk);
>   err_out_disable_pclk:
>   	clk_disable_unprepare(bp->pclk);
> @@ -1941,6 +2001,8 @@ static int __exit macb_remove(struct platform_device *pdev)
>   		kfree(bp->mii_bus->irq);
>   		mdiobus_free(bp->mii_bus);
>   		unregister_netdev(dev);
> +		if (!IS_ERR(bp->tx_clk))
> +			clk_disable_unprepare(bp->tx_clk);
>   		clk_disable_unprepare(bp->hclk);
>   		clk_disable_unprepare(bp->pclk);
>   		free_netdev(dev);
> @@ -1959,6 +2021,8 @@ static int macb_suspend(struct device *dev)
>   	netif_carrier_off(netdev);
>   	netif_device_detach(netdev);
>
> +	if (!IS_ERR(bp->tx_clk))
> +		clk_disable_unprepare(bp->tx_clk);
>   	clk_disable_unprepare(bp->hclk);
>   	clk_disable_unprepare(bp->pclk);
>
> @@ -1973,6 +2037,8 @@ static int macb_resume(struct device *dev)
>
>   	clk_prepare_enable(bp->pclk);
>   	clk_prepare_enable(bp->hclk);
> +	if (!IS_ERR(bp->tx_clk))
> +		clk_prepare_enable(bp->tx_clk);
>
>   	netif_device_attach(netdev);
>
> diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
> index f4076155bed7..51c02442160a 100644
> --- a/drivers/net/ethernet/cadence/macb.h
> +++ b/drivers/net/ethernet/cadence/macb.h
> @@ -572,6 +572,7 @@ struct macb {
>   	struct platform_device	*pdev;
>   	struct clk		*pclk;
>   	struct clk		*hclk;
> +	struct clk		*tx_clk;
>   	struct net_device	*dev;
>   	struct napi_struct	napi;
>   	struct work_struct	tx_error_task;
>


-- 
Nicolas Ferre

^ permalink raw reply

* Re: [PATCH RFC 5/5] net: macb: Adjust tx_clk when link speed changes
From: Michal Simek @ 2013-10-15  7:58 UTC (permalink / raw)
  To: Nicolas Ferre
  Cc: Soren Brinkmann, netdev, David Miller, linux-kernel, Michal Simek
In-Reply-To: <525CF4AD.1070304@atmel.com>

On 10/15/2013 09:54 AM, Nicolas Ferre wrote:
> On 15/10/2013 01:59, Soren Brinkmann :
>> Adjust the ethernet clock according to the negotiated link speed.
>>
>> Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
> 
> I will need more time to study this one.
> 
> Moreover, I will have to add the "tx_clk" to every user of this driver before switchin to the addition of this clock.

As I am reading this patch, Soren just protected this
case that if this clk is not specified then it is not used.

But anyway feel free to take more time to study it.

If there is device-tree binding then it should be extend
by this optional value.

Thanks,
Michal

^ permalink raw reply

* [PATCH net-next v2 3/3] bonding: add rtnl lock and remove read lock for bond sysfs
From: Ding Tianhong @ 2013-10-15  8:28 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

The bond_for_each_slave() will not be protected by read_lock(),
only protected by rtnl_lock(), so need to replace read_lock()
with rtnl_lock().

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
---
 drivers/net/bonding/bond_sysfs.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index e06c644..2ba1114 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -179,7 +179,9 @@ static ssize_t bonding_show_slaves(struct device *d,
 	struct slave *slave;
 	int res = 0;
 
-	read_lock(&bond->lock);
+	if (!rtnl_trylock())
+		return restart_syscall();
+
 	bond_for_each_slave(bond, slave, iter) {
 		if (res > (PAGE_SIZE - IFNAMSIZ)) {
 			/* not enough space for another interface name */
@@ -190,7 +192,9 @@ static ssize_t bonding_show_slaves(struct device *d,
 		}
 		res += sprintf(buf + res, "%s ", slave->dev->name);
 	}
-	read_unlock(&bond->lock);
+
+	rtnl_unlock();
+
 	if (res)
 		buf[res-1] = '\n'; /* eat the leftover space */
 
@@ -628,6 +632,9 @@ static ssize_t bonding_store_arp_targets(struct device *d,
 	unsigned long *targets_rx;
 	int ind, i, j, ret = -EINVAL;
 
+	if (!rtnl_trylock())
+		return restart_syscall();
+
 	targets = bond->params.arp_targets;
 	newtarget = in_aton(buf + 1);
 	/* look for adds */
@@ -701,6 +708,7 @@ static ssize_t bonding_store_arp_targets(struct device *d,
 
 	ret = count;
 out:
+	rtnl_unlock();
 	return ret;
 }
 static DEVICE_ATTR(arp_ip_target, S_IRUGO | S_IWUSR , bonding_show_arp_targets, bonding_store_arp_targets);
@@ -1469,7 +1477,6 @@ static ssize_t bonding_show_queue_id(struct device *d,
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	read_lock(&bond->lock);
 	bond_for_each_slave(bond, slave, iter) {
 		if (res > (PAGE_SIZE - IFNAMSIZ - 6)) {
 			/* not enough space for another interface_name:queue_id pair */
@@ -1481,9 +1488,9 @@ static ssize_t bonding_show_queue_id(struct device *d,
 		res += sprintf(buf + res, "%s:%d ",
 			       slave->dev->name, slave->queue_id);
 	}
-	read_unlock(&bond->lock);
 	if (res)
 		buf[res-1] = '\n'; /* eat the leftover space */
+
 	rtnl_unlock();
 
 	return res;
@@ -1532,8 +1539,6 @@ static ssize_t bonding_store_queue_id(struct device *d,
 	if (!sdev)
 		goto err_no_cmd;
 
-	read_lock(&bond->lock);
-
 	/* Search for thes slave and check for duplicate qids */
 	update_slave = NULL;
 	bond_for_each_slave(bond, slave, iter) {
@@ -1544,23 +1549,20 @@ static ssize_t bonding_store_queue_id(struct device *d,
 			 */
 			update_slave = slave;
 		else if (qid && qid == slave->queue_id) {
-			goto err_no_cmd_unlock;
+			goto err_no_cmd;
 		}
 	}
 
 	if (!update_slave)
-		goto err_no_cmd_unlock;
+		goto err_no_cmd;
 
 	/* Actually set the qids for the slave */
 	update_slave->queue_id = qid;
 
-	read_unlock(&bond->lock);
 out:
 	rtnl_unlock();
 	return ret;
 
-err_no_cmd_unlock:
-	read_unlock(&bond->lock);
 err_no_cmd:
 	pr_info("invalid input for queue_id set for %s.\n",
 		bond->dev->name);
@@ -1593,6 +1595,9 @@ static ssize_t bonding_store_slaves_active(struct device *d,
 	struct list_head *iter;
 	struct slave *slave;
 
+	if (!rtnl_trylock())
+		return restart_syscall();
+
 	if (sscanf(buf, "%d", &new_value) != 1) {
 		pr_err("%s: no all_slaves_active value specified.\n",
 		       bond->dev->name);
@@ -1612,7 +1617,6 @@ static ssize_t bonding_store_slaves_active(struct device *d,
 		goto out;
 	}
 
-	read_lock(&bond->lock);
 	bond_for_each_slave(bond, slave, iter) {
 		if (!bond_is_active_slave(slave)) {
 			if (new_value)
@@ -1621,8 +1625,8 @@ static ssize_t bonding_store_slaves_active(struct device *d,
 				slave->inactive = 1;
 		}
 	}
-	read_unlock(&bond->lock);
 out:
+	rtnl_unlock();
 	return ret;
 }
 static DEVICE_ATTR(all_slaves_active, S_IRUGO | S_IWUSR,
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH net-next v2 0/3] bonding: patchset for rcu use in bonding
From: Ding Tianhong @ 2013-10-15  8:28 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

Hi:

The Patch Set convert the xmit of 3ad and alb mode to use rcu lock.
dd rtnl lock and remove read lock for bond sysfs.

v2 because the bond_for_each_slave_rcu without rcu_read_lock() will occurs one warming, so
add new function for alb xmit path to avoid warming.

Ding Tianhong (3):
Wang Yufen (1):
Yang Yingliang (1):
  bonding: use RCU protection for 3ad xmit path
  bonding: use RCU protection for alb xmit path
  bonding: add rtnl lock and remove read lock for bond sysfs

 drivers/net/bonding/bond_3ad.c   | 10 +++----
 drivers/net/bonding/bond_alb.c   | 58 +++++++++++++++++++++++++++++-----------
 drivers/net/bonding/bond_sysfs.c | 30 ++++++++++++---------
 drivers/net/bonding/bonding.h    | 14 ++++++++++
 4 files changed, 78 insertions(+), 34 deletions(-)

-- 
1.8.2.1

^ permalink raw reply

* [PATCH net-next v2 1/3] bonding: use RCU protection for 3ad xmit path
From: Ding Tianhong @ 2013-10-15  8:28 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
(bonding: initial RCU conversion) has convert the roundrobin,
active-backup, broadcast and xor xmit path to rcu protection,
the performance will be better for these mode, so this time,
convert xmit path for 3ad mode.

Suggested-by: Nikolay Aleksandrov <nikolay@redhat.com>
Suggested-by: Veaceslav Falico <vfalico@redhat.com>
Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
Cc: Veaceslav Falico <vfalico@redhat.com>
---
 drivers/net/bonding/bond_3ad.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index ea3e64e..187b1b7 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2344,7 +2344,7 @@ int __bond_3ad_get_active_agg_info(struct bonding *bond,
 	struct slave *slave;
 	struct port *port;
 
-	bond_for_each_slave(bond, slave, iter) {
+	bond_for_each_slave_rcu(bond, slave, iter) {
 		port = &(SLAVE_AD_INFO(slave).port);
 		if (port->aggregator && port->aggregator->is_active) {
 			aggregator = port->aggregator;
@@ -2369,9 +2369,9 @@ int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info)
 {
 	int ret;
 
-	read_lock(&bond->lock);
+	rcu_read_lock();
 	ret = __bond_3ad_get_active_agg_info(bond, ad_info);
-	read_unlock(&bond->lock);
+	rcu_read_unlock();
 
 	return ret;
 }
@@ -2388,7 +2388,6 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 	int res = 1;
 	int agg_id;
 
-	read_lock(&bond->lock);
 	if (__bond_3ad_get_active_agg_info(bond, &ad_info)) {
 		pr_debug("%s: Error: __bond_3ad_get_active_agg_info failed\n",
 			 dev->name);
@@ -2406,7 +2405,7 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 	slave_agg_no = bond_xmit_hash(bond, skb, slaves_in_agg);
 	first_ok_slave = NULL;
 
-	bond_for_each_slave(bond, slave, iter) {
+	bond_for_each_slave_rcu(bond, slave, iter) {
 		agg = SLAVE_AD_INFO(slave).port.aggregator;
 		if (!agg || agg->aggregator_identifier != agg_id)
 			continue;
@@ -2436,7 +2435,6 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 		res = bond_dev_queue_xmit(bond, skb, first_ok_slave->dev);
 
 out:
-	read_unlock(&bond->lock);
 	if (res) {
 		/* no suitable interface, frame not sent */
 		kfree_skb(skb);
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH net-next v2 2/3] bonding: use RCU protection for alb xmit path
From: Ding Tianhong @ 2013-10-15  8:28 UTC (permalink / raw)
  To: Jay Vosburgh, Andy Gospodarek, David S. Miller,
	Nikolay Aleksandrov, Veaceslav Falico, Netdev

The commit 278b20837511776dc9d5f6ee1c7fabd5479838bb
(bonding: initial RCU conversion) has convert the roundrobin,
active-backup, broadcast and xor xmit path to rcu protection,
the performance will be better for these mode, so this time,
convert xmit path for alb mode.

Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
Cc: Veaceslav Falico <vfalico@redhat.com>
---
 drivers/net/bonding/bond_alb.c | 58 +++++++++++++++++++++++++++++++-----------
 drivers/net/bonding/bonding.h  | 14 ++++++++++
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 576ccea..0287240 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -230,7 +230,7 @@ static struct slave *tlb_get_least_loaded_slave(struct bonding *bond)
 	max_gap = LLONG_MIN;
 
 	/* Find the slave with the largest gap */
-	bond_for_each_slave(bond, slave, iter) {
+	bond_for_each_slave_rcu(bond, slave, iter) {
 		if (SLAVE_IS_OK(slave)) {
 			long long gap = compute_gap(slave);
 
@@ -412,6 +412,39 @@ static struct slave *rlb_next_rx_slave(struct bonding *bond)
 	return rx_slave;
 }
 
+/* Caller must hold rcu_read_lock() for read */
+static struct slave *__rlb_next_rx_slave(struct bonding *bond)
+{
+	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
+	struct slave *before = NULL, *rx_slave = NULL, *slave;
+	struct list_head *iter;
+	bool found = false;
+
+	bond_for_each_slave_rcu(bond, slave, iter) {
+		if (!SLAVE_IS_OK(slave))
+			continue;
+		if (!found) {
+			if (!before || before->speed < slave->speed)
+				before = slave;
+		} else {
+			if (!rx_slave || rx_slave->speed < slave->speed)
+				rx_slave = slave;
+		}
+		if (slave == bond_info->rx_slave)
+			found = true;
+	}
+	/* we didn't find anything after the current or we have something
+	 * better before and up to the current slave
+	 */
+	if (!rx_slave || (before && rx_slave->speed < before->speed))
+		rx_slave = before;
+
+	if (rx_slave)
+		bond_info->rx_slave = rx_slave;
+
+	return rx_slave;
+}
+
 /* teach the switch the mac of a disabled slave
  * on the primary for fault tolerance
  *
@@ -628,12 +661,14 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct arp_pkt *arp = arp_pkt(skb);
-	struct slave *assigned_slave;
+	struct slave *assigned_slave, *curr_active_slave;
 	struct rlb_client_info *client_info;
 	u32 hash_index = 0;
 
 	_lock_rx_hashtbl(bond);
 
+	curr_active_slave = rcu_dereference(bond->curr_active_slave);
+
 	hash_index = _simple_hash((u8 *)&arp->ip_dst, sizeof(arp->ip_dst));
 	client_info = &(bond_info->rx_hashtbl[hash_index]);
 
@@ -658,14 +693,14 @@ static struct slave *rlb_choose_channel(struct sk_buff *skb, struct bonding *bon
 			 * that the new client can be assigned to this entry.
 			 */
 			if (bond->curr_active_slave &&
-			    client_info->slave != bond->curr_active_slave) {
-				client_info->slave = bond->curr_active_slave;
+			    client_info->slave != curr_active_slave) {
+				client_info->slave = curr_active_slave;
 				rlb_update_client(client_info);
 			}
 		}
 	}
 	/* assign a new slave */
-	assigned_slave = rlb_next_rx_slave(bond);
+	assigned_slave = __rlb_next_rx_slave(bond);
 
 	if (assigned_slave) {
 		if (!(client_info->assigned &&
@@ -728,7 +763,7 @@ static struct slave *rlb_arp_xmit(struct sk_buff *skb, struct bonding *bond)
 	/* Don't modify or load balance ARPs that do not originate locally
 	 * (e.g.,arrive via a bridge).
 	 */
-	if (!bond_slave_has_mac(bond, arp->mac_src))
+	if (!bond_slave_has_mac_rcu(bond, arp->mac_src))
 		return NULL;
 
 	if (arp->op_code == htons(ARPOP_REPLY)) {
@@ -1343,11 +1378,6 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	skb_reset_mac_header(skb);
 	eth_data = eth_hdr(skb);
 
-	/* make sure that the curr_active_slave do not change during tx
-	 */
-	read_lock(&bond->lock);
-	read_lock(&bond->curr_slave_lock);
-
 	switch (ntohs(skb->protocol)) {
 	case ETH_P_IP: {
 		const struct iphdr *iph = ip_hdr(skb);
@@ -1429,12 +1459,12 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 
 	if (!tx_slave) {
 		/* unbalanced or unassigned, send through primary */
-		tx_slave = bond->curr_active_slave;
+		tx_slave = rcu_dereference(bond->curr_active_slave);
 		bond_info->unbalanced_load += skb->len;
 	}
 
 	if (tx_slave && SLAVE_IS_OK(tx_slave)) {
-		if (tx_slave != bond->curr_active_slave) {
+		if (tx_slave != rcu_dereference(bond->curr_active_slave)) {
 			memcpy(eth_data->h_source,
 			       tx_slave->dev->dev_addr,
 			       ETH_ALEN);
@@ -1449,8 +1479,6 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		}
 	}
 
-	read_unlock(&bond->curr_slave_lock);
-	read_unlock(&bond->lock);
 	if (res) {
 		/* no suitable interface, frame not sent */
 		kfree_skb(skb);
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 0bd04fb..3c3076e 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -464,6 +464,20 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond,
 	return NULL;
 }
 
+/* Caller must hold rcu_read_lock() for read */
+static inline struct slave *bond_slave_has_mac_rcu(struct bonding *bond,
+					       const u8 *mac)
+{
+	struct list_head *iter;
+	struct slave *tmp;
+
+	bond_for_each_slave_rcu(bond, tmp, iter)
+		if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
+			return tmp;
+
+	return NULL;
+}
+
 /* Check if the ip is present in arp ip list, or first free slot if ip == 0
  * Returns -1 if not found, index if found
  */
-- 
1.8.2.1

^ permalink raw reply related

* Re: [PATCH ipsec] xfrm: prevent ipcomp scratch buffer race condition
From: Steffen Klassert @ 2013-10-15  8:33 UTC (permalink / raw)
  To: Michal Kubecek; +Cc: Herbert Xu, David S. Miller, netdev
In-Reply-To: <20131014160334.BCCDDE8A31@unicorn.suse.cz>

On Mon, Oct 14, 2013 at 06:03:34PM +0200, Michal Kubecek wrote:
> In ipcomp_compress(), sortirq is enabled too early, allowing the
> per-cpu scratch buffer to be rewritten by ipcomp_decompress()
> (called on the same CPU in softirq context) between populating
> the buffer and copying the compressed data to the skb.
> 
> Add similar protection into ipcomp_decompress() as it can be
> called from process context as well (even if such scenario seems
> a bit artificial).
> 
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> ---
>  net/xfrm/xfrm_ipcomp.c | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
> index 2906d52..96946fb 100644
> --- a/net/xfrm/xfrm_ipcomp.c
> +++ b/net/xfrm/xfrm_ipcomp.c
> @@ -48,9 +48,11 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
>  	const int cpu = get_cpu();
>  	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
>  	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
> -	int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
> +	int err;
>  	int len;
>  
> +	local_bh_disable();

Maybe we could disable the BHs before we fetch the percpu pointers.
Then we can use smp_processor_id() to get the cpu. With that we
could get rid of a (now useless) preempt_disable()/preempt_enable()
pair. Same could be done in ipcomp_compress().

Looks ok otherwise. Thanks!

^ permalink raw reply

* Re: DomU's network interface will hung when Dom0 running 32bit
From: Ian Campbell @ 2013-10-15  8:43 UTC (permalink / raw)
  To: jianhai luan; +Cc: Wei Liu, xen-devel, netdev
In-Reply-To: <525CAC21.5040202@oracle.com>

On Tue, 2013-10-15 at 10:44 +0800, jianhai luan wrote:
> On 2013-10-14 19:19, Wei Liu wrote:
> > On Sat, Oct 12, 2013 at 04:53:18PM +0800, jianhai luan wrote:
> >> Hi Ian,
> >>    I meet the DomU's network interface hung issue recently, and have
> >> been working on the issue from that time. I find that DomU's network
> >> interface, which send lesser package, will hung if Dom0 running
> >> 32bit and DomU's up-time is very long.  I think that one jiffies
> >> overflow bug exist in the function tx_credit_exceeded().
> >>    I know the inline function time_after_eq(a,b) will process jiffies
> >> overflow, but the function have one limit a should little that (b +
> >> MAX_SIGNAL_LONG). If a large than the value, time_after_eq will
> >> return false. The MAX_SINGNAL_LONG should be 0x7fffffff at 32-bit
> >> machine.
> >>    If DomU's network interface send lesser package (<0.5k/s if
> >> jiffies=250 and credit_bytes=ULONG_MAX), jiffies will beyond out
> >> (credit_timeout.expires + MAX_SIGNAL_LONG) and time_after_eq(now,
> >> next_credit) will failure (should be true). So one timer which will
> >> not be trigger in short time, and later process will be aborted when
> >> timer_pending(&vif->credit_timeout) is true. The result will be
> >> DomU's network interface will be hung in long time (> 40days).
> >>    Please think about the below scenario:
> >>    Condition:
> >>      Dom0 running 32-bit and HZ = 1000
> >>      vif->credit_timeout->expire = 0xffffffff, vif->remaining_credit
> >> = 0xffffffff, vif->credit_usec=0 jiffies=0
> >>      vif receive lesser package (DomU send lesser package). If the
> >> value is litter than 2K/s, consume 4G(0xffffffff) will need 582.55
> >> hours. jiffies will large than 0x7ffffff. we guess jiffies =
> >> 0x800000ff, time_after_eq(0x800000ff, 0xffffffff) will failure, and
> >> one time which expire is 0xfffffff will be pended into system. So
> >> the interface will hung until jiffies recount 0xffffffff (that will
> >> need very long time).
> > If I'm not mistaken you meant time_after_eq(now, next_credit) in
> > netback. How does next_credit become 0xffffffff?
> 
> I only assume the value is 0xfffffff, and the value of next_credit 
> isn't  point. If the delta between now and next_credit larger than 
> ULONG_MAX, time_after_eq will do wrong judge.

So it sounds like we need a timer which is independent of the traffic
being sent to keep credit_timeout.expires rolling over.

Can you propose a patch?

Ian.

> >
> > Wei.
> >
> >>    If some error exist in above explain, please help me point it out.
> >>
> >> Thanks,
> >> Jason
> 

^ permalink raw reply

* Re: kernel policy routing table src ip not respected since 2.6.37 and commit 9fc3bbb4a752
From: Julian Anastasov @ 2013-10-15  8:51 UTC (permalink / raw)
  To: Vincent Li; +Cc: netdev@vger.kernel.org, jsing
In-Reply-To: <CAK3+h2zGFnbC-hpPB8fPX7TC3rKCWbPYVY4WW4HDYbN+LXNXXw@mail.gmail.com>


	Hello,

On Mon, 14 Oct 2013, Vincent Li wrote:

> I had a simple bash script to test if the policy routing table src ip
> is respected or not, git bisect found the  commit 9fc3bbb4a752 to
> change the policy routing table source ip behavior.
> 
> commit 9fc3bbb4a752f108cf096d96640f3b548bbbce6c
> Author: Joel Sing <jsing@google.com>
> Date:   Mon Jan 3 20:24:20 2011 +0000
> 
>     ipv4/route.c: respect prefsrc for local routes
> 
>     The preferred source address is currently ignored for local routes,
>     which results in all local connections having a src address that is the
>     same as the local dst address. Fix this by respecting the preferred source
>     address when it is provided for local routes.
> 
> test script:
> 
> #!/bin/bash
> ip addr add 10.1.1.1/24 dev eth0
> ip addr add 10.1.1.2/24 dev eth0
> ip rule add priority 245 table 245
> ip route add 10.1.1.0/24 dev eth0  proto kernel  scope link  src
> 10.1.1.2 table 245 <===source ip 10.1.1.2 to be preferred
> 
> ip addr show dev eth0
> ip route list table main
> ip route list table 245
> 
> 
> tcpdump -nn -i eth0 host 10.1.1.9 and icmp &
> 
> ping 10.1.1.9
> 
> 
> 
> --before commit 9fc3bbb4a752
> 
> the source is from ip 10.1.1.2 as expected
> 
> --after commit 9fc3bbb4a752
> 
> the source is from ip 10.1.1.1 which not expected since I have high
> priority table 245 with source ip 10.1.1.2
> 
> is this regression of commit 9fc3bbb4a752 ?

	Hm, it works here on 3.11.3. ARP request uses
10.1.1.2 and ICMP packet has such source. May be something with
the ping tool you are using? Check 'strace ping -c 1 10.1.1.9', may
be it binds to first device IP?

Regards

--
Julian Anastasov <ja@ssi.bg>

^ permalink raw reply

* Re: [PATCH ipsec] xfrm: prevent ipcomp scratch buffer race condition
From: Fan Du @ 2013-10-15  8:59 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Michal Kubecek, Herbert Xu, David S. Miller, netdev
In-Reply-To: <20131015083348.GW7660@secunet.com>



On 2013年10月15日 16:33, Steffen Klassert wrote:
> On Mon, Oct 14, 2013 at 06:03:34PM +0200, Michal Kubecek wrote:
>> In ipcomp_compress(), sortirq is enabled too early, allowing the
>> per-cpu scratch buffer to be rewritten by ipcomp_decompress()
>> (called on the same CPU in softirq context) between populating
>> the buffer and copying the compressed data to the skb.
>>
>> Add similar protection into ipcomp_decompress() as it can be
>> called from process context as well (even if such scenario seems
>> a bit artificial).
>>
>> Signed-off-by: Michal Kubecek<mkubecek@suse.cz>
>> ---
>>   net/xfrm/xfrm_ipcomp.c | 8 ++++++--
>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
>> index 2906d52..96946fb 100644
>> --- a/net/xfrm/xfrm_ipcomp.c
>> +++ b/net/xfrm/xfrm_ipcomp.c
>> @@ -48,9 +48,11 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
>>   	const int cpu = get_cpu();
>>   	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
>>   	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
>> -	int err = crypto_comp_decompress(tfm, start, plen, scratch,&dlen);
>> +	int err;
>>   	int len;
>>
>> +	local_bh_disable();
>
> Maybe we could disable the BHs before we fetch the percpu pointers.
> Then we can use smp_processor_id() to get the cpu. With that we
> could get rid of a (now useless) preempt_disable()/preempt_enable()
> pair. Same could be done in ipcomp_compress().

Is it possible that two tasks race scratch buffer when both of them trying to compress data
without preempt disabled? for example, when task A working on compression, then task B
with higher priority preempts task A, and try to touch scratch buffer, which leaves stale
data for task A after then.

I think we needs preempt disabled for such case, otherwise I overlook codes in somewhere else.

> Looks ok otherwise. Thanks!
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply

* Re: [PATCHv2 RESEND] {xfrm, sctp} Stick to software crc32 even if hardware is capable of that
From: Fan Du @ 2013-10-15  9:17 UTC (permalink / raw)
  To: Vlad Yasevich; +Cc: Daniel Borkmann, nhorman, steffen.klassert, davem, netdev
In-Reply-To: <fed034cb-be28-488b-883f-627e8e7499b9@email.android.com>



On 2013年10月14日 22:16, Vlad Yasevich wrote:
>
>
> Fan Du<fan.du@windriver.com>  wrote:
>
>>
>>
>> On 2013年10月14日 16:07, Daniel Borkmann wrote:
>>> On 10/14/2013 09:27 AM, Fan Du wrote:
>>>> igb/ixgbe have hardware sctp checksum support, when this feature is
>> enabled
>>>> and also IPsec is armed to protect sctp traffic, ugly things
>> happened as
>>>> xfrm_output checks CHECKSUM_PARTIAL to do check sum operation(sum
>> every thing
>>>> up and pack the 16bits result in the checksum field). The result is
>> fail
>>>> establishment of sctp communication.
>>>>
>>>> Signed-off-by: Fan Du<fan.du@windriver.com>
>>>> Cc: Vlad Yasevich<vyasevich@gmail.com>
>>>> Cc: Neil Horman<nhorman@tuxdriver.com>
>>>> Cc: Steffen Klassert<steffen.klassert@secunet.com>
>>>> Acked-by: Vlad Yasevich<vyasevich@gmail.com>
>>>> ---
>>>>    net/sctp/output.c |   14 +++++++++++++-
>>>>    1 file changed, 13 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/net/sctp/output.c b/net/sctp/output.c
>>>> index 0ac3a65..6de6402 100644
>>>> --- a/net/sctp/output.c
>>>> +++ b/net/sctp/output.c
>>>> @@ -372,6 +372,16 @@ static void sctp_packet_set_owner_w(struct
>> sk_buff *skb, struct sock *sk)
>>>>        atomic_inc(&sk->sk_wmem_alloc);
>>>>    }
>>>>
>>>> +static int is_xfrm_armed(struct dst_entry *dst)
>>>> +{
>>>> +#ifdef CONFIG_XFRM
>>>> +    /* If dst->xfrm is valid, this skb needs to be transformed */
>>>> +    return dst->xfrm != NULL;
>>>> +#else
>>>> +    return 0;
>>>> +#endif
>>>> +}
>>>
>>> Instead of putting this into SCTP code, isn't the above rather a
>> candidate for
>>> include/net/xfrm.h, e.g. as ... bool xfrm_is_armed(...) ?
>>
>> Should be in such style in terms of its name, but this is truly SCTP
>> specific in this scenario.
>> No one elsewhere barely need this as far as I can tell...
>
> It almost begs for dst_xfrm() function that returns NULL or dst->xfrm.
> Thar can live in dst code.

Ok, I will show my love in such style in v3.

> -vlad
>

-- 
浮沉随浪只记今朝笑

--fan

^ permalink raw reply

* [PATCHv3 net] {xfrm, sctp} Stick to software crc32 even if hardware is capable of that
From: Fan Du @ 2013-10-15  9:19 UTC (permalink / raw)
  To: vyasevich, nhorman; +Cc: steffen.klassert, davem, netdev

igb/ixgbe have hardware sctp checksum support, when this feature is enabled
and also IPsec is armed to protect sctp traffic, ugly things happened as
xfrm_output checks CHECKSUM_PARTIAL to do check sum operation(sum every thing
up and pack the 16bits result in the checksum field). The result is fail
establishment of sctp communication.

Signed-off-by: Fan Du <fan.du@windriver.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
---
v3:
  - Rename is_xfrm_armed by dst_xfrm
  - Move this funtion in include/net/dst.h

v2:
  - Split v1 into two separate patches.

---
 include/net/dst.h |   12 ++++++++++++
 net/sctp/output.c |    3 ++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 211dcf1..44995c1 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -478,10 +478,22 @@ static inline struct dst_entry *xfrm_lookup(struct net *net,
 {
 	return dst_orig;
 } 
+
+static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
+{
+	return NULL;
+}
+
 #else
 struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 			      const struct flowi *fl, struct sock *sk,
 			      int flags);
+
+/* skb attached with this dst needs transformation if dst->xfrm is valid */
+static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
+{
+	return dst->xfrm;
+}
 #endif
 
 #endif /* _NET_DST_H */
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 0ac3a65..24b3718 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -536,7 +536,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
 	 */
 	if (!sctp_checksum_disable) {
-		if (!(dst->dev->features & NETIF_F_SCTP_CSUM)) {
+		if (!(dst->dev->features & NETIF_F_SCTP_CSUM) ||
+			(dst_xfrm(dst) != NULL)) {
 			__u32 crc32 = sctp_start_cksum((__u8 *)sh, cksum_buf_len);
 
 			/* 3) Put the resultant value into the checksum field in the
-- 
1.7.9.5

^ permalink raw reply related

* Re: DomU's network interface will hung when Dom0 running 32bit
From: jianhai luan @ 2013-10-15  9:34 UTC (permalink / raw)
  To: Ian Campbell; +Cc: Wei Liu, xen-devel, netdev, ANNIE LI
In-Reply-To: <1381826609.24708.135.camel@kazak.uk.xensource.com>

[-- Attachment #1: Type: text/plain, Size: 2838 bytes --]


On 2013-10-15 16:43, Ian Campbell wrote:
> On Tue, 2013-10-15 at 10:44 +0800, jianhai luan wrote:
>> On 2013-10-14 19:19, Wei Liu wrote:
>>> On Sat, Oct 12, 2013 at 04:53:18PM +0800, jianhai luan wrote:
>>>> Hi Ian,
>>>>     I meet the DomU's network interface hung issue recently, and have
>>>> been working on the issue from that time. I find that DomU's network
>>>> interface, which send lesser package, will hung if Dom0 running
>>>> 32bit and DomU's up-time is very long.  I think that one jiffies
>>>> overflow bug exist in the function tx_credit_exceeded().
>>>>     I know the inline function time_after_eq(a,b) will process jiffies
>>>> overflow, but the function have one limit a should little that (b +
>>>> MAX_SIGNAL_LONG). If a large than the value, time_after_eq will
>>>> return false. The MAX_SINGNAL_LONG should be 0x7fffffff at 32-bit
>>>> machine.
>>>>     If DomU's network interface send lesser package (<0.5k/s if
>>>> jiffies=250 and credit_bytes=ULONG_MAX), jiffies will beyond out
>>>> (credit_timeout.expires + MAX_SIGNAL_LONG) and time_after_eq(now,
>>>> next_credit) will failure (should be true). So one timer which will
>>>> not be trigger in short time, and later process will be aborted when
>>>> timer_pending(&vif->credit_timeout) is true. The result will be
>>>> DomU's network interface will be hung in long time (> 40days).
>>>>     Please think about the below scenario:
>>>>     Condition:
>>>>       Dom0 running 32-bit and HZ = 1000
>>>>       vif->credit_timeout->expire = 0xffffffff, vif->remaining_credit
>>>> = 0xffffffff, vif->credit_usec=0 jiffies=0
>>>>       vif receive lesser package (DomU send lesser package). If the
>>>> value is litter than 2K/s, consume 4G(0xffffffff) will need 582.55
>>>> hours. jiffies will large than 0x7ffffff. we guess jiffies =
>>>> 0x800000ff, time_after_eq(0x800000ff, 0xffffffff) will failure, and
>>>> one time which expire is 0xfffffff will be pended into system. So
>>>> the interface will hung until jiffies recount 0xffffffff (that will
>>>> need very long time).
>>> If I'm not mistaken you meant time_after_eq(now, next_credit) in
>>> netback. How does next_credit become 0xffffffff?
>> I only assume the value is 0xfffffff, and the value of next_credit
>> isn't  point. If the delta between now and next_credit larger than
>> ULONG_MAX, time_after_eq will do wrong judge.
> So it sounds like we need a timer which is independent of the traffic
> being sent to keep credit_timeout.expires rolling over.
>
> Can you propose a patch?

Because credit_timeout.expire always after jiffies, i judge the value 
over the range of time_after_eq() by time_before(now, 
vif->credit_timeout.expires). please check the patch.
>
> Ian.
>
>>> Wei.
>>>
>>>>     If some error exist in above explain, please help me point it out.
>>>>
>>>> Thanks,
>>>> Jason
>


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Process-the-wrong-judge-of-time_after_eq.patch --]
[-- Type: text/plain; charset=gb18030; name="0001-Process-the-wrong-judge-of-time_after_eq.patch", Size: 1206 bytes --]

From f08c584ca1f393f6559b58b6b4c9e259c313259e Mon Sep 17 00:00:00 2001
From: Jason Luan <jianhai.luan@oracle.com>
Date: Tue, 15 Oct 2013 17:07:49 +0800
Subject: [PATCH] Process the wrong judge of time_after_eq().

If netfront send lesser package, the delta between now and next_credit will be out range of time_after_qe() and the function will do wrong judge. Because the expires always after jiffies, we judge the condition by time_before(now, vif->credit_timeout.expires).

Signed-off-by: Jason Luan <jianhai.luan@oracle.com>
---
 drivers/net/xen-netback/netback.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index f3e591c..8036ce6 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1195,7 +1195,8 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size)
 		return true;
 
 	/* Passed the point where we can replenish credit? */
-	if (time_after_eq(now, next_credit)) {
+	if (time_after_eq(now, next_credit) ||
+		unlikely(time_before(now, vif->credit_timeout.expires))) {
 		vif->credit_timeout.expires = now;
 		tx_add_credit(vif);
 	}
-- 
1.7.6.5


^ permalink raw reply related

* Re: [PATCH ipsec] xfrm: prevent ipcomp scratch buffer race condition
From: Steffen Klassert @ 2013-10-15  9:46 UTC (permalink / raw)
  To: Fan Du; +Cc: Michal Kubecek, Herbert Xu, David S. Miller, netdev
In-Reply-To: <525D03D8.7060802@windriver.com>

On Tue, Oct 15, 2013 at 04:59:04PM +0800, Fan Du wrote:
> 
> 
> On 2013年10月15日 16:33, Steffen Klassert wrote:
> >
> >Maybe we could disable the BHs before we fetch the percpu pointers.
> >Then we can use smp_processor_id() to get the cpu. With that we
> >could get rid of a (now useless) preempt_disable()/preempt_enable()
> >pair. Same could be done in ipcomp_compress().
> 
> Is it possible that two tasks race scratch buffer when both of them trying to compress data
> without preempt disabled? for example, when task A working on compression, then task B
> with higher priority preempts task A, and try to touch scratch buffer, which leaves stale
> data for task A after then.
> 
> I think we needs preempt disabled for such case, otherwise I overlook codes in somewhere else.
> 

You overlook that preemption is disabled if the BHs are disabled.

^ permalink raw reply

* Re: [PATCH] staging: octeon-ethernet: trivial: Avoid OOPS if phydev is not set
From: Dan Carpenter @ 2013-10-15  9:47 UTC (permalink / raw)
  To: Aaro Koskinen
  Cc: support, David Daney, Greg KH, driverdev-devel,
	Sebastian Pöhn, netdev
In-Reply-To: <20131014194937.GD4260@blackmetal.musicnaut.iki.fi>

On Mon, Oct 14, 2013 at 10:49:37PM +0300, Aaro Koskinen wrote:
> On Mon, Oct 14, 2013 at 10:16:49PM +0300, Dan Carpenter wrote:
> > On Mon, Oct 14, 2013 at 09:39:06PM +0300, Aaro Koskinen wrote:
> > > It's initialized in cvm_oct_phy_setup_device():
> > > 
> > > 	priv->phydev = of_phy_connect(dev, phy_node, cvm_oct_adjust_link, 0,
> >       ^^^^^^^^^^^^                                 ^^^^^^^^^^^^^^^^^^^
> > 
> > Sorry I should have explained better.
> > 
> > We use cvm_oct_adjust_link() to initialize priv->phydev but
> > cvm_oct_adjust_link() depends on priv->phydev.  It seems like we would
> > hit the NULL dereference every time.  Weird huh?
> 
> It doesn't happen on my system (EdgeRouter Lite). I think you need to
> explain even more better. :-)
> 
> What you mean by "We use cvm_oct_adjust_link() to initialize
> priv->phydev..."? Sorry, maybe I'm just missing something really
> obvious...

Hm...  I didn't followed the state machine all the way through so I'm
not actually sure how this is called.  But it's weird that priv->phydev
is initialized on the left side of the assignment but dereferenced in
the function mentioned on the right side if the assignment.

Also this patch should not be marked as "trivial" in the subject.
"trivial" is only for spelling mistakes in comments etc.

regards,
dan carpenter

^ permalink raw reply

* transmit lockup using smsc95xx ethernet on usb3
From: David Laight @ 2013-10-15  9:59 UTC (permalink / raw)
  To: netdev, linux-usb

We are seeing complete lockups of the transmit side when using
the smsc95xx driver connected to a USB3 port on an i7 (Ivybridge) cpu.
These errors are very intermittent - less than once a day, and
it isn't actually clear that they are related to traffic load.

Most of the systems are running the 3.2 kernel from Ubuntu 12.04
but I've seen the same problem when running a 3.4 kernel.
Looking at the changelog for xhci-ring.c I can see that some
'nasty' bugs were fixed between 3.2 and 3.4 (and possibly since)
but the usbmon trace I've now got doesn't seem to match any
of the changelog entries.

We are also seeing similar problems if we connect to a USB2
header.

Since we can't reproduce the problem quickly it is difficult to
do any analysis. Any suggestions for increasing the error rate
would be welcome.

Below is an annotated extract from a usbmon trace while running
a netperf test that was sending 8192 byte TCP packets (nagle off).
I've deleted the Bi entries (packets are received throughout)
and numbered all the others (modulo 10000) so it is easier to
see when the requests complete, I've also calculated the elapsed
time and the number of Setup entries between the S and C traces.

The USB ring seems to have 60 outstanding transmits in it,
each time one completes another is sent. There are a few 10000
traces of that then:

  start:9870         ffff88020ea16000 293811125 S Bo:3:003:2 -115 1514 =
                        e2350000 e2450000 22003200 00224d98
                        d8460002 1f0057d7 08004500 05d0ff11
   done:9811:6969:60 ffff88020c7c8000 293811236 C Bo:3:003:2 0 1090 >
  start:9871         ffff88020ea16a80 293811242 S Bo:3:003:2 -115 1090 =
                        3a340000 3a440000 22003200 00224d98
                        d8460002 1f0057d7 08004500 0428ff12
...
  start:9929         ffff88020ea16780 293817964 S Bo:3:003:2 -115 1514 =
                        e2350000 e2450000 22003200 00224d98
                        d8460002 1f0057d7 08004500 05d0ff4c
Last successful completion.
   done:9870:6968:60 ffff88020ea16000 293818093 C Bo:3:003:2 0 1514 >
  start:9930         ffff88020ea16000 293818099 S Bo:3:003:2 -115 1514 =
                        e2350000 e2450000 22003200 00224d98
                        d8460002 1f0057d7 08004500 05d0ff4d

At this point something (untraced) seems to have gone horribly
wrong in the transmit ring. dmesg shows nothing.
Two Bo 'fail -71', 6 succeed, one fails -32 the rest fail -104.
   done:9871:6913:60 ffff88020ea16a80 293818155 C Bo:3:003:2 -71 512 >
   done:9872:6927:59 ffff88020ea16f00 293818235 C Bo:3:003:2 -71 0
   done:9873:6875:58 ffff88020ea16480 293818313 C Bo:3:003:2 0 1514 >
   done:9874:6786:57 ffff88020c7c83c0 293818353 C Bo:3:003:2 0 1514 >
   done:9875:6794:56 ffff88020c7c80c0 293818470 C Bo:3:003:2 0 1514 >
   done:9876:6789:55 ffff88020c7c8e40 293818589 C Bo:3:003:2 0 1514 >
   done:9877:6775:54 ffff88020c7c8240 293818702 C Bo:3:003:2 0 1090 >
   done:9878:6751:53 ffff88020c7c8180 293818803 C Bo:3:003:2 0 1514 >
   done:9879:6735:52 ffff88020c7c89c0 293818885 C Bo:3:003:2 -32 0
   done:9880:6671:51 ffff88020c7c8900 293818925 C Bo:3:003:2 -104 0
...
    done:9927:1292:4 ffff88020cf0c480 293819015 C Bo:3:003:2 -104 0
    done:9928:1170:3 ffff88020ea160c0 293819016 C Bo:3:003:2 -104 0
Something is known to be wrong...
  start:9931         ffff88020ea160c0 293819037 S Co:3:003:0
                         s 02 01 0000 0002 0000 0
    done:9929:1080:3 ffff88020ea16780 293819044 C Bo:3:003:2 -104 0
     done:9930:945:2 ffff88020ea16000 293819044 C Bo:3:003:2 -104 0
      done:9931:48:1 ffff88020ea160c0 293819085 C Co:3:003:0 0 0

These 10 transmits never finish:
  start:9932         ffff88020ea160c0 293819098 S Bo:3:003:2 -115 1090 =
                        3a340000 3a440000 22003200 00224d98
                        d8460002 1f0057d7 08004500 0428ff4e
... 9933 to 9940 deleted
  start:9941         ffff88020ea16b40 293819111 S Bo:3:003:2 -115 1514 =
                        e2350000 e2450000 22003200 00224d98
                        d8460002 1f0057d7 08004500 05d0ff57

All further transmits fail immediately E -12 and generate the
    'xhci_hcd 0000:00:14.0: ERROR no room on ep ring' message.
(There are 1070 'E' traces and 1070 'no room' messages.)
Receives are still working.
  start:9942         ffff88020ea16240 293819113 S Bo:3:003:2 -115 1514 =
                        e2350000 e2450000 22003200 00224d98
                        d8460002 1f0057d7 08004500 05d0ff58
    done:9942:1550:1 ffff88020ea16240 293820663 E Bo:3:003:2 -12 0
  start:9943         ffff88020ea16240 293820675 S Bo:3:003:2 -115 1514 =
                        e2350000 e2450000 22003200 00224d98
                        d8460002 1f0057d7 08004500 05d0ff59
    done:9943:1507:1 ffff88020ea16240 293822182 E Bo:3:003:2 -12 0

Eventually something causes a device remove and insert - everything re-initialises.
This is over 12 hours later.
      done:unknown   ffff88020c8570c0 3637139297 C Ii:3:001:1 0:2048 1 = 02
  start:1015         ffff88020c8570c0 3637139302 S Ii:3:001:1 -115:2048 4 <
  start:1016         ffff88020cbeb300 3637139323 S Ci:3:001:0
                         s a3 00 0000 0001 0004 4 <
                     ffff88020ea16240 3637139331 C Bi:3:003:1 -71 0
       done:1016:9:1 ffff88020cbeb300 3637139332 C Ci:3:001:0 0 4 = 00010100
  start:1017         ffff88020cbeb300 3637139334 S Co:3:001:0
                         s 23 01 0010 0001 0000 0
       done:1017:4:1 ffff88020cbeb300 3637139338 C Co:3:001:0 0 0
      done:unknown   ffff88020ca9ae40 3637139423 C Ii:3:003:3 -71:1 0
                     ffff88020c9db540 3637139428 C Bi:3:003:1 -108 0
                     ffff88020c9db780 3637139430 C Bi:3:003:1 -108 0
                     ffff88020d8bb540 3637139431 C Bi:3:003:1 -108 0

The last 10 transmits then terminate with error -108:
 done:9932:xxxx      ffff88020ea160c0 3637139462 C Bo:3:003:2 -108 0
... 9933 to 9940 deleted
 done:9941:xxxx      ffff88020ea16b40 3637139482 C Bo:3:003:2 -108 0
   done:1015:21090:3 ffff88020c8570c0 3637160392 C Ii:3:001:1 0:2048 1 = 02
  start:1018         ffff88020c8570c0 3637160396 S Ii:3:001:1 -115:2048 4 <
      done:unknown   ffff88020cbf26c0 3637176790 C Ii:3:005:1 -108:8 0
      done:unknown   ffff88020c68aa80 3637622497 C Ii:3:002:1 -108:2048 0

Followed by lots of Ci/Co and eventually it all starts working again.

I've not yet tried to look up the control transfers.

These aren't the only errors we are seeing, we also see (separately):
[21549.917529] hub 3-2:1.0: port 1 disabled by hub (EMI?), re-enabling...
[ 5822.629579] NETDEV WATCHDOG: eth0 (smsc95xx): transmit queue 0 timed out
[ 7263.834404] hid-generic 0003:413C:2005.0002: can't reset device, 0000:00:1a.0-1.4.3/input0, status -71 (connected to a USB2 header).
These all cause a USB bus reset and everything recovers within a couple of seconds.

	David

^ permalink raw reply

* Re: DomU's network interface will hung when Dom0 running 32bit
From: Wei Liu @ 2013-10-15 10:06 UTC (permalink / raw)
  To: jianhai luan; +Cc: Ian Campbell, Wei Liu, xen-devel, netdev, ANNIE LI
In-Reply-To: <525D0C41.2080407@oracle.com>

On Tue, Oct 15, 2013 at 05:34:57PM +0800, jianhai luan wrote:
> 
> On 2013-10-15 16:43, Ian Campbell wrote:
> >On Tue, 2013-10-15 at 10:44 +0800, jianhai luan wrote:
> >>On 2013-10-14 19:19, Wei Liu wrote:
> >>>On Sat, Oct 12, 2013 at 04:53:18PM +0800, jianhai luan wrote:
> >>>>Hi Ian,
> >>>>    I meet the DomU's network interface hung issue recently, and have
> >>>>been working on the issue from that time. I find that DomU's network
> >>>>interface, which send lesser package, will hung if Dom0 running
> >>>>32bit and DomU's up-time is very long.  I think that one jiffies
> >>>>overflow bug exist in the function tx_credit_exceeded().
> >>>>    I know the inline function time_after_eq(a,b) will process jiffies
> >>>>overflow, but the function have one limit a should little that (b +
> >>>>MAX_SIGNAL_LONG). If a large than the value, time_after_eq will
> >>>>return false. The MAX_SINGNAL_LONG should be 0x7fffffff at 32-bit
> >>>>machine.
> >>>>    If DomU's network interface send lesser package (<0.5k/s if
> >>>>jiffies=250 and credit_bytes=ULONG_MAX), jiffies will beyond out
> >>>>(credit_timeout.expires + MAX_SIGNAL_LONG) and time_after_eq(now,
> >>>>next_credit) will failure (should be true). So one timer which will
> >>>>not be trigger in short time, and later process will be aborted when
> >>>>timer_pending(&vif->credit_timeout) is true. The result will be
> >>>>DomU's network interface will be hung in long time (> 40days).
> >>>>    Please think about the below scenario:
> >>>>    Condition:
> >>>>      Dom0 running 32-bit and HZ = 1000
> >>>>      vif->credit_timeout->expire = 0xffffffff, vif->remaining_credit
> >>>>= 0xffffffff, vif->credit_usec=0 jiffies=0
> >>>>      vif receive lesser package (DomU send lesser package). If the
> >>>>value is litter than 2K/s, consume 4G(0xffffffff) will need 582.55
> >>>>hours. jiffies will large than 0x7ffffff. we guess jiffies =
> >>>>0x800000ff, time_after_eq(0x800000ff, 0xffffffff) will failure, and
> >>>>one time which expire is 0xfffffff will be pended into system. So
> >>>>the interface will hung until jiffies recount 0xffffffff (that will
> >>>>need very long time).
> >>>If I'm not mistaken you meant time_after_eq(now, next_credit) in
> >>>netback. How does next_credit become 0xffffffff?
> >>I only assume the value is 0xfffffff, and the value of next_credit
> >>isn't  point. If the delta between now and next_credit larger than
> >>ULONG_MAX, time_after_eq will do wrong judge.
> >So it sounds like we need a timer which is independent of the traffic
> >being sent to keep credit_timeout.expires rolling over.
> >
> >Can you propose a patch?
> 
> Because credit_timeout.expire always after jiffies, i judge the
> value over the range of time_after_eq() by time_before(now,
> vif->credit_timeout.expires). please check the patch.

I don't think this really fix the issue for you. You still have chance
that now wraps around and falls between expires and next_credit. In that
case it's stalled again.

Wei.

^ permalink raw reply

* drivers/net/wireless/iwlwifi/dvm/tx.c:456 iwlagn_tx_skb+0x6c5/0x883()
From: Sander Eikelenboom @ 2013-10-15 10:39 UTC (permalink / raw)
  To: John W. Linville, johannes.berg, ilw; +Cc: ilw, linux-wireless, netdev

Hi,

I'm having a:

02:00.0 Network controller: Intel Corporation Centrino Advanced-N 6235 (rev 24)

And i'm running into this warning on boot with a 3.11.2 and 3.12-rc5 kernel.

[   23.904950] ------------[ cut here ]------------
[   23.904957] WARNING: CPU: 0 PID: 2531 at drivers/net/wireless/iwlwifi/dvm/tx.c:456 iwlagn_tx_skb+0x6c5/0x883()
[   23.904959] Modules linked in:
[   23.904962] CPU: 0 PID: 2531 Comm: hostapd Not tainted 3.12.0-rc5+ #1
[   23.904963] Hardware name:                  /D53427RKE, BIOS RKPPT10H.86A.0017.2013.0425.1251 04/25/2013
[   23.904966]  0000000000000000 0000000000000009 ffffffff8189aa62 0000000000000000
[   23.904968]  ffffffff8105a4f2 ffff880058339a48 ffffffff815f8a04 0000000000000000
[   23.904970]  ffff8800560097b0 0000000000000208 0000000000000000 ffff8800561a9e5e
[   23.904971] Call Trace:
[   23.904977]  [<ffffffff8189aa62>] ? dump_stack+0x41/0x51
[   23.904981]  [<ffffffff8105a4f2>] ? warn_slowpath_common+0x78/0x90
[   23.904984]  [<ffffffff815f8a04>] ? iwlagn_tx_skb+0x6c5/0x883
[   23.904986]  [<ffffffff815f8a04>] ? iwlagn_tx_skb+0x6c5/0x883
[   23.904989]  [<ffffffff818a0040>] ? put_cred+0x15/0x15
[   23.904991]  [<ffffffff815f6db4>] ? iwlagn_mac_tx+0x19/0x2f
[   23.904995]  [<ffffffff8186cc45>] ? __ieee80211_tx+0x226/0x29b
[   23.904998]  [<ffffffff8186e6bd>] ? ieee80211_tx+0xa6/0xb5
[   23.905001]  [<ffffffff8186e98b>] ? ieee80211_monitor_start_xmit+0x1e9/0x204
[   23.905005]  [<ffffffff8171ce5f>] ? dev_hard_start_xmit+0x271/0x3ec
[   23.905008]  [<ffffffff817351ac>] ? sch_direct_xmit+0x66/0x164
[   23.905010]  [<ffffffff8171d1bf>] ? dev_queue_xmit+0x1e5/0x3c8
[   23.905013]  [<ffffffff817fac5a>] ? packet_sendmsg+0xac5/0xb3d
[   23.905017]  [<ffffffff81709a09>] ? sock_sendmsg+0x37/0x52
[   23.905020]  [<ffffffff810f9e0c>] ? __do_fault+0x338/0x36b
[   23.905023]  [<ffffffff81713820>] ? verify_iovec+0x44/0x94
[   23.905025]  [<ffffffff81709e63>] ? ___sys_sendmsg+0x1f1/0x283
[   23.905029]  [<ffffffff81140a73>] ? __inode_wait_for_writeback+0x67/0xae
[   23.905031]  [<ffffffff8111735e>] ? __cache_free.isra.46+0x178/0x187
[   23.905033]  [<ffffffff811173b1>] ? kmem_cache_free+0x44/0x84
[   23.905036]  [<ffffffff81132c22>] ? dentry_kill+0x13d/0x149
[   23.905038]  [<ffffffff81132f6f>] ? dput+0xe5/0xef
[   23.905041]  [<ffffffff81136e04>] ? fget_light+0x2e/0x7c
[   23.905043]  [<ffffffff8170ae62>] ? __sys_sendmsg+0x39/0x57
[   23.905046]  [<ffffffff818a7e39>] ? system_call_fastpath+0x16/0x1b
[   23.905047] ---[ end trace 1b3eb79359c1d1e6 ]---

--
Sander

^ permalink raw reply

* RE: [Ilw] drivers/net/wireless/iwlwifi/dvm/tx.c:456 iwlagn_tx_skb+0x6c5/0x883()
From: Grumbach, Emmanuel @ 2013-10-15 10:52 UTC (permalink / raw)
  To: Sander Eikelenboom, John W. Linville, Berg, Johannes,
	ilw-VuQAYsv1563Yd54FQh9/CA@public.gmane.org
  Cc: ilw-VuQAYsv1563Yd54FQh9/CA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <9010061839.20131015123940-6SM94LqRVpn6gRhOQ7JHfg@public.gmane.org>

> 
> Hi,
> 
> I'm having a:
> 
> 02:00.0 Network controller: Intel Corporation Centrino Advanced-N 6235 (rev
> 24)
> 
> And i'm running into this warning on boot with a 3.11.2 and 3.12-rc5 kernel.
> 
> [   23.904950] ------------[ cut here ]------------
> [   23.904957] WARNING: CPU: 0 PID: 2531 at
> drivers/net/wireless/iwlwifi/dvm/tx.c:456 iwlagn_tx_skb+0x6c5/0x883()

Can you reproduce easily?
If yes, please reproduce with debug parameters:
modprobe iwlwifi debug=0xC0800000

Also, please enable MAC80211_HT_DEBUG

Thanks

--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: DomU's network interface will hung when Dom0 running 32bit
From: jianhai luan @ 2013-10-15 11:26 UTC (permalink / raw)
  To: Wei Liu; +Cc: Ian Campbell, xen-devel, netdev, ANNIE LI
In-Reply-To: <20131015100624.GB29436@zion.uk.xensource.com>


On 2013-10-15 18:06, Wei Liu wrote:
> On Tue, Oct 15, 2013 at 05:34:57PM +0800, jianhai luan wrote:
>> On 2013-10-15 16:43, Ian Campbell wrote:
>>> On Tue, 2013-10-15 at 10:44 +0800, jianhai luan wrote:
>>>> On 2013-10-14 19:19, Wei Liu wrote:
>>>>> On Sat, Oct 12, 2013 at 04:53:18PM +0800, jianhai luan wrote:
>>>>>> Hi Ian,
>>>>>>     I meet the DomU's network interface hung issue recently, and have
>>>>>> been working on the issue from that time. I find that DomU's network
>>>>>> interface, which send lesser package, will hung if Dom0 running
>>>>>> 32bit and DomU's up-time is very long.  I think that one jiffies
>>>>>> overflow bug exist in the function tx_credit_exceeded().
>>>>>>     I know the inline function time_after_eq(a,b) will process jiffies
>>>>>> overflow, but the function have one limit a should little that (b +
>>>>>> MAX_SIGNAL_LONG). If a large than the value, time_after_eq will
>>>>>> return false. The MAX_SINGNAL_LONG should be 0x7fffffff at 32-bit
>>>>>> machine.
>>>>>>     If DomU's network interface send lesser package (<0.5k/s if
>>>>>> jiffies=250 and credit_bytes=ULONG_MAX), jiffies will beyond out
>>>>>> (credit_timeout.expires + MAX_SIGNAL_LONG) and time_after_eq(now,
>>>>>> next_credit) will failure (should be true). So one timer which will
>>>>>> not be trigger in short time, and later process will be aborted when
>>>>>> timer_pending(&vif->credit_timeout) is true. The result will be
>>>>>> DomU's network interface will be hung in long time (> 40days).
>>>>>>     Please think about the below scenario:
>>>>>>     Condition:
>>>>>>       Dom0 running 32-bit and HZ = 1000
>>>>>>       vif->credit_timeout->expire = 0xffffffff, vif->remaining_credit
>>>>>> = 0xffffffff, vif->credit_usec=0 jiffies=0
>>>>>>       vif receive lesser package (DomU send lesser package). If the
>>>>>> value is litter than 2K/s, consume 4G(0xffffffff) will need 582.55
>>>>>> hours. jiffies will large than 0x7ffffff. we guess jiffies =
>>>>>> 0x800000ff, time_after_eq(0x800000ff, 0xffffffff) will failure, and
>>>>>> one time which expire is 0xfffffff will be pended into system. So
>>>>>> the interface will hung until jiffies recount 0xffffffff (that will
>>>>>> need very long time).
>>>>> If I'm not mistaken you meant time_after_eq(now, next_credit) in
>>>>> netback. How does next_credit become 0xffffffff?
>>>> I only assume the value is 0xfffffff, and the value of next_credit
>>>> isn't  point. If the delta between now and next_credit larger than
>>>> ULONG_MAX, time_after_eq will do wrong judge.
>>> So it sounds like we need a timer which is independent of the traffic
>>> being sent to keep credit_timeout.expires rolling over.
>>>
>>> Can you propose a patch?
>> Because credit_timeout.expire always after jiffies, i judge the
>> value over the range of time_after_eq() by time_before(now,
>> vif->credit_timeout.expires). please check the patch.
> I don't think this really fix the issue for you. You still have chance
> that now wraps around and falls between expires and next_credit. In that
> case it's stalled again.

if time_before(now, vif->credit_timeout.expires) is true, time wrap and 
do operation. Otherwise time_before(now, vif->credit_timeout.expires) 
isn't true, now - vif->credit_timeout.expires should be letter than 
ULONG_MAX/2. Because next_credit large than vif->credit_timeout.expires 
(next_crdit = vif->credit_timeout.expires + 
msecs_to_jiffies(vif->credit_usec/1000)), the delta between now and 
next_credit should be in range of time_after_eq().  So time_after_eq() 
do correctly judge.

Jason
>
> Wei.

^ permalink raw reply

* [PATCH 00/18] cleanup: wrapper functions of net_ratelimit() called to simplify code
From: Kefeng Wang @ 2013-10-15 11:44 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, David S. Miller, Pablo Neira Ayuso,
	Stephen Hemminger, Johannes Berg, John W. Linville,
	Stanislaw Gruszka, Johannes Berg, Francois Romieu, Ben Hutchings,
	Chas Williams, Marc Kleine-Budde, Samuel Ortiz, Paul Mackerras,
	Oliver Neukum, Konrad Rzeszutek Wilk, Boris Ostrovsky,
	David Vrabel, Rusty Russell, Michael S. Tsirkin, netfilter,
	netdev

Macro PRINTR is called only once in nfnetlink_log.c, and it can be 
replaced by wrappe function of net_ratelimit(), so kill it.

Meanwhile, I found many files could use wrappe functions of net_ratelimit()
to simplify, so I did.


Kefeng Wang (18):
  netfilter: cleanup: delete Macro PRINTR
  net: use wrapper functions of net_ratelimit() to simplify code
  rt18187se: use wrapper functions of net_ratelimit() to simplify code
  rt18192e: use wrapper functions of net_ratelimit() to simplify code
  rt18192u: use wrapper functions of net_ratelimit() to simplify code
  net: wireless: use wrapper functions of net_ratelimit() to simplify
    code
  net: ethernet: use wrapper functions of net_ratelimit() to simplify
    code
  atm: use wrapper functions of net_ratelimit() to simplify code
  block: aoe: use wrapper functions of net_ratelimit() to simplify code
  net: peak_usb: use wrapper functions of net_ratelimit() to simplify
    code
  net: hamradio: use wrapper functions of net_ratelimit() to simplify
    code
  net: irda: use wrapper functions of net_ratelimit() to simplify code
  net: ppp: use wrapper functions of net_ratelimit() to simplify code
  net: usb: use wrapper functions of net_ratelimit() to simplify code
  net: xen: use wrapper functions of net_ratelimit() to simplify code
  net: virtio: use wrapper functions of net_ratelimit() to simplify code
  net: vxlan: use wrapper functions of net_ratelimit() to simplify code
  net: wimax: use wrapper functions of net_ratelimit() to simplify code

 drivers/atm/solos-pci.c                            | 20 ++++-----
 drivers/block/aoe/aoenet.c                         |  4 +-
 drivers/net/can/usb/peak_usb/pcan_usb_core.c       | 11 ++---
 drivers/net/ethernet/aeroflex/greth.c              | 16 +++----
 drivers/net/ethernet/alteon/acenic.c               |  3 +-
 drivers/net/ethernet/arc/emac_main.c               |  7 ++--
 drivers/net/ethernet/broadcom/b44.c                |  4 +-
 drivers/net/ethernet/ethoc.c                       |  4 +-
 drivers/net/ethernet/faraday/ftgmac100.c           | 49 +++++++---------------
 drivers/net/ethernet/faraday/ftmac100.c            | 45 ++++++--------------
 drivers/net/ethernet/freescale/fec_mpc52xx.c       |  3 +-
 drivers/net/ethernet/ibm/emac/mal.c                | 15 ++-----
 drivers/net/ethernet/marvell/mv643xx_eth.c         |  6 +--
 drivers/net/ethernet/marvell/pxa168_eth.c          |  8 +---
 drivers/net/ethernet/marvell/sky2.c                | 42 +++++++------------
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c   |  5 +--
 drivers/net/ethernet/realtek/r8169.c               | 13 +++---
 drivers/net/ethernet/sfc/rx.c                      |  8 ++--
 drivers/net/ethernet/sfc/siena_sriov.c             | 27 ++++--------
 drivers/net/ethernet/ti/davinci_emac.c             |  4 +-
 drivers/net/ethernet/tundra/tsi108_eth.c           | 12 ++----
 drivers/net/hamradio/6pack.c                       |  3 +-
 drivers/net/hamradio/bpqether.c                    |  3 +-
 drivers/net/irda/stir4200.c                        |  3 +-
 drivers/net/ppp/ppp_generic.c                      | 11 +++--
 drivers/net/usb/usbnet.c                           |  4 +-
 drivers/net/virtio_net.c                           |  3 +-
 drivers/net/vxlan.c                                |  3 +-
 drivers/net/wimax/i2400m/netdev.c                  |  3 +-
 drivers/net/wireless/adm8211.c                     |  5 +--
 drivers/net/wireless/ath/carl9170/cmd.c            | 15 +++----
 drivers/net/wireless/ath/carl9170/phy.c            |  7 +---
 drivers/net/wireless/ath/carl9170/rx.c             | 45 ++++++--------------
 drivers/net/wireless/ath/carl9170/usb.c            | 12 ++----
 drivers/net/wireless/hostap/hostap_80211_rx.c      |  8 +---
 drivers/net/wireless/hostap/hostap_80211_tx.c      | 13 ++----
 drivers/net/wireless/hostap/hostap_ap.c            |  6 +--
 drivers/net/wireless/hostap/hostap_hw.c            | 17 +++-----
 drivers/net/wireless/iwlegacy/3945-mac.c           |  4 +-
 drivers/net/wireless/iwlegacy/4965-mac.c           |  6 +--
 drivers/net/wireless/iwlwifi/pcie/rx.c             |  7 ++--
 drivers/net/wireless/libertas_tf/cmd.c             |  6 +--
 drivers/net/wireless/mwl8k.c                       |  5 +--
 drivers/net/wireless/orinoco/hermes.c              |  6 +--
 drivers/net/wireless/orinoco/main.c                | 16 +++----
 drivers/net/wireless/orinoco/orinoco_usb.c         |  3 +-
 drivers/net/wireless/p54/p54pci.c                  |  6 +--
 drivers/net/xen-netback/netback.c                  | 21 ++++------
 drivers/net/xen-netfront.c                         | 27 ++++--------
 .../rtl8187se/ieee80211/ieee80211_crypt_ccmp.c     | 16 ++-----
 .../rtl8187se/ieee80211/ieee80211_crypt_tkip.c     | 27 ++++--------
 drivers/staging/rtl8187se/ieee80211/ieee80211_rx.c |  4 +-
 drivers/staging/rtl8187se/ieee80211/ieee80211_tx.c |  7 +---
 drivers/staging/rtl8192e/rtllib_crypt_ccmp.c       | 13 ++----
 drivers/staging/rtl8192e/rtllib_crypt_tkip.c       | 21 +++-------
 drivers/staging/rtl8192e/rtllib_rx.c               |  6 +--
 .../rtl8192u/ieee80211/ieee80211_crypt_ccmp.c      | 17 ++------
 .../rtl8192u/ieee80211/ieee80211_crypt_tkip.c      | 22 +++-------
 drivers/staging/rtl8192u/ieee80211/ieee80211_rx.c  |  7 +---
 drivers/staging/rtl8192u/ieee80211/ieee80211_tx.c  |  4 +-
 net/bridge/br_fdb.c                                |  6 +--
 net/bridge/br_multicast.c                          | 13 ++----
 net/bridge/br_stp_bpdu.c                           | 11 ++---
 net/mac80211/rx.c                                  |  6 +--
 net/netfilter/ipset/ip_set_hash_gen.h              |  3 +-
 net/netfilter/nfnetlink_log.c                      |  5 +--
 66 files changed, 246 insertions(+), 516 deletions(-)

-- 
1.8.2.1

^ permalink raw reply

* [PATCH 01/18] netfilter: cleanup: delete Macro PRINTR
From: Kefeng Wang @ 2013-10-15 11:44 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, David S. Miller, Pablo Neira Ayuso,
	Stephen Hemminger, Johannes Berg, John W. Linville,
	Stanislaw Gruszka, Johannes Berg, Francois Romieu, Ben Hutchings,
	Chas Williams, Marc Kleine-Budde, Samuel Ortiz, Paul Mackerras,
	Oliver Neukum, Konrad Rzeszutek Wilk, Boris Ostrovsky,
	David Vrabel, Rusty Russell, Michael S. Tsirkin, netfilter,
	netdev
In-Reply-To: <1381837514-50660-1-git-send-email-wangkefeng.wang@huawei.com>

Macro PRINTR is only used once in nfnetlink_log.c, so it can be
replaced by helper function net_err_ratelimited().

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 net/netfilter/nfnetlink_log.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index d92cc31..8713111 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -47,9 +47,6 @@
 #define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
 #define NFULNL_COPY_RANGE_MAX	0xFFFF	/* max packet size is limited by 16-bit struct nfattr nfa_len field */
 
-#define PRINTR(x, args...)	do { if (net_ratelimit()) \
-				     printk(x, ## args); } while (0);
-
 struct nfulnl_instance {
 	struct hlist_node hlist;	/* global list of instances */
 	spinlock_t lock;
@@ -587,7 +584,7 @@ __build_packet_message(struct nfnl_log_net *log,
 	return 0;
 
 nla_put_failure:
-	PRINTR(KERN_ERR "nfnetlink_log: error creating log nlmsg\n");
+	net_err_ratelimited("nfnetlink_log: error creating log nlmsg\n");
 	return -1;
 }
 
-- 
1.8.2.1

^ permalink raw reply related

* [PATCH 02/18] net: use wrapper functions of net_ratelimit() to simplify code
From: Kefeng Wang @ 2013-10-15 11:44 UTC (permalink / raw)
  To: linux-kernel
  Cc: Greg Kroah-Hartman, David S. Miller, Pablo Neira Ayuso,
	Stephen Hemminger, Johannes Berg, John W. Linville,
	Stanislaw Gruszka, Johannes Berg, Francois Romieu, Ben Hutchings,
	Chas Williams, Marc Kleine-Budde, Samuel Ortiz, Paul Mackerras,
	Oliver Neukum, Konrad Rzeszutek Wilk, Boris Ostrovsky,
	David Vrabel, Rusty Russell, Michael S. Tsirkin, netfilter,
	netdev
In-Reply-To: <1381837514-50660-1-git-send-email-wangkefeng.wang@huawei.com>

Wrapper functions net_ratelimited_function() and net_XXX_ratelimited()
are called to simplify code.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 net/bridge/br_fdb.c                   |  6 ++----
 net/bridge/br_multicast.c             | 13 ++++---------
 net/bridge/br_stp_bpdu.c              | 11 ++++-------
 net/mac80211/rx.c                     |  6 ++----
 net/netfilter/ipset/ip_set_hash_gen.h |  3 +--
 5 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index ffd5874..e47dfe5 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -465,10 +465,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 	if (likely(fdb)) {
 		/* attempt to update an entry for a local interface */
 		if (unlikely(fdb->is_local)) {
-			if (net_ratelimit())
-				br_warn(br, "received packet on %s with "
-					"own address as source address\n",
-					source->dev->name);
+			net_ratelimited_function(br_warn, br, "received packet on %s "
+				"with own address as source address\n", source->dev->name);
 		} else {
 			/* fastpath: update of existing entry */
 			fdb->dst = source;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d1c5786..c00d8a2 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -530,11 +530,8 @@ static struct net_bridge_mdb_entry *br_multicast_get_group(
 	max = mdb->max;
 
 	if (unlikely(count > br->hash_elasticity && count)) {
-		if (net_ratelimit())
-			br_info(br, "Multicast hash table "
-				"chain limit reached: %s\n",
-				port ? port->dev->name : br->dev->name);
-
+		net_ratelimited_function(br_info, br, "Multicast hash table "
+			"chain limit reached: %s\n", port ? port->dev->name : br->dev->name);
 		elasticity = br->hash_elasticity;
 	}
 
@@ -554,10 +551,8 @@ disable:
 
 	if (max > mdb->max || elasticity) {
 		if (mdb->old) {
-			if (net_ratelimit())
-				br_info(br, "Multicast hash table "
-					"on fire: %s\n",
-					port ? port->dev->name : br->dev->name);
+			net_ratelimited_function(br_info, br, "Multicast hash table "
+				"on fire: %s\n", port ? port->dev->name : br->dev->name);
 			err = -EEXIST;
 			goto err;
 		}
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 8660ea3..482ef11 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -220,13 +220,10 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
 		bpdu.forward_delay = br_get_ticks(buf+30);
 
 		if (bpdu.message_age > bpdu.max_age) {
-			if (net_ratelimit())
-				br_notice(p->br,
-					  "port %u config from %pM"
-					  " (message_age %ul > max_age %ul)\n",
-					  p->port_no,
-					  eth_hdr(skb)->h_source,
-					  bpdu.message_age, bpdu.max_age);
+			net_ratelimited_function(br_notice, p->br,
+				"port %u config from %pM (message_age %ul > max_age %ul)\n",
+				p->port_no, eth_hdr(skb)->h_source,
+				bpdu.message_age, bpdu.max_age);
 			goto out;
 		}
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 54395d7..35b8b1d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3156,10 +3156,8 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx,
 	if (!consume) {
 		skb = skb_copy(skb, GFP_ATOMIC);
 		if (!skb) {
-			if (net_ratelimit())
-				wiphy_debug(local->hw.wiphy,
-					"failed to copy skb for %s\n",
-					sdata->name);
+			net_ratelimited_function(wiphy_debug, local->hw.wiphy,
+				"failed to copy skb for %s\n", sdata->name);
 			return true;
 		}
 
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 707bc52..6179436 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -607,8 +607,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		mtype_expire(h, NETS_LENGTH(set->family), h->dsize);
 
 	if (h->elements >= h->maxelem) {
-		if (net_ratelimit())
-			pr_warning("Set %s is full, maxelem %u reached\n",
+		net_warn_ratelimited("Set %s is full, maxelem %u reached\n",
 				   set->name, h->maxelem);
 		return -IPSET_ERR_HASH_FULL;
 	}
-- 
1.8.2.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox