Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH net v3 8/9] forcedeth: 64-bit stats
From: Eric Dumazet @ 2011-11-05  7:28 UTC (permalink / raw)
  To: David Decotigny
  Cc: netdev, linux-kernel, David S. Miller, Ian Campbell, Jeff Kirsher,
	Jiri Pirko, Joe Perches, Szymon Janc
In-Reply-To: <4c77142ad7c6019e908884723d3c299163a55e2e.1320457247.git.david.decotigny@google.com>

Le vendredi 04 novembre 2011 à 18:53 -0700, David Decotigny a écrit :
> This converts forcedeth stats to be 64-bits. It also improves
> accounting for dropped rx frames.
> 
> Tested:
>   16-way SMP x86_64 ->
>   RX bytes:7244556582 (7.2 GB)  TX bytes:181904254 (181.9 MB)
> 
> 

This changelog and patch title are misleading.

On a 32bit x86, stats are still 32bit wide after your patch.

On a 64bit x86_64, stats were already 64bit wide before your patch.

So the real thing is about not using the embedded netdevice dev->stats
structure, to reduce false sharing.

> 
> Signed-off-by: David Decotigny <david.decotigny@google.com>
> ---
>  drivers/net/ethernet/nvidia/forcedeth.c |   69 +++++++++++++++++++-----------
>  1 files changed, 44 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
> index 90cdf26..08c512b 100644
> --- a/drivers/net/ethernet/nvidia/forcedeth.c
> +++ b/drivers/net/ethernet/nvidia/forcedeth.c
> @@ -799,6 +799,8 @@ struct fe_priv {
>  	struct timer_list stats_poll;
>  	u32 nic_poll_irq;
>  	int rx_ring_size;
> +	unsigned long stats_rx_dropped;
> +	unsigned long stats_rx_missed_errors;
>  
>  	/* media detection workaround.
>  	 * Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
> @@ -821,6 +823,7 @@ struct fe_priv {
>  	struct nv_skb_map *tx_change_owner;
>  	struct nv_skb_map *tx_end_flip;
>  	int tx_stop;
> +	unsigned long stats_tx_dropped;
>  
>  	/* msi/msi-x fields */
>  	u32 msi_flags;
> @@ -1700,33 +1703,47 @@ static void nv_get_hw_stats(struct net_device *dev)
>  }
>  
>  /*
> - * nv_get_stats: dev->get_stats function
> + * nv_get_stats: dev->ndo_get_stats64 function
>   * Get latest stats value from the nic.
>   * Called with read_lock(&dev_base_lock) held for read -
>   * only synchronized against unregister_netdevice.
>   */
> -static struct net_device_stats *nv_get_stats(struct net_device *dev)
> +static struct rtnl_link_stats64*
> +nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
>  {
>  	struct fe_priv *np = netdev_priv(dev);
>  
>  	/* If the nic supports hw counters then retrieve latest values */
> -	if (np->driver_data & (DEV_HAS_STATISTICS_V1|DEV_HAS_STATISTICS_V2|DEV_HAS_STATISTICS_V3)) {
> +	if (np->driver_data & (DEV_HAS_STATISTICS_V1
> +			       | DEV_HAS_STATISTICS_V2
> +			       | DEV_HAS_STATISTICS_V3)) {
>  		nv_get_hw_stats(dev);
>  
> -		/* copy to net_device stats */
> -		dev->stats.tx_packets = np->estats.tx_packets;
> -		dev->stats.rx_bytes = np->estats.rx_bytes;
> -		dev->stats.tx_bytes = np->estats.tx_bytes;
> -		dev->stats.tx_fifo_errors = np->estats.tx_fifo_errors;
> -		dev->stats.tx_carrier_errors = np->estats.tx_carrier_errors;
> -		dev->stats.rx_crc_errors = np->estats.rx_crc_errors;
> -		dev->stats.rx_over_errors = np->estats.rx_over_errors;
> -		dev->stats.rx_fifo_errors = np->estats.rx_drop_frame;
> -		dev->stats.rx_errors = np->estats.rx_errors_total;
> -		dev->stats.tx_errors = np->estats.tx_errors_total;
> -	}
> -
> -	return &dev->stats;
> +		/* generic stats */
> +		storage->rx_packets = np->estats.rx_packets;
> +		storage->tx_packets = np->estats.tx_packets;
> +		storage->rx_bytes   = np->estats.rx_bytes;
> +		storage->tx_bytes   = np->estats.tx_bytes;
> +		storage->rx_errors  = np->estats.rx_errors_total;
> +		storage->tx_errors  = np->estats.tx_errors_total;
> +		storage->rx_dropped = np->stats_rx_dropped;
> +		storage->tx_dropped = np->stats_tx_dropped;
> +		storage->multicast  = np->estats.rx_multicast;
> +
> +		/* detailed rx_errors */
> +		storage->rx_length_errors = np->estats.rx_length_error;
> +		storage->rx_over_errors   = np->estats.rx_over_errors;
> +		storage->rx_crc_errors    = np->estats.rx_crc_errors;
> +		storage->rx_frame_errors  = np->estats.rx_frame_align_error;
> +		storage->rx_fifo_errors   = np->estats.rx_drop_frame;
> +		storage->rx_missed_errors = np->stats_rx_missed_errors;
> +
> +		/* detailed tx_errors */
> +		storage->tx_carrier_errors = np->estats.tx_carrier_errors;
> +		storage->tx_fifo_errors    = np->estats.tx_fifo_errors;
> +	}
> +
> +	return storage;
>  }
>  
>  /*
> @@ -1759,8 +1776,10 @@ static int nv_alloc_rx(struct net_device *dev)
>  				np->put_rx.orig = np->first_rx.orig;
>  			if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
>  				np->put_rx_ctx = np->first_rx_ctx;
> -		} else
> +		} else {
> +			np->stats_rx_dropped++;
>  			return 1;
> +		}
>  	}
>  	return 0;
>  }
> @@ -1791,8 +1810,10 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
>  				np->put_rx.ex = np->first_rx.ex;
>  			if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
>  				np->put_rx_ctx = np->first_rx_ctx;
> -		} else
> +		} else {
> +			np->stats_rx_dropped++;
>  			return 1;
> +		}
>  	}
>  	return 0;
>  }
> @@ -1928,7 +1949,7 @@ static void nv_drain_tx(struct net_device *dev)
>  			np->tx_ring.ex[i].buflow = 0;
>  		}
>  		if (nv_release_txskb(np, &np->tx_skb[i]))
> -			dev->stats.tx_dropped++;
> +			np->stats_tx_dropped++;
>  		np->tx_skb[i].dma = 0;
>  		np->tx_skb[i].dma_len = 0;
>  		np->tx_skb[i].dma_single = 0;
> @@ -2651,7 +2672,7 @@ static int nv_rx_process(struct net_device *dev, int limit)
>  					/* the rest are hard errors */
>  					else {
>  						if (flags & NV_RX_MISSEDFRAME)
> -							dev->stats.rx_missed_errors++;
> +							np->stats_rx_missed_errors++;
>  						dev_kfree_skb(skb);
>  						goto next_pkt;
>  					}
> @@ -2694,7 +2715,6 @@ static int nv_rx_process(struct net_device *dev, int limit)
>  		skb_put(skb, len);
>  		skb->protocol = eth_type_trans(skb, dev);
>  		napi_gro_receive(&np->napi, skb);
> -		dev->stats.rx_packets++;
>  next_pkt:
>  		if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
>  			np->get_rx.orig = np->first_rx.orig;
> @@ -2777,7 +2797,6 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
>  				__vlan_hwaccel_put_tag(skb, vid);
>  			}
>  			napi_gro_receive(&np->napi, skb);
> -			dev->stats.rx_packets++;
>  		} else {
>  			dev_kfree_skb(skb);
>  		}
> @@ -5199,7 +5218,7 @@ static int nv_close(struct net_device *dev)
>  static const struct net_device_ops nv_netdev_ops = {
>  	.ndo_open		= nv_open,
>  	.ndo_stop		= nv_close,
> -	.ndo_get_stats		= nv_get_stats,
> +	.ndo_get_stats64	= nv_get_stats64,
>  	.ndo_start_xmit		= nv_start_xmit,
>  	.ndo_tx_timeout		= nv_tx_timeout,
>  	.ndo_change_mtu		= nv_change_mtu,
> @@ -5216,7 +5235,7 @@ static const struct net_device_ops nv_netdev_ops = {
>  static const struct net_device_ops nv_netdev_ops_optimized = {
>  	.ndo_open		= nv_open,
>  	.ndo_stop		= nv_close,
> -	.ndo_get_stats		= nv_get_stats,
> +	.ndo_get_stats64	= nv_get_stats64,
>  	.ndo_start_xmit		= nv_start_xmit_optimized,
>  	.ndo_tx_timeout		= nv_tx_timeout,
>  	.ndo_change_mtu		= nv_change_mtu,

^ permalink raw reply

* Dear Friend
From: Mr.Andrew Wong @ 2011-11-04 23:02 UTC (permalink / raw)




I need your assistance to transfer Abandoned sum of $8.5  Million Dollars in Alliance Bank in Kuala Lumpur Malaysia,
 Own by a billionaire Business Mogul Late Mr. Moises Saba Masri, a Jew
from Mexico who was a victim of a helicopter crash since 2010 year,
resulting to his death and his family members.
You can see more information about Saba Masri Mr.Moises unfortunate end
accident on the website-link below.

http://www.ynetnews.com/articles/0,7340,L-3832556,00.html.

We share in ratio 60%/40%. Reply if interested reply via email: mr.andrew.wong16@gmail.com

Best Regard

Mr.Andrew Wong

--

^ permalink raw reply

* [PATCH net v4 0/4] forcedeth: minor fixes for stats, rmmod, sparse
From: David Decotigny @ 2011-11-05  4:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Jiri Pirko, Joe Perches, Szymon Janc, David Decotigny

David,

Thanks for your feedback. Here is v4, with only the (minor) bug fixes
for net. I am setting the "new features" patches aside for later
(net-next).

Since I am still very much of a newbie with the way net & net-next
work, could you please redirect me to some pointer on how/when the
schedule works?

Regards,

Tested: 16-way x86_64 + forcedeth

Changes since v3:
 - removed feature additions, this leaves minor sparse and stats
   fixes. Feature additions shipped previously will go to net-next

Changes since v2:
 - removed "Fix a race during rmmod of forcedeth" from the series
   (will look at it separately with original author)
 - added "remove unneeded stats updates" and "64-bit stats"
 - reordered patches

Changes since v1:
 - rebased on top of netdev tip
 - do not repeat name of device in netdev_dbg
 - do not completely mute TX timeout messages when debug_tx_timeout is
   not set
 - make debug_tx_timeout writable in /sys/module
 Note: I am re-submitting "expose module parameters in /sys/module" as
       it can be useful in production and I was assured it doesn't add
       much memory overhead by the sysfs maintainers.

Tested:
  16-way x86_64 SMP, dual forcedeth ->
  RX bytes:7244556582 (7.2 GB)  TX bytes:181904254 (181.9 MB)


############################################
# Patch Set Summary:

David Decotigny (2):
  forcedeth: remove unneeded stats updates
  forcedeth: fix a few sparse warnings (variable shadowing)

Mandeep Baines (1):
  forcedeth: Improve stats counters

Mike Ditto (1):
  forcedeth: Acknowledge only interrupts that are being processed

 drivers/net/ethernet/nvidia/forcedeth.c |   86 +++++++++++--------------------
 1 files changed, 30 insertions(+), 56 deletions(-)

-- 
1.7.3.1

^ permalink raw reply

* [PATCH net v4 3/4] forcedeth: Improve stats counters
From: David Decotigny @ 2011-11-05  4:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Jiri Pirko, Joe Perches, Szymon Janc, Mandeep Baines,
	David Decotigny
In-Reply-To: <cover.1320463771.git.david.decotigny@google.com>

From: Mandeep Baines <msb@google.com>

Rx byte count was off; instead use the hardware's count.  Tx packet
count was counting pre-TSO packets; instead count on-the-wire packets.
Report hardware dropped frame count as rx_fifo_errors.

- The count of transmitted packets reported by the forcedeth driver
  reports pre-TSO (TCP Segmentation Offload) packet counts and not the
  count of the number of packets sent on the wire. This change fixes
  the forcedeth driver to report the correct count. Fixed the code by
  copying the count stored in the NIC H/W to the value reported by the
  driver.

- Count rx_drop_frame errors as rx_fifo_errors:
  We see a lot of rx_drop_frame errors if we disable the rx bottom-halves
  for too long.  Normally, rx_fifo_errors would be counted in this case.
  The rx_drop_frame error count is private to forcedeth and is not
  reported by ifconfig or sysfs.  The rx_fifo_errors count is currently
  unused in the forcedeth driver.  It is reported by ifconfig as overruns.
  This change reports rx_drop_frame errors as rx_fifo_errors.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 0bcb18e..fe2ba70 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -1682,6 +1682,7 @@ static void nv_get_hw_stats(struct net_device *dev)
 		np->estats.tx_pause += readl(base + NvRegTxPause);
 		np->estats.rx_pause += readl(base + NvRegRxPause);
 		np->estats.rx_drop_frame += readl(base + NvRegRxDropFrame);
+		np->estats.rx_errors_total += np->estats.rx_drop_frame;
 	}
 
 	if (np->driver_data & DEV_HAS_STATISTICS_V3) {
@@ -1706,11 +1707,14 @@ static struct net_device_stats *nv_get_stats(struct net_device *dev)
 		nv_get_hw_stats(dev);
 
 		/* copy to net_device stats */
+		dev->stats.tx_packets = np->estats.tx_packets;
+		dev->stats.rx_bytes = np->estats.rx_bytes;
 		dev->stats.tx_bytes = np->estats.tx_bytes;
 		dev->stats.tx_fifo_errors = np->estats.tx_fifo_errors;
 		dev->stats.tx_carrier_errors = np->estats.tx_carrier_errors;
 		dev->stats.rx_crc_errors = np->estats.rx_crc_errors;
 		dev->stats.rx_over_errors = np->estats.rx_over_errors;
+		dev->stats.rx_fifo_errors = np->estats.rx_drop_frame;
 		dev->stats.rx_errors = np->estats.rx_errors_total;
 		dev->stats.tx_errors = np->estats.tx_errors_total;
 	}
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net v4 4/4] forcedeth: fix a few sparse warnings (variable shadowing)
From: David Decotigny @ 2011-11-05  4:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Jiri Pirko, Joe Perches, Szymon Janc, David Decotigny
In-Reply-To: <cover.1320463771.git.david.decotigny@google.com>

This fixes the following sparse warnings:
drivers/net/ethernet/nvidia/forcedeth.c:2113:7: warning: symbol 'size' shadows an earlier one
drivers/net/ethernet/nvidia/forcedeth.c:2102:6: originally declared here
drivers/net/ethernet/nvidia/forcedeth.c:2155:7: warning: symbol 'size' shadows an earlier one
drivers/net/ethernet/nvidia/forcedeth.c:2102:6: originally declared here
drivers/net/ethernet/nvidia/forcedeth.c:2227:7: warning: symbol 'size' shadows an earlier one
drivers/net/ethernet/nvidia/forcedeth.c:2215:6: originally declared here
drivers/net/ethernet/nvidia/forcedeth.c:2271:7: warning: symbol 'size' shadows an earlier one
drivers/net/ethernet/nvidia/forcedeth.c:2215:6: originally declared here
drivers/net/ethernet/nvidia/forcedeth.c:2986:20: warning: symbol 'addr' shadows an earlier one
drivers/net/ethernet/nvidia/forcedeth.c:2963:6: originally declared here



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |   34 +++++++++++++++---------------
 1 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index fe2ba70..d32cd4b 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2103,10 +2103,10 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	/* add fragments to entries count */
 	for (i = 0; i < fragments; i++) {
-		u32 size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+		u32 frag_size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
 
-		entries += (size >> NV_TX2_TSO_MAX_SHIFT) +
-			   ((size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
+		entries += (frag_size >> NV_TX2_TSO_MAX_SHIFT) +
+			   ((frag_size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
 	}
 
 	spin_lock_irqsave(&np->lock, flags);
@@ -2145,13 +2145,13 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* setup the fragments */
 	for (i = 0; i < fragments; i++) {
 		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-		u32 size = skb_frag_size(frag);
+		u32 frag_size = skb_frag_size(frag);
 		offset = 0;
 
 		do {
 			prev_tx = put_tx;
 			prev_tx_ctx = np->put_tx_ctx;
-			bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
+			bcnt = (frag_size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : frag_size;
 			np->put_tx_ctx->dma = skb_frag_dma_map(
 							&np->pci_dev->dev,
 							frag, offset,
@@ -2163,12 +2163,12 @@ static netdev_tx_t nv_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			put_tx->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
 
 			offset += bcnt;
-			size -= bcnt;
+			frag_size -= bcnt;
 			if (unlikely(put_tx++ == np->last_tx.orig))
 				put_tx = np->first_tx.orig;
 			if (unlikely(np->put_tx_ctx++ == np->last_tx_ctx))
 				np->put_tx_ctx = np->first_tx_ctx;
-		} while (size);
+		} while (frag_size);
 	}
 
 	/* set last fragment flag  */
@@ -2217,10 +2217,10 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 
 	/* add fragments to entries count */
 	for (i = 0; i < fragments; i++) {
-		u32 size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+		u32 frag_size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
 
-		entries += (size >> NV_TX2_TSO_MAX_SHIFT) +
-			   ((size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
+		entries += (frag_size >> NV_TX2_TSO_MAX_SHIFT) +
+			   ((frag_size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
 	}
 
 	spin_lock_irqsave(&np->lock, flags);
@@ -2261,13 +2261,13 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 	/* setup the fragments */
 	for (i = 0; i < fragments; i++) {
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-		u32 size = skb_frag_size(frag);
+		u32 frag_size = skb_frag_size(frag);
 		offset = 0;
 
 		do {
 			prev_tx = put_tx;
 			prev_tx_ctx = np->put_tx_ctx;
-			bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
+			bcnt = (frag_size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : frag_size;
 			np->put_tx_ctx->dma = skb_frag_dma_map(
 							&np->pci_dev->dev,
 							frag, offset,
@@ -2280,12 +2280,12 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 			put_tx->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
 
 			offset += bcnt;
-			size -= bcnt;
+			frag_size -= bcnt;
 			if (unlikely(put_tx++ == np->last_tx.ex))
 				put_tx = np->first_tx.ex;
 			if (unlikely(np->put_tx_ctx++ == np->last_tx_ctx))
 				np->put_tx_ctx = np->first_tx_ctx;
-		} while (size);
+		} while (frag_size);
 	}
 
 	/* set last fragment flag  */
@@ -2933,11 +2933,11 @@ static void nv_set_multicast(struct net_device *dev)
 				struct netdev_hw_addr *ha;
 
 				netdev_for_each_mc_addr(ha, dev) {
-					unsigned char *addr = ha->addr;
+					unsigned char *hw_addr = ha->addr;
 					u32 a, b;
 
-					a = le32_to_cpu(*(__le32 *) addr);
-					b = le16_to_cpu(*(__le16 *) (&addr[4]));
+					a = le32_to_cpu(*(__le32 *) hw_addr);
+					b = le16_to_cpu(*(__le16 *) (&hw_addr[4]));
 					alwaysOn[0] &= a;
 					alwaysOff[0] &= ~a;
 					alwaysOn[1] &= b;
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net v4 2/4] forcedeth: remove unneeded stats updates
From: David Decotigny @ 2011-11-05  4:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Jiri Pirko, Joe Perches, Szymon Janc, David Decotigny
In-Reply-To: <cover.1320463771.git.david.decotigny@google.com>

Function ndo_get_stats() updates most of the stats from hardware
registers, making the manual updates un-needed. This change removes
these manual updates. Main exception is rx_missed_errors which needs
manual update. Another exception is rx_packets, still updated manually
in this commit to make sure this patch doesn't change behavior of
driver (this is addressed by a later commit in this series).



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |   35 +------------------------------
 1 files changed, 1 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 056986b..0bcb18e 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2374,16 +2374,8 @@ static int nv_tx_done(struct net_device *dev, int limit)
 		if (np->desc_ver == DESC_VER_1) {
 			if (flags & NV_TX_LASTPACKET) {
 				if (flags & NV_TX_ERROR) {
-					if (flags & NV_TX_UNDERFLOW)
-						dev->stats.tx_fifo_errors++;
-					if (flags & NV_TX_CARRIERLOST)
-						dev->stats.tx_carrier_errors++;
 					if ((flags & NV_TX_RETRYERROR) && !(flags & NV_TX_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
-					dev->stats.tx_errors++;
-				} else {
-					dev->stats.tx_packets++;
-					dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
 				}
 				dev_kfree_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
@@ -2392,16 +2384,8 @@ static int nv_tx_done(struct net_device *dev, int limit)
 		} else {
 			if (flags & NV_TX2_LASTPACKET) {
 				if (flags & NV_TX2_ERROR) {
-					if (flags & NV_TX2_UNDERFLOW)
-						dev->stats.tx_fifo_errors++;
-					if (flags & NV_TX2_CARRIERLOST)
-						dev->stats.tx_carrier_errors++;
 					if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK))
 						nv_legacybackoff_reseed(dev);
-					dev->stats.tx_errors++;
-				} else {
-					dev->stats.tx_packets++;
-					dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
 				}
 				dev_kfree_skb_any(np->get_tx_ctx->skb);
 				np->get_tx_ctx->skb = NULL;
@@ -2434,9 +2418,7 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
 		nv_unmap_txskb(np, np->get_tx_ctx);
 
 		if (flags & NV_TX2_LASTPACKET) {
-			if (!(flags & NV_TX2_ERROR))
-				dev->stats.tx_packets++;
-			else {
+			if (flags & NV_TX2_ERROR) {
 				if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK)) {
 					if (np->driver_data & DEV_HAS_GEAR_MODE)
 						nv_gear_backoff_reseed(dev);
@@ -2636,7 +2618,6 @@ static int nv_rx_process(struct net_device *dev, int limit)
 					if ((flags & NV_RX_ERROR_MASK) == NV_RX_ERROR4) {
 						len = nv_getlen(dev, skb->data, len);
 						if (len < 0) {
-							dev->stats.rx_errors++;
 							dev_kfree_skb(skb);
 							goto next_pkt;
 						}
@@ -2650,11 +2631,6 @@ static int nv_rx_process(struct net_device *dev, int limit)
 					else {
 						if (flags & NV_RX_MISSEDFRAME)
 							dev->stats.rx_missed_errors++;
-						if (flags & NV_RX_CRCERR)
-							dev->stats.rx_crc_errors++;
-						if (flags & NV_RX_OVERFLOW)
-							dev->stats.rx_over_errors++;
-						dev->stats.rx_errors++;
 						dev_kfree_skb(skb);
 						goto next_pkt;
 					}
@@ -2670,7 +2646,6 @@ static int nv_rx_process(struct net_device *dev, int limit)
 					if ((flags & NV_RX2_ERROR_MASK) == NV_RX2_ERROR4) {
 						len = nv_getlen(dev, skb->data, len);
 						if (len < 0) {
-							dev->stats.rx_errors++;
 							dev_kfree_skb(skb);
 							goto next_pkt;
 						}
@@ -2682,11 +2657,6 @@ static int nv_rx_process(struct net_device *dev, int limit)
 					}
 					/* the rest are hard errors */
 					else {
-						if (flags & NV_RX2_CRCERR)
-							dev->stats.rx_crc_errors++;
-						if (flags & NV_RX2_OVERFLOW)
-							dev->stats.rx_over_errors++;
-						dev->stats.rx_errors++;
 						dev_kfree_skb(skb);
 						goto next_pkt;
 					}
@@ -2704,7 +2674,6 @@ static int nv_rx_process(struct net_device *dev, int limit)
 		skb->protocol = eth_type_trans(skb, dev);
 		napi_gro_receive(&np->napi, skb);
 		dev->stats.rx_packets++;
-		dev->stats.rx_bytes += len;
 next_pkt:
 		if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
 			np->get_rx.orig = np->first_rx.orig;
@@ -2787,9 +2756,7 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 				__vlan_hwaccel_put_tag(skb, vid);
 			}
 			napi_gro_receive(&np->napi, skb);
-
 			dev->stats.rx_packets++;
-			dev->stats.rx_bytes += len;
 		} else {
 			dev_kfree_skb(skb);
 		}
-- 
1.7.3.1

^ permalink raw reply related

* [PATCH net v4 1/4] forcedeth: Acknowledge only interrupts that are being processed
From: David Decotigny @ 2011-11-05  4:04 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
	Jiri Pirko, Joe Perches, Szymon Janc, Mike Ditto, David Decotigny
In-Reply-To: <cover.1320463771.git.david.decotigny@google.com>

From: Mike Ditto <mditto@google.com>

This is to avoid a race, accidentally acknowledging an interrupt that
we didn't notice and won't immediately process.  This is based solely
on code inspection; it is not known if there was an actual bug here.



Signed-off-by: David Decotigny <david.decotigny@google.com>
---
 drivers/net/ethernet/nvidia/forcedeth.c |   13 ++++++++-----
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 1e37eb9..056986b 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -3398,7 +3398,8 @@ static irqreturn_t nv_nic_irq_tx(int foo, void *data)
 
 	for (i = 0;; i++) {
 		events = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQ_TX_ALL;
-		writel(NVREG_IRQ_TX_ALL, base + NvRegMSIXIrqStatus);
+		writel(events, base + NvRegMSIXIrqStatus);
+		netdev_dbg(dev, "tx irq events: %08x\n", events);
 		if (!(events & np->irqmask))
 			break;
 
@@ -3509,7 +3510,8 @@ static irqreturn_t nv_nic_irq_rx(int foo, void *data)
 
 	for (i = 0;; i++) {
 		events = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQ_RX_ALL;
-		writel(NVREG_IRQ_RX_ALL, base + NvRegMSIXIrqStatus);
+		writel(events, base + NvRegMSIXIrqStatus);
+		netdev_dbg(dev, "rx irq events: %08x\n", events);
 		if (!(events & np->irqmask))
 			break;
 
@@ -3553,7 +3555,8 @@ static irqreturn_t nv_nic_irq_other(int foo, void *data)
 
 	for (i = 0;; i++) {
 		events = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQ_OTHER;
-		writel(NVREG_IRQ_OTHER, base + NvRegMSIXIrqStatus);
+		writel(events, base + NvRegMSIXIrqStatus);
+		netdev_dbg(dev, "irq events: %08x\n", events);
 		if (!(events & np->irqmask))
 			break;
 
@@ -3617,10 +3620,10 @@ static irqreturn_t nv_nic_irq_test(int foo, void *data)
 
 	if (!(np->msi_flags & NV_MSI_X_ENABLED)) {
 		events = readl(base + NvRegIrqStatus) & NVREG_IRQSTAT_MASK;
-		writel(NVREG_IRQ_TIMER, base + NvRegIrqStatus);
+		writel(events & NVREG_IRQ_TIMER, base + NvRegIrqStatus);
 	} else {
 		events = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK;
-		writel(NVREG_IRQ_TIMER, base + NvRegMSIXIrqStatus);
+		writel(events & NVREG_IRQ_TIMER, base + NvRegMSIXIrqStatus);
 	}
 	pci_push(base);
 	if (!(events & NVREG_IRQ_TIMER))
-- 
1.7.3.1

^ permalink raw reply related

* Re: [PATCH/RFC 00/11] HFSC patches
From: Patrick McHardy @ 2011-11-05  3:48 UTC (permalink / raw)
  To: Michal Soltys; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

On 05.11.2011 03:32, Michal Soltys wrote:
> Those are most of the patches I've been sitting on for a while. For the most
> part they are either small corrections or simplifications (marked with s) - but
> there are also some new things (marked with !), the most interesting being #8
> probably.
> 
> All changes are richly commented in respective commit messages. I've been using
> them for a while - so unless I missed some subtlety, all should be fine.

Thanks Michal. It has been quite a while since I've last looked
at this and this is complicated stuff, please give me a few days
to review your patches.

> Apart from these, there's still one subtle thing to do w.r.t. cl_cvtmin (during
> init_vf(), as this value is lagged relatively to the situation at the time of
> enqueue).
> 
> On a side note, I was thinking about something like hfsc-strict or so - where
> [uplink] interface could be upperlimited on hfsc qdisc level, but all the class
> upperlimit would be otherwise gone. Not sure if anyone would be even interested
> in something like that at all.

So classes would just use link-sharing curves? That's
already possible, so I probably don't get your idea.

^ permalink raw reply

* dst->obsolete has become pointless
From: David Miller @ 2011-11-05  3:09 UTC (permalink / raw)
  To: netdev; +Cc: steffen.klassert, timo.teras


While researching the things unearthed by Steffen Klassert wrt. PMTU
handling in the current tree I went to do some research on what the
real story is wrt. dst->obsolete.

And sure enough EVERY SINGLE ipv4 and ipv6 route is created with
obsolete set to -1, so we unconditionally always invoke ->dst_check().

This makes it completely pointless as an optimization to avoid calling
the dst_ops->dst_check() method.  It never triggers.

This stems from Timo's change to make route expiry properly visible
to IPSEC stacked routes:

--
commit d11a4dc18bf41719c9f0d7ed494d295dd2973b92
Author: Timo Teräs <timo.teras@iki.fi>
Date:   Thu Mar 18 23:20:20 2010 +0000

    ipv4: check rt_genid in dst_check
...
--

Only DecNET creates routes with obsolete initially set to zero, and
therefore only hits ->dst_check() when dst_free is invoked on the route
during a flush of the decnet routing tables.

And actually this is how ipv4 operated before we started using
generation counts instead of flushing the entire table.  IPV6 seems to
always have used the FIB6 tree serial numbers for expiration checking
and therefore always set obsolete to -1 on new routes.

So we can't just get rid of the dst->obsolete check in dst_check() and
__sk_dst_check() because that will break DecNET because DecNET's
->dst_check() handler assumes that if it was called then the route
is obsolete and it just plainly returns NULL to tell the caller the
route is in fact invalid.

The current situation looks quite terrible, because these functions look
like they optimize away the check op call, but in reality for ipv4 and
ipv6 they do not.

^ permalink raw reply

* Re: NFSROOT mount fails on SPARC after 2.6.37
From: Trond Myklebust @ 2011-11-05  2:38 UTC (permalink / raw)
  To: David Miller
  Cc: chuck.lever-QHcLZuEGTsvQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA, linux-GLgANOly0l1BDLzU/O5InQ
In-Reply-To: <20111104.220334.1213367819937218830.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

On Fri, 2011-11-04 at 22:03 -0400, David Miller wrote: 
> Should be fixed by:
> 
> commit 3fb72f1e6e6165c5f495e8dc11c5bbd14c73385c
> Author: Micha Nelissen <micha-NMpVyAPB7R5Fb86tMAXRjw@public.gmane.org>
> Date:   Thu May 19 10:14:06 2011 +0000
> 
>     ipconfig wait for carrier
>     
>     v3 -> v4: fix return boolean false instead of 0 for ic_is_init_dev
>     
>     Currently the ip auto configuration has a hardcoded delay of 1 second.
>     When (ethernet) link takes longer to come up (e.g. more than 3 seconds),
>     nfs root may not be found.
>     
>     Remove the hardcoded delay, and wait for carrier on at least one network
>     device.

What will happen if there is more than one NIC, but only one if those
NICs is being configured with a static IP address (i.e. no auto
configuration)?

Trond
-- 
Trond Myklebust
Linux NFS client maintainer

NetApp
Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org
www.netapp.com

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 11/11] sch_hfsc.c: remove cl_vtadj, shift virtual curve instead
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

Instead of keeping intra-period difference, we can simply shift
the virtual curve to the right (as the adjustment is permanent).
---
 net/sched/sch_hfsc.c |   16 +++++++---------
 1 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 2109878..bf38551 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -151,8 +151,6 @@ struct hfsc_class {
 	u64	cl_cvtmin;		/* minimal virtual time among the
 					   children fit for link-sharing
 					   (monotonic within a period) */
-	u64	cl_vtadj;		/* intra-period cumulative vt
-					   adjustment */
 	u64	cl_cvtoff;		/* last (biggest) vt seen between
 					   siblings */
 
@@ -764,7 +762,6 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 			/* update the virtual curve */
 			rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
 
-			cl->cl_vtadj = 0;
 			cl->cl_vtperiod++;
 			cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
 			if (cl->cl_parent->cl_nactive == 0)
@@ -810,17 +807,19 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 			continue;
 
 		/* update vt */
-		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj;
+		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total);
 
 		/*
 		 * fixup vt due to upperlimit (if applicable)
 		 *
-		 * if vt of the class is smaller than cvtmin,
-		 * the class was skipped in the past due to non-fit.
-		 * if so, we need to adjust vtadj.
+		 * if we detect we're lagging past virtual times of other siblings,
+		 * that means we were skipped in the past due to non-fit.
+		 *
+		 * If so we adjust vt and shift virtual curve to match current
+		 * situation.
 		 */
 		if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
-			cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+			cl->cl_virtual.x += cl->cl_parent->cl_cvtmin - cl->cl_vt;
 			cl->cl_vt = cl->cl_parent->cl_cvtmin;
 		}
 
@@ -1527,7 +1526,6 @@ hfsc_reset_class(struct hfsc_class *cl)
 	cl->cl_d            = 0;
 	cl->cl_e            = 0;
 	cl->cl_vt           = 0;
-	cl->cl_vtadj        = 0;
 	cl->cl_cvtmin       = 0;
 	cl->cl_cvtoff       = 0;
 	cl->cl_vtperiod     = 0;
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 10/11] sch_hfsc.c: make sure classes are able to use 1st segment on fresh backlog period
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

This change guarantees, that when a class starts fresh backlog period,
it will be able to use its 1st segment for subsequent vt update.
---
 net/sched/sch_hfsc.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 26cdfaa..2109878 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -88,6 +88,7 @@ struct internal_sc {
 	u64	dy;	/* the y-projection of the 1st segment */
 	u64	sm2;	/* scaled slope of the 2nd segment */
 	u64	ism2;	/* scaled inverse-slope of the 2nd segment */
+	u64	i2dy;	/* x-projection of dy, using 2nd slope */
 };
 
 /* runtime service curve */
@@ -531,6 +532,7 @@ sc2isc(struct tc_service_curve *sc, struct internal_sc *isc)
 	isc->dy   = seg_x2y(isc->dx, isc->sm1);
 	isc->sm2  = m2sm(sc->m2);
 	isc->ism2 = m2ism(sc->m2);
+	isc->i2dy = seg_y2x(isc->dy, isc->ism2);
 }
 
 /*
@@ -885,8 +887,8 @@ set_passive(struct hfsc_class *cl)
 			break;
 
 		/* update cl_cvtoff of the parent class */
-		if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
-			cl->cl_parent->cl_cvtoff = cl->cl_vt;
+		if (cl->cl_vt + cl->cl_fsc.i2dy > cl->cl_parent->cl_cvtoff)
+			cl->cl_parent->cl_cvtoff = cl->cl_vt + cl->cl_fsc.i2dy;
 
 		/* remove this class from the parent's vt & cf trees */
 		vttree_remove(cl);
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 09/11] sch_hfsc.c: split update_vf()
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

Split update_vf() into 2 spearate functions, one responsible for just
updating the vt, and the other responsible for setting the class [and
possibly parent nodes] passive.

This is actually how it was once done in the past, but at some point it
was merged into one larger function.

Two functions are shorter and cleaner, and during normal update_vf() it
has a bit less work to do.
---
 net/sched/sch_hfsc.c |   51 +++++++++++++++++--------------------------------
 1 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index e73d2e0..26cdfaa 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -800,15 +800,11 @@ static void
 update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 {
 	u64 f; /* , myf_bound, delta; */
-	int go_passive = 0;
-
-	if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC)
-		go_passive = 1;
 
 	for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
 		cl->cl_total += len;
 
-		if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
+		if (!(cl->cl_flags & HFSC_FSC))
 			continue;
 
 		/* update vt */
@@ -826,27 +822,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 			cl->cl_vt = cl->cl_parent->cl_cvtmin;
 		}
 
-		if (go_passive && --cl->cl_nactive == 0)
-			go_passive = 1;
-		else
-			go_passive = 0;
-
-		if (go_passive) {
-			/* no more active child, going passive */
-
-			/* update cl_cvtoff of the parent class */
-			if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
-				cl->cl_parent->cl_cvtoff = cl->cl_vt;
-
-			/* remove this class from the vt tree */
-			vttree_remove(cl);
-
-			cftree_remove(cl);
-			update_cfmin(cl->cl_parent);
-
-			continue;
-		}
-
 		/* update the vt tree */
 		vttree_update(cl);
 
@@ -899,15 +874,27 @@ set_active(struct hfsc_class *cl, unsigned int len)
 static void
 set_passive(struct hfsc_class *cl)
 {
+	list_del(&cl->dlist);
+
 	if (cl->cl_flags & HFSC_RSC)
 		eltree_remove(cl);
 
-	list_del(&cl->dlist);
+	if (cl->cl_flags & HFSC_FSC)
+	for (; cl->cl_parent != NULL; cl = cl->cl_parent) {
+		if (--cl->cl_nactive > 0)
+			break;
 
-	/*
-	 * vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
-	 * needs to be called explicitly to remove a class from vttree.
-	 */
+		/* update cl_cvtoff of the parent class */
+		if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
+			cl->cl_parent->cl_cvtoff = cl->cl_vt;
+
+		/* remove this class from the parent's vt & cf trees */
+		vttree_remove(cl);
+		cftree_remove(cl);
+
+		/* update cfmin of the parent, after removal */
+		update_cfmin(cl->cl_parent);
+	}
 }
 
 static unsigned int
@@ -1286,7 +1273,6 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
 
 	if (cl->qdisc->q.qlen == 0) {
-		update_vf(cl, 0, 0);
 		set_passive(cl);
 	}
 }
@@ -1729,7 +1715,6 @@ hfsc_drop(struct Qdisc *sch)
 		if (cl->qdisc->ops->drop != NULL &&
 		    (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) {
 			if (cl->qdisc->q.qlen == 0) {
-				update_vf(cl, 0, 0);
 				set_passive(cl);
 			} else {
 				list_move_tail(&cl->dlist, &q->droplist);
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 08/11] sch_hfsc.c: update speed table, add explanations, increase accuracy
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

This patch:

- updates the "speed table" in commented section before service curve
  helpers, to reflect current PSCHED_SHIFT value, and how things look in
  current kernels

- adds detailed explanation about possible shifts

- updates seg_x2y() and seg_y2x() functions to retain full accuracy
  regardless of the chosen shifts

- with help of seg_*() changes, and one div64_u64(), allows slopes to be
  shifted by 32

- swaps do_div()s to div_u64()s and <linux/math64.h> header

Another thing, that got fixed as a side-effect, is that earlier way of
deriving [I]SM_SHIFT was just an estimate and didn't yield proper values
after changing PSCHED_SHIFT. For example, if PSCHED_SHIFT was
changed from 6 to 8:

at 100 kbit:  0.0032 bytes / tick
at 1 gbit:        32 bytes / tick -> 0.03125 ticks / byte

Under assumption of keeping at least 4 full decimal digits:

0.0032  requires shift = 22 (log2(10000/.0032))
0.03125 requires shift = 19

But with old definitions:

SM_SHIFT = (30 - PSCHED_SHIFT)
ISM_SHIFT = (8 + PSCHED_SHIFT)

we would get 22 for SM_SHIFT (still fine, but for PSCHED_SHIFT == 10, we
would be 1 too little) and 16 for ISM_SHIFT (too small).

Not an issue anymore, as shifts are now set simply to 32.

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |  190 +++++++++++++++++++++++++++++++-------------------
 1 files changed, 117 insertions(+), 73 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index ceff8a6..e73d2e0 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -63,10 +63,10 @@
 #include <linux/init.h>
 #include <linux/rtnetlink.h>
 #include <linux/pkt_sched.h>
+#include <linux/math64.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
-#include <asm/div64.h>
 
 /*
  * kernel internal service curve representation:
@@ -350,80 +350,147 @@ cftree_update(struct hfsc_class *cl)
 }
 
 /*
- * service curve support functions
+ * -- service curve support functions --
  *
  *  external service curve parameters
- *	m: bps
- *	d: us
+ *	m: bps (bytes per second)
+ *	d: us (microseconds)
  *  internal service curve parameters
- *	sm: (bytes/psched_us) << SM_SHIFT
- *	ism: (psched_us/byte) << ISM_SHIFT
- *	dx: psched_us
+ *	sm:  (bytes/pt) << SM_SHIFT
+ *	ism: (pts/byte) << ISM_SHIFT
+ *	dx: pts (psched ticks)
  *
- * The clock source resolution with ktime and PSCHED_SHIFT 10 is 1.024us.
+ * pt stands for psched tick. With PSCHED_SHIFT = 6, we have
+ * PSCHED_TICKS_PER_SEC = 1e9 >> 6 - so 1 psched tick is 64ns.
  *
  * sm and ism are scaled in order to keep effective digits.
  * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective
- * digits in decimal using the following table.
+ * full digits in decimal.
  *
- *  bits/sec      100Kbps     1Mbps     10Mbps     100Mbps    1Gbps
- *  ------------+-------------------------------------------------------
- *  bytes/1.024us 12.8e-3    128e-3     1280e-3    12800e-3   128000e-3
+ * -- calculation info, assuming 64ns psched tick --
  *
- *  1.024us/byte  78.125     7.8125     0.78125    0.078125   0.0078125
+ * 1gbit = 125,000,000 bytes / 1 s = 8 bytes / 1 pt
+ * inverse of that is 0.125 pts / 1 byte
+ * to keep our assumed 4 effective digits, we need to mul that by 8e4
+ * log2(8e4) ~= 16.3 - thus ISM_SHIFT = 17
  *
- * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18.
+ * similary:
+ * 10kbit = 1,250 bytes / 1 s = 0.00008 bytes / 1 pt
+ * we want at least full 4 digits, so we need to mul by 125e6
+ * log2(125e6) ~= 26.9 - thus SM_SHIFT = 27
+ *
+ *  bits/sec         10kbit   100kbit   1mbit    10mbit    100mbit    1gbit
+ *  ---------------+---------------------------------------------------------
+ *  bytes/pt (sm)     8e-5     8e-4     8e-3      8e-2      8e-1        8
+ *  pts/byte (ism)    12500    1250      125     125e-1     125e-2    125e-3
+ *
+ * as you can see:
+ *
+ * SM_SHIFT comes from the left-top
+ * ISM_SHIFT comes from the right-bottom
+ *
+ * In the code below we actually use flat 32bit shifts, giving us decent
+ * headroom on both ends of the above table.
+ *
+ * -- about allowed values (under assumption of 64ns psched tick) --
+ *
+ * We have to make sure, we have no overflows anywhere in the code. It's not a
+ * problem to just scale sm and ism, but then during computations we would
+ * easily end with >64bit values (remember our time resolution is 64ns).
+ *
+ * 1) seg_x2y and seg_y2x
+ *
+ * These use "school" math to derive final values, making sure all possible
+ * accuracy is preserved, regardless of the shifts. Note, that with this
+ * method we could use arbitrarily scaled sm/ism values, but we would have
+ * problems with divisions in the other places (and more complex code).
+ *
+ * 2) initialization functions: m2sm, m2ism, d2dx, dx2d
+ *
+ * These are used only during [re]setup/reports. Nothing particulary special
+ * about those, as user input is limited to 32bit. One remark though:
+ *
+ * Theoretically, due to rounding up in d2dx() and m2sm(), it's possible that
+ * in dx2d() and sm2m() we could overflow as well, but that would require input
+ * values near 2^32-1 provided during d2dx() and m2sm(). Just in case, we catch
+ * that possibility with WARN_ON().
+ *
+ * 3) rtsc_min
+ *
+ * The only other place, where a shift is used directly due to division. We are
+ * forced to use 64/64 division here: the (y1 - y) difference is guaranteed to
+ * take more than 32 bits due to the shift. Similary, the dsm difference is
+ * scaled, and with scalers == 32 will require more than 32 bits.
+ *
+ * Note - since v2.6.37-rc1~105^2~18, full 64/64 bit division produces fully
+ * accurate results.
+ *
+ */
+
+/*
+ * Within 10kbit .. 1gbit range, 32bit shifts still manage to keep 4 decimal
+ * digits even with PSCHED_SHIFT == 1. For reasonable == 6 we have currently
+ * there's lots of headroom for even smaller or faster speeds.
  */
-#define	SM_SHIFT	(30 - PSCHED_SHIFT)
-#define	ISM_SHIFT	(8 + PSCHED_SHIFT)
 
-#define	SM_MASK		((1ULL << SM_SHIFT) - 1)
-#define	ISM_MASK	((1ULL << ISM_SHIFT) - 1)
+#define SM_SHIFT	32
+#define ISM_SHIFT	32
+
+#define SM_MASK		((1ULL << SM_SHIFT) - 1)
+#define ISM_MASK	((1ULL << ISM_SHIFT) - 1)
+#define B32_MASK	((1ULL << 32) - 1)
+
+/* for readability */
+#define PTPS		PSCHED_TICKS_PER_SEC
+#define USPS		USEC_PER_SEC
 
 static inline u64
 seg_x2y(u64 x, u64 sm)
 {
-	u64 y;
-
 	/*
-	 * compute
-	 *	y = x * sm >> SM_SHIFT
-	 * but divide it for the upper and lower bits to avoid overflow
+	 * perform 3-stage computation, to allow shifts as high
+	 * as 32 while retaining full accuracy
 	 */
-	y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT);
-	return y;
+	return
+	(( (x & B32_MASK) * (sm & B32_MASK) ) >> SM_SHIFT) +
+	((
+		( (x >> 32) * sm ) +
+		( (x & B32_MASK) * (sm >> 32) )
+	) << (32 - SM_SHIFT));
 }
 
 static inline u64
 seg_y2x(u64 y, u64 ism)
 {
-	u64 x;
-
-	/* callers guarantee that ism is not infinity */
-#if 0
-	if (y == 0)
-		x = 0;
-	else if (ism == HT_INFINITY)
-		x = HT_INFINITY;
-	else {
-		x = (y >> ISM_SHIFT) * ism
-		    + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
-	}
-#endif
-	x = (y >> ISM_SHIFT) * ism + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
-	return x;
+	/*
+	 * callers guarantee that ism is sane, otherwise we would have to check
+	 * for HT_INFINITY and return it in case of match
+	 */
+	/*
+	 * perform 3-stage computation, to allow shifts as high
+	 * as 32 while retaining full accuracy
+	 */
+	return
+	(( (y & B32_MASK) * (ism & B32_MASK) ) >> ISM_SHIFT) +
+	((
+		( (y >> 32) * ism ) +
+		( (y & B32_MASK) * (ism >> 32) )
+	) << (32 - ISM_SHIFT));
 }
 
 /* Convert m (bps) into sm (bytes/psched us) */
 static u64
 m2sm(u32 m)
 {
-	u64 sm;
+	WARN_ON(m & (1<<31));
+	return div_u64(((u64)m << SM_SHIFT) + PTPS - 1, PTPS);
+}
 
-	sm = ((u64)m << SM_SHIFT);
-	sm += PSCHED_TICKS_PER_SEC - 1;
-	do_div(sm, PSCHED_TICKS_PER_SEC);
-	return sm;
+/* convert sm (bytes/psched us) into m (bps) */
+static u32
+sm2m(u64 sm)
+{
+	return (u32)((sm * PTPS) >> SM_SHIFT);
 }
 
 /* convert m (bps) into ism (psched us/byte) */
@@ -435,9 +502,7 @@ m2ism(u32 m)
 	if (m == 0)
 		ism = HT_INFINITY;
 	else {
-		ism = ((u64)PSCHED_TICKS_PER_SEC << ISM_SHIFT);
-		ism += m - 1;
-		do_div(ism, m);
+		ism = div_u64(((u64)PTPS << ISM_SHIFT) + m - 1, m);
 	}
 	return ism;
 }
@@ -446,33 +511,15 @@ m2ism(u32 m)
 static u64
 d2dx(u32 d)
 {
-	u64 dx;
-
-	dx = ((u64)d * PSCHED_TICKS_PER_SEC);
-	dx += USEC_PER_SEC - 1;
-	do_div(dx, USEC_PER_SEC);
-	return dx;
-}
-
-/* convert sm (bytes/psched us) into m (bps) */
-static u32
-sm2m(u64 sm)
-{
-	u64 m;
-
-	m = (sm * PSCHED_TICKS_PER_SEC) >> SM_SHIFT;
-	return (u32)m;
+	WARN_ON(d & (1<<31));
+	return div_u64(((u64)d * PTPS) + USPS - 1, USPS);
 }
 
 /* convert dx (psched us) into d (us) */
 static u32
 dx2d(u64 dx)
 {
-	u64 d;
-
-	d = dx * USEC_PER_SEC;
-	do_div(d, PSCHED_TICKS_PER_SEC);
-	return (u32)d;
+	return (u32)div_u64(dx * USPS, PTPS);
 }
 
 static void
@@ -553,7 +600,6 @@ static void
 rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
 {
 	u64 y1, y2, dx, dy;
-	u32 dsm;
 
 	if (isc->sm1 <= isc->sm2) {
 		/* service curve is convex or linear */
@@ -602,9 +648,7 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
 	 * function of seg_x2y()
 	 *	seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y)
 	 */
-	dx = (y1 - y) << SM_SHIFT;
-	dsm = isc->sm1 - isc->sm2;
-	do_div(dx, dsm);
+	dx = div64_u64((y1 - y) << SM_SHIFT, isc->sm1 - isc->sm2);
 	/*
 	 * check if (x, y1) belongs to the 1st segment of rtsc.
 	 * if so, add the offset.
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 06/11] sch_hfsc.c: seg_y2x(), rtsc_y2x(), hfsc_change_class() minor changes
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

hfsc_change_class():

Make sure that invalid input (flat 2nd segment, USC without FSC) is not
allowed. Even though userspace (tc) does the respective checks as well - we
want to verify if everything is in order before the following seg_y2x() change
(and some other userspace tools not doing such checks might exist):

seg_y2x():

All callers (actually there's only one - rtsc_y2x()) guarantee that the
slope argument actually makese sense (otherwise the results would be
unusable) - so we don't heave to check for +inf case.

rtsc_y2x():

In case of flat 1st segment of a service curve, the function always
returns maximum a, such that f(a) = y (for our curves then, a = x + dx).
To keep things consistent (and continuous), we also return x+dx for
values < y.
---
 net/sched/sch_hfsc.c |   39 +++++++++++++++++++++++++++++----------
 1 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index e0cf11b..8da2d88 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -400,6 +400,8 @@ seg_y2x(u64 y, u64 ism)
 {
 	u64 x;
 
+	/* callers guarantee that ism is not infinity */
+#if 0
 	if (y == 0)
 		x = 0;
 	else if (ism == HT_INFINITY)
@@ -408,6 +410,8 @@ seg_y2x(u64 y, u64 ism)
 		x = (y >> ISM_SHIFT) * ism
 		    + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
 	}
+#endif
+	x = (y >> ISM_SHIFT) * ism + (((y & ISM_MASK) * ism) >> ISM_SHIFT);
 	return x;
 }
 
@@ -509,7 +513,7 @@ rtsc_y2x(struct runtime_sc *rtsc, u64 y)
 {
 	u64 x;
 
-	if (y < rtsc->y)
+	if (y < rtsc->y && rtsc->dy != 0)
 		x = rtsc->x;
 	else if (y <= rtsc->y + rtsc->dy) {
 		/* x belongs to the 1st segment */
@@ -985,22 +989,40 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 
 	if (tb[TCA_HFSC_RSC]) {
 		rsc = nla_data(tb[TCA_HFSC_RSC]);
-		if (rsc->m1 == 0 && rsc->m2 == 0)
-			rsc = NULL;
+		if (rsc->m2 == 0) {
+			if (rsc->m1 != 0)
+				return -EINVAL;
+			else
+				rsc = NULL;
+		}
 	}
 
 	if (tb[TCA_HFSC_FSC]) {
 		fsc = nla_data(tb[TCA_HFSC_FSC]);
-		if (fsc->m1 == 0 && fsc->m2 == 0)
-			fsc = NULL;
+		if (fsc->m2 == 0) {
+			if (fsc->m1 != 0)
+				return -EINVAL;
+			else
+				fsc = NULL;
+		}
 	}
 
 	if (tb[TCA_HFSC_USC]) {
 		usc = nla_data(tb[TCA_HFSC_USC]);
-		if (usc->m1 == 0 && usc->m2 == 0)
-			usc = NULL;
+		if (usc->m2 == 0) {
+			if (usc->m1 != 0)
+				return -EINVAL;
+			else
+				usc = NULL;
+		}
 	}
 
+	if (rsc == NULL && fsc == NULL)
+		return -EINVAL;
+
+	if (fsc == NULL && usc != NULL)
+		return -EINVAL;
+
 	if (cl != NULL) {
 		if (parentid) {
 			if (cl->cl_parent &&
@@ -1053,9 +1075,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (hfsc_find_class(classid, sch))
 		return -EEXIST;
 
-	if (rsc == NULL && fsc == NULL)
-		return -EINVAL;
-
 	cl = kzalloc(sizeof(struct hfsc_class), GFP_KERNEL);
 	if (cl == NULL)
 		return -ENOBUFS;
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 07/11] sch_hfsc.c: make cl_vt keep all periods in itself
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

Currently, cl_vt is kept at all cost to be relative to actual backlog
period - thus we have cl_vtoff that is carefully updated to track
current "start" of vt; cl_vtmax, that keeps highest vt seen among
siblings in last backlog period; cl_cvtoff that keeps sum of all
previous cl_vtmax.

Despite that, the actual service curve is always updated from "full" vt,
and subsequent computations have vtoff part subtracted.

This is unnecessary, as vt can simply keep all vtoff values in itself -
while keeping the algorithm behaviour exactly the same.

This change allows us to remove cl_vtoff (no longer needed - part of
cl_vt now) and cl_cvtmax (no longer needed, cl_cvtoff is updated
directly from cl_vt).

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |   37 ++++++++++++-------------------------
 1 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 8da2d88..ceff8a6 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -152,9 +152,8 @@ struct hfsc_class {
 					   (monotonic within a period) */
 	u64	cl_vtadj;		/* intra-period cumulative vt
 					   adjustment */
-	u64	cl_vtoff;		/* inter-period cumulative vt offset */
-	u64	cl_cvtmax;		/* max child's vt in the last period */
-	u64	cl_cvtoff;		/* cumulative cvtmax of all periods */
+	u64	cl_cvtoff;		/* last (biggest) vt seen between
+					   siblings */
 
 	struct internal_sc cl_rsc;	/* internal real-time service curve */
 	struct internal_sc cl_fsc;	/* internal fair service curve */
@@ -710,26 +709,17 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 			} else {
 				/*
 				 * first child for a new parent backlog period.
-				 * add parent's cvtmax to cvtoff to make a new
-				 * vt (vtoff + vt) larger than the vt in the
-				 * last period for all children.
+				 * use the biggest vt seen between siblings so far
 				 */
-				vt = cl->cl_parent->cl_cvtmax;
-				cl->cl_parent->cl_cvtoff += vt;
-				cl->cl_parent->cl_cvtmax = 0;
-				cl->cl_parent->cl_cvtmin = 0;
-				cl->cl_vt = 0;
+				cl->cl_vt = cl->cl_parent->cl_cvtoff;
+				cl->cl_parent->cl_cvtmin = cl->cl_vt;
 			}
 
-			cl->cl_vtoff = cl->cl_parent->cl_cvtoff;
-
 			/* update the virtual curve */
-			vt = cl->cl_vt + cl->cl_vtoff;
-			rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
-						      cl->cl_total);
-			cl->cl_vtadj = 0;
+			rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total);
 
-			cl->cl_vtperiod++;  /* increment vt period */
+			cl->cl_vtadj = 0;
+			cl->cl_vtperiod++;
 			cl->cl_parentperiod = cl->cl_parent->cl_vtperiod;
 			if (cl->cl_parent->cl_nactive == 0)
 				cl->cl_parentperiod++;
@@ -778,8 +768,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 			continue;
 
 		/* update vt */
-		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
-			    - cl->cl_vtoff + cl->cl_vtadj;
+		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj;
 
 		/*
 		 * fixup vt due to upperlimit (if applicable)
@@ -801,9 +790,9 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 		if (go_passive) {
 			/* no more active child, going passive */
 
-			/* update cvtmax of the parent class */
-			if (cl->cl_vt > cl->cl_parent->cl_cvtmax)
-				cl->cl_parent->cl_cvtmax = cl->cl_vt;
+			/* update cl_cvtoff of the parent class */
+			if (cl->cl_vt > cl->cl_parent->cl_cvtoff)
+				cl->cl_parent->cl_cvtoff = cl->cl_vt;
 
 			/* remove this class from the vt tree */
 			vttree_remove(cl);
@@ -1507,9 +1496,7 @@ hfsc_reset_class(struct hfsc_class *cl)
 	cl->cl_e            = 0;
 	cl->cl_vt           = 0;
 	cl->cl_vtadj        = 0;
-	cl->cl_vtoff        = 0;
 	cl->cl_cvtmin       = 0;
-	cl->cl_cvtmax       = 0;
 	cl->cl_cvtoff       = 0;
 	cl->cl_vtperiod     = 0;
 	cl->cl_parentperiod = 0;
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 05/11] sch_hfsc.c: don't subtract from cl_vtoff and cl_cvtoff
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

Using pcvtoff to keep vtoff as close to 0 as possible is not necessary. All
curves (based on real or virtual time) use the same mechanics, and real time
based ones are obviously not even allowed anything like that.

This might be some leftover from earlier hfsc versions (or perhaps versions
[that are/were ?] limited to 32 bit values only, where such thing would help a
bit against hitting overflow ... though what about RSC/USC ?).

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |   11 +----------
 1 files changed, 1 insertions(+), 10 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 97d720c..e0cf11b 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -155,8 +155,6 @@ struct hfsc_class {
 	u64	cl_vtoff;		/* inter-period cumulative vt offset */
 	u64	cl_cvtmax;		/* max child's vt in the last period */
 	u64	cl_cvtoff;		/* cumulative cvtmax of all periods */
-	u64	cl_pcvtoff;		/* parent's cvtoff at initialization
-					   time */
 
 	struct internal_sc cl_rsc;	/* internal real-time service curve */
 	struct internal_sc cl_fsc;	/* internal fair service curve */
@@ -719,17 +717,12 @@ init_vf(struct hfsc_class *cl, unsigned int len)
 				cl->cl_vt = 0;
 			}
 
-			cl->cl_vtoff = cl->cl_parent->cl_cvtoff -
-							cl->cl_pcvtoff;
+			cl->cl_vtoff = cl->cl_parent->cl_cvtoff;
 
 			/* update the virtual curve */
 			vt = cl->cl_vt + cl->cl_vtoff;
 			rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt,
 						      cl->cl_total);
-			if (cl->cl_virtual.x == vt) {
-				cl->cl_virtual.x -= cl->cl_vtoff;
-				cl->cl_vtoff = 0;
-			}
 			cl->cl_vtadj = 0;
 
 			cl->cl_vtperiod++;  /* increment vt period */
@@ -1102,7 +1095,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
 	if (parent->level == 0)
 		hfsc_purge_queue(sch, parent);
 	hfsc_adjust_levels(parent);
-	cl->cl_pcvtoff = parent->cl_cvtoff;
 	sch_tree_unlock(sch);
 
 	qdisc_class_hash_grow(sch, &q->clhash);
@@ -1500,7 +1492,6 @@ hfsc_reset_class(struct hfsc_class *cl)
 	cl->cl_cvtmin       = 0;
 	cl->cl_cvtmax       = 0;
 	cl->cl_cvtoff       = 0;
-	cl->cl_pcvtoff      = 0;
 	cl->cl_vtperiod     = 0;
 	cl->cl_parentperiod = 0;
 	cl->cl_f            = 0;
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 04/11] sch_hfsc.c: remove initial check in vttree_get_minvt() (optional)
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

Remove initial check in vttree_get_minvt(), as it's implicitly
handled by vttree_firstfit().

Also see:
http://marc.info/?l=linux-netdev&m=128457329614914&w=2

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |    4 ----
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 710cbda..97d720c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -307,10 +307,6 @@ vttree_firstfit(struct hfsc_class *cl, u64 cur_time)
 static struct hfsc_class *
 vttree_get_minvt(struct hfsc_class *cl, u64 cur_time)
 {
-	/* if root-class's cfmin is bigger than cur_time nothing to do */
-	if (cl->cl_cfmin > cur_time)
-		return NULL;
-
 	while (cl->level > 0) {
 		cl = vttree_firstfit(cl, cur_time);
 		if (cl == NULL)
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 03/11] sch_hfsc.c: rtsc_min() convex/convex case fixup
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

The condition, which chooses smaller curve in convex/convex case is a bit too
simple. If we actually have intersecting curves, and base the test on initial
(x,y), instead of (x+dx,y+dy), the we can choose wrong curve:

                     +
                *   +
               *++++
       x,y  ++*+
    new ++++ *
            *
        ****
old ****

>From the above art, (x,y) will win the comparison and old curve will be chosen,
but it's new that should be selected (as its infinite 2nd segment is below the
old one).

Another possibility is to flatten the 1st segment (see hfsc paper), which has
an effect for *y2x() functions to return max x such that f(x) = y. In this way
the old curve would take precedence on the initial part. Worth considering for
another patch.

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |   22 +++++++++++++++-------
 1 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 6b73725..710cbda 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -559,13 +559,21 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
 	u32 dsm;
 
 	if (isc->sm1 <= isc->sm2) {
-		/* service curve is convex */
-		y1 = rtsc_x2y(rtsc, x);
-		if (y1 < y)
-			/* the current rtsc is smaller */
-			return;
-		rtsc->x = x;
-		rtsc->y = y;
+		/* service curve is convex or linear */
+		y1 = rtsc_x2y(rtsc, x + isc->dx);
+		if (y1 > y + isc->dy) {
+			/*
+			 * sm1/sm2 junction point is below the old curve, so
+			 * update the curve with ours; note that for strictly
+			 * convex curves a brief part of the first segment
+			 * might be above, but it's important that 2nd infinite
+			 * segment wins (which is below the old curve in this
+			 * case)
+			 */
+			rtsc->x = x;
+			rtsc->y = y;
+		}
+		/* else: the current rtsc is smaller */
 		return;
 	}
 
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 02/11] sch_hfsc.c: go passive after cl_vt update in update_vf()
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

In contrast to RT and UL curves - which are based on actual time and
can always be properly initialized, LS is based on virtual time, which
we initialize manually, in three possible ways:

1. (max + min) / 2 of active siblings (min must not be upperlimited)
2. do not adjust existing vt, if parent has the same backlog period and
   newly calculated vt is smaller
3. initialize to cumulative cvtmax of all periods

In the hfsc paper, update_vf() was not responsible for setting class
passive - it only updated linksharing related stuff. So update
chores first, passivity second (if applicable).

In one of the patches to altq, setting class passive was merged into
update_vf(), but in doing so, go_passive condition was checked /before/
vt update.

This conflicts with point (2.), as vt tested at that point doesn't
reflect the situation after the last dequeue - and we update virtual
time curve later, after updating vt itself.

It also kind of conflicts with (3.) for single-packet backlog periods,
as vt will never move to the right, but total work will keep increasing
(this is handled by rtsc_min() conditions though, so the results are
still generally valid).

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |   33 +++++++++++++++++----------------
 1 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 261accc..6b73725 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -776,6 +776,22 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 		if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0)
 			continue;
 
+		/* update vt */
+		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
+			    - cl->cl_vtoff + cl->cl_vtadj;
+
+		/*
+		 * fixup vt due to upperlimit (if applicable)
+		 *
+		 * if vt of the class is smaller than cvtmin,
+		 * the class was skipped in the past due to non-fit.
+		 * if so, we need to adjust vtadj.
+		 */
+		if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
+			cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
+			cl->cl_vt = cl->cl_parent->cl_cvtmin;
+		}
+
 		if (go_passive && --cl->cl_nactive == 0)
 			go_passive = 1;
 		else
@@ -797,25 +813,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 			continue;
 		}
 
-		/*
-		 * update vt and f
-		 */
-		cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total)
-			    - cl->cl_vtoff + cl->cl_vtadj;
-
-		/*
-		 * if vt of the class is smaller than cvtmin,
-		 * the class was skipped in the past due to non-fit.
-		 * if so, we need to adjust vtadj.
-		 */
-		if (cl->cl_vt < cl->cl_parent->cl_cvtmin) {
-			cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt;
-			cl->cl_vt = cl->cl_parent->cl_cvtmin;
-		}
-
 		/* update the vt tree */
 		vttree_update(cl);
 
+		/* update ft */
 		if (cl->cl_flags & HFSC_USC) {
 			cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit,
 							      cl->cl_total);
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH 01/11] sch_hfsc.c: update_d() fixup
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev
In-Reply-To: <1320460377-8682-1-git-send-email-soltys@ziu.info>

The deadline time is generated from total rt work + size of the next packet.

Now, when a packet gets dequeued by linkshare criterion, we have to
update the deadline time to match the new situation (that's the job of
update_d()) - but to do that properly, we have to subtract the size of
the packet just dequeued, when calling rtsc_min().

This is actually stated in hfsc paper very clearly, but got (probably)
missed during altq days (or due to some patch at some point).

Signed-off-by: Michal Soltys <soltys@ziu.info>
---
 net/sched/sch_hfsc.c |   14 ++++++++------
 1 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 6488e64..261accc 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -650,9 +650,9 @@ update_ed(struct hfsc_class *cl, unsigned int next_len)
 }
 
 static inline void
-update_d(struct hfsc_class *cl, unsigned int next_len)
+update_d(struct hfsc_class *cl, unsigned int curr_len, unsigned int next_len)
 {
-	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len);
+	cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul - curr_len + next_len);
 }
 
 static inline void
@@ -1610,7 +1610,7 @@ hfsc_dequeue(struct Qdisc *sch)
 	struct hfsc_class *cl;
 	struct sk_buff *skb;
 	u64 cur_time;
-	unsigned int next_len;
+	unsigned int curr_len, next_len;
 	int realtime = 0;
 
 	if (sch->q.qlen == 0)
@@ -1640,14 +1640,16 @@ hfsc_dequeue(struct Qdisc *sch)
 	}
 
 	skb = qdisc_dequeue_peeked(cl->qdisc);
+	curr_len = qdisc_pkt_len(skb);
+
 	if (skb == NULL) {
 		qdisc_warn_nonwc("HFSC", cl->qdisc);
 		return NULL;
 	}
 
-	update_vf(cl, qdisc_pkt_len(skb), cur_time);
+	update_vf(cl, curr_len, cur_time);
 	if (realtime)
-		cl->cl_cumul += qdisc_pkt_len(skb);
+		cl->cl_cumul += curr_len;
 
 	if (cl->qdisc->q.qlen != 0) {
 		if (cl->cl_flags & HFSC_RSC) {
@@ -1656,7 +1658,7 @@ hfsc_dequeue(struct Qdisc *sch)
 			if (realtime)
 				update_ed(cl, next_len);
 			else
-				update_d(cl, next_len);
+				update_d(cl, curr_len, next_len);
 		}
 	} else {
 		/* the class becomes passive */
-- 
1.7.7.1

^ permalink raw reply related

* [PATCH/RFC 00/11] HFSC patches
From: Michal Soltys @ 2011-11-05  2:32 UTC (permalink / raw)
  To: kaber; +Cc: davem, netdev

Those are most of the patches I've been sitting on for a while. For the most
part they are either small corrections or simplifications (marked with s) - but
there are also some new things (marked with !), the most interesting being #8
probably.

All changes are richly commented in respective commit messages. I've been using
them for a while - so unless I missed some subtlety, all should be fine.

Apart from these, there's still one subtle thing to do w.r.t. cl_cvtmin (during
init_vf(), as this value is lagged relatively to the situation at the time of
enqueue).

On a side note, I was thinking about something like hfsc-strict or so - where
[uplink] interface could be upperlimited on hfsc qdisc level, but all the class
upperlimit would be otherwise gone. Not sure if anyone would be even interested
in something like that at all.


Michal Soltys (11):
  sch_hfsc.c: update_d() fixup
  sch_hfsc.c: go passive after cl_vt update in update_vf()
  sch_hfsc.c: rtsc_min() convex/convex case fixup
s sch_hfsc.c: remove initial check in vttree_get_minvt() (optional)
s sch_hfsc.c: don't subtract from cl_vtoff and cl_cvtoff
s sch_hfsc.c: seg_y2x(), rtsc_y2x(), hfsc_change_class() minor changes
s sch_hfsc.c: make cl_vt keep all periods in itself
! sch_hfsc.c: update speed table, add explanations, increase accuracy
s sch_hfsc.c: split update_vf()
! sch_hfsc.c: make sure classes are able to use 1st segment on fresh backlog period
s sch_hfsc.c: remove cl_vtadj, shift virtual curve instead

 net/sched/sch_hfsc.c |  373 +++++++++++++++++++++++++++-----------------------
 1 files changed, 203 insertions(+), 170 deletions(-)

-- 
1.7.7.1

^ permalink raw reply

* Re: [stable] net: Handle different key sizes between address families in flow cache
From: Kim Phillips @ 2011-11-05  2:29 UTC (permalink / raw)
  To: Greg KH; +Cc: David Miller, stable, eric.dumazet, zheng.z.yan, netdev,
	David Ward
In-Reply-To: <20111104212439.GA27324@kroah.com>

On Fri, 4 Nov 2011 14:24:39 -0700
Greg KH <greg@kroah.com> wrote:

> On Fri, Nov 04, 2011 at 02:46:59PM -0500, Kim Phillips wrote:
> > commit aa1c366e4febc7f5c2b84958a2dd7cd70e28f9d0 upstream.
> > 
> > With the conversion of struct flowi to a union of AF-specific structs, some
> > operations on the flow cache need to account for the exact size of the key.
> > 
> > Signed-off-by: David Ward <david.ward@ll.mit.edu>
> > Signed-off-by: David S. Miller <davem@davemloft.net>
> > Cc: stable@vger.kernel.org # v3.0.x
> > ---
> > This patch is the result of a clean cherry-pick onto v3.0.8.
> > It restores IPSec fwding performance from ~4kpps back to ~44kpps on
> > a P2020DS board.
> 
> Too bad you forgot to build this patch after you applied it:
>   CC      init/main.o
> In file included from include/linux/security.h:39:0,
>                  from init/main.c:32:
> include/net/flow.h: In function 'flow_key_size':
> include/net/flow.h:174:3: error: size of unnamed array is negative
> include/net/flow.h:177:3: error: size of unnamed array is negative
> 
> Please be kind to your poor over-worked stable kernel maintainer and do
> the decent thing and TEST YOUR PATCH before you ask him to accept it.
> 
> bah, I think it's time for the weekend to start a bit earlier than normal...

so sorry I hadn't tested 64-bit.

I found the problem - upstream commit aa1c366 depends on
upstream commit 728871b:

commit 728871bc05afc8ff310b17dba3e57a2472792b13
Author: David Ward <david.ward@ll.mit.edu>
Date:   Mon Sep 5 16:47:23 2011 +0000

    net: Align AF-specific flowi structs to long
    
    AF-specific flowi structs are now passed to flow_key_compare, which must
    also be aligned to a long.
    
    Signed-off-by: David Ward <david.ward@ll.mit.edu>
    Signed-off-by: David S. Miller <davem@davemloft.net>

so, one more time, in stable sign-off area format:

Cc: <stable@vger.kernel.org> # v3.0.x: 728871b: net: Align AF-specific flowi structs to long
Cc: <stable@vger.kernel.org> # v3.0.x

build tested on 32- and 64-bit powerpc, and ARCH=x86
{i386,x86_64}_defconfig.

Kim

^ permalink raw reply

* Re: PROBLEM: pppol2tp over pppoe NULL pointer dereference
From: David Miller @ 2011-11-05  2:28 UTC (permalink / raw)
  To: eric.dumazet; +Cc: spiked.yar, netdev
In-Reply-To: <1320191893.4728.13.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 02 Nov 2011 00:58:13 +0100

> Please try following patch, thanks !
> 
> [PATCH] l2tp: handle fragmented skbs in receive path
> 
> Modern drivers provide skb with fragments, and L2TP doesnt properly
> handles them.
> 
> Some bad frames can also trigger panics because of insufficent checks.
> 
> Reported-by: Misha Labjuk <spiked.yar@gmail.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

I'm still waiting for testing results of this patch.

^ permalink raw reply

* Re: [PATCH net v3 0/9] forcedeth: minor fixes for stats, rmmod, sparse
From: David Miller @ 2011-11-05  2:27 UTC (permalink / raw)
  To: david.decotigny
  Cc: netdev, linux-kernel, ian.campbell, eric.dumazet,
	jeffrey.t.kirsher, jpirko, joe, szymon
In-Reply-To: <cover.1320457247.git.david.decotigny@google.com>

From: David Decotigny <david.decotigny@google.com>
Date: Fri,  4 Nov 2011 18:53:24 -0700

> Changes since v2:
>  - removed "Fix a race during rmmod of forcedeth" from the series
>    (will look at it separately with original author)
>  - added "remove unneeded stats updates" and "64-bit stats"
>  - reordered patches
> 
> Changes since v1:
>  - rebased on top of netdev tip
>  - do not repeat name of device in netdev_dbg
>  - do not completely mute TX timeout messages when debug_tx_timeout is
>    not set
>  - make debug_tx_timeout writable in /sys/module
>  Note: I am re-submitting "expose module parameters in /sys/module" as
>        it can be useful in production and I was assured it doesn't add
>        much memory overhead by the sysfs maintainers.

If you want me to apply any of this now you're going to have to split
out the pure bug fixes from the feature additions and submit the feature
bits later when net-next opens back up.

Because things like 64-bit stats and new stat counters are not
appropriate at this time in the development cycle.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox