Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: pch_gbe: oops with vlan (new)
From: Eric Dumazet @ 2012-05-11 22:10 UTC (permalink / raw)
  To: Andy Cress; +Cc: netdev
In-Reply-To: <1336770777.31653.283.camel@edumazet-glaptop>

On Fri, 2012-05-11 at 23:13 +0200, Eric Dumazet wrote:
> On Fri, 2012-05-11 at 13:48 -0700, Andy Cress wrote:
> > Folks,
> > 
> > I am looking for help in debugging a pch_gbe driver oops/abort.
> > 
> > Kernel: version 2.6.32-220.el6.i686 (RHEL6.2)
> > Driver: pch_gbe version 0.91-NAPI  (source tarball we used is at
> > https://sendfile.kontron.com/message/24tdUi6MXklnUtBLnOsumq until May
> > 16)
> > NIC: 0b:00.1 Ethernet controller [0200]: Intel Corporation Platform
> > Controller Hub EG20T Gigabit Ethernet Controller [8086:8802] (rev 02)
> > 
> > Configuration, with VLAN:
> >  eth0 (not started)
> >  eth0.100 = 192.168.100.1
> >  eth0.200 = 192.168.200.1
> >  eth0.6  = 192.168.6.1
> > 
> > When starting the VLAN configuration, then doing a ping test for >= 5
> > minutes, I get a kernel oop/abort message as shown below.  This does not
> > happen without configuring VLAN.
> > Where should I look for possible causes for a transmit queue timeout
> > like this?  
> > 
> > I have contacted the OKI/LAPIS driver authors, but no response so far.
> > I thought that this group might be able to comment from similar
> > experiences.
> > 
> > Andy
> 
> typical sign of a buggy driver
> 
> A quick look in current Linus tree show a non existent synchronization
> between ndo_start_xmit and TX completion.
> 
> tx completion uses a tx_queue_lock spinlock for nothing but false sense
> of correctness.

Please try the following patch : (based on current net-next tree)

Also this driver has a strange RX path : It does a copy of incoming
frames on fixed size skbs, (2048+overhead -> kmalloc-4096 pool) instead
of using skb of the right size...



 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h      |    2 
 drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c |   25 ++++------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
index 9f3dbc4..b07311e 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h
@@ -584,7 +584,6 @@ struct pch_gbe_hw_stats {
 /**
  * struct pch_gbe_adapter - board specific private data structure
  * @stats_lock:	Spinlock structure for status
- * @tx_queue_lock:	Spinlock structure for transmit
  * @ethtool_lock:	Spinlock structure for ethtool
  * @irq_sem:		Semaphore for interrupt
  * @netdev:		Pointer of network device structure
@@ -609,7 +608,6 @@ struct pch_gbe_hw_stats {
 
 struct pch_gbe_adapter {
 	spinlock_t stats_lock;
-	spinlock_t tx_queue_lock;
 	spinlock_t ethtool_lock;
 	atomic_t irq_sem;
 	struct net_device *netdev;
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 9dc7e50..3787c64 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -645,14 +645,11 @@ static void pch_gbe_mac_set_pause_packet(struct pch_gbe_hw *hw)
  */
 static int pch_gbe_alloc_queues(struct pch_gbe_adapter *adapter)
 {
-	int size;
-
-	size = (int)sizeof(struct pch_gbe_tx_ring);
-	adapter->tx_ring = kzalloc(size, GFP_KERNEL);
+	adapter->tx_ring = kzalloc(sizeof(*adapter->tx_ring), GFP_KERNEL);
 	if (!adapter->tx_ring)
 		return -ENOMEM;
-	size = (int)sizeof(struct pch_gbe_rx_ring);
-	adapter->rx_ring = kzalloc(size, GFP_KERNEL);
+
+	adapter->rx_ring = kzalloc(sizeof(*adapter->rx_ring), GFP_KERNEL);
 	if (!adapter->rx_ring) {
 		kfree(adapter->tx_ring);
 		return -ENOMEM;
@@ -1169,7 +1166,6 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter,
 	struct sk_buff *tmp_skb;
 	unsigned int frame_ctrl;
 	unsigned int ring_num;
-	unsigned long flags;
 
 	/*-- Set frame control --*/
 	frame_ctrl = 0;
@@ -1216,14 +1212,14 @@ static void pch_gbe_tx_queue(struct pch_gbe_adapter *adapter,
 			}
 		}
 	}
-	spin_lock_irqsave(&tx_ring->tx_lock, flags);
+
 	ring_num = tx_ring->next_to_use;
 	if (unlikely((ring_num + 1) == tx_ring->count))
 		tx_ring->next_to_use = 0;
 	else
 		tx_ring->next_to_use = ring_num + 1;
 
-	spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
+
 	buffer_info = &tx_ring->buffer_info[ring_num];
 	tmp_skb = buffer_info->skb;
 
@@ -1525,7 +1521,7 @@ pch_gbe_alloc_rx_buffers_pool(struct pch_gbe_adapter *adapter,
 						&rx_ring->rx_buff_pool_logic,
 						GFP_KERNEL);
 	if (!rx_ring->rx_buff_pool) {
-		pr_err("Unable to allocate memory for the receive poll buffer\n");
+		pr_err("Unable to allocate memory for the receive pool buffer\n");
 		return -ENOMEM;
 	}
 	memset(rx_ring->rx_buff_pool, 0, size);
@@ -1644,15 +1640,17 @@ pch_gbe_clean_tx(struct pch_gbe_adapter *adapter,
 	pr_debug("called pch_gbe_unmap_and_free_tx_resource() %d count\n",
 		 cleaned_count);
 	/* Recover from running out of Tx resources in xmit_frame */
+	spin_lock(&tx_ring->tx_lock);
 	if (unlikely(cleaned && (netif_queue_stopped(adapter->netdev)))) {
 		netif_wake_queue(adapter->netdev);
 		adapter->stats.tx_restart_count++;
 		pr_debug("Tx wake queue\n");
 	}
-	spin_lock(&adapter->tx_queue_lock);
+
 	tx_ring->next_to_clean = i;
-	spin_unlock(&adapter->tx_queue_lock);
+
 	pr_debug("next_to_clean : %d\n", tx_ring->next_to_clean);
+	spin_unlock(&tx_ring->tx_lock);
 	return cleaned;
 }
 
@@ -2043,7 +2041,6 @@ static int pch_gbe_sw_init(struct pch_gbe_adapter *adapter)
 		return -ENOMEM;
 	}
 	spin_lock_init(&adapter->hw.miim_lock);
-	spin_lock_init(&adapter->tx_queue_lock);
 	spin_lock_init(&adapter->stats_lock);
 	spin_lock_init(&adapter->ethtool_lock);
 	atomic_set(&adapter->irq_sem, 0);
@@ -2148,10 +2145,10 @@ static int pch_gbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 			 tx_ring->next_to_use, tx_ring->next_to_clean);
 		return NETDEV_TX_BUSY;
 	}
-	spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
 
 	/* CRC,ITAG no support */
 	pch_gbe_tx_queue(adapter, tx_ring, skb);
+	spin_unlock_irqrestore(&tx_ring->tx_lock, flags);
 	return NETDEV_TX_OK;
 }
 

^ permalink raw reply related

* Re: pull request: batman-adv 2012-05-11
From: Sven Eckelmann @ 2012-05-11 22:10 UTC (permalink / raw)
  To: b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, David Miller
In-Reply-To: <20120511.180144.174337959525316777.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1121 bytes --]

On Friday, May 11, 2012 06:01:44 PM David Miller wrote:
> From: Antonio Quartulli <ordex-GaUfNO9RBHfsrOwW+9ziJQ@public.gmane.org>
> Date: Fri, 11 May 2012 14:21:17 +0200
> 
> > this is a fixed version of the pull request issued on 2012-05-09.
> > 
> > Comments introduced in this patchset are not following the net-tree
> > guidelines. Another patch changing all the already existing comments
> > will follow later.
> > 
> > New exported functions follow the new name convention we discussed so
> > far. A patch renaming all the existing exported functions will follow.
> Pulled, but you have to cleanup the namespace on lots of other
> functions too.
> 
> For example, the packet receive handle register/unregister routines
> you added in this series as well.
> 
> Go through the entire batman-adv stack, and if the function or data
> object is not static, make sure it has a batadv_*() prefix.

The patches for this are posted on the batman-adv mailinglist [1]. I am 
currently splitting this large patch for easier consumption.

Thanks,
	Sven

[1] https://lists.open-mesh.org/pipermail/b.a.t.m.a.n/2012-May/007062.html

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH] net/ipv6/af_inet6.c: checkpatch cleanup
From: David Miller @ 2012-05-11 22:07 UTC (permalink / raw)
  To: eldad; +Cc: kuznet, jmorris, yoshfuji, kaber, netdev, linux-kernel
In-Reply-To: <1336248834-25271-1-git-send-email-eldad@fogrefinery.com>

From: Eldad Zack <eldad@fogrefinery.com>
Date: Sat,  5 May 2012 22:13:53 +0200

 ...
> Signed-off-by: Eldad Zack <eldad@fogrefinery.com>

Applied.

^ permalink raw reply

* Re: [PATCH] Net ipv6: Fixed checkpatch errors
From: David Miller @ 2012-05-11 22:06 UTC (permalink / raw)
  To: cristian.chilipirea
  Cc: kuznet, jmorris, yoshfuji, kaber, netdev, linux-kernel,
	daniel.baluta
In-Reply-To: <1336755300-6821-1-git-send-email-cristian.chilipirea@gmail.com>

From: Cristian Chilipirea <cristian.chilipirea@gmail.com>
Date: Fri, 11 May 2012 19:55:00 +0300

> Fixed all of ERROR "foo* bar" should be "foo *bar"
> 
> Signed-off-by: Cristian Chilipirea <cristian.chilipirea@gmail.com>

Don't ever do this.

This has already been fixed in my net-next tree since April 1st, which
is more than a month ago.  What this means is that you wrote your
patch against something other than reality and your patch won't apply
at all.

All non-bug-fix networking patches should be against the net-next
tree, doing otherwise is at your own peril.

^ permalink raw reply

* Re: [PATCH] net: of/phy: fix build error when phylib is built as a module
From: David Miller @ 2012-05-11 22:03 UTC (permalink / raw)
  To: ddaney.cavm
  Cc: sfr, netdev, linux-kernel, paul.gortmaker, rdunlap, linux-next,
	linuxppc-dev, bjorn
In-Reply-To: <4FAD532E.401@gmail.com>

From: David Daney <ddaney.cavm@gmail.com>
Date: Fri, 11 May 2012 10:58:06 -0700

> On 05/11/2012 08:47 AM, Bjørn Mork wrote:
>> CONFIG_OF_MDIO is tristate and will be m if PHYLIB is m.  Use
>> IS_ENABLED macro to prevent build error:
>>
>>   ERROR: "of_mdio_find_bus" [drivers/net/phy/mdio-mux.ko] undefined!
>>
>> Reported-by: Randy Dunlap<rdunlap@xenotime.net>
>> Cc: David Daney<david.daney@cavium.com>
>> Signed-off-by: Bjørn Mork<bjorn@mork.no>
> 
> I was able to reproduce the failure, and this patch both fixes it and
> seems correct, so...
> 
> Acked-by: David Daney<david.daney@cavium.com>

Applied.

^ permalink raw reply

* Re: pull request: batman-adv 2012-05-11
From: David Miller @ 2012-05-11 22:01 UTC (permalink / raw)
  To: ordex-GaUfNO9RBHfsrOwW+9ziJQ
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	b.a.t.m.a.n-ZwoEplunGu2X36UT3dwllkB+6BGkLq7r
In-Reply-To: <1336738892-7401-1-git-send-email-ordex-GaUfNO9RBHfsrOwW+9ziJQ@public.gmane.org>

From: Antonio Quartulli <ordex-GaUfNO9RBHfsrOwW+9ziJQ@public.gmane.org>
Date: Fri, 11 May 2012 14:21:17 +0200

> this is a fixed version of the pull request issued on 2012-05-09.
> 
> Comments introduced in this patchset are not following the net-tree
> guidelines. Another patch changing all the already existing comments will follow
> later. 
> 
> New exported functions follow the new name convention we discussed so far.
> A patch renaming all the existing exported functions will follow.

Pulled, but you have to cleanup the namespace on lots of other
functions too.

For example, the packet receive handle register/unregister routines
you added in this series as well.

Go through the entire batman-adv stack, and if the function or data
object is not static, make sure it has a batadv_*() prefix.

^ permalink raw reply

* Re: [PATCH 00/17] Swap-over-NBD without deadlocking V10
From: Andrew Morton @ 2012-05-11 21:29 UTC (permalink / raw)
  To: David Miller
  Cc: mgorman, linux-mm, netdev, linux-kernel, neilb, a.p.zijlstra,
	michaelc, emunson
In-Reply-To: <20120511.172339.2007927803884694483.davem@davemloft.net>

On Fri, 11 May 2012 17:23:39 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Mel Gorman <mgorman@suse.de>
> Date: Fri, 11 May 2012 16:45:40 +0100
> 
> > From my point of view, the ideal would be that all the patches go
> > through akpm's tree or yours but that probably will cause merge
> > difficulties.
> > 
> > Any recommendations?
> 
> I know there will be networking side conflicts very soon, it's not a
> matter of 'if' but 'when'.
> 
> But the trick is that I bet the 'mm' and 'slab' folks are in a similar
> situation.
> 
> In any event I'm more than happy to take it all in my tree.

I guess either is OK.  The main thing is to get it all reviewed and
tested, after all.

I can take all the patches once it's all lined up and everyone is
happy.  If the net bits later take significant damage then I can squirt them
at you once the core MM bits are merged.  That would give you a few
days to check them over and get them into Linus.  If that's a problem,
we can hold the net bits over for a cycle.

That's all assuming that the core MM parts are mergeable without the
net parts being merged.  I trust that's the case!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] net: of/phy: fix build error when phylib is built as a module
From: David Miller @ 2012-05-11 21:24 UTC (permalink / raw)
  To: bjorn
  Cc: sfr, david.daney, netdev, linux-kernel, paul.gortmaker, rdunlap,
	linux-next, linuxppc-dev
In-Reply-To: <1336751221-19127-1-git-send-email-bjorn@mork.no>

From: Bjørn Mork <bjorn@mork.no>
Date: Fri, 11 May 2012 17:47:01 +0200

> Should be wrapped into commit 25106022 if it works, to ensure
> bisectability.

Wrapped into?

Commits made to my net-next tree are permanent and irreversible, so we
cannot go back and change a commit.  I never rebase my tree, too many
people use it directly and pull it into their tree, so I'd break their
world if I ever did that.

^ permalink raw reply

* Re: [PATCH 00/17] Swap-over-NBD without deadlocking V10
From: David Miller @ 2012-05-11 21:23 UTC (permalink / raw)
  To: mgorman
  Cc: akpm, linux-mm, netdev, linux-kernel, neilb, a.p.zijlstra,
	michaelc, emunson
In-Reply-To: <20120511154540.GV11435@suse.de>

From: Mel Gorman <mgorman@suse.de>
Date: Fri, 11 May 2012 16:45:40 +0100

> From my point of view, the ideal would be that all the patches go
> through akpm's tree or yours but that probably will cause merge
> difficulties.
> 
> Any recommendations?

I know there will be networking side conflicts very soon, it's not a
matter of 'if' but 'when'.

But the trick is that I bet the 'mm' and 'slab' folks are in a similar
situation.

In any event I'm more than happy to take it all in my tree.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 10/17] netvm: Allow skb allocation to use PFMEMALLOC reserves
From: David Miller @ 2012-05-11 21:17 UTC (permalink / raw)
  To: mgorman
  Cc: akpm, linux-mm, netdev, linux-kernel, neilb, a.p.zijlstra,
	michaelc, emunson
In-Reply-To: <20120511143218.GS11435@suse.de>

From: Mel Gorman <mgorman@suse.de>
Date: Fri, 11 May 2012 15:32:18 +0100

> On Fri, May 11, 2012 at 12:57:40AM -0400, David Miller wrote:
>> Please change this to be a static branch.
> 
> Will do. I renamed memalloc_socks to sk_memalloc_socks, made it a int as
> atomics are unnecessary and I check it directly in a branch instead of a
> static inline. It should be relatively easy for the branch predictor.

No branch predictor can beat an unconditional branch :-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: pch_gbe: oops with vlan (new)
From: Eric Dumazet @ 2012-05-11 21:12 UTC (permalink / raw)
  To: Andy Cress; +Cc: netdev
In-Reply-To: <40680C535D6FE6498883F1640FACD44DDF9105@ka-exchange-1.kontronamerica.local>

On Fri, 2012-05-11 at 13:48 -0700, Andy Cress wrote:
> Folks,
> 
> I am looking for help in debugging a pch_gbe driver oops/abort.
> 
> Kernel: version 2.6.32-220.el6.i686 (RHEL6.2)
> Driver: pch_gbe version 0.91-NAPI  (source tarball we used is at
> https://sendfile.kontron.com/message/24tdUi6MXklnUtBLnOsumq until May
> 16)
> NIC: 0b:00.1 Ethernet controller [0200]: Intel Corporation Platform
> Controller Hub EG20T Gigabit Ethernet Controller [8086:8802] (rev 02)
> 
> Configuration, with VLAN:
>  eth0 (not started)
>  eth0.100 = 192.168.100.1
>  eth0.200 = 192.168.200.1
>  eth0.6  = 192.168.6.1
> 
> When starting the VLAN configuration, then doing a ping test for >= 5
> minutes, I get a kernel oop/abort message as shown below.  This does not
> happen without configuring VLAN.
> Where should I look for possible causes for a transmit queue timeout
> like this?  
> 
> I have contacted the OKI/LAPIS driver authors, but no response so far.
> I thought that this group might be able to comment from similar
> experiences.
> 
> Andy

typical sign of a buggy driver

A quick look in current Linus tree show a non existent synchronization
between ndo_start_xmit and TX completion.

tx completion uses a tx_queue_lock spinlock for nothing but false sense
of correctness.

# find drivers/net/ethernet/oki-semi/pch_gbe -name "*.[ch]"|xargs grep -4 -n tx_queue_lock
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-583-
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-584-/**
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-585- * struct pch_gbe_adapter - board specific private data structure
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-586- * @stats_lock:	Spinlock structure for status
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h:587: * @tx_queue_lock:	Spinlock structure for transmit
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-588- * @ethtool_lock:	Spinlock structure for ethtool
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-589- * @irq_sem:		Semaphore for interrupt
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-590- * @netdev:		Pointer of network device structure
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-591- * @pdev:		Pointer of pci device structure
--
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-608- */
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-609-
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-610-struct pch_gbe_adapter {
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-611-	spinlock_t stats_lock;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h:612:	spinlock_t tx_queue_lock;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-613-	spinlock_t ethtool_lock;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-614-	atomic_t irq_sem;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-615-	struct net_device *netdev;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h-616-	struct pci_dev *pdev;
--
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1641-		netif_wake_queue(adapter->netdev);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1642-		adapter->stats.tx_restart_count++;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1643-		pr_debug("Tx wake queue\n");
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1644-	}
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:1645:	spin_lock(&adapter->tx_queue_lock);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1646-	tx_ring->next_to_clean = i;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:1647:	spin_unlock(&adapter->tx_queue_lock);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1648-	pr_debug("next_to_clean : %d\n", tx_ring->next_to_clean);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1649-	return cleaned;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1650-}
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-1651-
--
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2036-		pr_err("Unable to allocate memory for queues\n");
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2037-		return -ENOMEM;
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2038-	}
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2039-	spin_lock_init(&adapter->hw.miim_lock);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2040:	spin_lock_init(&adapter->tx_queue_lock);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2041-	spin_lock_init(&adapter->stats_lock);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2042-	spin_lock_init(&adapter->ethtool_lock);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2043-	atomic_set(&adapter->irq_sem, 0);
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c-2044-	pch_gbe_irq_disable(adapter);

^ permalink raw reply

* Re: [PATCH RFC 1/6] skbuff: support per-page destructors in copy_ubufs
From: David Miller @ 2012-05-11 21:12 UTC (permalink / raw)
  To: mst; +Cc: Ian.Campbell, netdev, eric.dumazet
In-Reply-To: <20120511120836.GA4637@redhat.com>

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 11 May 2012 15:08:37 +0300

> On Fri, May 11, 2012 at 11:58:12AM +0100, Ian Campbell wrote:
>> On Fri, 2012-05-11 at 10:00 +0100, Ian Campbell wrote:
>> > I'm seeing copy_ubufs called in my remote NFS test, which I don't
>> > think I expected -- I'll investigate why this is happening today. 
>> 
>> It's tcp_transmit_skb which can (conditionally) call skb_clone
>> (backtrace below)
> 
> Interesting. I didn't realise we clone skbs on data path:
> tcp_write_xmit calls tcp_transmit_skb with clone_it flag.
> Could someone comment on why we need to clone on good path
> like this?

We can't send the original SKB that's linked into the retransmit
queue.  It's linkage must stay secure in that queue.

^ permalink raw reply

* pch_gbe: oops with vlan (new)
From: Andy Cress @ 2012-05-11 20:48 UTC (permalink / raw)
  To: netdev

Folks,

I am looking for help in debugging a pch_gbe driver oops/abort.

Kernel: version 2.6.32-220.el6.i686 (RHEL6.2)
Driver: pch_gbe version 0.91-NAPI  (source tarball we used is at
https://sendfile.kontron.com/message/24tdUi6MXklnUtBLnOsumq until May
16)
NIC: 0b:00.1 Ethernet controller [0200]: Intel Corporation Platform
Controller Hub EG20T Gigabit Ethernet Controller [8086:8802] (rev 02)

Configuration, with VLAN:
 eth0 (not started)
 eth0.100 = 192.168.100.1
 eth0.200 = 192.168.200.1
 eth0.6  = 192.168.6.1

When starting the VLAN configuration, then doing a ping test for >= 5
minutes, I get a kernel oop/abort message as shown below.  This does not
happen without configuring VLAN.
Where should I look for possible causes for a transmit queue timeout
like this?  

I have contacted the OKI/LAPIS driver authors, but no response so far.
I thought that this group might be able to comment from similar
experiences.

Andy

May 11 11:06:09 kontron kernel: ------------[ cut here ]------------
May 11 11:06:09 kontron kernel: WARNING: at net/sched/sch_generic.c:261
dev_watchdog+0x1ec/0x200() (Not tainted)
May 11 11:06:09 kontron kernel: Hardware name: N/A
May 11 11:06:09 kontron kernel: NETDEV WATCHDOG: eth0 (pch_gbe):
transmit queue 0 timed out
May 11 11:06:09 kontron kernel: Modules linked in: fuse ip6table_filter
ip6_tables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4
nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM
iptable_mangle iptable_filter ip_tables tun bridge autofs4 sunrpc
cpufreq_ondemand acpi_cpufreq mperf 8021q garp stp llc ipv6 ext3 jbd
uinput ppdev parport_pc parport sg microcode pch_gbe(U) mii serio_raw
snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq
snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc ext4
mbcache jbd2 sd_mod crc_t10dif ahci sdhci_pci sdhci mmc_core video
output dm_mirror dm_region_hash dm_log dm_mod [last unloaded:
scsi_wait_scan]
May 11 11:06:09 kontron kernel: Pid: 0, comm: swapper Not tainted
2.6.32-220.el6.i686 #1
May 11 11:06:09 kontron kernel: Call Trace:
May 11 11:06:09 kontron kernel: [<c0454c81>] ?
warn_slowpath_common+0x81/0xc0
May 11 11:06:09 kontron kernel: [<c07a16bc>] ? dev_watchdog+0x1ec/0x200
May 11 11:06:09 kontron kernel: [<c07a16bc>] ? dev_watchdog+0x1ec/0x200
May 11 11:06:09 kontron kernel: [<c0454d53>] ?
warn_slowpath_fmt+0x33/0x40
May 11 11:06:09 kontron kernel: [<c07a16bc>] ? dev_watchdog+0x1ec/0x200
May 11 11:06:09 kontron kernel: [<c0471bfa>] ? insert_work+0x5a/0xb0
May 11 11:06:09 kontron kernel: [<c04656f9>] ?
run_timer_softirq+0x139/0x2c0
May 11 11:06:09 kontron kernel: [<c0831315>] ?
apic_timer_interrupt+0x31/0x38
May 11 11:06:09 kontron kernel: [<c07a14d0>] ? dev_watchdog+0x0/0x200
May 11 11:06:09 kontron kernel: [<c045be4a>] ? __do_softirq+0x8a/0x1a0
May 11 11:06:09 kontron kernel: [<c045bf9d>] ? do_softirq+0x3d/0x50
May 11 11:06:09 kontron kernel: [<c045c0f5>] ? irq_exit+0x65/0x70
May 11 11:06:09 kontron kernel: [<c0428473>] ?
smp_apic_timer_interrupt+0x53/0x90
May 11 11:06:09 kontron kernel: [<c0831315>] ?
apic_timer_interrupt+0x31/0x38
May 11 11:06:09 kontron kernel: [<c045007b>] ?
throttle_cfs_rq+0x6b/0x130
May 11 11:06:09 kontron kernel: [<c064735f>] ? intel_idle+0xaf/0x140
May 11 11:06:09 kontron kernel: [<c075c282>] ?
cpuidle_idle_call+0x72/0x100
May 11 11:06:09 kontron kernel: [<c0408964>] ? cpu_idle+0x94/0xd0
May 11 11:06:09 kontron kernel: [<c082a645>] ?
start_secondary+0x20d/0x252
May 11 11:06:09 kontron kernel: ---[ end trace 3672ff56500ae344 ]---
May 11 11:06:09 kontron NetworkManager[1608]: <info> (eth0): carrier now
OFF (device state 3)
May 11 11:06:09 kontron NetworkManager[1608]: <info> (eth0): device
state change: 3 -> 2 (reason 40)
May 11 11:06:09 kontron NetworkManager[1608]: <info> (eth0):
deactivating device (reason: 40).

^ permalink raw reply

* Re: pch_gbe oops with vlan
From: David Miller @ 2012-05-11 20:36 UTC (permalink / raw)
  To: andy.cress; +Cc: netdev
In-Reply-To: <40680C535D6FE6498883F1640FACD44DDF9055@ka-exchange-1.kontronamerica.local>

Ummm, no.  You can't do this.

You replied to Eric Dumazet's patch posting, which is completely
unrelated to what you want to post about.  Then you edited the
Subject: and thought that was OK.

This is wrong, because the thread ID and other related fields still
refer to Eric's posting, so all thread indexing facilities still
think your posting is a reply to Eric's.

Don't do this, write a new email to the list properly.

^ permalink raw reply

* Re: pull request: bluetooth 2012-05-04
From: Gustavo Padovan @ 2012-05-11 20:16 UTC (permalink / raw)
  To: David Miller
  Cc: linville-2XuSBdqkA4R54TAoqtyWWQ,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20120508070211.GB11843@joana>

[-- Attachment #1: Type: text/plain, Size: 3881 bytes --]

Hi Dave,

I couldn't get this pull by John this week, he's been unresponsive.
We would love to see this code in the 3.4 kernel (not sure if this would be
possible if Linus do not release the -rc7, there are important fixes 
in the pull request, such as a fix to a regression that was breaking bluetooth
keyboards.

Please let me know if you have any problems with this! I checked this code for
coding style issues too. Thanks.

	Gustavo

* Gustavo Padovan <gustavo-THi1TnShQwVAfugRpC6u6w@public.gmane.org> [2012-05-08 04:07:54 -0300]:

> Hi John,
> 
> * Gustavo Padovan <gustavo-THi1TnShQwVAfugRpC6u6w@public.gmane.org> [2012-05-04 21:12:43 -0300]:
> 
> > Hi John,
> > 
> > A few more patch to 3.4. There is 3 fixes from Mat Martineau, a fix for
> > a incoming MTU check that was breaking ERTM, still on ERTM there is a
> > patch that fixes concurrency with one of our locks. His third patch is
> > related to lock fixes too.
> > Besides that Johan fixed a wrong bit set in the Inquiry code, Vishal
> > Agarwal added a EIR fix and finally we have support for one more usb id.
> > 
> > Please pull or let me know of any problems!
> 
> I saw that you didn't pull this yet, so I decided to re-do this pull request.
> We have more 3 important fixes to go in. They fix a regression that was
> preventing Bluetooth keyboard to work. I'll re-do the whole pull request
> message, so you can copy and paste it easily:
> 
> A few more patch to 3.4. There is 3 fixes from Mat Martineau, a fix for a
> incoming MTU check that was breaking ERTM, still on ERTM there is a patch that
> fixes concurrency with one of our locks. His third patch is related to lock
> fixes too.
> Besides that Johan fixed a wrong bit set in the Inquiry code, Vishal Agarwal
> added a EIR fix and finally we have support for one more usb id.  Finally
> Johan and I fixed a regression that was preventing bluetooth keyboard to work.
> 
> Please pull or let me know of any problems!
> 
> 	Gustavo
> 
> ---
> 
> The following changes since commit 985140369be1e886754d8ac0375dd64e4f727311:
> 
>   Add Foxconn / Hon Hai IDs for btusb module (2012-04-24 11:38:41 -0300)
> 
> are available in the git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git for-upstream
> 
> for you to fetch changes up to fbcb6a061652b38257a1d8bcf6ebd70fc1771549:
> 
>   Bluetooth: mgmt: Fix device_connected sending order (2012-05-07 20:56:30 +0300)
> 
> ----------------------------------------------------------------
> Gustavo Padovan (2):
>       Bluetooth: notify userspace of security level change
>       Bluetooth: report the right security level in getsockopt
> 
> Johan Hedberg (2):
>       Bluetooth: Fix Inquiry with RSSI event mask
>       Bluetooth: mgmt: Fix device_connected sending order
> 
> Mat Martineau (3):
>       Bluetooth: Fix a redundant and problematic incoming MTU check
>       Bluetooth: Restore locking semantics when looking up L2CAP channels
>       Bluetooth: Lock the L2CAP channel when sending
> 
> Michael Gruetzner (1):
>       bluetooth: Add support for Foxconn/Hon Hai AR5BBU22 0489:E03C
> 
> Vishal Agarwal (1):
>       Bluetooth: Fix EIR data generation for mgmt_device_found
> 
>  drivers/bluetooth/ath3k.c         |    6 ++++++
>  drivers/bluetooth/btusb.c         |    3 +++
>  include/net/bluetooth/bluetooth.h |    3 +--
>  include/net/bluetooth/hci_core.h  |   17 +++++++++++++++++
>  net/bluetooth/af_bluetooth.c      |    2 +-
>  net/bluetooth/hci_core.c          |    8 ++++++++
>  net/bluetooth/hci_event.c         |   17 +++++++++++++----
>  net/bluetooth/l2cap_core.c        |   35 ++++++++---------------------------
>  net/bluetooth/l2cap_sock.c        |   39 +++++++++++++++++++++++++--------------
>  9 files changed, 82 insertions(+), 48 deletions(-)


[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* [PATCH v5 2/2] decrement static keys on real destroy time
From: Glauber Costa @ 2012-05-11 20:11 UTC (permalink / raw)
  To: cgroups
  Cc: linux-mm, devel, kamezawa.hiroyu, netdev, Tejun Heo, Li Zefan,
	Glauber Costa, Johannes Weiner, Michal Hocko
In-Reply-To: <1336767077-25351-1-git-send-email-glommer@parallels.com>

We call the destroy function when a cgroup starts to be removed,
such as by a rmdir event.

However, because of our reference counters, some objects are still
inflight. Right now, we are decrementing the static_keys at destroy()
time, meaning that if we get rid of the last static_key reference,
some objects will still have charges, but the code to properly
uncharge them won't be run.

This becomes a problem specially if it is ever enabled again, because
now new charges will be added to the staled charges making keeping
it pretty much impossible.

We just need to be careful with the static branch activation:
since there is no particular preferred order of their activation,
we need to make sure that we only start using it after all
call sites are active. This is achieved by having a per-memcg
flag that is only updated after static_key_slow_inc() returns.
At this time, we are sure all sites are active.

This is made per-memcg, not global, for a reason:
it also has the effect of making socket accounting more
consistent. The first memcg to be limited will trigger static_key()
activation, therefore, accounting. But all the others will then be
accounted no matter what. After this patch, only limited memcgs
will have its sockets accounted.

[v2: changed a tcp limited flag for a generic proto limited flag ]
[v3: update the current active flag only after the static_key update ]
[v4: disarm_static_keys() inside free_work ]
[v5: got rid of tcp_limit_mutex, now in the static_key interface ]

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 include/net/sock.h        |    9 +++++++++
 mm/memcontrol.c           |   26 ++++++++++++++++++++++++--
 net/ipv4/tcp_memcontrol.c |   32 +++++++++++++++++++++++++-------
 3 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b3ebe6b..5c620bd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -914,6 +914,15 @@ struct cg_proto {
 	int			*memory_pressure;
 	long			*sysctl_mem;
 	/*
+	 * active means it is currently active, and new sockets should
+	 * be assigned to cgroups.
+	 *
+	 * activated means it was ever activated, and we need to
+	 * disarm the static keys on destruction
+	 */
+	bool			activated;
+	bool			active;
+	/*
 	 * memcg field is used to find which memcg we belong directly
 	 * Each memcg struct can hold more than one cg_proto, so container_of
 	 * won't really cut.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b4b4c8..d1b0849 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -404,6 +404,7 @@ void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
+		struct cg_proto *cg_proto;
 
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 
@@ -423,9 +424,10 @@ void sock_update_memcg(struct sock *sk)
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
-		if (!mem_cgroup_is_root(memcg)) {
+		cg_proto = sk->sk_prot->proto_cgroup(memcg);
+		if (!mem_cgroup_is_root(memcg) && cg_proto->active) {
 			mem_cgroup_get(memcg);
-			sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
@@ -442,6 +444,14 @@ void sock_release_memcg(struct sock *sk)
 	}
 }
 
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+#ifdef CONFIG_INET
+	if (memcg->tcp_mem.cg_proto.activated)
+		static_key_slow_dec(&memcg_socket_limit_enabled);
+#endif
+}
+
 #ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
@@ -452,6 +462,11 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
+#else
+static inline void disarm_static_keys(struct mem_cgroup *memcg)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -4836,6 +4851,13 @@ static void free_work(struct work_struct *work)
 	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
+	/*
+	 * We need to make sure that (at least for now), the jump label
+	 * destruction code runs outside of the cgroup lock. schedule_work()
+	 * will guarantee this happens. Be careful if you need to move this
+	 * disarm_static_keys around
+	 */
+	disarm_static_keys(memcg);
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 1517037..7ea4f79 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -74,9 +74,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
 
 	val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
-	if (val != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
@@ -107,10 +104,31 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 					     net->ipv4.sysctl_tcp_mem[i]);
 
-	if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
-	else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
-		static_key_slow_inc(&memcg_socket_limit_enabled);
+	if (val == RESOURCE_MAX)
+		cg_proto->active = false;
+	else if (val != RESOURCE_MAX) {
+		/*
+		 * ->activated needs to be written after the static_key update.
+		 *  This is what guarantees that the socket activation function
+		 *  is the last one to run. See sock_update_memcg() for details,
+		 *  and note that we don't mark any socket as belonging to this
+		 *  memcg until that flag is up.
+		 *
+		 *  We need to do this, because static_keys will span multiple
+		 *  sites, but we can't control their order. If we mark a socket
+		 *  as accounted, but the accounting functions are not patched in
+		 *  yet, we'll lose accounting.
+		 *
+		 *  We never race with the readers in sock_update_memcg(), because
+		 *  when this value change, the code to process it is not patched in
+		 *  yet.
+		 */
+		if (!cg_proto->activated) {
+			static_key_slow_inc(&memcg_socket_limit_enabled);
+			cg_proto->activated = true;
+		}
+		cg_proto->active = true;
+	}
 
 	return 0;
 }
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH v5 1/2] Always free struct memcg through schedule_work()
From: Glauber Costa @ 2012-05-11 20:11 UTC (permalink / raw)
  To: cgroups
  Cc: linux-mm, devel, kamezawa.hiroyu, netdev, Tejun Heo, Li Zefan,
	Glauber Costa, Johannes Weiner, Michal Hocko
In-Reply-To: <1336767077-25351-1-git-send-email-glommer@parallels.com>

Right now we free struct memcg with kfree right after a
rcu grace period, but defer it if we need to use vfree() to get
rid of that memory area. We do that by need, because we need vfree
to be called in a process context.

This patch unifies this behavior, by ensuring that even kfree will
happen in a separate thread. The goal is to have a stable place to
call the upcoming jump label destruction function outside the realm
of the complicated and quite far-reaching cgroup lock (that can't be
held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 mm/memcontrol.c |   24 +++++++++++++-----------
 1 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 932a734..0b4b4c8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -245,8 +245,8 @@ struct mem_cgroup {
 		 */
 		struct rcu_head rcu_freeing;
 		/*
-		 * But when using vfree(), that cannot be done at
-		 * interrupt time, so we must then queue the work.
+		 * We also need some space for a worker in deferred freeing.
+		 * By the time we call it, rcu_freeing is not longer in use.
 		 */
 		struct work_struct work_freeing;
 	};
@@ -4826,23 +4826,28 @@ out_free:
 }
 
 /*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
  * but in process context.  The work_freeing structure is overlaid
  * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;
+	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	vfree(memcg);
+	if (size < PAGE_SIZE)
+		kfree(memcg);
+	else
+		vfree(memcg);
 }
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
 {
 	struct mem_cgroup *memcg;
 
 	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, vfree_work);
+	INIT_WORK(&memcg->work_freeing, free_work);
 	schedule_work(&memcg->work_freeing);
 }
 
@@ -4868,10 +4873,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
 	free_percpu(memcg->stat);
-	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-		kfree_rcu(memcg, rcu_freeing);
-	else
-		call_rcu(&memcg->rcu_freeing, vfree_rcu);
+	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 
 static void mem_cgroup_get(struct mem_cgroup *memcg)
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH v5 0/2] fix static_key disabling problem in memcg
From: Glauber Costa @ 2012-05-11 20:11 UTC (permalink / raw)
  To: cgroups; +Cc: linux-mm, devel, kamezawa.hiroyu, netdev, Tejun Heo, Li Zefan

Hi, Tejun, Kame,

This series is composed of the two patches of the last fix, with no changes
(only exception is the removal of x = false assignments that Tejun requested,
that is done now). Note also that patch 1 of this series was reused by me
in the slab accounting patches for memcg.

The first patch, that adds a mutex to memcg is dropped. I didn't posted it
before so I could wait for Kame to get back from his vacations and properly
review it.

Kame: Steven Rostedt pointed out that our analysis of the static branch updates
were wrong, so the mutex is really not needed. 

The key to understand that, is that atomic_inc_not_zero will only return right
away if the value is not yet zero - as the name implies - but the update in the
atomic variable only happens after the code is patched.

Therefore, if two callers enters with a key value of zero, both will be held at
the jump_label_lock() call, effectively guaranteeing the behavior we need.

Glauber Costa (2):
  Always free struct memcg through schedule_work()
  decrement static keys on real destroy time

 include/net/sock.h        |    9 ++++++++
 mm/memcontrol.c           |   50 +++++++++++++++++++++++++++++++++-----------
 net/ipv4/tcp_memcontrol.c |   32 ++++++++++++++++++++++------
 3 files changed, 71 insertions(+), 20 deletions(-)

-- 
1.7.7.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH v2 iproute2] fq_codel: Fair Queue Codel AQM
From: Eric Dumazet @ 2012-05-11 19:49 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Changli Gao, netdev, Dave Taht, Kathleen Nichols,
	Van Jacobson, Tom Herbert, Matt Mathis, Yuchung Cheng,
	Stephen Hemminger, Maciej Żenczykowski, Nandita Dukkipati
In-Reply-To: <1336764650.31653.277.camel@edumazet-glaptop>

From: Eric Dumazet <edumazet@google.com>

Fair Queue Codel packet scheduler

Principles :

- Packets are classified (internal classifier or external) on flows.
- This is a Stochastic model (as we use a hash, several flows might
                              be hashed on same slot)
- Each flow has a CoDel managed queue.
- Flows are linked onto two (Round Robin) lists,
  so that new flows have priority on old ones.

- For a given flow, packets are not reordered (CoDel uses a FIFO)
- head drops only.
- ECN capability is on by default.
- Very low memory footprint (64 bytes per flow)

tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ]
                      [ target TIME ] [ interval TIME ] [ noecn ]
                      [ quantum BYTES ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Van Jacobson <van@pollere.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Changli Gao <xiaosuo@gmail.com>
---
 include/linux/pkt_sched.h |   54 ++++++++
 tc/Makefile               |    1 
 tc/q_fq_codel.c           |  232 ++++++++++++++++++++++++++++++++++++
 3 files changed, 287 insertions(+)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index cde56c2..32aef0a 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -681,4 +681,58 @@ struct tc_codel_xstats {
 	__u32	dropping;  /* are we in dropping state ? */
 };
 
+/* FQ_CODEL */
+
+enum {
+	TCA_FQ_CODEL_UNSPEC,
+	TCA_FQ_CODEL_TARGET,
+	TCA_FQ_CODEL_LIMIT,
+	TCA_FQ_CODEL_INTERVAL,
+	TCA_FQ_CODEL_ECN,
+	TCA_FQ_CODEL_FLOWS,
+	TCA_FQ_CODEL_QUANTUM,
+	__TCA_FQ_CODEL_MAX
+};
+
+#define TCA_FQ_CODEL_MAX	(__TCA_FQ_CODEL_MAX - 1)
+
+enum {
+	TCA_FQ_CODEL_XSTATS_QDISC,
+	TCA_FQ_CODEL_XSTATS_CLASS,
+};
+
+struct tc_fq_codel_qd_stats {
+	__u32	maxpacket;	/* largest packet we've seen so far */
+	__u32	drop_overlimit; /* number of time max qdisc
+				 * packet limit was hit
+				 */
+	__u32	ecn_mark;	/* number of packets we ECN marked
+				 * instead of being dropped
+				 */
+	__u32	new_flow_count; /* number of time packets
+				 * created a 'new flow'
+				 */
+	__u32	new_flows_len;	/* count of flows in new list */
+	__u32	old_flows_len;	/* count of flows in old list */
+};
+
+struct tc_fq_codel_cl_stats {
+	__s32	deficit;
+	__u32	ldelay;		/* in-queue delay seen by most recently
+				 * dequeued packet
+				 */
+	__u32	count;
+	__u32	lastcount;
+	__u32	dropping;
+	__s32	drop_next;
+};
+
+struct tc_fq_codel_xstats {
+	__u32	type;
+	union {
+		struct tc_fq_codel_qd_stats qdisc_stats;
+		struct tc_fq_codel_cl_stats class_stats;
+	};
+};
+
 #endif
diff --git a/tc/Makefile b/tc/Makefile
index 8a7cc8d..64d93ad 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -48,6 +48,7 @@ TCMODULES += em_u32.o
 TCMODULES += em_meta.o
 TCMODULES += q_mqprio.o
 TCMODULES += q_codel.o
+TCMODULES += q_fq_codel.o
 
 TCSO :=
 ifeq ($(TC_CONFIG_ATM),y)
diff --git a/tc/q_fq_codel.c b/tc/q_fq_codel.c
new file mode 100644
index 0000000..3b3b074
--- /dev/null
+++ b/tc/q_fq_codel.c
@@ -0,0 +1,232 @@
+/*
+ * Fair Queue Codel
+ *
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+	fprintf(stderr, "Usage: ... fq_codel [ limit PACKETS ] [ flows NUMBER ]\n");
+	fprintf(stderr, "                    [ target TIME] [ interval TIME ]\n");
+	fprintf(stderr, "                    [ quantum BYTES ] [ [no]ecn ]\n");
+}
+
+static int fq_codel_parse_opt(struct qdisc_util *qu, int argc, char **argv,
+			      struct nlmsghdr *n)
+{
+	unsigned limit = 0;
+	unsigned flows = 0;
+	unsigned target = 0;
+	unsigned interval = 0;
+	unsigned quantum = 0;
+	int ecn = -1;
+	struct rtattr *tail;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "limit") == 0) {
+			NEXT_ARG();
+			if (get_unsigned(&limit, *argv, 0)) {
+				fprintf(stderr, "Illegal \"limit\"\n");
+				return -1;
+			}
+		} else if (strcmp(*argv, "flows") == 0) {
+			NEXT_ARG();
+			if (get_unsigned(&flows, *argv, 0)) {
+				fprintf(stderr, "Illegal \"flows\"\n");
+				return -1;
+			}
+		} else if (strcmp(*argv, "quantum") == 0) {
+			NEXT_ARG();
+			if (get_unsigned(&quantum, *argv, 0)) {
+				fprintf(stderr, "Illegal \"quantum\"\n");
+				return -1;
+			}
+		} else if (strcmp(*argv, "target") == 0) {
+			NEXT_ARG();
+			if (get_time(&target, *argv)) {
+				fprintf(stderr, "Illegal \"target\"\n");
+				return -1;
+			}
+		} else if (strcmp(*argv, "interval") == 0) {
+			NEXT_ARG();
+			if (get_time(&interval, *argv)) {
+				fprintf(stderr, "Illegal \"interval\"\n");
+				return -1;
+			}
+		} else if (strcmp(*argv, "ecn") == 0) {
+			ecn = 1;
+		} else if (strcmp(*argv, "noecn") == 0) {
+			ecn = 0;
+		} else if (strcmp(*argv, "help") == 0) {
+			explain();
+			return -1;
+		} else {
+			fprintf(stderr, "What is \"%s\"?\n", *argv);
+			explain();
+			return -1;
+		}
+		argc--; argv++;
+	}
+
+	tail = NLMSG_TAIL(n);
+	addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+	if (limit)
+		addattr_l(n, 1024, TCA_FQ_CODEL_LIMIT, &limit, sizeof(limit));
+	if (flows)
+		addattr_l(n, 1024, TCA_FQ_CODEL_FLOWS, &flows, sizeof(flows));
+	if (quantum)
+		addattr_l(n, 1024, TCA_FQ_CODEL_QUANTUM, &quantum, sizeof(quantum));
+	if (interval)
+		addattr_l(n, 1024, TCA_FQ_CODEL_INTERVAL, &interval, sizeof(interval));
+	if (target)
+		addattr_l(n, 1024, TCA_FQ_CODEL_TARGET, &target, sizeof(target));
+	if (ecn != -1)
+		addattr_l(n, 1024, TCA_FQ_CODEL_ECN, &ecn, sizeof(ecn));
+	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+	return 0;
+}
+
+static int fq_codel_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+	struct rtattr *tb[TCA_FQ_CODEL_MAX + 1];
+	unsigned limit;
+	unsigned flows;
+	unsigned interval;
+	unsigned target;
+	unsigned ecn;
+	unsigned quantum;
+	SPRINT_BUF(b1);
+
+	if (opt == NULL)
+		return 0;
+
+	parse_rtattr_nested(tb, TCA_FQ_CODEL_MAX, opt);
+
+	if (tb[TCA_FQ_CODEL_LIMIT] &&
+	    RTA_PAYLOAD(tb[TCA_FQ_CODEL_LIMIT]) >= sizeof(__u32)) {
+		limit = rta_getattr_u32(tb[TCA_FQ_CODEL_LIMIT]);
+		fprintf(f, "limit %up ", limit);
+	}
+	if (tb[TCA_FQ_CODEL_FLOWS] &&
+	    RTA_PAYLOAD(tb[TCA_FQ_CODEL_FLOWS]) >= sizeof(__u32)) {
+		flows = rta_getattr_u32(tb[TCA_FQ_CODEL_FLOWS]);
+		fprintf(f, "flows %u ", flows);
+	}
+	if (tb[TCA_FQ_CODEL_QUANTUM] &&
+	    RTA_PAYLOAD(tb[TCA_FQ_CODEL_QUANTUM]) >= sizeof(__u32)) {
+		quantum = rta_getattr_u32(tb[TCA_FQ_CODEL_QUANTUM]);
+		fprintf(f, "quantum %u ", quantum);
+	}
+	if (tb[TCA_FQ_CODEL_TARGET] &&
+	    RTA_PAYLOAD(tb[TCA_FQ_CODEL_TARGET]) >= sizeof(__u32)) {
+		target = rta_getattr_u32(tb[TCA_FQ_CODEL_TARGET]);
+		fprintf(f, "target %s ", sprint_time(target, b1));
+	}
+	if (tb[TCA_FQ_CODEL_INTERVAL] &&
+	    RTA_PAYLOAD(tb[TCA_FQ_CODEL_INTERVAL]) >= sizeof(__u32)) {
+		interval = rta_getattr_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+		fprintf(f, "interval %s ", sprint_time(interval, b1));
+	}
+	if (tb[TCA_FQ_CODEL_ECN] &&
+	    RTA_PAYLOAD(tb[TCA_FQ_CODEL_ECN]) >= sizeof(__u32)) {
+		ecn = rta_getattr_u32(tb[TCA_FQ_CODEL_ECN]);
+		if (ecn)
+			fprintf(f, "ecn ");
+	}
+
+	return 0;
+}
+
+static int fq_codel_print_xstats(struct qdisc_util *qu, FILE *f,
+				 struct rtattr *xstats)
+{
+	struct tc_fq_codel_xstats *st;
+	SPRINT_BUF(b1);
+
+	if (xstats == NULL)
+		return 0;
+
+	if (RTA_PAYLOAD(xstats) < sizeof(*st))
+		return -1;
+
+	st = RTA_DATA(xstats);
+	if (st->type == TCA_FQ_CODEL_XSTATS_QDISC) {
+		fprintf(f, "  maxpacket %u drop_overlimit %u new_flow_count %u ecn_mark %u",
+			st->qdisc_stats.maxpacket,
+			st->qdisc_stats.drop_overlimit,
+			st->qdisc_stats.new_flow_count,
+			st->qdisc_stats.ecn_mark);
+		fprintf(f, "\n  new_flows_len %u old_flows_len %u",
+			st->qdisc_stats.new_flows_len,
+			st->qdisc_stats.old_flows_len);
+	}
+	if (st->type == TCA_FQ_CODEL_XSTATS_CLASS) {
+		fprintf(f, "  deficit %d count %u lastcount %u ldelay %s",
+			st->class_stats.deficit,
+			st->class_stats.count,
+			st->class_stats.lastcount,
+			sprint_time(st->class_stats.ldelay, b1));
+		if (st->class_stats.dropping) {
+			fprintf(f, " dropping");
+			if (st->class_stats.drop_next < 0)
+				fprintf(f, " drop_next -%s",
+					sprint_time(-st->class_stats.drop_next, b1));
+			else
+				fprintf(f, " drop_next %s",
+					sprint_time(st->class_stats.drop_next, b1));
+		}
+	}
+	return 0;
+
+}
+
+struct qdisc_util fq_codel_qdisc_util = {
+	.id		= "fq_codel",
+	.parse_qopt	= fq_codel_parse_opt,
+	.print_qopt	= fq_codel_print_opt,
+	.print_xstats	= fq_codel_print_xstats,
+};

^ permalink raw reply related

* RE: [net-next 06/12] ixgbe: Hardware Timestamping + PTP Hardware Clock (PHC)
From: Keller, Jacob E @ 2012-05-11 19:34 UTC (permalink / raw)
  To: Richard Cochran
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, sassmann@redhat.com
In-Reply-To: <20120511051509.GA2170@netboy.at.omicron.at>

> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Thursday, May 10, 2012 10:15 PM
> To: Keller, Jacob E
> Cc: Kirsher, Jeffrey T; davem@davemloft.net; netdev@vger.kernel.org;
> gospo@redhat.com; sassmann@redhat.com
> Subject: Re: [net-next 06/12] ixgbe: Hardware Timestamping + PTP Hardware
> Clock (PHC)
> 
> On Thu, May 10, 2012 at 09:53:18PM +0000, Keller, Jacob E wrote:
> > > > +	/*
> > > > +	 * If this bit is set, then the RX registers contain the time
> stamp. No
> > > > +	 * other packet will be time stamped until we read these
> registers, so
> > > > +	 * read the registers to make them available again. Because only
> one
> > > > +	 * packet can be time stamped at a time, we know that the
> register
> > > > +	 * values must belong to this one here and therefore we don't
> need to
> > > > +	 * compare any of the additional attributes stored for it.
> > >
> > > I suspect that this assumption is wrong. What happens if the time
> > > stamping logic locks a value but the packet is lost because the ring is
> full?
> > >
> > > BTW, the IGB driver also has this defect.
> > >
> >
> > Note how I read the rx registers first? So it will always clear the value.
> > That should unlock the value for the next rx stamp packet.
> 
> 1. Hw recognizes ptp event packet, locks time stamp 2. Hw drops packet because
> queue is full 3. No more time stamps are ever generated
> 
> Can this happen? The docs seems to say it can.
> 
> Richard

Sorry for the spam here, but I looked at the ixgbe code, and found a solution. When ptp4l discovers a missing rx timestamp, it faults and then waits for 15 seconds until the fault is cleared. After this, it reopens the socket, and reruns the hwtstamp ioctl. This function actually does clear the tx/rx timestamps (just in case) So after a fault the values should end up being reset. Is this good enough?

- Jake

^ permalink raw reply

* [PATCH v2 net-next] fq_codel: Fair Queue Codel AQM
From: Eric Dumazet @ 2012-05-11 19:30 UTC (permalink / raw)
  To: Changli Gao, David Miller
  Cc: netdev, Dave Taht, Kathleen Nichols, Van Jacobson, Tom Herbert,
	Matt Mathis, Yuchung Cheng, Stephen Hemminger,
	Maciej Żenczykowski, Nandita Dukkipati
In-Reply-To: <1336752516.31653.196.camel@edumazet-glaptop>

From: Eric Dumazet <edumazet@google.com>

Fair Queue Codel packet scheduler

Principles :

- Packets are classified (internal classifier or external) on flows.
- This is a Stochastic model (as we use a hash, several flows might
                              be hashed on same slot)
- Each flow has a CoDel managed queue.
- Flows are linked onto two (Round Robin) lists,
  so that new flows have priority on old ones.

- For a given flow, packets are not reordered (CoDel uses a FIFO)
- head drops only.
- ECN capability is on by default.
- Very low memory footprint (64 bytes per flow)

tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ]
                      [ target TIME ] [ interval TIME ] [ noecn ]
                      [ quantum BYTES ]

defaults : 1024 flows, 10240 packets limit, quantum : device MTU
           target : 5ms (CoDel default)
           interval : 100ms (CoDel default)

Impressive results on load :

# tc -s -d cl show dev eth9

class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0 
 Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0) 
 rate 201691Kbit 28595pps backlog 0b 312p requeues 0 
 lended: 33063109 borrowed: 0 giants: 0
 tokens: -912 ctokens: -912

class fq_codel 10:1735 parent 10: 
 (dropped 1292, overlimits 0 requeues 0) 
 backlog 15140b 10p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4524 parent 10: 
 (dropped 1291, overlimits 0 requeues 0) 
 backlog 16654b 11p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4e74 parent 10: 
 (dropped 1290, overlimits 0 requeues 0) 
 backlog 6056b 4p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms
class fq_codel 10:628a parent 10: 
 (dropped 1289, overlimits 0 requeues 0) 
 backlog 7570b 5p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms
class fq_codel 10:a4b3 parent 10: 
 (dropped 302, overlimits 0 requeues 0) 
 backlog 16654b 11p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:c3c2 parent 10: 
 (dropped 1284, overlimits 0 requeues 0) 
 backlog 13626b 9p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:d331 parent 10: 
 (dropped 299, overlimits 0 requeues 0) 
 backlog 15140b 10p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 7.0ms
class fq_codel 10:d526 parent 10: 
 (dropped 12160, overlimits 0 requeues 0) 
 backlog 35870b 211p requeues 0 
  deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us
class fq_codel 10:e2c6 parent 10: 
 (dropped 1288, overlimits 0 requeues 0) 
 backlog 15140b 10p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:eab5 parent 10: 
 (dropped 1285, overlimits 0 requeues 0) 
 backlog 16654b 11p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:f220 parent 10: 
 (dropped 1289, overlimits 0 requeues 0) 
 backlog 15140b 10p requeues 0 
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms

# tc -s -d qd show dev eth9

qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17
 Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71) 
 rate 201697Kbit 28602pps backlog 0b 260p requeues 71 
qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn 
 Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0) 
 rate 201697Kbit 28602pps backlog 189352b 260p requeues 0 
  maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593
  new_flows_len 0 old_flows_len 11


# ping -c 10 172.30.42.18
PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data.
64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms
64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms
64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms
64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms
64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms
64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms
64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms
64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms
64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms
64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms

--- 172.30.42.18 ping statistics ---
10 packets transmitted, 10 received, 0% packet loss, time 8999ms
rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms

Much better than SFQ because of priority given to new flows, and fast
path dirtying less cache lines.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Dave Taht <dave.taht@bufferbloat.net>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Van Jacobson <van@pollere.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Maciej Żenczykowski <maze@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Cc: Changli Gao <xiaosuo@gmail.com>
---
v2: added 'dropped' counter per flow (sum of drops and marks)
    .change method allowed (tc qdisc change .... )
    quantum is a tunable
    no starvation of old flows because of new ones.
    drop_count correctly handled in dequeue() (upcall to parents)
    pkt_sched.h cleanups

 include/linux/pkt_sched.h |   54 +++
 net/sched/Kconfig         |   11 
 net/sched/Makefile        |    1 
 net/sched/sch_fq_codel.c  |  625 ++++++++++++++++++++++++++++++++++++
 4 files changed, 691 insertions(+)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index cde56c2..32aef0a 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -681,4 +681,58 @@ struct tc_codel_xstats {
 	__u32	dropping;  /* are we in dropping state ? */
 };
 
+/* FQ_CODEL */
+
+enum {
+	TCA_FQ_CODEL_UNSPEC,
+	TCA_FQ_CODEL_TARGET,
+	TCA_FQ_CODEL_LIMIT,
+	TCA_FQ_CODEL_INTERVAL,
+	TCA_FQ_CODEL_ECN,
+	TCA_FQ_CODEL_FLOWS,
+	TCA_FQ_CODEL_QUANTUM,
+	__TCA_FQ_CODEL_MAX
+};
+
+#define TCA_FQ_CODEL_MAX	(__TCA_FQ_CODEL_MAX - 1)
+
+enum {
+	TCA_FQ_CODEL_XSTATS_QDISC,
+	TCA_FQ_CODEL_XSTATS_CLASS,
+};
+
+struct tc_fq_codel_qd_stats {
+	__u32	maxpacket;	/* largest packet we've seen so far */
+	__u32	drop_overlimit; /* number of time max qdisc
+				 * packet limit was hit
+				 */
+	__u32	ecn_mark;	/* number of packets we ECN marked
+				 * instead of being dropped
+				 */
+	__u32	new_flow_count; /* number of time packets
+				 * created a 'new flow'
+				 */
+	__u32	new_flows_len;	/* count of flows in new list */
+	__u32	old_flows_len;	/* count of flows in old list */
+};
+
+struct tc_fq_codel_cl_stats {
+	__s32	deficit;
+	__u32	ldelay;		/* in-queue delay seen by most recently
+				 * dequeued packet
+				 */
+	__u32	count;
+	__u32	lastcount;
+	__u32	dropping;
+	__s32	drop_next;
+};
+
+struct tc_fq_codel_xstats {
+	__u32	type;
+	union {
+		struct tc_fq_codel_qd_stats qdisc_stats;
+		struct tc_fq_codel_cl_stats class_stats;
+	};
+};
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index fadd252..e7a8976 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -261,6 +261,17 @@ config NET_SCH_CODEL
 
 	  If unsure, say N.
 
+config NET_SCH_FQ_CODEL
+	tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
+	help
+	  Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
+	  packet scheduling algorithm.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_fq_codel.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 30fab03..5940a19 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
 obj-$(CONFIG_NET_SCH_CHOKE)	+= sch_choke.o
 obj-$(CONFIG_NET_SCH_QFQ)	+= sch_qfq.o
 obj-$(CONFIG_NET_SCH_CODEL)	+= sch_codel.o
+obj-$(CONFIG_NET_SCH_FQ_CODEL)	+= sch_fq_codel.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
new file mode 100644
index 0000000..1f538e4
--- /dev/null
+++ b/net/sched/sch_fq_codel.c
@@ -0,0 +1,625 @@
+/*
+ * Fair Queue CoDel discipline
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/flow_keys.h>
+#include <net/codel.h>
+
+/*	Fair Queue CoDel.
+ *
+ * Principles :
+ * Packets are classified (internal classifier or external) on flows.
+ * This is a Stochastic model (as we use a hash, several flows
+ *			       might be hashed on same slot)
+ * Each flow has a CoDel managed queue.
+ * Flows are linked onto two (Round Robin) lists,
+ * so that new flows have priority on old ones.
+ *
+ * For a given flow, packets are not reordered (CoDel uses a FIFO)
+ * head drops only.
+ * ECN capability is on by default.
+ * Low memory footprint (64 bytes per flow)
+ */
+
+struct fq_codel_flow {
+	struct sk_buff	  *head;
+	struct sk_buff	  *tail;
+	struct list_head  flowchain;
+	int		  deficit;
+	u32		  dropped; /* number of drops (or ECN marks) on this flow */
+	struct codel_vars cvars;
+}; /* please try to keep this structure <= 64 bytes */
+
+struct fq_codel_sched_data {
+	struct tcf_proto *filter_list;	/* optional external classifier */
+	struct fq_codel_flow *flows;	/* Flows table [flows_cnt] */
+	u32		*backlogs;	/* backlog table [flows_cnt] */
+	u32		flows_cnt;	/* number of flows */
+	u32		perturbation;	/* hash perturbation */
+	u32		quantum;	/* psched_mtu(qdisc_dev(sch)); */
+	struct codel_params cparams;
+	struct codel_stats cstats;
+	u32		drop_overlimit;
+	u32		new_flow_count;
+
+	struct list_head new_flows;	/* list of new flows */
+	struct list_head old_flows;	/* list of old flows */
+};
+
+static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
+				  const struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	unsigned int hash;
+
+	skb_flow_dissect(skb, &keys);
+	hash = jhash_3words((__force u32)keys.dst,
+			    (__force u32)keys.src ^ keys.ip_proto,
+			    (__force u32)keys.ports, q->perturbation);
+	return ((u64)hash * q->flows_cnt) >> 32;
+}
+
+static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
+				      int *qerr)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tcf_result res;
+	int result;
+
+	if (TC_H_MAJ(skb->priority) == sch->handle &&
+	    TC_H_MIN(skb->priority) > 0 &&
+	    TC_H_MIN(skb->priority) <= q->flows_cnt)
+		return TC_H_MIN(skb->priority);
+
+	if (!q->filter_list)
+		return fq_codel_hash(q, skb) + 1;
+
+	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	result = tc_classify(skb, q->filter_list, &res);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return 0;
+		}
+#endif
+		if (TC_H_MIN(res.classid) <= q->flows_cnt)
+			return TC_H_MIN(res.classid);
+	}
+	return 0;
+}
+
+/* helper functions : might be changed when/if skb use a standard list_head */
+
+/* remove one skb from head of slot queue */
+static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
+{
+	struct sk_buff *skb = flow->head;
+
+	flow->head = skb->next;
+	skb->next = NULL;
+	return skb;
+}
+
+/* add skb to flow queue (tail add) */
+static inline void flow_queue_add(struct fq_codel_flow *flow,
+				  struct sk_buff *skb)
+{
+	if (flow->head == NULL)
+		flow->head = skb;
+	else
+		flow->tail->next = skb;
+	flow->tail = skb;
+	skb->next = NULL;
+}
+
+static unsigned int fq_codel_drop(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	unsigned int maxbacklog = 0, idx = 0, i, len;
+	struct fq_codel_flow *flow;
+
+	/* Queue is full! Find the fat flow and drop packet from it.
+	 * This might sound expensive, but with 1024 flows, we scan
+	 * 4KB of memory, and we dont need to handle a complex tree
+	 * in fast path (packet queue/enqueue) with many cache misses.
+	 */
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (q->backlogs[i] > maxbacklog) {
+			maxbacklog = q->backlogs[i];
+			idx = i;
+		}
+	}
+	flow = &q->flows[idx];
+	skb = dequeue_head(flow);
+	len = qdisc_pkt_len(skb);
+	q->backlogs[idx] -= len;
+	kfree_skb(skb);
+	sch->q.qlen--;
+	sch->qstats.drops++;
+	sch->qstats.backlog -= len;
+	flow->dropped++;
+	return idx;
+}
+
+static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	unsigned int idx;
+	struct fq_codel_flow *flow;
+	int uninitialized_var(ret);
+
+	idx = fq_codel_classify(skb, sch, &ret);
+	if (idx == 0) {
+		if (ret & __NET_XMIT_BYPASS)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+	idx--;
+
+	codel_set_enqueue_time(skb);
+	flow = &q->flows[idx];
+	flow_queue_add(flow, skb);
+	q->backlogs[idx] += qdisc_pkt_len(skb);
+	sch->qstats.backlog += qdisc_pkt_len(skb);
+
+	if (list_empty(&flow->flowchain)) {
+		list_add_tail(&flow->flowchain, &q->new_flows);
+		codel_vars_init(&flow->cvars);
+		q->new_flow_count++;
+		flow->deficit = q->quantum;
+		flow->dropped = 0;
+	}
+	if (++sch->q.qlen < sch->limit)
+		return NET_XMIT_SUCCESS;
+
+	q->drop_overlimit++;
+	/* Return Congestion Notification only if we dropped a packet
+	 * from this flow.
+	 */
+	if (fq_codel_drop(sch) == idx)
+		return NET_XMIT_CN;
+
+	/* As we dropped a packet, better let upper stack know this */
+	qdisc_tree_decrease_qlen(sch, 1);
+	return NET_XMIT_SUCCESS;
+}
+
+/* This is the specific function called from codel_dequeue()
+ * to dequeue a packet from queue. Note: backlog is handled in
+ * codel, we dont need to reduce it here.
+ */
+static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
+{
+	struct fq_codel_flow *flow;
+	struct sk_buff *skb = NULL;
+
+	flow = container_of(vars, struct fq_codel_flow, cvars);
+	if (flow->head) {
+		skb = dequeue_head(flow);
+		sch->qstats.backlog -= qdisc_pkt_len(skb);
+		sch->q.qlen--;
+	}
+	return skb;
+}
+
+static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	struct fq_codel_flow *flow;
+	struct list_head *head;
+	u32 prev_drop_count, prev_ecn_mark;
+
+begin:
+	head = &q->new_flows;
+	if (list_empty(head)) {
+		head = &q->old_flows;
+		if (list_empty(head))
+			return NULL;
+	}
+	flow = list_first_entry(head, struct fq_codel_flow, flowchain);
+
+	if (flow->deficit <= 0) {
+		flow->deficit += q->quantum;
+		list_move_tail(&flow->flowchain, &q->old_flows);
+		goto begin;
+	}
+
+	prev_drop_count = q->cstats.drop_count;
+	prev_ecn_mark = q->cstats.ecn_mark;
+
+	skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
+			    dequeue, &q->backlogs[flow - q->flows]);
+
+	flow->dropped += q->cstats.drop_count - prev_drop_count;
+	flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
+
+	if (!skb) {
+		/* force a pass through old_flows to prevent starvation */
+		if ((head == &q->new_flows) && !list_empty(&q->old_flows))
+			list_move_tail(&flow->flowchain, &q->old_flows);
+		else
+			list_del_init(&flow->flowchain);
+		goto begin;
+	}
+	qdisc_bstats_update(sch, skb);
+	flow->deficit -= qdisc_pkt_len(skb);
+	/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+	 * or HTB crashes. Defer it for next round.
+	 */
+	if (q->cstats.drop_count && sch->q.qlen) {
+		qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+		q->cstats.drop_count = 0;
+	}
+	return skb;
+}
+
+static void fq_codel_reset(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+
+	while ((skb = fq_codel_dequeue(sch)) != NULL)
+		kfree_skb(skb);
+}
+
+static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
+	[TCA_FQ_CODEL_TARGET]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_LIMIT]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_INTERVAL]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_ECN]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_FLOWS]	= { .type = NLA_U32 },
+	[TCA_FQ_CODEL_QUANTUM]	= { .type = NLA_U32 },
+};
+
+static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
+	int err;
+
+	if (!opt)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
+	if (err < 0)
+		return err;
+	if (tb[TCA_FQ_CODEL_FLOWS]) {
+		if (q->flows)
+			return -EINVAL;
+		q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
+		if (!q->flows_cnt ||
+		    q->flows_cnt > 65536)
+			return -EINVAL;
+	}
+	sch_tree_lock(sch);
+
+	if (tb[TCA_FQ_CODEL_TARGET]) {
+		u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
+
+		q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_FQ_CODEL_INTERVAL]) {
+		u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
+
+		q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
+	}
+
+	if (tb[TCA_FQ_CODEL_LIMIT])
+		sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
+
+	if (tb[TCA_FQ_CODEL_ECN])
+		q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
+
+	if (tb[TCA_FQ_CODEL_QUANTUM])
+		q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
+
+	while (sch->q.qlen > sch->limit) {
+		struct sk_buff *skb = fq_codel_dequeue(sch);
+
+		kfree_skb(skb);
+		q->cstats.drop_count++;
+	}
+	qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+	q->cstats.drop_count = 0;
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+static void *fq_codel_zalloc(size_t sz)
+{
+	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
+
+	if (!ptr)
+		ptr = vzalloc(sz);
+	return ptr;
+}
+
+static void fq_codel_free(void *addr)
+{
+	if (addr) {
+		if (is_vmalloc_addr(addr))
+			vfree(addr);
+		else
+			kfree(addr);
+	}
+}
+
+static void fq_codel_destroy(struct Qdisc *sch)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+	tcf_destroy_chain(&q->filter_list);
+	fq_codel_free(q->backlogs);
+	fq_codel_free(q->flows);
+}
+
+static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	int i;
+
+	sch->limit = 10*1024;
+	q->flows_cnt = 1024;
+	q->quantum = psched_mtu(qdisc_dev(sch));
+	q->perturbation = net_random();
+	INIT_LIST_HEAD(&q->new_flows);
+	INIT_LIST_HEAD(&q->old_flows);
+	codel_params_init(&q->cparams);
+	codel_stats_init(&q->cstats);
+	q->cparams.ecn = true;
+
+	if (opt) {
+		int err = fq_codel_change(sch, opt);
+		if (err)
+			return err;
+	}
+
+	if (!q->flows) {
+		q->flows = fq_codel_zalloc(q->flows_cnt *
+					   sizeof(struct fq_codel_flow));
+		if (!q->flows)
+			return -ENOMEM;
+		q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+		if (!q->backlogs) {
+			fq_codel_free(q->flows);
+			return -ENOMEM;
+		}
+		for (i = 0; i < q->flows_cnt; i++) {
+			struct fq_codel_flow *flow = q->flows + i;
+
+			INIT_LIST_HEAD(&flow->flowchain);
+		}
+	}
+	if (sch->limit >= 1)
+		sch->flags |= TCQ_F_CAN_BYPASS;
+	else
+		sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+
+	opts = nla_nest_start(skb, TCA_OPTIONS);
+	if (opts == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
+			codel_time_to_us(q->cparams.target)) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
+			sch->limit) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
+			codel_time_to_us(q->cparams.interval)) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_ECN,
+			q->cparams.ecn) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
+			q->quantum) ||
+	    nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
+			q->flows_cnt))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, opts);
+	return skb->len;
+
+nla_put_failure:
+	return -1;
+}
+
+static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	struct tc_fq_codel_xstats st = {
+		.type				= TCA_FQ_CODEL_XSTATS_QDISC,
+		.qdisc_stats.maxpacket		= q->cstats.maxpacket,
+		.qdisc_stats.drop_overlimit	= q->drop_overlimit,
+		.qdisc_stats.ecn_mark		= q->cstats.ecn_mark,
+		.qdisc_stats.new_flow_count	= q->new_flow_count,
+	};
+	struct list_head *pos;
+
+	list_for_each(pos, &q->new_flows)
+		st.qdisc_stats.new_flows_len++;
+
+	list_for_each(pos, &q->old_flows)
+		st.qdisc_stats.old_flows_len++;
+
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
+static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
+			      u32 classid)
+{
+	/* we cannot bypass queue discipline anymore */
+	sch->flags &= ~TCQ_F_CAN_BYPASS;
+	return 0;
+}
+
+static void fq_codel_put(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_proto **fq_codel_find_tcf(struct Qdisc *sch, unsigned long cl)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return &q->filter_list;
+}
+
+static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
+			  struct sk_buff *skb, struct tcmsg *tcm)
+{
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+				     struct gnet_dump *d)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	u32 idx = cl - 1;
+	struct gnet_stats_queue qs = { 0 };
+	struct tc_fq_codel_xstats xstats;
+
+	WARN_ON_ONCE(1);
+	if (idx < q->flows_cnt) {
+		const struct fq_codel_flow *flow = &q->flows[idx];
+		const struct sk_buff *skb = flow->head;
+
+		memset(&xstats, 0, sizeof(xstats));
+		xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
+		xstats.class_stats.deficit = flow->deficit;
+		xstats.class_stats.ldelay =
+			codel_time_to_us(flow->cvars.ldelay);
+		xstats.class_stats.count = flow->cvars.count;
+		xstats.class_stats.lastcount = flow->cvars.lastcount;
+		xstats.class_stats.dropping = flow->cvars.dropping;
+		if (flow->cvars.dropping) {
+			codel_tdiff_t delta = flow->cvars.drop_next -
+					      codel_get_time();
+
+			xstats.class_stats.drop_next = (delta >= 0) ?
+				codel_time_to_us(delta) :
+				-codel_time_to_us(-delta);
+		}
+		while (skb) {
+			qs.qlen++;
+			skb = skb->next;
+		}
+		qs.backlog = q->backlogs[idx];
+		qs.drops = flow->dropped;
+	}
+	if (gnet_stats_copy_queue(d, &qs) < 0)
+		return -1;
+	if (idx < q->flows_cnt)
+		return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
+	return 0;
+}
+
+static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct fq_codel_sched_data *q = qdisc_priv(sch);
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	for (i = 0; i < q->flows_cnt; i++) {
+		if (list_empty(&q->flows[i].flowchain) ||
+		    arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops fq_codel_class_ops = {
+	.leaf		=	fq_codel_leaf,
+	.get		=	fq_codel_get,
+	.put		=	fq_codel_put,
+	.tcf_chain	=	fq_codel_find_tcf,
+	.bind_tcf	=	fq_codel_bind,
+	.unbind_tcf	=	fq_codel_put,
+	.dump		=	fq_codel_dump_class,
+	.dump_stats	=	fq_codel_dump_class_stats,
+	.walk		=	fq_codel_walk,
+};
+
+static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+	.cl_ops		=	&fq_codel_class_ops,
+	.id		=	"fq_codel",
+	.priv_size	=	sizeof(struct fq_codel_sched_data),
+	.enqueue	=	fq_codel_enqueue,
+	.dequeue	=	fq_codel_dequeue,
+	.peek		=	qdisc_peek_dequeued,
+	.drop		=	fq_codel_drop,
+	.init		=	fq_codel_init,
+	.reset		=	fq_codel_reset,
+	.destroy	=	fq_codel_destroy,
+	.change		=	fq_codel_change,
+	.dump		=	fq_codel_dump,
+	.dump_stats =	fq_codel_dump_stats,
+	.owner		=	THIS_MODULE,
+};
+
+static int __init fq_codel_module_init(void)
+{
+	return register_qdisc(&fq_codel_qdisc_ops);
+}
+
+static void __exit fq_codel_module_exit(void)
+{
+	unregister_qdisc(&fq_codel_qdisc_ops);
+}
+
+module_init(fq_codel_module_init)
+module_exit(fq_codel_module_exit)
+MODULE_AUTHOR("Eric Dumazet");
+MODULE_LICENSE("GPL");

^ permalink raw reply related

* RE: [net-next 06/12] ixgbe: Hardware Timestamping + PTP Hardware Clock (PHC)
From: Keller, Jacob E @ 2012-05-11 19:23 UTC (permalink / raw)
  To: Richard Cochran
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, sassmann@redhat.com
In-Reply-To: <20120511051509.GA2170@netboy.at.omicron.at>

> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Thursday, May 10, 2012 10:15 PM
> To: Keller, Jacob E
> Cc: Kirsher, Jeffrey T; davem@davemloft.net; netdev@vger.kernel.org;
> gospo@redhat.com; sassmann@redhat.com
> Subject: Re: [net-next 06/12] ixgbe: Hardware Timestamping + PTP Hardware
> Clock (PHC)
> 
> On Thu, May 10, 2012 at 09:53:18PM +0000, Keller, Jacob E wrote:
> > > > +	/*
> > > > +	 * If this bit is set, then the RX registers contain the time
> stamp. No
> > > > +	 * other packet will be time stamped until we read these
> registers, so
> > > > +	 * read the registers to make them available again. Because only
> one
> > > > +	 * packet can be time stamped at a time, we know that the
> register
> > > > +	 * values must belong to this one here and therefore we don't
> need to
> > > > +	 * compare any of the additional attributes stored for it.
> > >
> > > I suspect that this assumption is wrong. What happens if the time
> > > stamping logic locks a value but the packet is lost because the ring is
> full?
> > >
> > > BTW, the IGB driver also has this defect.
> > >
> >
> > Note how I read the rx registers first? So it will always clear the value.
> > That should unlock the value for the next rx stamp packet.
> 
> 1. Hw recognizes ptp event packet, locks time stamp 2. Hw drops packet because
> queue is full 3. No more time stamps are ever generated
> 
> Can this happen? The docs seems to say it can.
> 
> Richard

I believe this very rare case might be possible, but I don't think that checking the ptp seqid will fix anything. In normal cases, hardware latches Rx packet timestamp, then the ptp packet goes into the queue and we process it shortly after. Before we process that packet there will never be another packet in the queue that needs a timestamp. We know this because the hardware stops timestamping until we unlatch the RX registers. This should mean we don't need to check the sequence ID, and spending time doing it would never fix the issue you are talking about.

The issue is for when a packet is timestamped and then never reaches the queue. Then the rx stamp registers are locked for good, because we never clear them, and hardware would never timestamp another receive packet. I don't know a good solution to this, except to clear the registers periodically. Do you have any suggestions?

- Jake

^ permalink raw reply

* pch_gbe oops with vlan
From: Andy Cress @ 2012-05-11 18:49 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1336752516.31653.196.camel@edumazet-glaptop>

Folks,

I am looking for help in debugging a pch_gbe driver oops/abort.

Kernel: version 2.6.32-220.el6.i686 (RHEL6.2)
Driver: pch_gbe version 0.91-NAPI  (source tarball we used is at https://sendfile.kontron.com/message/24tdUi6MXklnUtBLnOsumq until May 16)
NIC: 0b:00.1 Ethernet controller [0200]: Intel Corporation Platform Controller Hub EG20T Gigabit Ethernet Controller [8086:8802] (rev 02)

Configuration, with VLAN:
 eth0 (not started)
 eth0.100 = 192.168.100.1 
 eth0.200 = 192.168.200.1 
 eth0.6  = 192.168.6.1

When starting the VLAN configuration, then doing a ping test for >= 5 minutes, I get a kernel oop/abort message as shown below.  This does not happen without configuring VLAN.
Where should I look for possible causes for a transmit queue timeout like this?  

I have contacted the OKI/LAPIS driver authors, but no response so far.  I thought that this group might be able to comment from similar experiences.

Andy

May 11 11:06:09 kontron kernel: ------------[ cut here ]------------
May 11 11:06:09 kontron kernel: WARNING: at net/sched/sch_generic.c:261 dev_watchdog+0x1ec/0x200() (Not tainted)
May 11 11:06:09 kontron kernel: Hardware name: N/A
May 11 11:06:09 kontron kernel: NETDEV WATCHDOG: eth0 (pch_gbe): transmit queue 0 timed out
May 11 11:06:09 kontron kernel: Modules linked in: fuse ip6table_filter ip6_tables ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_CHECKSUM iptable_mangle iptable_filter ip_tables tun bridge autofs4 sunrpc cpufreq_ondemand acpi_cpufreq mperf 8021q garp stp llc ipv6 ext3 jbd uinput ppdev parport_pc parport sg microcode pch_gbe(U) mii serio_raw snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc ext4 mbcache jbd2 sd_mod crc_t10dif ahci sdhci_pci sdhci mmc_core video output dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_wait_scan]
May 11 11:06:09 kontron kernel: Pid: 0, comm: swapper Not tainted 2.6.32-220.el6.i686 #1
May 11 11:06:09 kontron kernel: Call Trace:
May 11 11:06:09 kontron kernel: [<c0454c81>] ? warn_slowpath_common+0x81/0xc0
May 11 11:06:09 kontron kernel: [<c07a16bc>] ? dev_watchdog+0x1ec/0x200
May 11 11:06:09 kontron kernel: [<c07a16bc>] ? dev_watchdog+0x1ec/0x200
May 11 11:06:09 kontron kernel: [<c0454d53>] ? warn_slowpath_fmt+0x33/0x40
May 11 11:06:09 kontron kernel: [<c07a16bc>] ? dev_watchdog+0x1ec/0x200
May 11 11:06:09 kontron kernel: [<c0471bfa>] ? insert_work+0x5a/0xb0
May 11 11:06:09 kontron kernel: [<c04656f9>] ? run_timer_softirq+0x139/0x2c0
May 11 11:06:09 kontron kernel: [<c0831315>] ? apic_timer_interrupt+0x31/0x38
May 11 11:06:09 kontron kernel: [<c07a14d0>] ? dev_watchdog+0x0/0x200
May 11 11:06:09 kontron kernel: [<c045be4a>] ? __do_softirq+0x8a/0x1a0
May 11 11:06:09 kontron kernel: [<c045bf9d>] ? do_softirq+0x3d/0x50
May 11 11:06:09 kontron kernel: [<c045c0f5>] ? irq_exit+0x65/0x70
May 11 11:06:09 kontron kernel: [<c0428473>] ? smp_apic_timer_interrupt+0x53/0x90
May 11 11:06:09 kontron kernel: [<c0831315>] ? apic_timer_interrupt+0x31/0x38
May 11 11:06:09 kontron kernel: [<c045007b>] ? throttle_cfs_rq+0x6b/0x130
May 11 11:06:09 kontron kernel: [<c064735f>] ? intel_idle+0xaf/0x140
May 11 11:06:09 kontron kernel: [<c075c282>] ? cpuidle_idle_call+0x72/0x100
May 11 11:06:09 kontron kernel: [<c0408964>] ? cpu_idle+0x94/0xd0
May 11 11:06:09 kontron kernel: [<c082a645>] ? start_secondary+0x20d/0x252
May 11 11:06:09 kontron kernel: ---[ end trace 3672ff56500ae344 ]---
May 11 11:06:09 kontron NetworkManager[1608]: <info> (eth0): carrier now OFF (device state 3)
May 11 11:06:09 kontron NetworkManager[1608]: <info> (eth0): device state change: 3 -> 2 (reason 40)
May 11 11:06:09 kontron NetworkManager[1608]: <info> (eth0): deactivating device (reason: 40).
May 11 11:06:10 kontron abrtd: Directory 'oops-2012-05-11-11:06:10-1924-0' creation detected
May 11 11:06:10 kontron abrt-dump-oops: Reported 1 kernel oopses to Abrt

^ permalink raw reply

* Re: [PATCH RFC 1/6] skbuff: support per-page destructors in copy_ubufs
From: Michael S. Tsirkin @ 2012-05-11 16:30 UTC (permalink / raw)
  To: Ian Campbell; +Cc: David Miller, netdev@vger.kernel.org, eric.dumazet@gmail.com
In-Reply-To: <20120511120836.GA4637@redhat.com>

On Fri, May 11, 2012 at 03:08:36PM +0300, Michael S. Tsirkin wrote:
> On Fri, May 11, 2012 at 11:58:12AM +0100, Ian Campbell wrote:
> > On Fri, 2012-05-11 at 10:00 +0100, Ian Campbell wrote:
> > > I'm seeing copy_ubufs called in my remote NFS test, which I don't
> > > think I expected -- I'll investigate why this is happening today. 
> > 
> > It's tcp_transmit_skb which can (conditionally) call skb_clone
> > (backtrace below)
> 
> Interesting. I didn't realise we clone skbs on data path:
> tcp_write_xmit calls tcp_transmit_skb with clone_it flag.
> Could someone comment on why we need to clone on good path
> like this?

Hmm, it's in case we need to retransmit it later.

> -- 
> MST

^ permalink raw reply

* RE: [net-next 07/12] ixgbe: Enable timesync clock-out feature for PPS support on X540
From: Keller, Jacob E @ 2012-05-11 18:10 UTC (permalink / raw)
  To: Richard Cochran
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, sassmann@redhat.com
In-Reply-To: <20120511054036.GC2170@netboy.at.omicron.at>

> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Thursday, May 10, 2012 10:41 PM
> To: Keller, Jacob E
> Cc: Kirsher, Jeffrey T; davem@davemloft.net; netdev@vger.kernel.org;
> gospo@redhat.com; sassmann@redhat.com
> Subject: Re: [net-next 07/12] ixgbe: Enable timesync clock-out feature for PPS
> support on X540
> 
> On Thu, May 10, 2012 at 10:08:44PM +0000, Keller, Jacob E wrote:
> >
> > Oops stupid mail program sent that on accident. Anyways: I think you
> > might be right, Richard. We don't read those timestamp values unless
> > the stat err bit for timestamps is set on the descriptor. But I am not
> > sure what happens when the tjmestamped packet is dropped off the end
> > of the ring. What would you propose here? How can we detect if this
> > timestamp doesn't match the packet? I can look into using the extra
> > hardware features for matching timestamps. That might be a more useful, in
> that it would help prevent this case.
> 
> [ Talking about the Rx time stamping locking from other patch... ]
> 
> The IGB provides some PTP event packet identification fields (seqNum,
> etc) just for the purpose of matching time stamps to packets. Some of the
> other PHC drivers (ixp4xx, dp83640) have code that does the matching.
> 

Ixgbe has the sequence number also. I'll take a look at the PHC drivers
that match already, and see how difficult it would be to perform this check.

- Jake

> HTH,
> Richard
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox