Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next-2.6 PATCH 2/2] be2net: Code changes in Tx path to use skb_dma_map/skb_dma_unmap
From: Ajit Khaparde @ 2009-09-04 13:12 UTC (permalink / raw)
  To: davem, netdev

Code changes to
 - In the tx completion processing, there were instances of unmapping a
memory as a page which was originally mapped as single. This patch takes care
of this by using skb_dma_map()/skb_dma_unmap() to map/unmap Tx buffers.
 - set gso_max_size to 65535. This was not done till now.


Signed-off-by: Ajit Khaparde <ajitk@serverengines.com>
---
 drivers/net/benet/be_main.c |   62 ++++++++++++++++++++++--------------------
 1 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
index d09106f..ce11bba 100644
--- a/drivers/net/benet/be_main.c
+++ b/drivers/net/benet/be_main.c
@@ -385,15 +385,19 @@ static int make_tx_wrbs(struct be_adapter *adapter,
 	struct be_eth_wrb *wrb;
 	struct be_eth_hdr_wrb *hdr;
 
-	atomic_add(wrb_cnt, &txq->used);
 	hdr = queue_head_node(txq);
+	atomic_add(wrb_cnt, &txq->used);
 	queue_head_inc(txq);
 
+	if (skb_dma_map(&pdev->dev, skb, DMA_TO_DEVICE)) {
+		dev_err(&pdev->dev, "TX DMA mapping failed\n");
+		return 0;
+	}
+
 	if (skb->len > skb->data_len) {
 		int len = skb->len - skb->data_len;
-		busaddr = pci_map_single(pdev, skb->data, len,
-					 PCI_DMA_TODEVICE);
 		wrb = queue_head_node(txq);
+		busaddr = skb_shinfo(skb)->dma_head;
 		wrb_fill(wrb, busaddr, len);
 		be_dws_cpu_to_le(wrb, sizeof(*wrb));
 		queue_head_inc(txq);
@@ -403,9 +407,8 @@ static int make_tx_wrbs(struct be_adapter *adapter,
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		struct skb_frag_struct *frag =
 			&skb_shinfo(skb)->frags[i];
-		busaddr = pci_map_page(pdev, frag->page,
-					frag->page_offset,
-					frag->size, PCI_DMA_TODEVICE);
+
+		busaddr = skb_shinfo(skb)->dma_maps[i];
 		wrb = queue_head_node(txq);
 		wrb_fill(wrb, busaddr, frag->size);
 		be_dws_cpu_to_le(wrb, sizeof(*wrb));
@@ -429,6 +432,7 @@ static int make_tx_wrbs(struct be_adapter *adapter,
 
 static netdev_tx_t be_xmit(struct sk_buff *skb,
 				 struct net_device *netdev)
+
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	struct be_tx_obj *tx_obj = &adapter->tx_obj;
@@ -440,23 +444,28 @@ static netdev_tx_t be_xmit(struct sk_buff *skb,
 	wrb_cnt = wrb_cnt_for_skb(skb, &dummy_wrb);
 
 	copied = make_tx_wrbs(adapter, skb, wrb_cnt, dummy_wrb);
+	if (copied) {
+		/* record the sent skb in the sent_skb table */
+		BUG_ON(tx_obj->sent_skb_list[start]);
+		tx_obj->sent_skb_list[start] = skb;
+
+		/* Ensure txq has space for the next skb; Else stop the queue
+		 * *BEFORE* ringing the tx doorbell, so that we serialze the
+		 * tx compls of the current transmit which'll wake up the queue
+		 */
+		if ((BE_MAX_TX_FRAG_COUNT + atomic_read(&txq->used)) >=
+								txq->len) {
+			netif_stop_queue(netdev);
+			stopped = true;
+		}
 
-	/* record the sent skb in the sent_skb table */
-	BUG_ON(tx_obj->sent_skb_list[start]);
-	tx_obj->sent_skb_list[start] = skb;
+		be_txq_notify(adapter, txq->id, wrb_cnt);
 
-	/* Ensure that txq has space for the next skb; Else stop the queue
-	 * *BEFORE* ringing the tx doorbell, so that we serialze the
-	 * tx compls of the current transmit which'll wake up the queue
-	 */
-	if ((BE_MAX_TX_FRAG_COUNT + atomic_read(&txq->used)) >= txq->len) {
-		netif_stop_queue(netdev);
-		stopped = true;
+		be_tx_stats_update(adapter, wrb_cnt, copied, stopped);
+	} else {
+		txq->head = start;
+		dev_kfree_skb_any(skb);
 	}
-
-	be_txq_notify(adapter, txq->id, wrb_cnt);
-
-	be_tx_stats_update(adapter, wrb_cnt, copied, stopped);
 	return NETDEV_TX_OK;
 }
 
@@ -958,10 +967,8 @@ static struct be_eth_tx_compl *be_tx_compl_get(struct be_queue_info *tx_cq)
 static void be_tx_compl_process(struct be_adapter *adapter, u16 last_index)
 {
 	struct be_queue_info *txq = &adapter->tx_obj.q;
-	struct be_eth_wrb *wrb;
 	struct sk_buff **sent_skbs = adapter->tx_obj.sent_skb_list;
 	struct sk_buff *sent_skb;
-	u64 busaddr;
 	u16 cur_index, num_wrbs = 0;
 
 	cur_index = txq->tail;
@@ -971,19 +978,12 @@ static void be_tx_compl_process(struct be_adapter *adapter, u16 last_index)
 
 	do {
 		cur_index = txq->tail;
-		wrb = queue_tail_node(txq);
-		be_dws_le_to_cpu(wrb, sizeof(*wrb));
-		busaddr = ((u64)wrb->frag_pa_hi << 32) | (u64)wrb->frag_pa_lo;
-		if (busaddr != 0) {
-			pci_unmap_single(adapter->pdev, busaddr,
-				wrb->frag_len, PCI_DMA_TODEVICE);
-		}
 		num_wrbs++;
 		queue_tail_inc(txq);
 	} while (cur_index != last_index);
 
 	atomic_sub(num_wrbs, &txq->used);
-
+	skb_dma_unmap(&adapter->pdev->dev, sent_skb, DMA_TO_DEVICE);
 	kfree_skb(sent_skb);
 }
 
@@ -1892,6 +1892,8 @@ static void be_netdev_init(struct net_device *netdev)
 
 	adapter->rx_csum = true;
 
+	netif_set_gso_max_size(netdev, 65535);
+
 	BE_SET_NETDEV_OPS(netdev, &be_netdev_ops);
 
 	SET_ETHTOOL_OPS(netdev, &be_ethtool_ops);
-- 
1.6.0.4


^ permalink raw reply related

* Re: [PATCH net-next] can: add can_free_echo_skb() for upcoming drivers
From: Wolfgang Grandegger @ 2009-09-04 14:34 UTC (permalink / raw)
  To: David Miller
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	oliver.hartkopp-l29pVbxQd1IUtdQbppsyvg,
	urs.thuermann-l29pVbxQd1IUtdQbppsyvg
In-Reply-To: <20090904.021350.234672852.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

On 09/04/2009 11:13 AM, David Miller wrote:
> From: Wolfgang Grandegger<wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>
> Date: Fri, 04 Sep 2009 09:49:38 +0200
>
>> David Miller wrote:
>>> Can a CAN expert please review these patches?  They are rotting
>>> in patchwork and I want to move forward with them somehow.I'
>>
>> I'm maintaining the CAN device drivers interface and drivers as listed
>> in MAINTAINERS. Though, these patches have already the official blessing.
>
> My bad :-)  I'll apply these then.
>
>> Do you want me to setup a git repository "can-next-2.6" where you can
>> pull from? So far, not that much can-drv patches have been posted to
>> this list.
>
> You can do that in the future if you like, sure.  But it is not a
> requirement, as with the rate you guys send things I am handling it
> efficiently using patchwork.

OK, then I will continue sending patches for the time being. Please let 
me know when a git repository would ease your work.

Thanks,

Wolfgang.

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Christoph Lameter @ 2009-09-04 15:33 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Eric Dumazet, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <20090903220300.GN6761@linux.vnet.ibm.com>

On Thu, 3 Sep 2009, Paul E. McKenney wrote:

> > You need to ensure that no objects are in use before destroying a slab. In
> > case of DESTROY_BY_RCU you must ensure that there are no potential
> > readers. So use a suitable rcu barrier or something else like a
> > synchronize_rcu...
>
> If by "you must ensure" you mean "kmem_cache_destroy must ensure", then
> we are in complete agreement.  Otherwise, not a chance.

Well then we are going down to a crappy interface and mixing rcu with slab
semantics. More bugs to follow in the future.

> If by "must ensure that no objects are in use", you mean "must have
> no further references to them", then we are in agreement.  And in my
> scenario above, it is not the -user- who later references the memory,
> but rather the slab code itself.

The slab code itself is not referencing the later memory with my patch. It
can only be the user.

> Put the rcu_barrier() in kmem_cache_free().  Please.

Guess we are doing this ... Sigh. Are you going to add support other rcu
versions to slab as well as time permits and as the need arises? Should we
add you as a maintainer? ;-)

^ permalink raw reply

* [PATCH net-next] wan: dlci/sdla transmit return dehacking
From: Stephen Hemminger @ 2009-09-04 15:33 UTC (permalink / raw)
  To: David Miller, Krzysztof Halasa; +Cc: sfr, linux-next, netdev
In-Reply-To: <20090903.213548.46609322.davem@davemloft.net>

This is a brute force removal of the wierd slave interface done for DLCI -> SDLA
transmit. Before it was using non-standard return values and freeing skb in caller.
This changes it to using normal return values, and freeing in the callee.
Luckly only one driver pair was doing this. Not tested on real hardware,
in fact I wonder if this driver pair is even being used by any users.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 drivers/net/wan/dlci.c  |   43 +++++--------------------------------------
 drivers/net/wan/sdla.c  |    8 ++++----
 include/linux/if_frad.h |    5 -----
 3 files changed, 9 insertions(+), 47 deletions(-)

--- a/drivers/net/wan/dlci.c	2009-09-04 08:14:38.993377174 -0700
+++ b/drivers/net/wan/dlci.c	2009-09-04 08:25:03.024404089 -0700
@@ -186,46 +186,13 @@ static void dlci_receive(struct sk_buff 
 		dev_kfree_skb(skb);
 }
 
-static netdev_tx_t dlci_transmit(struct sk_buff *skb,
-				       struct net_device *dev)
+static netdev_tx_t dlci_transmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct dlci_local *dlp;
-	netdev_tx_t ret;
-
-	if (!skb || !dev)
-		return NETDEV_TX_OK;
-
-	dlp = netdev_priv(dev);
-
-	netif_stop_queue(dev);
-	
-	/* This is hackish, overloads driver specific return values
-	   on top of normal transmit return! */
-	ret = dlp->slave->netdev_ops->ndo_start_xmit(skb, dlp->slave);
-	switch (ret)
-	{
-		case DLCI_RET_OK:
-			dev->stats.tx_packets++;
-			ret = NETDEV_TX_OK;
-			break;
-		case DLCI_RET_ERR:
-			dev->stats.tx_errors++;
-			ret = NETDEV_TX_OK;
-			break;
-		case DLCI_RET_DROP:
-			dev->stats.tx_dropped++;
-			ret = NETDEV_TX_BUSY;
-			break;
-	}
-	/* Alan Cox recommends always returning 0, and always freeing the packet */
-	/* experience suggest a slightly more conservative approach */
+	struct dlci_local *dlp = netdev_priv(dev);
 
-	if (ret == NETDEV_TX_OK)
-	{
-		dev_kfree_skb(skb);
-		netif_wake_queue(dev);
-	}
-	return(ret);
+	if (skb)
+		dlp->slave->netdev_ops->ndo_start_xmit(skb, dlp->slave);
+	return NETDEV_TX_OK;
 }
 
 static int dlci_config(struct net_device *dev, struct dlci_conf __user *conf, int get)
--- a/drivers/net/wan/sdla.c	2009-09-04 08:18:22.917065991 -0700
+++ b/drivers/net/wan/sdla.c	2009-09-04 08:24:53.587189004 -0700
@@ -652,7 +652,7 @@ static int sdla_dlci_conf(struct net_dev
 
 /* NOTE: the DLCI driver deals with freeing the SKB!! */
 static netdev_tx_t sdla_transmit(struct sk_buff *skb,
-				       struct net_device *dev)
+				 struct net_device *dev)
 {
 	struct frad_local *flp;
 	int               ret, addr, accept, i;
@@ -712,23 +712,21 @@ static netdev_tx_t sdla_transmit(struct 
 				}
 				break;
 		}
+
 		switch (ret)
 		{
 			case SDLA_RET_OK:
 				dev->stats.tx_packets++;
-				ret = DLCI_RET_OK;
 				break;
 
 			case SDLA_RET_CIR_OVERFLOW:
 			case SDLA_RET_BUF_OVERSIZE:
 			case SDLA_RET_NO_BUFS:
 				dev->stats.tx_dropped++;
-				ret = DLCI_RET_DROP;
 				break;
 
 			default:
 				dev->stats.tx_errors++;
-				ret = DLCI_RET_ERR;
 				break;
 		}
 	}
@@ -738,6 +736,8 @@ static netdev_tx_t sdla_transmit(struct 
 		if(flp->master[i]!=NULL)
 			netif_wake_queue(flp->master[i]);
 	}		
+
+	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
 
--- a/include/linux/if_frad.h	2009-09-04 08:19:04.459045748 -0700
+++ b/include/linux/if_frad.h	2009-09-04 08:19:47.487006328 -0700
@@ -69,11 +69,6 @@ struct dlci_conf {
 
 #define DLCI_VALID_FLAGS	0x000B
 
-/* FRAD driver uses these to indicate what it did with packet */
-#define DLCI_RET_OK		0x00
-#define DLCI_RET_ERR		0x01
-#define DLCI_RET_DROP		0x02
-
 /* defines for the actual Frame Relay hardware */
 #define FRAD_GET_CONF	(SIOCDEVPRIVATE)
 #define FRAD_SET_CONF	(SIOCDEVPRIVATE + 1)

^ permalink raw reply

* Re: [RFC] net/fs_enet: send a reset request to the PHY on init
From: Sebastian Andrzej Siewior @ 2009-09-04 15:38 UTC (permalink / raw)
  To: Grant Likely; +Cc: linuxppc-dev, netdev, Vitaly Bordug
In-Reply-To: <fa686aa40909030948h4acf6d3x1c318baa2fdefe1f@mail.gmail.com>

Grant Likely wrote:
  > What version of the kernel are you using?  The line numbers don't
> match up with kernel mainline, so I wonder if this is before or after
> the OF MDIO rework changes.
It is the kernel which was shipped in ads5121's bsp which is 2.6.24.

> Regardless, this doesn't look right.  It certainly isn't right for the
> driver to do an unconditional PHY reset when it doesn't actually know
> what phy is attached.  For most boards I'm sure this is not desirable
> because it will cause a delay while the PHY auto negotiates.
> Depending on when the first network traffic begins, can cause several
> seconds of boot delay.
> 
> Best would be to do this in U-Boot.  Otherwise, I think I would rather
> see it at phy_device probe time.  At least then it would be on a
> per-phy basis, or could be controlled by a property in the device tree
> so that all boards don't get the same impact.
I have no network support in boot loader so I can't do it there. Doing it 
at phy-probe time sounds reasonable.
So all other boards are doing this kind of reset in u-boot?

> g.
> 


Sebastian

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Christoph Lameter @ 2009-09-04 15:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Paul E. McKenney, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <4AA0407E.8030505@gmail.com>

On Fri, 4 Sep 2009, Eric Dumazet wrote:

> And another one for SLAB_DESTROY_BY_RCU to make sure slabs where freed

Now its two rcu_barriers(). What I feared comes true.

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Christoph Lameter @ 2009-09-04 15:42 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Eric Dumazet, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <20090903231757.GP6761@linux.vnet.ibm.com>

On Thu, 3 Sep 2009, Paul E. McKenney wrote:

> If it were the user of the slab who was invoking some variant of
> call_rcu(), then I would agree with you.

The user already has to deal with it as explained by Eric.

> However, call_rcu() is instead being invoked by the slab itself in the
> case of SLAB_DESTROY_BY_RCU, so that there is no variation in usage.
> Requiring that the user call rcu_barrier() is asking for subtle bugs.
> Therefore, the best approach is to have kmem_cache_destroy() handle
> the RCU cleanup, given that this cleanup is for actions taken by
> kmem_cache_free(), not by the user.

The user already has to properly handle barriers and rcu logic in order to
use objects handled with RCU properly. Moreover the user even has to check
that the object is not suddenly checked under it. Its already complex.


^ permalink raw reply

* Re: [PATCH 5/5] Updating documentation accordingly
From: Randy Dunlap @ 2009-09-04 15:45 UTC (permalink / raw)
  To: Ivo Calado; +Cc: dccp, netdev
In-Reply-To: <1252067111.6172.5.camel@localhost>

On Fri, 04 Sep 2009 09:25:11 -0300 Ivo Calado wrote:

> Updating documentation accordingly
> 
> Signed-off-by: Ivo Calado, Erivaldo Xavier, Leandro Sales
> <ivocalado@embedded.ufcg.edu.br>, <desadoc@gmail.com>,
> <leandroal@gmail.com>
> 
> Index: dccp_tree_work4/net/dccp/ccids/lib/loss_interval_sp.c
> ===================================================================
> --- dccp_tree_work4.orig/net/dccp/ccids/lib/loss_interval_sp.c
> 2009-09-04 00:28:03.000000000 -0300
> +++ dccp_tree_work4/net/dccp/ccids/lib/loss_interval_sp.c 2009-09-04
> 01:00:22.000000000 -0300
> @@ -324,7 +342,7 @@
> }
> 
> /**
> - * tfrc_lh_update_i_mean  -  Update the `open' loss interval I_0
> + * tfrc_sp_lh_update_i_mean  -  Update the `open' loss interval I_0
>   * This updates I_mean as the sequence numbers increase. As a
> consequence, the
>   * open loss interval I_0 increases, hence p = W_tot/max(I_tot0,
> I_tot1)
>   * decreases, and thus there is no need to send renewed feedback.
> @@ -372,7 +390,7 @@
> return cur->li_is_closed;
> }
> 
> -/** tfrc_lh_interval_add  -  Insert new record into the Loss Interval
> database
> +/** tfrc_sp_lh_interval_add - Insert new record into the Loss Interval
> database

The beginning kernel-doc marker ("/**") should be on a line by itself,
like all of the others are.


>   * @lh:    Loss Interval database
>   * @rh:    Receive history containing a fresh loss event
>   * @calc_first_li: Caller-dependent routine to compute length of first
> interval

> Index: dccp_tree_work4/net/dccp/ccids/lib/tfrc_equation_sp.c
> ===================================================================
> --- dccp_tree_work4.orig/net/dccp/ccids/lib/tfrc_equation_sp.c
> 2009-09-03 22:01:08.000000000 -0300
> +++ dccp_tree_work4/net/dccp/ccids/lib/tfrc_equation_sp.c 2009-09-04
> 00:54:05.000000000 -0300
> @@ -607,7 +609,7 @@
> }
> 
> /**
> - * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
> + * tfrc_sp_calc_x - Calculate the send rate as per section 3.1 of
> RFC3448

Looks like some mail-handling software is splitting lines where they
should not be split.  You should check that the mailed patches (on the
receiving side) can be applied cleanly.


>   *
>   *  @s: packet size          in bytes
>   *  @R: RTT                  scaled by 1000000   (i.e., microseconds)



---
~Randy
LPC 2009, Sept. 23-25, Portland, Oregon
http://linuxplumbersconf.org/2009/

^ permalink raw reply

* Re: [RFC] net/fs_enet: send a reset request to the PHY on init
From: Grant Likely @ 2009-09-04 15:45 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Pantelis Antoniou, Vitaly Bordug, linuxppc-dev, netdev,
	Wolfgang Denk
In-Reply-To: <4AA1347A.5060702@linutronix.de>

On Fri, Sep 4, 2009 at 9:38 AM, Sebastian Andrzej
Siewior<bigeasy@linutronix.de> wrote:
> Grant Likely wrote:
>  > What version of the kernel are you using?  The line numbers don't
>>
>> match up with kernel mainline, so I wonder if this is before or after
>> the OF MDIO rework changes.
>
> It is the kernel which was shipped in ads5121's bsp which is 2.6.24.

Okay, I can safely ignore this then.  Wolfgang may be interested
though.  He's been doing some work to get 5121 support mainlined.

> I have no network support in boot loader so I can't do it there. Doing it at
> phy-probe time sounds reasonable.
> So all other boards are doing this kind of reset in u-boot?

In general I take the approach that as much as possible firmware
should have devices in a sane state before booting the kernel just to
avoid doing board specific fixup stuff in the kernel tree.  But this
isn't law, just more of a rule of thumb that I go by.  2nd resort is
to create a board specific platform code file and put it there
(arch/powerpc/platforms/*).

g.

-- 
Grant Likely, B.Sc., P.Eng.
Secret Lab Technologies Ltd.

^ permalink raw reply

* net_sched 00/07: classful multiqueue dummy scheduler
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy

These patches contain a classful multiqueue ("mq") dummy scheduler to fix a
couple of problems with the current multiqueue TC API integration. The
changelogs of patch 05 and 07 contain more details.

The mq scheduler does two things:

- present device TX queues as classes, allowing to attach different qdiscs
  to them, which are grafted to the TX queues

- present accumulated statistics of all device queue root qdiscs

Its used by default for multiqueue devices instead of the regular pfifo_fast
qdisc, but can also be attached manually to restore multiqueue behaviour
after attaching a non-multiqueue (shared) qdisc.

Patches 1-4 contain some preparatory cleanups because I was getting tired
of copying unnecessary checks and dummy functions :)

Patch 5 introduces a dev->qdisc pointer, which points to the root qdisc from
userspace's point of view. This is later used for the mq qdisc, which isn't
actually attached to any device queues. Patch 7 contains the mq scheduler.

I've tested the scheduler with a hacked macvlan version which uses 4 queues,
but since I don't own a multiqueue capable device I couldn't test this on
real hardware.

Any comments and test results welcome :)

 include/linux/netdevice.h |    3 +
 include/net/sch_generic.h |    6 +
 net/core/rtnetlink.c      |    6 +-
 net/sched/Makefile        |    2 +-
 net/sched/cls_api.c       |   10 +-
 net/sched/sch_api.c       |   99 ++++++++-----------
 net/sched/sch_cbq.c       |   38 ++++----
 net/sched/sch_generic.c   |   58 +++++++++--
 net/sched/sch_hfsc.c      |    4 +-
 net/sched/sch_htb.c       |   35 ++++----
 net/sched/sch_ingress.c   |   14 ---
 net/sched/sch_mq.c        |  234 +++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_multiq.c    |   33 +------
 net/sched/sch_prio.c      |   32 +------
 net/sched/sch_red.c       |   21 ----
 net/sched/sch_sfq.c       |    7 --
 net/sched/sch_tbf.c       |   22 ----
 17 files changed, 375 insertions(+), 249 deletions(-)
 create mode 100644 net/sched/sch_mq.c

Patrick McHardy (7):
      net_sched: fix class grafting errno codes
      net_sched: make cls_ops->tcf_chain() optional
      net_sched: make cls_ops->change and cls_ops->delete optional
      net_sched: remove some unnecessary checks in classful schedulers
      net_sched: reintroduce dev->qdisc for use by sch_api
      net_sched: move dev_graft_qdisc() to sch_generic.c
      net_sched: add classful multiqueue dummy scheduler

^ permalink raw reply

* net_sched 01/07: fix class grafting errno codes
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit 1cf555183f0ae7e256381ea8993272c0a321f5b5
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 14:28:10 2009 +0200

    net_sched: fix class grafting errno codes
    
    If the parent qdisc doesn't support classes, use EOPNOTSUPP.
    If the parent class doesn't exist, use ENOENT. Currently EINVAL
    is returned in both cases.
    
    Additionally check whether grafting is supported and remove a now
    unnecessary graft function from sch_ingress.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 24d17ce..bef2d64 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -728,14 +728,14 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 	} else {
 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
 
-		err = -EINVAL;
-
-		if (cops) {
+		err = -EOPNOTSUPP;
+		if (cops && cops->graft) {
 			unsigned long cl = cops->get(parent, classid);
 			if (cl) {
 				err = cops->graft(parent, cl, new, &old);
 				cops->put(parent, cl);
-			}
+			} else
+				err = -ENOENT;
 		}
 		if (!err)
 			notify_and_destroy(skb, n, classid, old, new);
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 4a2b773..ace7902 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -22,12 +22,6 @@ struct ingress_qdisc_data {
 
 /* ------------------------- Class/flow operations ------------------------- */
 
-static int ingress_graft(struct Qdisc *sch, unsigned long arg,
-			 struct Qdisc *new, struct Qdisc **old)
-{
-	return -EOPNOTSUPP;
-}
-
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	return NULL;
@@ -123,7 +117,6 @@ nla_put_failure:
 }
 
 static const struct Qdisc_class_ops ingress_class_ops = {
-	.graft		=	ingress_graft,
 	.leaf		=	ingress_leaf,
 	.get		=	ingress_get,
 	.put		=	ingress_put,

^ permalink raw reply related

* net_sched 02/07: make cls_ops->tcf_chain() optional
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit 6ea4233ef8f398289a14a3305d4ed440fb026d43
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 14:28:11 2009 +0200

    net_sched: make cls_ops->tcf_chain() optional
    
    Some qdiscs don't support attaching filters. Handle this centrally in
    cls_api and return a proper errno code (EOPNOTSUPP) instead of EINVAL.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 09cdcdf..eaa8f43 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -181,6 +181,9 @@ replay:
 	if ((cops = q->ops->cl_ops) == NULL)
 		return -EINVAL;
 
+	if (cops->tcf_chain == NULL)
+		return -EOPNOTSUPP;
+
 	/* Do we search for filter, attached to class? */
 	if (TC_H_MIN(parent)) {
 		cl = cops->get(q, parent);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 2bdf241..c27b802 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -331,11 +331,6 @@ static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_proto **red_find_tcf(struct Qdisc *sch, unsigned long cl)
-{
-	return NULL;
-}
-
 static const struct Qdisc_class_ops red_class_ops = {
 	.graft		=	red_graft,
 	.leaf		=	red_leaf,
@@ -344,7 +339,6 @@ static const struct Qdisc_class_ops red_class_ops = {
 	.change		=	red_change_class,
 	.delete		=	red_delete,
 	.walk		=	red_walk,
-	.tcf_chain	=	red_find_tcf,
 	.dump		=	red_dump_class,
 };
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index e22dfe8..2890969 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -433,11 +433,6 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 	}
 }
 
-static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
-{
-	return NULL;
-}
-
 static const struct Qdisc_class_ops tbf_class_ops =
 {
 	.graft		=	tbf_graft,
@@ -447,7 +442,6 @@ static const struct Qdisc_class_ops tbf_class_ops =
 	.change		=	tbf_change_class,
 	.delete		=	tbf_delete,
 	.walk		=	tbf_walk,
-	.tcf_chain	=	tbf_find_tcf,
 	.dump		=	tbf_dump_class,
 };
 

^ permalink raw reply related

* net_sched 03/07: make cls_ops->change and cls_ops->delete optional
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit c357570bb4fdd3c608dce92174acccb9b5b8163b
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 15:14:58 2009 +0200

    net_sched: make cls_ops->change and cls_ops->delete optional
    
    Some schedulers don't support creating, changing or deleting classes.
    Make the respective callbacks optionally and consistently return
    -EOPNOTSUPP for unsupported operations, instead of currently either
    -EOPNOTSUPP, -ENOSYS or no error.
    
    In case of sch_prio and sch_multiq, the removed operations additionally
    checked for an invalid class. This is not necessary since the class
    argument can only orginate from ->get() or in case of ->change is 0
    for creation of new classes, in which case ->change() incorrectly
    returned -ENOENT.
    
    As a side-effect, this patch fixes a possible (root-only) NULL pointer
    function call in sch_ingress, which didn't implement a so far mandatory
    ->delete() operation.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index bef2d64..166fcca 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1417,7 +1417,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 				goto out;
 			break;
 		case RTM_DELTCLASS:
-			err = cops->delete(q, cl);
+			err = -EOPNOTSUPP;
+			if (cops->delete)
+				err = cops->delete(q, cl);
 			if (err == 0)
 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
 			goto out;
@@ -1431,7 +1433,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	}
 
 	new_cl = cl;
-	err = cops->change(q, clid, pid, tca, &new_cl);
+	err = -EOPNOTSUPP;
+	if (cops->change)
+		err = cops->change(q, clid, pid, tca, &new_cl);
 	if (err == 0)
 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
 
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index ace7902..a9e646b 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -42,12 +42,6 @@ static void ingress_put(struct Qdisc *sch, unsigned long cl)
 {
 }
 
-static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent,
-			  struct nlattr **tca, unsigned long *arg)
-{
-	return 0;
-}
-
 static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	return;
@@ -120,7 +114,6 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 	.leaf		=	ingress_leaf,
 	.get		=	ingress_get,
 	.put		=	ingress_put,
-	.change		=	ingress_change,
 	.walk		=	ingress_walk,
 	.tcf_chain	=	ingress_find_tcf,
 	.bind_tcf	=	ingress_bind_filter,
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 9127312..a0ffe71 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -348,26 +348,6 @@ static void multiq_put(struct Qdisc *q, unsigned long cl)
 	return;
 }
 
-static int multiq_change(struct Qdisc *sch, u32 handle, u32 parent,
-			 struct nlattr **tca, unsigned long *arg)
-{
-	unsigned long cl = *arg;
-	struct multiq_sched_data *q = qdisc_priv(sch);
-
-	if (cl - 1 > q->bands)
-		return -ENOENT;
-	return 0;
-}
-
-static int multiq_delete(struct Qdisc *sch, unsigned long cl)
-{
-	struct multiq_sched_data *q = qdisc_priv(sch);
-	if (cl - 1 > q->bands)
-		return -ENOENT;
-	return 0;
-}
-
-
 static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
 			     struct sk_buff *skb, struct tcmsg *tcm)
 {
@@ -430,8 +410,6 @@ static const struct Qdisc_class_ops multiq_class_ops = {
 	.leaf		=	multiq_leaf,
 	.get		=	multiq_get,
 	.put		=	multiq_put,
-	.change		=	multiq_change,
-	.delete		=	multiq_delete,
 	.walk		=	multiq_walk,
 	.tcf_chain	=	multiq_find_tcf,
 	.bind_tcf	=	multiq_bind,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 94cecef..209a4ca 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -311,25 +311,6 @@ static void prio_put(struct Qdisc *q, unsigned long cl)
 	return;
 }
 
-static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct nlattr **tca, unsigned long *arg)
-{
-	unsigned long cl = *arg;
-	struct prio_sched_data *q = qdisc_priv(sch);
-
-	if (cl - 1 > q->bands)
-		return -ENOENT;
-	return 0;
-}
-
-static int prio_delete(struct Qdisc *sch, unsigned long cl)
-{
-	struct prio_sched_data *q = qdisc_priv(sch);
-	if (cl - 1 > q->bands)
-		return -ENOENT;
-	return 0;
-}
-
-
 static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
 			   struct tcmsg *tcm)
 {
@@ -392,8 +373,6 @@ static const struct Qdisc_class_ops prio_class_ops = {
 	.leaf		=	prio_leaf,
 	.get		=	prio_get,
 	.put		=	prio_put,
-	.change		=	prio_change,
-	.delete		=	prio_delete,
 	.walk		=	prio_walk,
 	.tcf_chain	=	prio_find_tcf,
 	.bind_tcf	=	prio_bind,
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index c27b802..a2c4d1a 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -308,17 +308,6 @@ static void red_put(struct Qdisc *sch, unsigned long arg)
 	return;
 }
 
-static int red_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-			    struct nlattr **tca, unsigned long *arg)
-{
-	return -ENOSYS;
-}
-
-static int red_delete(struct Qdisc *sch, unsigned long cl)
-{
-	return -ENOSYS;
-}
-
 static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -336,8 +325,6 @@ static const struct Qdisc_class_ops red_class_ops = {
 	.leaf		=	red_leaf,
 	.get		=	red_get,
 	.put		=	red_put,
-	.change		=	red_change_class,
-	.delete		=	red_delete,
 	.walk		=	red_walk,
 	.dump		=	red_dump_class,
 };
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 8706920..cb21380 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -496,12 +496,6 @@ nla_put_failure:
 	return -1;
 }
 
-static int sfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-			    struct nlattr **tca, unsigned long *arg)
-{
-	return -EOPNOTSUPP;
-}
-
 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 {
 	return 0;
@@ -560,7 +554,6 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 
 static const struct Qdisc_class_ops sfq_class_ops = {
 	.get		=	sfq_get,
-	.change		=	sfq_change_class,
 	.tcf_chain	=	sfq_find_tcf,
 	.dump		=	sfq_dump_class,
 	.dump_stats	=	sfq_dump_class_stats,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 2890969..d904167 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -410,17 +410,6 @@ static void tbf_put(struct Qdisc *sch, unsigned long arg)
 {
 }
 
-static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
-			    struct nlattr **tca, unsigned long *arg)
-{
-	return -ENOSYS;
-}
-
-static int tbf_delete(struct Qdisc *sch, unsigned long arg)
-{
-	return -ENOSYS;
-}
-
 static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 {
 	if (!walker->stop) {
@@ -439,8 +428,6 @@ static const struct Qdisc_class_ops tbf_class_ops =
 	.leaf		=	tbf_leaf,
 	.get		=	tbf_get,
 	.put		=	tbf_put,
-	.change		=	tbf_change_class,
-	.delete		=	tbf_delete,
 	.walk		=	tbf_walk,
 	.dump		=	tbf_dump_class,
 };

^ permalink raw reply related

* net_sched 04/07: remove some unnecessary checks in classful schedulers
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit eca9dbb05c5bf47f4bc3162b4f19dcb4dd85acfc
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 16:12:45 2009 +0200

    net_sched: remove some unnecessary checks in classful schedulers
    
    The class argument to the ->graft(), ->leaf(), ->dump(), ->dump_stats() all
    originate from either ->get() or ->walk() and are always valid.
    
    Remove unnecessary checks.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d5798e1..5b132c4 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1621,29 +1621,25 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct cbq_class *cl = (struct cbq_class*)arg;
 
-	if (cl) {
-		if (new == NULL) {
-			new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-						&pfifo_qdisc_ops,
-						cl->common.classid);
-			if (new == NULL)
-				return -ENOBUFS;
-		} else {
+	if (new == NULL) {
+		new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+					&pfifo_qdisc_ops, cl->common.classid);
+		if (new == NULL)
+			return -ENOBUFS;
+	} else {
 #ifdef CONFIG_NET_CLS_ACT
-			if (cl->police == TC_POLICE_RECLASSIFY)
-				new->reshape_fail = cbq_reshape_fail;
+		if (cl->police == TC_POLICE_RECLASSIFY)
+			new->reshape_fail = cbq_reshape_fail;
 #endif
-		}
-		sch_tree_lock(sch);
-		*old = cl->q;
-		cl->q = new;
-		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-		qdisc_reset(*old);
-		sch_tree_unlock(sch);
-
-		return 0;
 	}
-	return -ENOENT;
+	sch_tree_lock(sch);
+	*old = cl->q;
+	cl->q = new;
+	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+	qdisc_reset(*old);
+	sch_tree_unlock(sch);
+
+	return 0;
 }
 
 static struct Qdisc *
@@ -1651,7 +1647,7 @@ cbq_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	struct cbq_class *cl = (struct cbq_class*)arg;
 
-	return cl ? cl->q : NULL;
+	return cl->q;
 }
 
 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index dad0144..375d64c 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1203,8 +1203,6 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
 
-	if (cl == NULL)
-		return -ENOENT;
 	if (cl->level > 0)
 		return -EINVAL;
 	if (new == NULL) {
@@ -1228,7 +1226,7 @@ hfsc_class_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	struct hfsc_class *cl = (struct hfsc_class *)arg;
 
-	if (cl != NULL && cl->level == 0)
+	if (cl->level == 0)
 		return cl->qdisc;
 
 	return NULL;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index ec4d463..85acab9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1117,30 +1117,29 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct htb_class *cl = (struct htb_class *)arg;
 
-	if (cl && !cl->level) {
-		if (new == NULL &&
-		    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
-					     &pfifo_qdisc_ops,
-					     cl->common.classid))
-		    == NULL)
-			return -ENOBUFS;
-		sch_tree_lock(sch);
-		*old = cl->un.leaf.q;
-		cl->un.leaf.q = new;
-		if (*old != NULL) {
-			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
-			qdisc_reset(*old);
-		}
-		sch_tree_unlock(sch);
-		return 0;
+	if (cl->level)
+		return -EINVAL;
+	if (new == NULL &&
+	    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
+				     &pfifo_qdisc_ops,
+				     cl->common.classid)) == NULL)
+		return -ENOBUFS;
+
+	sch_tree_lock(sch);
+	*old = cl->un.leaf.q;
+	cl->un.leaf.q = new;
+	if (*old != NULL) {
+		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
+		qdisc_reset(*old);
 	}
-	return -ENOENT;
+	sch_tree_unlock(sch);
+	return 0;
 }
 
 static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	struct htb_class *cl = (struct htb_class *)arg;
-	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
+	return !cl->level ? cl->un.leaf.q : NULL;
 }
 
 static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index a0ffe71..069f81c 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -298,9 +298,6 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
 
-	if (band >= q->bands)
-		return -EINVAL;
-
 	if (new == NULL)
 		new = &noop_qdisc;
 
@@ -320,9 +317,6 @@ multiq_leaf(struct Qdisc *sch, unsigned long arg)
 	struct multiq_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
 
-	if (band >= q->bands)
-		return NULL;
-
 	return q->queues[band];
 }
 
@@ -353,11 +347,8 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
 {
 	struct multiq_sched_data *q = qdisc_priv(sch);
 
-	if (cl - 1 > q->bands)
-		return -ENOENT;
 	tcm->tcm_handle |= TC_H_MIN(cl);
-	if (q->queues[cl-1])
-		tcm->tcm_info = q->queues[cl-1]->handle;
+	tcm->tcm_info = q->queues[cl-1]->handle;
 	return 0;
 }
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 209a4ca..0f73c41 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -262,9 +262,6 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
 
-	if (band >= q->bands)
-		return -EINVAL;
-
 	if (new == NULL)
 		new = &noop_qdisc;
 
@@ -284,9 +281,6 @@ prio_leaf(struct Qdisc *sch, unsigned long arg)
 	struct prio_sched_data *q = qdisc_priv(sch);
 	unsigned long band = arg - 1;
 
-	if (band >= q->bands)
-		return NULL;
-
 	return q->queues[band];
 }
 
@@ -316,11 +310,8 @@ static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 
-	if (cl - 1 > q->bands)
-		return -ENOENT;
 	tcm->tcm_handle |= TC_H_MIN(cl);
-	if (q->queues[cl-1])
-		tcm->tcm_info = q->queues[cl-1]->handle;
+	tcm->tcm_info = q->queues[cl-1]->handle;
 	return 0;
 }
 
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a2c4d1a..072cdf4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -268,8 +268,6 @@ static int red_dump_class(struct Qdisc *sch, unsigned long cl,
 {
 	struct red_sched_data *q = qdisc_priv(sch);
 
-	if (cl != 1)
-		return -ENOENT;
 	tcm->tcm_handle |= TC_H_MIN(1);
 	tcm->tcm_info = q->qdisc->handle;
 	return 0;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index d904167..8fb8107 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -368,9 +368,6 @@ static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
 {
 	struct tbf_sched_data *q = qdisc_priv(sch);
 
-	if (cl != 1) 	/* only one class */
-		return -ENOENT;
-
 	tcm->tcm_handle |= TC_H_MIN(1);
 	tcm->tcm_info = q->qdisc->handle;
 

^ permalink raw reply related

* net_sched 05/07: reintroduce dev->qdisc for use by sch_api
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit 57a016350a3d85dc351ab90ce91e4dc49ce2183a
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 16:12:45 2009 +0200

    net_sched: reintroduce dev->qdisc for use by sch_api
    
    Currently the multiqueue integration with the qdisc API suffers from
    a few problems:
    
    - with multiple queues, all root qdiscs use the same handle. This means
      they can't be exposed to userspace in a backwards compatible fashion.
    
    - all API operations always refer to queue number 0. Newly created
      qdiscs are automatically shared between all queues, its not possible
      to address individual queues or restore multiqueue behaviour once a
      shared qdisc has been attached.
    
    - Dumps only contain the root qdisc of queue 0, in case of non-shared
      qdiscs this means the statistics are incomplete.
    
    This patch reintroduces dev->qdisc, which points to the (single) root qdisc
    from userspace's point of view. Currently it either points to the first
    (non-shared) default qdisc, or a qdisc shared between all queues. The
    following patches will introduce a classful dummy qdisc, which will be used
    as root qdisc and contain the per-queue qdiscs as children.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 121cbad..a44118b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -832,6 +832,9 @@ struct net_device
 	/* Number of TX queues currently active in device  */
 	unsigned int		real_num_tx_queues;
 
+	/* root qdisc from userspace point of view */
+	struct Qdisc		*qdisc;
+
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
 	spinlock_t		tx_global_lock;
 /*
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bbcba2a..eb42873 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -606,7 +606,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags)
 {
-	struct netdev_queue *txq;
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
 	const struct net_device_stats *stats;
@@ -637,9 +636,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (dev->master)
 		NLA_PUT_U32(skb, IFLA_MASTER, dev->master->ifindex);
 
-	txq = netdev_get_tx_queue(dev, 0);
-	if (txq->qdisc_sleeping)
-		NLA_PUT_STRING(skb, IFLA_QDISC, txq->qdisc_sleeping->ops->id);
+	if (dev->qdisc)
+		NLA_PUT_STRING(skb, IFLA_QDISC, dev->qdisc->ops->id);
 
 	if (dev->ifalias)
 		NLA_PUT_STRING(skb, IFLA_IFALIAS, dev->ifalias);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index eaa8f43..8cbc66f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -168,8 +168,7 @@ replay:
 
 	/* Find qdisc */
 	if (!parent) {
-		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
-		q = dev_queue->qdisc_sleeping;
+		q = dev->qdisc;
 		parent = q->handle;
 	} else {
 		q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
@@ -408,7 +407,6 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
 static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct netdev_queue *dev_queue;
 	int t;
 	int s_t;
 	struct net_device *dev;
@@ -427,9 +425,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 		return skb->len;
 
-	dev_queue = netdev_get_tx_queue(dev, 0);
 	if (!tcm->tcm_parent)
-		q = dev_queue->qdisc_sleeping;
+		q = dev->qdisc;
 	else
 		q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
 	if (!q)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 166fcca..8aa9a0c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -207,7 +207,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
 static void qdisc_list_add(struct Qdisc *q)
 {
 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
-		list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
+		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
 }
 
 void qdisc_list_del(struct Qdisc *q)
@@ -219,17 +219,11 @@ EXPORT_SYMBOL(qdisc_list_del);
 
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 {
-	unsigned int i;
 	struct Qdisc *q;
 
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		struct Qdisc *txq_root = txq->qdisc_sleeping;
-
-		q = qdisc_match_from_root(txq_root, handle);
-		if (q)
-			goto out;
-	}
+	q = qdisc_match_from_root(dev->qdisc, handle);
+	if (q)
+		goto out;
 
 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
 out:
@@ -720,9 +714,14 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 			if (new && i > 0)
 				atomic_inc(&new->refcnt);
 
-			notify_and_destroy(skb, n, classid, old, new);
+			qdisc_destroy(old);
 		}
 
+		notify_and_destroy(skb, n, classid, dev->qdisc, new);
+		if (new)
+			atomic_inc(&new->refcnt);
+		dev->qdisc = new ? : &noop_qdisc;
+
 		if (dev->flags & IFF_UP)
 			dev_activate(dev);
 	} else {
@@ -974,9 +973,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 				q = dev->rx_queue.qdisc_sleeping;
 			}
 		} else {
-			struct netdev_queue *dev_queue;
-			dev_queue = netdev_get_tx_queue(dev, 0);
-			q = dev_queue->qdisc_sleeping;
+			q = dev->qdisc;
 		}
 		if (!q)
 			return -ENOENT;
@@ -1044,9 +1041,7 @@ replay:
 				q = dev->rx_queue.qdisc_sleeping;
 			}
 		} else {
-			struct netdev_queue *dev_queue;
-			dev_queue = netdev_get_tx_queue(dev, 0);
-			q = dev_queue->qdisc_sleeping;
+			q = dev->qdisc;
 		}
 
 		/* It may be default qdisc, ignore it */
@@ -1291,8 +1286,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 			s_q_idx = 0;
 		q_idx = 0;
 
-		dev_queue = netdev_get_tx_queue(dev, 0);
-		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
+		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
 			goto done;
 
 		dev_queue = &dev->rx_queue;
@@ -1323,7 +1317,6 @@ done:
 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
-	struct netdev_queue *dev_queue;
 	struct tcmsg *tcm = NLMSG_DATA(n);
 	struct nlattr *tca[TCA_MAX + 1];
 	struct net_device *dev;
@@ -1361,7 +1354,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 
 	/* Step 1. Determine qdisc handle X:0 */
 
-	dev_queue = netdev_get_tx_queue(dev, 0);
 	if (pid != TC_H_ROOT) {
 		u32 qid1 = TC_H_MAJ(pid);
 
@@ -1372,7 +1364,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 		} else if (qid1) {
 			qid = qid1;
 		} else if (qid == 0)
-			qid = dev_queue->qdisc_sleeping->handle;
+			qid = dev->qdisc->handle;
 
 		/* Now qid is genuine qdisc handle consistent
 		   both with parent and child.
@@ -1383,7 +1375,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 			pid = TC_H_MAKE(qid, pid);
 	} else {
 		if (qid == 0)
-			qid = dev_queue->qdisc_sleeping->handle;
+			qid = dev->qdisc->handle;
 	}
 
 	/* OK. Locate qdisc */
@@ -1588,8 +1580,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 	s_t = cb->args[0];
 	t = 0;
 
-	dev_queue = netdev_get_tx_queue(dev, 0);
-	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
+	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
 		goto done;
 
 	dev_queue = &dev->rx_queue;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6128e6f..a91f079 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -623,19 +623,6 @@ void qdisc_destroy(struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_destroy);
 
-static bool dev_all_qdisc_sleeping_noop(struct net_device *dev)
-{
-	unsigned int i;
-
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-
-		if (txq->qdisc_sleeping != &noop_qdisc)
-			return false;
-	}
-	return true;
-}
-
 static void attach_one_default_qdisc(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
 				     void *_unused)
@@ -677,6 +664,7 @@ static void transition_one_qdisc(struct net_device *dev,
 
 void dev_activate(struct net_device *dev)
 {
+	struct netdev_queue *txq;
 	int need_watchdog;
 
 	/* No queueing discipline is attached to device;
@@ -685,9 +673,14 @@ void dev_activate(struct net_device *dev)
 	   virtual interfaces
 	 */
 
-	if (dev_all_qdisc_sleeping_noop(dev))
+	if (dev->qdisc == &noop_qdisc) {
 		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
 
+		txq = netdev_get_tx_queue(dev, 0);
+		dev->qdisc = txq->qdisc_sleeping;
+		atomic_inc(&dev->qdisc->refcnt);
+	}
+
 	if (!netif_carrier_ok(dev))
 		/* Delay activation until next carrier-on event */
 		return;
@@ -777,6 +770,7 @@ static void dev_init_scheduler_queue(struct net_device *dev,
 
 void dev_init_scheduler(struct net_device *dev)
 {
+	dev->qdisc = &noop_qdisc;
 	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
 	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
 
@@ -802,5 +796,8 @@ void dev_shutdown(struct net_device *dev)
 {
 	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
 	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
+	qdisc_destroy(dev->qdisc);
+	dev->qdisc = &noop_qdisc;
+
 	WARN_ON(timer_pending(&dev->watchdog_timer));
 }

^ permalink raw reply related

* net_sched 06/07: move dev_graft_qdisc() to sch_generic.c
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit 7d0411697d850bcabf79bdc5bce9bf140fb317ef
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 16:12:45 2009 +0200

    net_sched: move dev_graft_qdisc() to sch_generic.c
    
    It will be used in a following patch by the multiqueue qdisc.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a48a4cc..a92dc62 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -302,6 +302,8 @@ extern void dev_init_scheduler(struct net_device *dev);
 extern void dev_shutdown(struct net_device *dev);
 extern void dev_activate(struct net_device *dev);
 extern void dev_deactivate(struct net_device *dev);
+extern struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
+				     struct Qdisc *qdisc);
 extern void qdisc_reset(struct Qdisc *qdisc);
 extern void qdisc_destroy(struct Qdisc *qdisc);
 extern void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 8aa9a0c..d71f12b 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -610,32 +610,6 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
 	return i>0 ? autohandle : 0;
 }
 
-/* Attach toplevel qdisc to device queue. */
-
-static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
-				     struct Qdisc *qdisc)
-{
-	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
-	spinlock_t *root_lock;
-
-	root_lock = qdisc_lock(oqdisc);
-	spin_lock_bh(root_lock);
-
-	/* Prune old scheduler */
-	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
-		qdisc_reset(oqdisc);
-
-	/* ... and graft new one */
-	if (qdisc == NULL)
-		qdisc = &noop_qdisc;
-	dev_queue->qdisc_sleeping = qdisc;
-	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
-
-	spin_unlock_bh(root_lock);
-
-	return oqdisc;
-}
-
 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
 {
 	const struct Qdisc_class_ops *cops;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a91f079..e7c47ce 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -623,6 +623,31 @@ void qdisc_destroy(struct Qdisc *qdisc)
 }
 EXPORT_SYMBOL(qdisc_destroy);
 
+/* Attach toplevel qdisc to device queue. */
+struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
+			      struct Qdisc *qdisc)
+{
+	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
+	spinlock_t *root_lock;
+
+	root_lock = qdisc_lock(oqdisc);
+	spin_lock_bh(root_lock);
+
+	/* Prune old scheduler */
+	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
+		qdisc_reset(oqdisc);
+
+	/* ... and graft new one */
+	if (qdisc == NULL)
+		qdisc = &noop_qdisc;
+	dev_queue->qdisc_sleeping = qdisc;
+	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
+
+	spin_unlock_bh(root_lock);
+
+	return oqdisc;
+}
+
 static void attach_one_default_qdisc(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
 				     void *_unused)

^ permalink raw reply related

* net_sched 07/07: add classful multiqueue dummy scheduler
From: Patrick McHardy @ 2009-09-04 16:41 UTC (permalink / raw)
  To: netdev; +Cc: Patrick McHardy
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

commit f114d0f02c9e72fea7bbc4d28a113946183fc65f
Author: Patrick McHardy <kaber@trash.net>
Date:   Fri Sep 4 18:25:04 2009 +0200

    net_sched: add classful multiqueue dummy scheduler
    
    This patch adds a classful dummy scheduler which can be used as root qdisc
    for multiqueue devices and exposes each device queue as a child class.
    
    This allows to address queues individually and graft them similar to regular
    classes. Additionally it presents an accumulated view of the statistics of
    all real root qdiscs in the dummy root.
    
    Two new callbacks are added to the qdisc_ops and qdisc_class_ops:
    
    - cl_ops->select_queue selects the tx queue number for new child classes.
    
    - qdisc_ops->attach() overrides root qdisc device grafting to attach
      non-shared qdiscs to the queues.
    
    Signed-off-by: Patrick McHardy <kaber@trash.net>

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a92dc62..9c69585 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -80,6 +80,7 @@ struct Qdisc
 struct Qdisc_class_ops
 {
 	/* Child qdisc manipulation */
+	unsigned int		(*select_queue)(struct Qdisc *, struct tcmsg *);
 	int			(*graft)(struct Qdisc *, unsigned long cl,
 					struct Qdisc *, struct Qdisc **);
 	struct Qdisc *		(*leaf)(struct Qdisc *, unsigned long cl);
@@ -122,6 +123,7 @@ struct Qdisc_ops
 	void			(*reset)(struct Qdisc *);
 	void			(*destroy)(struct Qdisc *);
 	int			(*change)(struct Qdisc *, struct nlattr *arg);
+	void			(*attach)(struct Qdisc *);
 
 	int			(*dump)(struct Qdisc *, struct sk_buff *);
 	int			(*dump_stats)(struct Qdisc *, struct gnet_dump *);
@@ -255,6 +257,8 @@ static inline void sch_tree_unlock(struct Qdisc *q)
 
 extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
+extern struct Qdisc_ops pfifo_fast_ops;
+extern struct Qdisc_ops mq_qdisc_ops;
 
 struct Qdisc_class_common
 {
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 54d950c..f14e71b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux Traffic Control Unit.
 #
 
-obj-y	:= sch_generic.o
+obj-y	:= sch_generic.o sch_mq.o
 
 obj-$(CONFIG_NET_SCHED)		+= sch_api.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)		+= cls_api.o
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d71f12b..2a78d54 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -678,6 +678,11 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		if (dev->flags & IFF_UP)
 			dev_deactivate(dev);
 
+		if (new && new->ops->attach) {
+			new->ops->attach(new);
+			num_q = 0;
+		}
+
 		for (i = 0; i < num_q; i++) {
 			struct netdev_queue *dev_queue = &dev->rx_queue;
 
@@ -692,7 +697,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 		}
 
 		notify_and_destroy(skb, n, classid, dev->qdisc, new);
-		if (new)
+		if (new && !new->ops->attach)
 			atomic_inc(&new->refcnt);
 		dev->qdisc = new ? : &noop_qdisc;
 
@@ -1095,10 +1100,16 @@ create_n_graft:
 		q = qdisc_create(dev, &dev->rx_queue,
 				 tcm->tcm_parent, tcm->tcm_parent,
 				 tca, &err);
-	else
-		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
+	else {
+		unsigned int ntx = 0;
+
+		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
+			ntx = p->ops->cl_ops->select_queue(p, tcm);
+
+		q = qdisc_create(dev, netdev_get_tx_queue(dev, ntx),
 				 tcm->tcm_parent, tcm->tcm_handle,
 				 tca, &err);
+	}
 	if (q == NULL) {
 		if (err == -EAGAIN)
 			goto replay;
@@ -1674,6 +1685,7 @@ static int __init pktsched_init(void)
 {
 	register_qdisc(&pfifo_qdisc_ops);
 	register_qdisc(&bfifo_qdisc_ops);
+	register_qdisc(&mq_qdisc_ops);
 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
 
 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e7c47ce..4ae6aa5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -514,7 +514,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
 	return 0;
 }
 
-static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
+struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.id		=	"pfifo_fast",
 	.priv_size	=	sizeof(struct pfifo_fast_priv),
 	.enqueue	=	pfifo_fast_enqueue,
@@ -670,6 +670,26 @@ static void attach_one_default_qdisc(struct net_device *dev,
 	dev_queue->qdisc_sleeping = qdisc;
 }
 
+static void attach_default_qdiscs(struct net_device *dev)
+{
+	struct netdev_queue *txq;
+	struct Qdisc *qdisc;
+
+	txq = netdev_get_tx_queue(dev, 0);
+
+	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
+		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
+		dev->qdisc = txq->qdisc_sleeping;
+		atomic_inc(&dev->qdisc->refcnt);
+	} else {
+		qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
+		if (qdisc) {
+			qdisc->ops->attach(qdisc);
+			dev->qdisc = qdisc;
+		}
+	}
+}
+
 static void transition_one_qdisc(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
 				 void *_need_watchdog)
@@ -689,7 +709,6 @@ static void transition_one_qdisc(struct net_device *dev,
 
 void dev_activate(struct net_device *dev)
 {
-	struct netdev_queue *txq;
 	int need_watchdog;
 
 	/* No queueing discipline is attached to device;
@@ -698,13 +717,8 @@ void dev_activate(struct net_device *dev)
 	   virtual interfaces
 	 */
 
-	if (dev->qdisc == &noop_qdisc) {
-		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
-
-		txq = netdev_get_tx_queue(dev, 0);
-		dev->qdisc = txq->qdisc_sleeping;
-		atomic_inc(&dev->qdisc->refcnt);
-	}
+	if (dev->qdisc == &noop_qdisc)
+		attach_default_qdiscs(dev);
 
 	if (!netif_carrier_ok(dev))
 		/* Delay activation until next carrier-on event */
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
new file mode 100644
index 0000000..5e453fd
--- /dev/null
+++ b/net/sched/sch_mq.c
@@ -0,0 +1,234 @@
+/*
+ * net/sched/sch_mq.c		Classful multiqueue dummy scheduler
+ *
+ * Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+struct mq_sched {
+	struct Qdisc		**qdiscs;
+};
+
+static void mq_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (priv->qdiscs)
+		return;
+	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+		qdisc_destroy(priv->qdiscs[ntx]);
+	kfree(priv->qdiscs);
+}
+
+static int mq_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	/* pre-allocate qdiscs, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL)
+		return -ENOMEM;
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		dev_queue = netdev_get_tx_queue(dev, ntx);
+		qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(ntx + 1)));
+		if (qdisc == NULL)
+			goto err;
+		qdisc->flags |= TCQ_F_CAN_BYPASS;
+		priv->qdiscs[ntx] = qdisc;
+	}
+
+	return 0;
+
+err:
+	mq_destroy(sch);
+	return -ENOMEM;
+}
+
+static void mq_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mq_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (qdisc)
+			qdisc_destroy(qdisc);
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+	return 0;
+}
+
+static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1;
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static unsigned int mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm)
+{
+	unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
+
+	if (!mq_queue_get(sch, ntx))
+		return 0;
+	return ntx - 1;
+}
+
+static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+	return 0;
+}
+
+static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mq_get(struct Qdisc *sch, u32 classid)
+{
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (!mq_queue_get(sch, ntx))
+		return 0;
+	return ntx;
+}
+
+static void mq_put(struct Qdisc *sch, unsigned long cl)
+{
+	return;
+}
+
+static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	tcm->tcm_parent = TC_H_ROOT;
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	return 0;
+}
+
+static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+			       struct gnet_dump *d)
+{
+	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
+
+	sch = dev_queue->qdisc_sleeping;
+	if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+	    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+		return -1;
+	return 0;
+}
+
+static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx;
+
+	if (arg->stop)
+		return;
+
+	arg->count = arg->skip;
+	for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mq_class_ops = {
+	.select_queue	= mq_select_queue,
+	.graft		= mq_graft,
+	.leaf		= mq_leaf,
+	.get		= mq_get,
+	.put		= mq_put,
+	.walk		= mq_walk,
+	.dump		= mq_dump_class,
+	.dump_stats	= mq_dump_class_stats,
+};
+
+struct Qdisc_ops mq_qdisc_ops __read_mostly = {
+	.cl_ops		= &mq_class_ops,
+	.id		= "mq",
+	.priv_size	= sizeof(struct mq_sched),
+	.init		= mq_init,
+	.destroy	= mq_destroy,
+	.attach		= mq_attach,
+	.dump		= mq_dump,
+	.owner		= THIS_MODULE,
+};

^ permalink raw reply related

* Re: net_sched 00/07: classful multiqueue dummy scheduler
From: Patrick McHardy @ 2009-09-04 16:42 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20090904164111.27300.29929.sendpatchset@x2.localnet>

[-- Attachment #1: Type: text/plain, Size: 1279 bytes --]

Patrick McHardy wrote:
> These patches contain a classful multiqueue ("mq") dummy scheduler to fix a
> couple of problems with the current multiqueue TC API integration. The
> changelogs of patch 05 and 07 contain more details.
> 
> The mq scheduler does two things:
> 
> - present device TX queues as classes, allowing to attach different qdiscs
>   to them, which are grafted to the TX queues
> 
> - present accumulated statistics of all device queue root qdiscs
> 
> Its used by default for multiqueue devices instead of the regular pfifo_fast
> qdisc, but can also be attached manually to restore multiqueue behaviour
> after attaching a non-multiqueue (shared) qdisc.
> 
> Patches 1-4 contain some preparatory cleanups because I was getting tired
> of copying unnecessary checks and dummy functions :)
> 
> Patch 5 introduces a dev->qdisc pointer, which points to the root qdisc from
> userspace's point of view. This is later used for the mq qdisc, which isn't
> actually attached to any device queues. Patch 7 contains the mq scheduler.
> 
> I've tested the scheduler with a hacked macvlan version which uses 4 queues,
> but since I don't own a multiqueue capable device I couldn't test this on
> real hardware.

And for reference, this is the script I used for testing:


[-- Attachment #2: test.sh --]
[-- Type: application/x-sh, Size: 928 bytes --]

^ permalink raw reply

* [PATCH] WAN: remove deprecated PCI_DEVICE_ID from PCI200SYN driver.
From: Krzysztof Halasa @ 2009-09-04 17:50 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

PCI200SYN has its own PCI subsystem device ID for 3+ years, now it's
time to remove the generic PLX905[02] ID from the driver. Anyone with
old EEPROM data will have to run the upgrade.

Having the generic PLX905[02] (PCI-local bus bridge) ID is harmful
as the driver tries to handle other devices based on these bridges.
-- 
Krzysztof Halasa

--- a/drivers/net/wan/pci200syn.c
+++ b/drivers/net/wan/pci200syn.c
@@ -360,15 +360,6 @@ static int __devinit pci200_pci_init_one(struct pci_dev *pdev,
 	       " %u RX packets rings\n", ramsize / 1024, ramphys,
 	       pdev->irq, card->tx_ring_buffers, card->rx_ring_buffers);
 
-	if (pdev->subsystem_device == PCI_DEVICE_ID_PLX_9050) {
-		printk(KERN_ERR "Detected PCI200SYN card with old "
-		       "configuration data.\n");
-		printk(KERN_ERR "See <http://www.kernel.org/pub/"
-		       "linux/utils/net/hdlc/pci200syn/> for update.\n");
-		printk(KERN_ERR "The card will stop working with"
-		       " future versions of Linux if not updated.\n");
-	}
-
 	if (card->tx_ring_buffers < 1) {
 		printk(KERN_ERR "pci200syn: RAM test failed\n");
 		pci200_pci_remove_one(pdev);
@@ -427,8 +418,6 @@ static int __devinit pci200_pci_init_one(struct pci_dev *pdev,
 
 static struct pci_device_id pci200_pci_tbl[] __devinitdata = {
 	{ PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050, PCI_VENDOR_ID_PLX,
-	  PCI_DEVICE_ID_PLX_9050, 0, 0, 0 },
-	{ PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050, PCI_VENDOR_ID_PLX,
 	  PCI_DEVICE_ID_PLX_PCI200SYN, 0, 0, 0 },
 	{ 0, }
 };

^ permalink raw reply

* Re: [PATCH net-next] wan: dlci/sdla transmit return dehacking
From: Krzysztof Halasa @ 2009-09-04 18:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, sfr, linux-next, netdev
In-Reply-To: <20090904083346.43885303@nehalam>

Stephen Hemminger <shemminger@vyatta.com> writes:

> This is a brute force removal of the wierd slave interface done for
> DLCI -> SDLA transmit. Before it was using non-standard return values
> and freeing skb in caller. This changes it to using normal return
> values, and freeing in the callee. Luckly only one driver pair was
> doing this. Not tested on real hardware, in fact I wonder if this
> driver pair is even being used by any users.

The only hardware which seems to be driven by dlci.c is sdla.c =
old Sangoma ISA cards.

Sangoma seems to maintain their own drivers for their hw (including
these ISA cards).

Are the in-kernel drivers functional after all those years? I don't
know.
-- 
Krzysztof Halasa

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Christoph Lameter @ 2009-09-04 15:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Paul E. McKenney, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <4AA03E6A.7070800@gmail.com>

On Fri, 4 Sep 2009, Eric Dumazet wrote:

> Problem is not _objects_ Christoph, but _slabs_, and your patch is not working.

Why?

> Its true that when User calls kmem_cache_destroy(), all _objects_ were
> previously freed. This is mandatory, with or withou SLAB_DESTROY_BY_RCU
> thing

Right.

> Problem is that slub has some internal state, including some to-be-freed _slabs_,
> that User have no control at all on it.

Those are going to be freed without calls to rcu with my patch. The only
reason for earlier rcu frees are user calls to kfree.

> Face it, SLAB_DESTROY_BY_RCU is internal affair (to slub/slab/... allocators)

Nope the user must follow RCU guidelines when using objects.

> We absolutely need a rcu_barrier() somewhere, believe it or not. You can
> argue that it should be done *before*, but it gives no speedup, only
> potential bugs.

I never said that you do not need an rcu_barrier() for this particular
situation? Why suggest such a thing?

The insertion of rcu stuff in the slab code will lead to future bugs since
now the slab logic is tied to the semantics of a particular rcu
implementatin.

> Only case User should do its rcu_barrier() itself is if it knows some
> call_rcu() are pending and are delaying _objects_ freeing (typical
> !SLAB_DESTROY_RCU usage in RCU algos).

Ok then the user already has to deal with the barriers. The API is
inconsistent if you put this into kmem_cache_destroy.

> I dont even understand why you care so much about
> kmem_cache_destroy(SLAB_DESTROY_BY_RCU), given that almost nobody use
> it. We took almost one month to find out what the bug was in first
> place...

This is already the second bug on this issue. Given the complexity of rcu
it is to be experted that inserting more RCU semantics into the slab
allocators is likely to cause future chains of new features and
bugs in slab allocators.

^ permalink raw reply

* Re: [PATCH] forcedeth: updated phy errata
From: Ayaz Abdulla @ 2009-09-04 13:57 UTC (permalink / raw)
  To: David Miller
  Cc: manfred@colorfullife.com, akpm@osdl.org, netdev@vger.kernel.org
In-Reply-To: <20090902.232201.248326790.davem@davemloft.net>



David Miller wrote:
> From: Ayaz Abdulla <aabdulla@nvidia.com>
> Date: Mon, 31 Aug 2009 19:08:37 -0400
> 
> 
>>This patch updates the special programming (and/or errata) needed in
>>order to setup the phy for various vendor models.
>>
>>The new models include:
>>Marvell E1116
>>Marvell E1111
>>Marvell E1011
>>Marvell E3016
>>Broadcom 9507
>>Broadcom AC131
>>Broadcom 50610
>>
>>Signed-off-by: Ayaz Abdulla <aabdulla@nvidia.com>
> 
> 
> Please document what these individual bits mean which you
> are clearing, by using an individual define for each register
> bit to describe it's purpose, and then define the mask as
> a concatenation of these bits.
> 
> Having an opaque bitmask is not how to do this.

Unfortunately, the phy vendors don't want us exposing the meaning of 
their non-standard bits.


> 
> Thanks.

^ permalink raw reply

* Re: [PATCH 24/29] fsldma: implement a private tx_list
From: Dan Williams @ 2009-09-04 19:42 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-raid, netdev, Li Yang, maciej.sosnowski, Ira Snyder
In-Reply-To: <20090904023232.32667.58041.stgit@dwillia2-linux.ch.intel.com>

[-- Attachment #1: Type: text/plain, Size: 521 bytes --]

On Thu, Sep 3, 2009 at 7:32 PM, Dan Williams<dan.j.williams@intel.com> wrote:
> Drop fsldma's use of tx_list from struct dma_async_tx_descriptor in
> preparation for removal of this field.
>
> Cc: Li Yang <leoli@freescale.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---

As I merged the pending fsldma queue with this patch I noticed that
this missed a tx_list conversion in fsl_dma_tx_submit().  New patch
attached.  I also fixed up the pending dma-slave support for fsldma
(attached).

Regards,
Dan

[-- Attachment #2: fsldma-local-tx-list.patch --]
[-- Type: text/x-diff, Size: 3396 bytes --]

fsldma: implement a private tx_list

From: Dan Williams <dan.j.williams@intel.com>

Drop fsldma's use of tx_list from struct dma_async_tx_descriptor in
preparation for removal of this field.

Cc: Li Yang <leoli@freescale.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/fsldma.c |   16 +++++++++-------
 drivers/dma/fsldma.h |    1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index ef87a89..73dd748 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -326,7 +326,8 @@ static void fsl_chan_toggle_ext_start(struct fsl_dma_chan *fsl_chan, int enable)
 static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
 	struct fsl_dma_chan *fsl_chan = to_fsl_chan(tx->chan);
-	struct fsl_desc_sw *desc;
+	struct fsl_desc_sw *desc = tx_to_fsl_desc(tx);
+	struct fsl_desc_sw *child;
 	unsigned long flags;
 	dma_cookie_t cookie;
 
@@ -334,7 +335,7 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
 
 	cookie = fsl_chan->common.cookie;
-	list_for_each_entry(desc, &tx->tx_list, node) {
+	list_for_each_entry(child, &desc->tx_list, node) {
 		cookie++;
 		if (cookie < 0)
 			cookie = 1;
@@ -343,8 +344,8 @@ static dma_cookie_t fsl_dma_tx_submit(struct dma_async_tx_descriptor *tx)
 	}
 
 	fsl_chan->common.cookie = cookie;
-	append_ld_queue(fsl_chan, tx_to_fsl_desc(tx));
-	list_splice_init(&tx->tx_list, fsl_chan->ld_queue.prev);
+	append_ld_queue(fsl_chan, desc);
+	list_splice_init(&desc->tx_list, fsl_chan->ld_queue.prev);
 
 	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
 
@@ -366,6 +367,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
 	desc_sw = dma_pool_alloc(fsl_chan->desc_pool, GFP_ATOMIC, &pdesc);
 	if (desc_sw) {
 		memset(desc_sw, 0, sizeof(struct fsl_desc_sw));
+		INIT_LIST_HEAD(&desc_sw->tx_list);
 		dma_async_tx_descriptor_init(&desc_sw->async_tx,
 						&fsl_chan->common);
 		desc_sw->async_tx.tx_submit = fsl_dma_tx_submit;
@@ -455,7 +457,7 @@ fsl_dma_prep_interrupt(struct dma_chan *chan, unsigned long flags)
 	new->async_tx.flags = flags;
 
 	/* Insert the link descriptor to the LD ring */
-	list_add_tail(&new->node, &new->async_tx.tx_list);
+	list_add_tail(&new->node, &new->tx_list);
 
 	/* Set End-of-link to the last link descriptor of new list*/
 	set_ld_eol(fsl_chan, new);
@@ -513,7 +515,7 @@ static struct dma_async_tx_descriptor *fsl_dma_prep_memcpy(
 		dma_dest += copy;
 
 		/* Insert the link descriptor to the LD ring */
-		list_add_tail(&new->node, &first->async_tx.tx_list);
+		list_add_tail(&new->node, &first->tx_list);
 	} while (len);
 
 	new->async_tx.flags = flags; /* client is in control of this ack */
@@ -528,7 +530,7 @@ fail:
 	if (!first)
 		return NULL;
 
-	list = &first->async_tx.tx_list;
+	list = &first->tx_list;
 	list_for_each_entry_safe_reverse(new, prev, list, node) {
 		list_del(&new->node);
 		dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index dc7f268..4493afe 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -90,6 +90,7 @@ struct fsl_dma_ld_hw {
 struct fsl_desc_sw {
 	struct fsl_dma_ld_hw hw;
 	struct list_head node;
+	struct list_head tx_list;
 	struct dma_async_tx_descriptor async_tx;
 	struct list_head *ld;
 	void *priv;

[-- Attachment #3: fsldma-add-dma_slave-support.patch --]
[-- Type: text/x-diff, Size: 12785 bytes --]

fsldma: Add DMA_SLAVE support

From: Ira Snyder <iws@ovro.caltech.edu>

Use the DMA_SLAVE capability of the DMAEngine API to copy/from a
scatterlist into an arbitrary list of hardware address/length pairs.

This allows a single DMA transaction to copy data from several different
devices into a scatterlist at the same time.

This also adds support to enable some controller-specific features such as
external start and external pause for a DMA transaction.

[dan.j.williams@intel.com: rebased on tx_list movement]
Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Acked-by: Li Yang <leoli@freescale.com>
Acked-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/include/asm/fsldma.h |  136 ++++++++++++++++++++++
 drivers/dma/fsldma.c              |  227 +++++++++++++++++++++++++++++++++++++
 2 files changed, 363 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fsldma.h

diff --git a/arch/powerpc/include/asm/fsldma.h b/arch/powerpc/include/asm/fsldma.h
new file mode 100644
index 0000000..a67aeed
--- /dev/null
+++ b/arch/powerpc/include/asm/fsldma.h
@@ -0,0 +1,136 @@
+/*
+ * Freescale MPC83XX / MPC85XX DMA Controller
+ *
+ * Copyright (c) 2009 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ARCH_POWERPC_ASM_FSLDMA_H__
+#define __ARCH_POWERPC_ASM_FSLDMA_H__
+
+#include <linux/dmaengine.h>
+
+/*
+ * Definitions for the Freescale DMA controller's DMA_SLAVE implemention
+ *
+ * The Freescale DMA_SLAVE implementation was designed to handle many-to-many
+ * transfers. An example usage would be an accelerated copy between two
+ * scatterlists. Another example use would be an accelerated copy from
+ * multiple non-contiguous device buffers into a single scatterlist.
+ *
+ * A DMA_SLAVE transaction is defined by a struct fsl_dma_slave. This
+ * structure contains a list of hardware addresses that should be copied
+ * to/from the scatterlist passed into device_prep_slave_sg(). The structure
+ * also has some fields to enable hardware-specific features.
+ */
+
+/**
+ * struct fsl_dma_hw_addr
+ * @entry: linked list entry
+ * @address: the hardware address
+ * @length: length to transfer
+ *
+ * Holds a single physical hardware address / length pair for use
+ * with the DMAEngine DMA_SLAVE API.
+ */
+struct fsl_dma_hw_addr {
+	struct list_head entry;
+
+	dma_addr_t address;
+	size_t length;
+};
+
+/**
+ * struct fsl_dma_slave
+ * @addresses: a linked list of struct fsl_dma_hw_addr structures
+ * @request_count: value for DMA request count
+ * @src_loop_size: setup and enable constant source-address DMA transfers
+ * @dst_loop_size: setup and enable constant destination address DMA transfers
+ * @external_start: enable externally started DMA transfers
+ * @external_pause: enable externally paused DMA transfers
+ *
+ * Holds a list of address / length pairs for use with the DMAEngine
+ * DMA_SLAVE API implementation for the Freescale DMA controller.
+ */
+struct fsl_dma_slave {
+
+	/* List of hardware address/length pairs */
+	struct list_head addresses;
+
+	/* Support for extra controller features */
+	unsigned int request_count;
+	unsigned int src_loop_size;
+	unsigned int dst_loop_size;
+	bool external_start;
+	bool external_pause;
+};
+
+/**
+ * fsl_dma_slave_append - add an address/length pair to a struct fsl_dma_slave
+ * @slave: the &struct fsl_dma_slave to add to
+ * @address: the hardware address to add
+ * @length: the length of bytes to transfer from @address
+ *
+ * Add a hardware address/length pair to a struct fsl_dma_slave. Returns 0 on
+ * success, -ERRNO otherwise.
+ */
+static inline int fsl_dma_slave_append(struct fsl_dma_slave *slave,
+				       dma_addr_t address, size_t length)
+{
+	struct fsl_dma_hw_addr *addr;
+
+	addr = kzalloc(sizeof(*addr), GFP_ATOMIC);
+	if (!addr)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&addr->entry);
+	addr->address = address;
+	addr->length = length;
+
+	list_add_tail(&addr->entry, &slave->addresses);
+	return 0;
+}
+
+/**
+ * fsl_dma_slave_free - free a struct fsl_dma_slave
+ * @slave: the struct fsl_dma_slave to free
+ *
+ * Free a struct fsl_dma_slave and all associated address/length pairs
+ */
+static inline void fsl_dma_slave_free(struct fsl_dma_slave *slave)
+{
+	struct fsl_dma_hw_addr *addr, *tmp;
+
+	if (slave) {
+		list_for_each_entry_safe(addr, tmp, &slave->addresses, entry) {
+			list_del(&addr->entry);
+			kfree(addr);
+		}
+
+		kfree(slave);
+	}
+}
+
+/**
+ * fsl_dma_slave_alloc - allocate a struct fsl_dma_slave
+ * @gfp: the flags to pass to kmalloc when allocating this structure
+ *
+ * Allocate a struct fsl_dma_slave for use by the DMA_SLAVE API. Returns a new
+ * struct fsl_dma_slave on success, or NULL on failure.
+ */
+static inline struct fsl_dma_slave *fsl_dma_slave_alloc(gfp_t gfp)
+{
+	struct fsl_dma_slave *slave;
+
+	slave = kzalloc(sizeof(*slave), gfp);
+	if (!slave)
+		return NULL;
+
+	INIT_LIST_HEAD(&slave->addresses);
+	return slave;
+}
+
+#endif /* __ARCH_POWERPC_ASM_FSLDMA_H__ */
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 7a0cb60..296f9e7 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -34,6 +34,7 @@
 #include <linux/dmapool.h>
 #include <linux/of_platform.h>
 
+#include <asm/fsldma.h>
 #include "fsldma.h"
 
 static void dma_init(struct fsl_dma_chan *fsl_chan)
@@ -552,6 +553,229 @@ fail:
 }
 
 /**
+ * fsl_dma_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
+ * @chan: DMA channel
+ * @sgl: scatterlist to transfer to/from
+ * @sg_len: number of entries in @scatterlist
+ * @direction: DMA direction
+ * @flags: DMAEngine flags
+ *
+ * Prepare a set of descriptors for a DMA_SLAVE transaction. Following the
+ * DMA_SLAVE API, this gets the device-specific information from the
+ * chan->private variable.
+ */
+static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg(
+	struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len,
+	enum dma_data_direction direction, unsigned long flags)
+{
+	struct fsl_dma_chan *fsl_chan;
+	struct fsl_desc_sw *first = NULL, *prev = NULL, *new = NULL;
+	struct fsl_dma_slave *slave;
+	struct list_head *tx_list;
+	size_t copy;
+
+	int i;
+	struct scatterlist *sg;
+	size_t sg_used;
+	size_t hw_used;
+	struct fsl_dma_hw_addr *hw;
+	dma_addr_t dma_dst, dma_src;
+
+	if (!chan)
+		return NULL;
+
+	if (!chan->private)
+		return NULL;
+
+	fsl_chan = to_fsl_chan(chan);
+	slave = chan->private;
+
+	if (list_empty(&slave->addresses))
+		return NULL;
+
+	hw = list_first_entry(&slave->addresses, struct fsl_dma_hw_addr, entry);
+	hw_used = 0;
+
+	/*
+	 * Build the hardware transaction to copy from the scatterlist to
+	 * the hardware, or from the hardware to the scatterlist
+	 *
+	 * If you are copying from the hardware to the scatterlist and it
+	 * takes two hardware entries to fill an entire page, then both
+	 * hardware entries will be coalesced into the same page
+	 *
+	 * If you are copying from the scatterlist to the hardware and a
+	 * single page can fill two hardware entries, then the data will
+	 * be read out of the page into the first hardware entry, and so on
+	 */
+	for_each_sg(sgl, sg, sg_len, i) {
+		sg_used = 0;
+
+		/* Loop until the entire scatterlist entry is used */
+		while (sg_used < sg_dma_len(sg)) {
+
+			/*
+			 * If we've used up the current hardware address/length
+			 * pair, we need to load a new one
+			 *
+			 * This is done in a while loop so that descriptors with
+			 * length == 0 will be skipped
+			 */
+			while (hw_used >= hw->length) {
+
+				/*
+				 * If the current hardware entry is the last
+				 * entry in the list, we're finished
+				 */
+				if (list_is_last(&hw->entry, &slave->addresses))
+					goto finished;
+
+				/* Get the next hardware address/length pair */
+				hw = list_entry(hw->entry.next,
+						struct fsl_dma_hw_addr, entry);
+				hw_used = 0;
+			}
+
+			/* Allocate the link descriptor from DMA pool */
+			new = fsl_dma_alloc_descriptor(fsl_chan);
+			if (!new) {
+				dev_err(fsl_chan->dev, "No free memory for "
+						       "link descriptor\n");
+				goto fail;
+			}
+#ifdef FSL_DMA_LD_DEBUG
+			dev_dbg(fsl_chan->dev, "new link desc alloc %p\n", new);
+#endif
+
+			/*
+			 * Calculate the maximum number of bytes to transfer,
+			 * making sure it is less than the DMA controller limit
+			 */
+			copy = min_t(size_t, sg_dma_len(sg) - sg_used,
+					     hw->length - hw_used);
+			copy = min_t(size_t, copy, FSL_DMA_BCR_MAX_CNT);
+
+			/*
+			 * DMA_FROM_DEVICE
+			 * from the hardware to the scatterlist
+			 *
+			 * DMA_TO_DEVICE
+			 * from the scatterlist to the hardware
+			 */
+			if (direction == DMA_FROM_DEVICE) {
+				dma_src = hw->address + hw_used;
+				dma_dst = sg_dma_address(sg) + sg_used;
+			} else {
+				dma_src = sg_dma_address(sg) + sg_used;
+				dma_dst = hw->address + hw_used;
+			}
+
+			/* Fill in the descriptor */
+			set_desc_cnt(fsl_chan, &new->hw, copy);
+			set_desc_src(fsl_chan, &new->hw, dma_src);
+			set_desc_dest(fsl_chan, &new->hw, dma_dst);
+
+			/*
+			 * If this is not the first descriptor, chain the
+			 * current descriptor after the previous descriptor
+			 */
+			if (!first) {
+				first = new;
+			} else {
+				set_desc_next(fsl_chan, &prev->hw,
+					      new->async_tx.phys);
+			}
+
+			new->async_tx.cookie = 0;
+			async_tx_ack(&new->async_tx);
+
+			prev = new;
+			sg_used += copy;
+			hw_used += copy;
+
+			/* Insert the link descriptor into the LD ring */
+			list_add_tail(&new->node, &first->tx_list);
+		}
+	}
+
+finished:
+
+	/* All of the hardware address/length pairs had length == 0 */
+	if (!first || !new)
+		return NULL;
+
+	new->async_tx.flags = flags;
+	new->async_tx.cookie = -EBUSY;
+
+	/* Set End-of-link to the last link descriptor of new list */
+	set_ld_eol(fsl_chan, new);
+
+	/* Enable extra controller features */
+	if (fsl_chan->set_src_loop_size)
+		fsl_chan->set_src_loop_size(fsl_chan, slave->src_loop_size);
+
+	if (fsl_chan->set_dest_loop_size)
+		fsl_chan->set_dest_loop_size(fsl_chan, slave->dst_loop_size);
+
+	if (fsl_chan->toggle_ext_start)
+		fsl_chan->toggle_ext_start(fsl_chan, slave->external_start);
+
+	if (fsl_chan->toggle_ext_pause)
+		fsl_chan->toggle_ext_pause(fsl_chan, slave->external_pause);
+
+	if (fsl_chan->set_request_count)
+		fsl_chan->set_request_count(fsl_chan, slave->request_count);
+
+	return &first->async_tx;
+
+fail:
+	/* If first was not set, then we failed to allocate the very first
+	 * descriptor, and we're done */
+	if (!first)
+		return NULL;
+
+	/*
+	 * First is set, so all of the descriptors we allocated have been added
+	 * to first->tx_list, INCLUDING "first" itself. Therefore we
+	 * must traverse the list backwards freeing each descriptor in turn
+	 *
+	 * We're re-using variables for the loop, oh well
+	 */
+	tx_list = &first->tx_list;
+	list_for_each_entry_safe_reverse(new, prev, tx_list, node) {
+		list_del_init(&new->node);
+		dma_pool_free(fsl_chan->desc_pool, new, new->async_tx.phys);
+	}
+
+	return NULL;
+}
+
+static void fsl_dma_device_terminate_all(struct dma_chan *chan)
+{
+	struct fsl_dma_chan *fsl_chan;
+	struct fsl_desc_sw *desc, *tmp;
+	unsigned long flags;
+
+	if (!chan)
+		return;
+
+	fsl_chan = to_fsl_chan(chan);
+
+	/* Halt the DMA engine */
+	dma_halt(fsl_chan);
+
+	spin_lock_irqsave(&fsl_chan->desc_lock, flags);
+
+	/* Remove and free all of the descriptors in the LD queue */
+	list_for_each_entry_safe(desc, tmp, &fsl_chan->ld_queue, node) {
+		list_del(&desc->node);
+		dma_pool_free(fsl_chan->desc_pool, desc, desc->async_tx.phys);
+	}
+
+	spin_unlock_irqrestore(&fsl_chan->desc_lock, flags);
+}
+
+/**
  * fsl_dma_update_completed_cookie - Update the completed cookie.
  * @fsl_chan : Freescale DMA channel
  */
@@ -977,12 +1201,15 @@ static int __devinit of_fsl_dma_probe(struct of_device *dev,
 
 	dma_cap_set(DMA_MEMCPY, fdev->common.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, fdev->common.cap_mask);
+	dma_cap_set(DMA_SLAVE, fdev->common.cap_mask);
 	fdev->common.device_alloc_chan_resources = fsl_dma_alloc_chan_resources;
 	fdev->common.device_free_chan_resources = fsl_dma_free_chan_resources;
 	fdev->common.device_prep_dma_interrupt = fsl_dma_prep_interrupt;
 	fdev->common.device_prep_dma_memcpy = fsl_dma_prep_memcpy;
 	fdev->common.device_is_tx_complete = fsl_dma_is_complete;
 	fdev->common.device_issue_pending = fsl_dma_memcpy_issue_pending;
+	fdev->common.device_prep_slave_sg = fsl_dma_prep_slave_sg;
+	fdev->common.device_terminate_all = fsl_dma_device_terminate_all;
 	fdev->common.dev = &dev->dev;
 
 	fdev->irq = irq_of_parse_and_map(dev->node, 0);

^ permalink raw reply related

* Re: [PATCH] slub: fix slab_pad_check()
From: Paul E. McKenney @ 2009-09-04 20:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Christoph Lameter, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <4AA0407E.8030505@gmail.com>

On Fri, Sep 04, 2009 at 12:17:34AM +0200, Eric Dumazet wrote:
> Eric Dumazet a écrit :
> > 
> > 
> > 
> > Problem is not _objects_ Christoph, but _slabs_, and your patch is not working.
> > 
> > Its true that when User calls kmem_cache_destroy(), all _objects_ were previously freed.
> > This is mandatory, with or withou SLAB_DESTROY_BY_RCU thing
> > 
> > Problem is that slub has some internal state, including some to-be-freed _slabs_,
> > that User have no control at all on it.
> > 
> > User cannot "know" slabs are freed, inuse, or whatever state in cache or call_rcu queues.
> > 
> > Face it, SLAB_DESTROY_BY_RCU is internal affair (to slub/slab/... allocators)
> > 
> > We absolutely need a rcu_barrier() somewhere, believe it or not. You can argue that it should
> > be done *before*, but it gives no speedup, only potential bugs.
> > 
> > Only case User should do its rcu_barrier() itself is if it knows some call_rcu() are pending
> > and are delaying _objects_ freeing (typical !SLAB_DESTROY_RCU usage in RCU algos).
> > 
> > I dont even understand why you care so much about kmem_cache_destroy(SLAB_DESTROY_BY_RCU),
> > given that almost nobody use it. We took almost one month to find out what the bug was in first
> > place...
> 
> 
> So maybe the safest thing would be to include the rcu_barrier() to
> insure all objects where freed

I argue that the above is the user's responsibility.  That said, I don't
see why the user would pass a SLAB_DESTROY_BY_RCU object to call_rcu().
So I would want to see an example of this before inflicting a pair
of rcu_barrier() calls on kmem_cache_destroy().

> And another one for SLAB_DESTROY_BY_RCU to make sure slabs where freed

This last is I believe kmem_cache's responsibility.

							Thanx, Paul

> void kmem_cache_destroy(struct kmem_cache *s)
> {
> 	/*
> 	 * Make sure no objects are waiting in call_rcu queues to be freed
> 	 */
> 	rcu_barrier();
> 
> 	down_write(&slub_lock);
> 	s->refcount--;
> 	if (!s->refcount) {
>                 list_del(&s->list);
>                 up_write(&slub_lock);
>                 if (kmem_cache_close(s)) {
>                         printk(KERN_ERR "SLUB %s: %s called for cache that "
>                                 "still has objects.\n", s->name, __func__);
>                         dump_stack();
>                 }
> 		/*
> 		 * Make sure no slabs are waiting in call_rcu queues to be freed
> 		 */
>                 if (s->flags & SLAB_DESTROY_BY_RCU)
>                         rcu_barrier();
>                 sysfs_slab_remove(s);
>         } else
>                 up_write(&slub_lock);
> }
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] slub: fix slab_pad_check()
From: Paul E. McKenney @ 2009-09-04 20:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Eric Dumazet, Pekka Enberg, Zdenek Kabelac, Patrick McHardy,
	Robin Holt, Linux Kernel Mailing List, Jesper Dangaard Brouer,
	Linux Netdev List, Netfilter Developers
In-Reply-To: <alpine.DEB.1.10.0909041140040.9781@V090114053VZO-1>

On Fri, Sep 04, 2009 at 11:42:17AM -0400, Christoph Lameter wrote:
> On Thu, 3 Sep 2009, Paul E. McKenney wrote:
> 
> > If it were the user of the slab who was invoking some variant of
> > call_rcu(), then I would agree with you.
> 
> The user already has to deal with it as explained by Eric.

I didn't read his email that way.  Perhaps I misinterpreted it.

> > However, call_rcu() is instead being invoked by the slab itself in the
> > case of SLAB_DESTROY_BY_RCU, so that there is no variation in usage.
> > Requiring that the user call rcu_barrier() is asking for subtle bugs.
> > Therefore, the best approach is to have kmem_cache_destroy() handle
> > the RCU cleanup, given that this cleanup is for actions taken by
> > kmem_cache_free(), not by the user.
> 
> The user already has to properly handle barriers and rcu logic in order to
> use objects handled with RCU properly. Moreover the user even has to check
> that the object is not suddenly checked under it. Its already complex.

mm/slab.c has had RCU calls since 2.6.9, so this is not new.

> Guess we are doing this ... Sigh. Are you going to add support other rcu
> versions to slab as well as time permits and as the need arises? Should
> we add you as a maintainer? ;-)

I don't see any point in adding anything resembling SLAB_DESTROY_BY_RCU_BH,
SLAB_DESTROY_BY_RCU_SCHED, or SLAB_DESTROY_BY_SRCU unless and until
someone needs it.  And I most especially don't see the point of adding
(say) SLAB_DESTROY_BY_RCU_BH_SCHED until someone convinces me of the
need for it.  I would prefer to put the energy into further streamlining
the underlying RCU implementation, maybe someday collapsing RCU-BH back
into RCU.  ;-)

We have gotten along fine with only SLAB_DESTROY_BY_RCU for almost
five years, so I think we are plenty fine with what we have.  So, as
you say, "as the need arises".

I don't see any more need to add me as maintainer of slab and friends
than of btrfs, netfilter, selinux, decnet, afs, wireless, or any of a
number of other subsystems that use RCU.

							Thanx, Paul

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox