* [PATCH 2/3][NET_BATCH] net core use batching @ 2007-10-08 18:26 jamal 2007-10-08 19:46 ` Waskiewicz Jr, Peter P 0 siblings, 1 reply; 79+ messages in thread From: jamal @ 2007-10-08 18:26 UTC (permalink / raw) To: David Miller Cc: krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jeff, mchan, general, kumarkr, tgraf, randy.dunlap, sri [-- Attachment #1: Type: text/plain, Size: 71 bytes --] This patch adds the usage of batching within the core. cheers, jamal [-- Attachment #2: 02-net-core-use-batching.patch --] [-- Type: text/x-patch, Size: 4790 bytes --] [NET_BATCH] net core use batching This patch adds the usage of batching within the core. Performance results demonstrating improvement are provided separately. I have #if-0ed some of the old functions so the patch is more readable. A future patch will remove all if-0ed content. Patrick McHardy eyeballed a bug that will cause re-ordering in case of a requeue. Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca> --- commit 63381156b35719a364d0f81fec487e6263fb5467 tree 615dfb5f8617a8a4edffbb965e81a75c60597319 parent 8b9960cfbb98d48c0ac30b2c00d27c25217adb26 author Jamal Hadi Salim <hadi@cyberus.ca> Mon, 08 Oct 2007 09:10:14 -0400 committer Jamal Hadi Salim <hadi@cyberus.ca> Mon, 08 Oct 2007 09:10:14 -0400 net/sched/sch_generic.c | 116 +++++++++++++++++++++++++++++++++++++++++++---- 1 files changed, 106 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 95ae119..96bfdcb 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -56,6 +56,7 @@ static inline int qdisc_qlen(struct Qdisc *q) return q->q.qlen; } +#if 0 static inline int dev_requeue_skb(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q) { @@ -110,6 +111,97 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, return ret; } +#endif + +static inline int handle_dev_cpu_collision(struct net_device *dev) +{ + if (unlikely(dev->xmit_lock_owner == smp_processor_id())) { + if (net_ratelimit()) + printk(KERN_WARNING + "Dead loop on netdevice %s, fix it urgently!\n", + dev->name); + return 1; + } + __get_cpu_var(netdev_rx_stat).cpu_collision++; + return 0; +} + +static inline int +dev_requeue_skbs(struct sk_buff_head *skbs, struct net_device *dev, + struct Qdisc *q) +{ + + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(skbs)) != NULL) + q->ops->requeue(skb, q); + + netif_schedule(dev); + return 0; +} + +static inline int +xmit_islocked(struct sk_buff_head *skbs, struct net_device *dev, + struct Qdisc *q) +{ + int ret = handle_dev_cpu_collision(dev); + + if (ret) { + if (!skb_queue_empty(skbs)) + skb_queue_purge(skbs); + return qdisc_qlen(q); + } + + return dev_requeue_skbs(skbs, dev, q); +} + +static int xmit_count_skbs(struct sk_buff *skb) +{ + int count = 0; + for (; skb; skb = skb->next) { + count += skb_shinfo(skb)->nr_frags; + count += 1; + } + return count; +} + +static int xmit_get_pkts(struct net_device *dev, + struct Qdisc *q, + struct sk_buff_head *pktlist) +{ + struct sk_buff *skb; + int count = dev->xmit_win; + + if (count && dev->gso_skb) { + skb = dev->gso_skb; + dev->gso_skb = NULL; + count -= xmit_count_skbs(skb); + __skb_queue_tail(pktlist, skb); + } + + while (count > 0) { + skb = q->dequeue(q); + if (!skb) + break; + + count -= xmit_count_skbs(skb); + __skb_queue_tail(pktlist, skb); + } + + return skb_queue_len(pktlist); +} + +static int xmit_prepare_pkts(struct net_device *dev, + struct sk_buff_head *tlist) +{ + struct sk_buff *skb; + struct sk_buff_head *flist = &dev->blist; + + while ((skb = __skb_dequeue(tlist)) != NULL) + xmit_prepare_skb(skb, dev); + + return skb_queue_len(flist); +} /* * NOTE: Called under dev->queue_lock with locally disabled BH. @@ -130,22 +222,23 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, * >0 - queue is not empty. * */ + static inline int qdisc_restart(struct net_device *dev) { struct Qdisc *q = dev->qdisc; - struct sk_buff *skb; - int ret; + int ret = 0; - /* Dequeue packet */ - if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL)) - return 0; + ret = xmit_get_pkts(dev, q, &dev->blist); + if (!ret) + return 0; - /* And release queue */ + /* We got em packets */ spin_unlock(&dev->queue_lock); + /* bye packets ....*/ HARD_TX_LOCK(dev, smp_processor_id()); - ret = dev_hard_start_xmit(skb, dev); + ret = dev_batch_xmit(dev); HARD_TX_UNLOCK(dev); spin_lock(&dev->queue_lock); @@ -158,8 +251,8 @@ static inline int qdisc_restart(struct net_device *dev) break; case NETDEV_TX_LOCKED: - /* Driver try lock failed */ - ret = handle_dev_cpu_collision(skb, dev, q); + /* Driver lock failed */ + ret = xmit_islocked(&dev->blist, dev, q); break; default: @@ -168,7 +261,7 @@ static inline int qdisc_restart(struct net_device *dev) printk(KERN_WARNING "BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); - ret = dev_requeue_skb(skb, dev, q); + ret = dev_requeue_skbs(&dev->blist, dev, q); break; } @@ -564,6 +657,9 @@ void dev_deactivate(struct net_device *dev) skb = dev->gso_skb; dev->gso_skb = NULL; + if (!skb_queue_empty(&dev->blist)) + skb_queue_purge(&dev->blist); + dev->xmit_win = 1; spin_unlock_bh(&dev->queue_lock); kfree_skb(skb); ^ permalink raw reply related [flat|nested] 79+ messages in thread
* RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 18:26 [PATCH 2/3][NET_BATCH] net core use batching jamal @ 2007-10-08 19:46 ` Waskiewicz Jr, Peter P 2007-10-08 20:48 ` jamal 0 siblings, 1 reply; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-08 19:46 UTC (permalink / raw) To: hadi, David Miller Cc: krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, jeff, mchan, general, tgraf, randy.dunlap, sri > -----Original Message----- > From: J Hadi Salim [mailto:j.hadi123@gmail.com] On Behalf Of jamal > Sent: Monday, October 08, 2007 11:27 AM > To: David Miller > Cc: krkumar2@in.ibm.com; johnpol@2ka.mipt.ru; > herbert@gondor.apana.org.au; kaber@trash.net; > shemminger@linux-foundation.org; jagana@us.ibm.com; > Robert.Olsson@data.slu.se; rick.jones2@hp.com; > xma@us.ibm.com; gaagaan@gmail.com; netdev@vger.kernel.org; > rdreier@cisco.com; Waskiewicz Jr, Peter P; > mcarlson@broadcom.com; jeff@garzik.org; mchan@broadcom.com; > general@lists.openfabrics.org; kumarkr@linux.ibm.com; > tgraf@suug.ch; randy.dunlap@oracle.com; sri@us.ibm.com > Subject: [PATCH 2/3][NET_BATCH] net core use batching > > This patch adds the usage of batching within the core. > > cheers, > jamal Hey Jamal, I still have concerns how this will work with Tx multiqueue. The way the batching code looks right now, you will probably send a batch of skb's from multiple bands from PRIO or RR to the driver. For non-Tx multiqueue drivers, this is fine. For Tx multiqueue drivers, this isn't fine, since the Tx ring is selected by the value of skb->queue_mapping (set by the qdisc on {prio|rr}_classify()). If the whole batch comes in with different queue_mappings, this could prove to be an interesting issue. Now I see in the driver HOWTO you recently sent that the driver will be expected to loop over the list and call it's ->hard_start_xmit() for each skb. I think that should be fine for multiqueue, I just wanted to see if you had any thoughts on how it should work, any performance issues you can see (I can't think of any). Since the batching feature and Tx multiqueue are very new features, I'd like to make sure we can think of any possible issues with them coexisting before they are both mainline. Looking ahead for multiqueue, I'm still working on the per-queue lock implementation for multiqueue, which I know will not work with batching as it's designed today. I'm still not sure how to handle this, because it really would require the batch you send to have the same queue_mapping in each skb, so you're grabbing the correct queue_lock. Or, we could have the core grab all the queue locks for each skb->queue_mapping represented in the batch. That would block another batch though if it had any of those queues in it's next batch before the first one completed. Thoughts? Thanks Jamal, -PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com> ^ permalink raw reply [flat|nested] 79+ messages in thread
* RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 19:46 ` Waskiewicz Jr, Peter P @ 2007-10-08 20:48 ` jamal 2007-10-08 21:26 ` [ofa-general] " David Miller 2007-10-08 22:33 ` Waskiewicz Jr, Peter P 0 siblings, 2 replies; 79+ messages in thread From: jamal @ 2007-10-08 20:48 UTC (permalink / raw) To: Waskiewicz Jr, Peter P Cc: David Miller, krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, jeff, mchan, general, tgraf, randy.dunlap, sri On Mon, 2007-08-10 at 12:46 -0700, Waskiewicz Jr, Peter P wrote: > I still have concerns how this will work with Tx multiqueue. > The way the batching code looks right now, you will probably send a > batch of skb's from multiple bands from PRIO or RR to the driver. For > non-Tx multiqueue drivers, this is fine. For Tx multiqueue drivers, > this isn't fine, since the Tx ring is selected by the value of > skb->queue_mapping (set by the qdisc on {prio|rr}_classify()). If the > whole batch comes in with different queue_mappings, this could prove to > be an interesting issue. true, that needs some resolution. Heres a hand-waving thought: Assuming all packets of a specific map end up in the same qdiscn queue, it seems feasible to ask the qdisc scheduler to give us enough packages (ive seen people use that terms to refer to packets) for each hardware ring's available space. With the patches i posted, i do that via dev->xmit_win that assumes only one view of the driver; essentially a single ring. If that is doable, then it is up to the driver to say "i have space for 5 in ring[0], 10 in ring[1] 0 in ring[2]" based on what scheduling scheme the driver implements - the dev->blist can stay the same. Its a handwave, so there may be issues there and there could be better ways to handle this. Note: The other issue that needs resolving that i raised earlier was in regards to multiqueue running on multiple cpus servicing different rings concurently. > Now I see in the driver HOWTO you recently sent that the driver > will be expected to loop over the list and call it's ->hard_start_xmit() > for each skb. It's the core that does that, not the driver; the driver continues to use ->hard_start_xmit() (albeit modified one). The idea is not to have many new interfaces. > I think that should be fine for multiqueue, I just wanted > to see if you had any thoughts on how it should work, any performance > issues you can see (I can't think of any). Since the batching feature > and Tx multiqueue are very new features, I'd like to make sure we can > think of any possible issues with them coexisting before they are both > mainline. Isnt multiqueue mainline already? > Looking ahead for multiqueue, I'm still working on the per-queue > lock implementation for multiqueue, which I know will not work with > batching as it's designed today. The point behind batching is to reduce the cost of the locks by amortizing across the locks. Even better if one can, they should get rid of locks. Remind me, why do you need the per-queuemap lock? And is it needed from the enqueuing side too? Maybe lets start there to help me understand things? > I'm still not sure how to handle this, > because it really would require the batch you send to have the same > queue_mapping in each skb, so you're grabbing the correct queue_lock. Sure, that is doable if the driver can set a per queue_mapping xmit_win and the qdisc can be taught to say "give me packets for queue_mapping X" > Or, we could have the core grab all the queue locks for each > skb->queue_mapping represented in the batch. That would block another > batch though if it had any of those queues in it's next batch before the > first one completed. Thoughts? I am not understanding the desire to have locks on a per-queuemap. I think the single queuelock we have today should suffice. If the intent is to have concurent cpus running to each hardware ring, then this is what i questioned earlier whether it was the right thing to do(very top of email where i mention it as "other issue"). cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 20:48 ` jamal @ 2007-10-08 21:26 ` David Miller 2007-10-08 22:34 ` jamal 2007-10-08 22:33 ` Waskiewicz Jr, Peter P 1 sibling, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-08 21:26 UTC (permalink / raw) To: hadi Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, randy.dunlap, jagana, general, mchan, tgraf, jeff, sri, shemminger, kaber From: jamal <hadi@cyberus.ca> Date: Mon, 08 Oct 2007 16:48:50 -0400 > On Mon, 2007-08-10 at 12:46 -0700, Waskiewicz Jr, Peter P wrote: > > > I still have concerns how this will work with Tx multiqueue. > > The way the batching code looks right now, you will probably send a > > batch of skb's from multiple bands from PRIO or RR to the driver. For > > non-Tx multiqueue drivers, this is fine. For Tx multiqueue drivers, > > this isn't fine, since the Tx ring is selected by the value of > > skb->queue_mapping (set by the qdisc on {prio|rr}_classify()). If the > > whole batch comes in with different queue_mappings, this could prove to > > be an interesting issue. > > true, that needs some resolution. Heres a hand-waving thought: > Assuming all packets of a specific map end up in the same qdiscn queue, > it seems feasible to ask the qdisc scheduler to give us enough packages > (ive seen people use that terms to refer to packets) for each hardware > ring's available space. With the patches i posted, i do that via > dev->xmit_win that assumes only one view of the driver; essentially a > single ring. > If that is doable, then it is up to the driver to say > "i have space for 5 in ring[0], 10 in ring[1] 0 in ring[2]" based on > what scheduling scheme the driver implements - the dev->blist can stay > the same. Its a handwave, so there may be issues there and there could > be better ways to handle this. Add xmit_win to struct net_device_subqueue, problem solved. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 21:26 ` [ofa-general] " David Miller @ 2007-10-08 22:34 ` jamal 2007-10-08 22:36 ` [ofa-general] " Waskiewicz Jr, Peter P 0 siblings, 1 reply; 79+ messages in thread From: jamal @ 2007-10-08 22:34 UTC (permalink / raw) To: David Miller Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, randy.dunlap, jagana, general, mchan, tgraf, jeff, sri, shemminger, kaber On Mon, 2007-08-10 at 14:26 -0700, David Miller wrote: > Add xmit_win to struct net_device_subqueue, problem solved. If net_device_subqueue is visible from both driver and core scheduler area (couldnt tell from looking at whats in there already), then that'll do it. cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 22:34 ` jamal @ 2007-10-08 22:36 ` Waskiewicz Jr, Peter P 0 siblings, 0 replies; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-08 22:36 UTC (permalink / raw) To: hadi, David Miller Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, mcarlson, randy.dunlap, jagana, general, mchan, tgraf, jeff, sri, shemminger, kaber > If net_device_subqueue is visible from both driver and core > scheduler area (couldnt tell from looking at whats in there > already), then that'll do it. Yes, I use the net_device_subqueue structs (the state variable in there) in the prio and rr qdiscs right now. It's an indexed list at the very end of struct netdevice. -PJ Waskiewicz ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 20:48 ` jamal 2007-10-08 21:26 ` [ofa-general] " David Miller @ 2007-10-08 22:33 ` Waskiewicz Jr, Peter P 2007-10-08 23:40 ` jamal 2007-10-09 10:58 ` [ofa-general] " Krishna Kumar2 1 sibling, 2 replies; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-08 22:33 UTC (permalink / raw) To: hadi Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, mcarlson, kaber, randy.dunlap, jagana, general, mchan, tgraf, jeff, sri, shemminger, David Miller > true, that needs some resolution. Heres a hand-waving thought: > Assuming all packets of a specific map end up in the same > qdiscn queue, it seems feasible to ask the qdisc scheduler to > give us enough packages (ive seen people use that terms to > refer to packets) for each hardware ring's available space. > With the patches i posted, i do that via > dev->xmit_win that assumes only one view of the driver; essentially a > single ring. > If that is doable, then it is up to the driver to say "i have > space for 5 in ring[0], 10 in ring[1] 0 in ring[2]" based on > what scheduling scheme the driver implements - the dev->blist > can stay the same. Its a handwave, so there may be issues > there and there could be better ways to handle this. > > Note: The other issue that needs resolving that i raised > earlier was in regards to multiqueue running on multiple cpus > servicing different rings concurently. I can see the qdisc being modified to send batches per queue_mapping. This shouldn't be too difficult, and if we had the xmit_win per queue (in the subqueue struct like Dave pointed out). Addressing your note/issue with different rings being services concurrently: I'd like to remove the QDISC_RUNNING bit from the global device; with Tx multiqueue, this bit should be set on each queue (if at all), allowing multiple Tx rings to be loaded simultaneously. The biggest issue today with the multiqueue implementation is the global queue_lock. I see it being a hot source of contention in my testing; my setup is a 8-core machine (dual quad-core procs) with a 10GbE NIC, using 8 Tx and 8 Rx queues. On transmit, when loading all 8 queues, the enqueue/dequeue are hitting that lock quite a bit for the whole device. I really think that the queue_lock should join the queue_state, so the device no longer manages the top-level state (since we're operating per-queue instead of per-device). > It's the core that does that, not the driver; the driver > continues to use ->hard_start_xmit() (albeit modified one). > The idea is not to have many new interfaces. I'll look closer at this, since I think I confused myself. > Isnt multiqueue mainline already? Well, it's in 2.6.23-rc*. I imagine it won't see much action though until 2.6.24, since people will be porting drivers during that time. Plus having the native Rx multiqueue w/NAPI code in 2.6.24 makes sense to have Tx multiqueue at that time. > The point behind batching is to reduce the cost of the locks > by amortizing across the locks. Even better if one can, they > should get rid of locks. Remind me, why do you need the > per-queuemap lock? And is it needed from the enqueuing side > too? Maybe lets start there to help me understand things? The multiqueue implementation today enforces the number of qdisc bands (RR or PRIO) to be equal to the number of Tx rings your hardware/driver is supporting. Therefore, the queue_lock and queue_state in the kernel directly relate to the qdisc band management. If the queue stops from the driver, then the qdisc won't try to dequeue from the band. What I'm working on is to move the lock there too, so I can lock the queue when I enqueue (protect the band from multiple sources modifying the skb chain), and lock it when I dequeue. This is purely for concurrency of adding/popping skb's from the qdisc queues. Right now, we take the whole global lock to add and remove skb's. This is the next logical step for separating the queue dependancy on each other. Please let me know if this doesn't make sense, or if you have any questions at all about my reasoning. I agree that this is where we should be on the same page before moving onto anything else in this discussion. :) > Sure, that is doable if the driver can set a per > queue_mapping xmit_win and the qdisc can be taught to say > "give me packets for queue_mapping X" Yes, I like this idea very much. Do that, modify the qdisc to send in chunks from a queue, and the problem should be solved. I will try and find some additional cycles to get my patches completely working, and send them. It'd be easier I think to see what's going on if I did that. I'll also try to make them work with the ideas of xmit_win per queue and batched queue qdisc sends. Stay tuned... Thanks Jamal, -PJ Waskiewicz <peter.p.waskiewicz.jr@intel.com> ^ permalink raw reply [flat|nested] 79+ messages in thread
* RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 22:33 ` Waskiewicz Jr, Peter P @ 2007-10-08 23:40 ` jamal 2007-10-09 1:13 ` Jeff Garzik 2007-10-09 1:31 ` Jeff Garzik 2007-10-09 10:58 ` [ofa-general] " Krishna Kumar2 1 sibling, 2 replies; 79+ messages in thread From: jamal @ 2007-10-08 23:40 UTC (permalink / raw) To: Waskiewicz Jr, Peter P Cc: David Miller, krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, jeff, mchan, general, tgraf, randy.dunlap, sri On Mon, 2007-08-10 at 15:33 -0700, Waskiewicz Jr, Peter P wrote: > Addressing your note/issue with different rings being services > concurrently: I'd like to remove the QDISC_RUNNING bit from the global The challenge to deal with is that netdevices, filters, the queues and scheduler are closely inter-twined. So it is not just the scheduling region and QDISC_RUNNING. For example, lets pick just the filters because they are simple to see: You need to attach them to something - whatever that is, you then need to synchronize against config and multiple cpus trying to use them. You could: a) replicate them across cpus and only lock on config, but you are wasting RAM then b) attach them to rings instead of netdevices - but that makes me wonder if those subqueues are now going to become netdevices. This also means you change all user space interfaces to know about subqueues. If you recall this was a major contention in our earlier discussion. > device; with Tx multiqueue, this bit should be set on each queue (if at > all), allowing multiple Tx rings to be loaded simultaneously. This is the issue i raised - refer to Dave's wording of it. If you run access to the rings simultenously you may not be able to guarantee any ordering or proper qos in contention for wire-resources (think strict prio in hardware) - as long as you have the qdisc area. You may actually get away with it with something like DRR. You could totaly bypass the qdisc region and go to the driver directly and let it worry about the scheduling but youd have to make the qdisc area a "passthrough" while providing the illusion to user space that all is as before. > The > biggest issue today with the multiqueue implementation is the global > queue_lock. I see it being a hot source of contention in my testing; my > setup is a 8-core machine (dual quad-core procs) with a 10GbE NIC, using > 8 Tx and 8 Rx queues. On transmit, when loading all 8 queues, the > enqueue/dequeue are hitting that lock quite a bit for the whole device. Yes, the queuelock is expensive; in your case if all 8 hardware threads are contending for that one device, you will suffer. The txlock on the other hand is not that expensive since the contention is for a max of 2 cpus (tx and rx softirq). I tried to use that fact in the batching to move things that i processed under queue lock into the area for txlock. I'd be very interested in some results on such a piece of hardware with the 10G nic to see if these theories make any sense. > I really think that the queue_lock should join the queue_state, so the > device no longer manages the top-level state (since we're operating > per-queue instead of per-device). Refer to above. > > The multiqueue implementation today enforces the number of qdisc bands > (RR or PRIO) to be equal to the number of Tx rings your hardware/driver > is supporting. Therefore, the queue_lock and queue_state in the kernel > directly relate to the qdisc band management. If the queue stops from > the driver, then the qdisc won't try to dequeue from the band. Good start. > What I'm > working on is to move the lock there too, so I can lock the queue when I > enqueue (protect the band from multiple sources modifying the skb > chain), and lock it when I dequeue. This is purely for concurrency of > adding/popping skb's from the qdisc queues. Ok, so the "concurency" aspect is what worries me. What i am saying is that sooner or later you have to serialize (which is anti-concurency) For example, consider CPU0 running a high prio queue and CPU1 running the low prio queue of the same netdevice. Assume CPU0 is getting a lot of interupts or other work while CPU1 doesnt (so as to create a condition that CPU1 is slower). Then as long as there packets and there is space on the drivers rings, CPU1 will send more packets per unit time than CPU0. This contradicts the strict prio scheduler which says higher priority packets ALWAYS go out first regardless of the presence of low prio packets. I am not sure i made sense. cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 23:40 ` jamal @ 2007-10-09 1:13 ` Jeff Garzik 2007-10-09 1:41 ` [ofa-general] " David Miller 2007-10-09 1:31 ` Jeff Garzik 1 sibling, 1 reply; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 1:13 UTC (permalink / raw) To: hadi Cc: Waskiewicz Jr, Peter P, David Miller, krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri jamal wrote: > Ok, so the "concurency" aspect is what worries me. What i am saying is > that sooner or later you have to serialize (which is anti-concurency) > For example, consider CPU0 running a high prio queue and CPU1 running > the low prio queue of the same netdevice. > Assume CPU0 is getting a lot of interupts or other work while CPU1 > doesnt (so as to create a condition that CPU1 is slower). Then as long > as there packets and there is space on the drivers rings, CPU1 will send > more packets per unit time than CPU0. > This contradicts the strict prio scheduler which says higher priority > packets ALWAYS go out first regardless of the presence of low prio > packets. I am not sure i made sense. You made sense. I think it is important to note simply that the packet scheduling algorithm itself will dictate the level of concurrency you can achieve. Strict prio is fundamentally an interface to a big imaginary queue, with multiple packet insertion points (the individual bands/rings for each prio band). If you assume a scheduler implementation where each prio band is mapped to a separate CPU, you can certainly see where some CPUs could be substantially idle while others are overloaded, largely depending on the data workload (and priority contained within). Moreover, you increase L1/L2 cache traffic, not just because of locks, but because of data dependencies: user prio packet NIC TX ring process band scheduler cpu7 1 cpu1 1 cpu5 1 cpu1 1 cpu2 0 cpu0 0 At that point, it is probably more cache- and lock-friendly to keep the current TX softirq scheme. In contrast, a pure round-robin approach is more friendly to concurrency. Jeff ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 1:13 ` Jeff Garzik @ 2007-10-09 1:41 ` David Miller 2007-10-09 2:01 ` Herbert Xu ` (3 more replies) 0 siblings, 4 replies; 79+ messages in thread From: David Miller @ 2007-10-09 1:41 UTC (permalink / raw) To: jeff Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber From: Jeff Garzik <jeff@garzik.org> Date: Mon, 08 Oct 2007 21:13:59 -0400 > If you assume a scheduler implementation where each prio band is mapped > to a separate CPU, you can certainly see where some CPUs could be > substantially idle while others are overloaded, largely depending on the > data workload (and priority contained within). Right, which is why Peter added the prio DRR scheduler stuff for TX multiqueue (see net/sched/sch_prio.c:rr_qdisc_ops) because this is what the chips do. But this doesn't get us to where we want to be as Peter has been explaining a bit these past few days. Ok, we're talking a lot but not pouring much concrete, let's start doing that. I propose: 1) A library for transmit load balancing functions, with an interface that can be made visible to userspace. I can write this and test it on real multiqueue hardware. The whole purpose of this library is to set skb->queue_mapping based upon the load balancing function. Facilities will be added to handle virtualization port selection based upon destination MAC address as one of the "load balancing" methods. 2) Switch the default qdisc away from pfifo_fast to a new DRR fifo with load balancing using the code in #1. I think this is kind of in the territory of what Peter said he is working on. I know this is controversial, but realistically I doubt users benefit at all from the prioritization that pfifo provides. They will, on the other hand, benefit from TX queue load balancing on fast interfaces. 3) Work on discovering a way to make the locking on transmit as localized to the current thread of execution as possible. Things like RCU and statistic replication, techniques we use widely elsewhere in the stack, begin to come to mind. I also want to point out another issue. Any argument wrt. reordering is specious at best because right now reordering from qdisc to device happens anyways. And that's because we drop the qdisc lock first, then we grab the transmit lock on the device and submit the packet. So, after we drop the qdisc lock, another cpu can get the qdisc lock, get the next packet (perhaps a lower priority one) and then sneak in to get the device transmit lock before the first thread can, and thus the packets will be submitted out of order. This, along with other things, makes me believe that ordering really doesn't matter in practice. And therefore, in practice, we can treat everything from the qdisc to the real hardware as a FIFO even if something else is going on inside the black box which might reorder packets on the wire. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 1:41 ` [ofa-general] " David Miller @ 2007-10-09 2:01 ` Herbert Xu 2007-10-09 2:03 ` Herbert Xu 2007-10-09 2:12 ` [ofa-general] " Jeff Garzik ` (2 subsequent siblings) 3 siblings, 1 reply; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:01 UTC (permalink / raw) To: David Miller Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, kaber, sri On Mon, Oct 08, 2007 at 06:41:26PM -0700, David Miller wrote: > > I also want to point out another issue. Any argument wrt. reordering > is specious at best because right now reordering from qdisc to device > happens anyways. This is not true. If your device has a qdisc at all, then you will end up in the function qdisc_restart, where we release the queue lock only after acquiring the TX lock. So right now this path does not create any reordering. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:01 ` Herbert Xu @ 2007-10-09 2:03 ` Herbert Xu 2007-10-09 2:04 ` Herbert Xu 2007-10-09 2:43 ` David Miller 0 siblings, 2 replies; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:03 UTC (permalink / raw) To: David Miller Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, kaber, sri On Tue, Oct 09, 2007 at 10:01:15AM +0800, Herbert Xu wrote: > On Mon, Oct 08, 2007 at 06:41:26PM -0700, David Miller wrote: > > > > I also want to point out another issue. Any argument wrt. reordering > > is specious at best because right now reordering from qdisc to device > > happens anyways. > > This is not true. > > If your device has a qdisc at all, then you will end up in the > function qdisc_restart, where we release the queue lock only > after acquiring the TX lock. > > So right now this path does not create any reordering. Argh! Someone's just broken this. I think we should restore the original behaviour. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:03 ` Herbert Xu @ 2007-10-09 2:04 ` Herbert Xu 2007-10-09 2:15 ` jamal 2007-10-09 2:45 ` [ofa-general] " David Miller 2007-10-09 2:43 ` David Miller 1 sibling, 2 replies; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:04 UTC (permalink / raw) To: David Miller Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, kaber, sri On Tue, Oct 09, 2007 at 10:03:18AM +0800, Herbert Xu wrote: > On Tue, Oct 09, 2007 at 10:01:15AM +0800, Herbert Xu wrote: > > On Mon, Oct 08, 2007 at 06:41:26PM -0700, David Miller wrote: > > > > > > I also want to point out another issue. Any argument wrt. reordering > > > is specious at best because right now reordering from qdisc to device > > > happens anyways. > > > > This is not true. > > > > If your device has a qdisc at all, then you will end up in the > > function qdisc_restart, where we release the queue lock only > > after acquiring the TX lock. > > > > So right now this path does not create any reordering. > > Argh! Someone's just broken this. I think we should restore > the original behaviour. Please revert commit 41843197b17bdfb1f97af0a87c06d24c1620ba90 Author: Jamal Hadi Salim <hadi@cyberus.ca> Date: Tue Sep 25 19:27:13 2007 -0700 [NET_SCHED]: explict hold dev tx lock As this change introduces potential reordering and I don't think we've discussed this aspect sufficiently. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:04 ` Herbert Xu @ 2007-10-09 2:15 ` jamal 2007-10-09 2:16 ` Herbert Xu 2007-10-09 2:45 ` [ofa-general] " David Miller 1 sibling, 1 reply; 79+ messages in thread From: jamal @ 2007-10-09 2:15 UTC (permalink / raw) To: Herbert Xu Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, kaber, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, David Miller, sri On Tue, 2007-09-10 at 10:04 +0800, Herbert Xu wrote: > Please revert > > commit 41843197b17bdfb1f97af0a87c06d24c1620ba90 > Author: Jamal Hadi Salim <hadi@cyberus.ca> > Date: Tue Sep 25 19:27:13 2007 -0700 > > [NET_SCHED]: explict hold dev tx lock > > As this change introduces potential reordering and I don't think > we've discussed this aspect sufficiently. How does it introduce reordering? cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:15 ` jamal @ 2007-10-09 2:16 ` Herbert Xu 2007-10-09 2:19 ` [ofa-general] " jamal 0 siblings, 1 reply; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:16 UTC (permalink / raw) To: jamal Cc: David Miller, jeff, peter.p.waskiewicz.jr, krkumar2, johnpol, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri On Mon, Oct 08, 2007 at 10:15:49PM -0400, jamal wrote: > On Tue, 2007-09-10 at 10:04 +0800, Herbert Xu wrote: > > > Please revert > > > > commit 41843197b17bdfb1f97af0a87c06d24c1620ba90 > > Author: Jamal Hadi Salim <hadi@cyberus.ca> > > Date: Tue Sep 25 19:27:13 2007 -0700 > > > > [NET_SCHED]: explict hold dev tx lock > > > > As this change introduces potential reordering and I don't think > > we've discussed this aspect sufficiently. > > How does it introduce reordering? No it doesn't. I'd forgotten about the QDISC_RUNNING bit :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:16 ` Herbert Xu @ 2007-10-09 2:19 ` jamal 2007-10-09 2:20 ` Herbert Xu 0 siblings, 1 reply; 79+ messages in thread From: jamal @ 2007-10-09 2:19 UTC (permalink / raw) To: Herbert Xu Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, kaber, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, David Miller, sri On Tue, 2007-09-10 at 10:16 +0800, Herbert Xu wrote: > > No it doesn't. I'd forgotten about the QDISC_RUNNING bit :) You should not better, you wrote it and ive been going insane trying to break for at least a year now ;-> cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:19 ` [ofa-general] " jamal @ 2007-10-09 2:20 ` Herbert Xu 0 siblings, 0 replies; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:20 UTC (permalink / raw) To: jamal Cc: David Miller, jeff, peter.p.waskiewicz.jr, krkumar2, johnpol, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri On Mon, Oct 08, 2007 at 10:19:02PM -0400, jamal wrote: > On Tue, 2007-09-10 at 10:16 +0800, Herbert Xu wrote: > > > > > No it doesn't. I'd forgotten about the QDISC_RUNNING bit :) > > You should not better, you wrote it and ive been going insane trying to > break for at least a year now ;-> Well you've broken me at least :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:04 ` Herbert Xu 2007-10-09 2:15 ` jamal @ 2007-10-09 2:45 ` David Miller 1 sibling, 0 replies; 79+ messages in thread From: David Miller @ 2007-10-09 2:45 UTC (permalink / raw) To: herbert Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, kaber, sri From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 9 Oct 2007 10:04:42 +0800 > On Tue, Oct 09, 2007 at 10:03:18AM +0800, Herbert Xu wrote: > > On Tue, Oct 09, 2007 at 10:01:15AM +0800, Herbert Xu wrote: > > > On Mon, Oct 08, 2007 at 06:41:26PM -0700, David Miller wrote: > > > > > > > > I also want to point out another issue. Any argument wrt. reordering > > > > is specious at best because right now reordering from qdisc to device > > > > happens anyways. > > > > > > This is not true. > > > > > > If your device has a qdisc at all, then you will end up in the > > > function qdisc_restart, where we release the queue lock only > > > after acquiring the TX lock. > > > > > > So right now this path does not create any reordering. > > > > Argh! Someone's just broken this. I think we should restore > > the original behaviour. > > Please revert > > commit 41843197b17bdfb1f97af0a87c06d24c1620ba90 > Author: Jamal Hadi Salim <hadi@cyberus.ca> > Date: Tue Sep 25 19:27:13 2007 -0700 > > [NET_SCHED]: explict hold dev tx lock > > As this change introduces potential reordering and I don't think > we've discussed this aspect sufficiently. Agreed, and done. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:03 ` Herbert Xu 2007-10-09 2:04 ` Herbert Xu @ 2007-10-09 2:43 ` David Miller 2007-10-09 2:46 ` Herbert Xu 1 sibling, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-09 2:43 UTC (permalink / raw) To: herbert Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, kaber, sri From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 9 Oct 2007 10:03:18 +0800 > On Tue, Oct 09, 2007 at 10:01:15AM +0800, Herbert Xu wrote: > > On Mon, Oct 08, 2007 at 06:41:26PM -0700, David Miller wrote: > > > > > > I also want to point out another issue. Any argument wrt. reordering > > > is specious at best because right now reordering from qdisc to device > > > happens anyways. > > > > This is not true. > > > > If your device has a qdisc at all, then you will end up in the > > function qdisc_restart, where we release the queue lock only > > after acquiring the TX lock. > > > > So right now this path does not create any reordering. > > Argh! Someone's just broken this. I think we should restore > the original behaviour. Right, that's Jamal's recent patch. It looked funny to me too. I think we can't make this change, the acquisition of the device transmit lock before we release the qdisc is the only thing that prevents reordering between qdisc and device. Otherwise all of the prioritization is pretty much for nothing as I described in another email today. Jamal, I'm pretty sure we have to revert this, you can't change the locking in this way. commit 41843197b17bdfb1f97af0a87c06d24c1620ba90 Author: Jamal Hadi Salim <hadi@cyberus.ca> Date: Tue Sep 25 19:27:13 2007 -0700 [NET_SCHED]: explict hold dev tx lock For N cpus, with full throttle traffic on all N CPUs, funneling traffic to the same ethernet device, the devices queue lock is contended by all N CPUs constantly. The TX lock is only contended by a max of 2 CPUS. In the current mode of operation, after all the work of entering the dequeue region, we may endup aborting the path if we are unable to get the tx lock and go back to contend for the queue lock. As N goes up, this gets worse. The changes in this patch result in a small increase in performance with a 4CPU (2xdual-core) with no irq binding. Both e1000 and tg3 showed similar behavior; Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca> Signed-off-by: David S. Miller <davem@davemloft.net> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e970e8e..95ae119 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -134,34 +134,19 @@ static inline int qdisc_restart(struct net_device *dev) { struct Qdisc *q = dev->qdisc; struct sk_buff *skb; - unsigned lockless; int ret; /* Dequeue packet */ if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL)) return 0; - /* - * When the driver has LLTX set, it does its own locking in - * start_xmit. These checks are worth it because even uncongested - * locks can be quite expensive. The driver can do a trylock, as - * is being done here; in case of lock contention it should return - * NETDEV_TX_LOCKED and the packet will be requeued. - */ - lockless = (dev->features & NETIF_F_LLTX); - - if (!lockless && !netif_tx_trylock(dev)) { - /* Another CPU grabbed the driver tx lock */ - return handle_dev_cpu_collision(skb, dev, q); - } /* And release queue */ spin_unlock(&dev->queue_lock); + HARD_TX_LOCK(dev, smp_processor_id()); ret = dev_hard_start_xmit(skb, dev); - - if (!lockless) - netif_tx_unlock(dev); + HARD_TX_UNLOCK(dev); spin_lock(&dev->queue_lock); q = dev->qdisc; ^ permalink raw reply related [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:43 ` David Miller @ 2007-10-09 2:46 ` Herbert Xu 0 siblings, 0 replies; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:46 UTC (permalink / raw) To: David Miller Cc: jeff, hadi, peter.p.waskiewicz.jr, krkumar2, johnpol, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri On Mon, Oct 08, 2007 at 07:43:43PM -0700, David Miller wrote: > > Right, that's Jamal's recent patch. It looked funny to me too. Hang on Dave. It was too early in the morning for me :) I'd forgotten about the QDISC_RUNNING bit which did what the queue lock did without actually holding the queue lock. So there is no reordering with or without Jamal's patch. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 1:41 ` [ofa-general] " David Miller 2007-10-09 2:01 ` Herbert Xu @ 2007-10-09 2:12 ` Jeff Garzik 2007-10-09 2:46 ` David Miller 2007-10-09 18:48 ` [ofa-general] " Waskiewicz Jr, Peter P 2007-10-09 2:14 ` [ofa-general] " jamal 2007-10-09 16:51 ` Andi Kleen 3 siblings, 2 replies; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 2:12 UTC (permalink / raw) To: David Miller Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber David Miller wrote: > 1) A library for transmit load balancing functions, with an interface > that can be made visible to userspace. I can write this and test > it on real multiqueue hardware. > > The whole purpose of this library is to set skb->queue_mapping > based upon the load balancing function. > > Facilities will be added to handle virtualization port selection > based upon destination MAC address as one of the "load balancing" > methods. Groovy. I'm interested in working on a load balancer function that approximates skb->queue_mapping = smp_processor_id() I'd be happy to code and test in that direction, based on your lib. > 2) Switch the default qdisc away from pfifo_fast to a new DRR fifo > with load balancing using the code in #1. I think this is kind > of in the territory of what Peter said he is working on. > > I know this is controversial, but realistically I doubt users > benefit at all from the prioritization that pfifo provides. They > will, on the other hand, benefit from TX queue load balancing on > fast interfaces. IMO the net driver really should provide a hint as to what it wants. 8139cp and tg3 would probably prefer multiple TX queue behavior to match silicon behavior -- strict prio. And I'll volunteer to write the net driver code for that, if people want to see how things would look for that type of hardware packet scheduling. > 3) Work on discovering a way to make the locking on transmit as > localized to the current thread of execution as possible. Things > like RCU and statistic replication, techniques we use widely > elsewhere in the stack, begin to come to mind. Definitely. Jeff ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:12 ` [ofa-general] " Jeff Garzik @ 2007-10-09 2:46 ` David Miller 2007-10-09 18:48 ` [ofa-general] " Waskiewicz Jr, Peter P 1 sibling, 0 replies; 79+ messages in thread From: David Miller @ 2007-10-09 2:46 UTC (permalink / raw) To: jeff Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber From: Jeff Garzik <jeff@garzik.org> Date: Mon, 08 Oct 2007 22:12:03 -0400 > I'm interested in working on a load balancer function that approximates > > skb->queue_mapping = smp_processor_id() > > I'd be happy to code and test in that direction, based on your lib. It's the second algorithm that will be available :-) Just add a "% num_tx_queues" to the result. > IMO the net driver really should provide a hint as to what it wants. > > 8139cp and tg3 would probably prefer multiple TX queue behavior to match > silicon behavior -- strict prio. > > And I'll volunteer to write the net driver code for that, if people want > to see how things would look for that type of hardware packet scheduling. Ok. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:12 ` [ofa-general] " Jeff Garzik 2007-10-09 2:46 ` David Miller @ 2007-10-09 18:48 ` Waskiewicz Jr, Peter P 2007-10-09 19:04 ` Jeff Garzik 1 sibling, 1 reply; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-09 18:48 UTC (permalink / raw) To: Jeff Garzik, David Miller Cc: johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber > IMO the net driver really should provide a hint as to what it wants. > > 8139cp and tg3 would probably prefer multiple TX queue > behavior to match silicon behavior -- strict prio. If I understand what you just said, I disagree. If your hardware is running strict prio, you don't want to enforce strict prio in the qdisc layer; performing two layers of QoS is excessive, and may lead to results you don't want. The reason I added the DRR qdisc is for the Si that has its own queueing strategy that is not RR. For Si that implements RR (like e1000), you can either use the DRR qdisc, or if you want to prioritize your flows, use PRIO. -PJ Waskiewicz peter.p.waskiewicz.jr@intel.com ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 18:48 ` [ofa-general] " Waskiewicz Jr, Peter P @ 2007-10-09 19:04 ` Jeff Garzik 2007-10-09 19:07 ` Waskiewicz Jr, Peter P 0 siblings, 1 reply; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 19:04 UTC (permalink / raw) To: Waskiewicz Jr, Peter P Cc: David Miller, hadi, krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri Waskiewicz Jr, Peter P wrote: >> IMO the net driver really should provide a hint as to what it wants. >> >> 8139cp and tg3 would probably prefer multiple TX queue >> behavior to match silicon behavior -- strict prio. > > If I understand what you just said, I disagree. If your hardware is > running strict prio, you don't want to enforce strict prio in the qdisc > layer; performing two layers of QoS is excessive, and may lead to > results you don't want. The reason I added the DRR qdisc is for the Si > that has its own queueing strategy that is not RR. For Si that > implements RR (like e1000), you can either use the DRR qdisc, or if you > want to prioritize your flows, use PRIO. A misunderstanding, I think. To my brain, DaveM's item #2 seemed to assume/require the NIC hardware to balance fairly across hw TX rings, which seemed to preclude the 8139cp/tg3 style of strict-prio hardware. That's what I was responding to. As long as there is some modular way to fit 8139cp/tg3 style multi-TX into our universe, I'm happy :) Jeff ^ permalink raw reply [flat|nested] 79+ messages in thread
* RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 19:04 ` Jeff Garzik @ 2007-10-09 19:07 ` Waskiewicz Jr, Peter P 0 siblings, 0 replies; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-09 19:07 UTC (permalink / raw) To: Jeff Garzik Cc: David Miller, hadi, krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri > A misunderstanding, I think. > > To my brain, DaveM's item #2 seemed to assume/require the NIC > hardware to balance fairly across hw TX rings, which seemed > to preclude the > 8139cp/tg3 style of strict-prio hardware. That's what I was > responding to. > > As long as there is some modular way to fit 8139cp/tg3 style > multi-TX into our universe, I'm happy :) Ah hah. Yes, a misunderstanding on my part. Thanks for the clarification. Methinks more caffeine is required for today... -PJ ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 1:41 ` [ofa-general] " David Miller 2007-10-09 2:01 ` Herbert Xu 2007-10-09 2:12 ` [ofa-general] " Jeff Garzik @ 2007-10-09 2:14 ` jamal 2007-10-09 2:16 ` Herbert Xu 2007-10-09 16:51 ` Andi Kleen 3 siblings, 1 reply; 79+ messages in thread From: jamal @ 2007-10-09 2:14 UTC (permalink / raw) To: David Miller Cc: johnpol, herbert, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber On Mon, 2007-08-10 at 18:41 -0700, David Miller wrote: > I also want to point out another issue. Any argument wrt. reordering > is specious at best because right now reordering from qdisc to device > happens anyways. > > And that's because we drop the qdisc lock first, then we grab the > transmit lock on the device and submit the packet. So, after we > drop the qdisc lock, another cpu can get the qdisc lock, get the > next packet (perhaps a lower priority one) and then sneak in to > get the device transmit lock before the first thread can, and > thus the packets will be submitted out of order. > You forgot QDISC_RUNNING Dave;-> the above cant happen. Essentially at any one point in time, we are guaranteed that we can have multiple cpus enqueueing but only can be dequeueing (the one that managed to grab QDISC_RUNNING) i.e multiple producers to the qdisc queue but only one consumer. Only the dequeuer has access to the txlock. > This, along with other things, makes me believe that ordering really > doesn't matter in practice. And therefore, in practice, we can treat > everything from the qdisc to the real hardware as a FIFO even if > something else is going on inside the black box which might reorder > packets on the wire. I think it is important to get the scheduling right - estimations can be a last resort. For example, If i have voip competing for the wire with ftp on two different rings/cpus and i specified that voip should be more important i may consider equipment faulty if it works "most of the time" (when ftp is not clogging the wire) and at times i am asked to repeat what i just said. cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:14 ` [ofa-general] " jamal @ 2007-10-09 2:16 ` Herbert Xu 2007-10-09 2:47 ` [ofa-general] " David Miller 0 siblings, 1 reply; 79+ messages in thread From: Herbert Xu @ 2007-10-09 2:16 UTC (permalink / raw) To: jamal Cc: David Miller, jeff, peter.p.waskiewicz.jr, krkumar2, johnpol, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri On Mon, Oct 08, 2007 at 10:14:30PM -0400, jamal wrote: > > You forgot QDISC_RUNNING Dave;-> the above cant happen. > Essentially at any one point in time, we are guaranteed that we can have > multiple cpus enqueueing but only can be dequeueing (the one that > managed to grab QDISC_RUNNING) i.e multiple producers to the qdisc queue > but only one consumer. Only the dequeuer has access to the txlock. Good point. You had me worried for a sec :) Dave, Jamal's patch is fine as it is and doesn't actually create any packet reordering. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 2:16 ` Herbert Xu @ 2007-10-09 2:47 ` David Miller 0 siblings, 0 replies; 79+ messages in thread From: David Miller @ 2007-10-09 2:47 UTC (permalink / raw) To: herbert Cc: johnpol, jeff, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, jagana, general, mchan, tgraf, randy.dunlap, shemminger, kaber, sri From: Herbert Xu <herbert@gondor.apana.org.au> Date: Tue, 9 Oct 2007 10:16:20 +0800 > On Mon, Oct 08, 2007 at 10:14:30PM -0400, jamal wrote: > > > > You forgot QDISC_RUNNING Dave;-> the above cant happen. > > Essentially at any one point in time, we are guaranteed that we can have > > multiple cpus enqueueing but only can be dequeueing (the one that > > managed to grab QDISC_RUNNING) i.e multiple producers to the qdisc queue > > but only one consumer. Only the dequeuer has access to the txlock. > > Good point. You had me worried for a sec :) > > Dave, Jamal's patch is fine as it is and doesn't actually create > any packet reordering. Ok, then, I'll un-revert. :-) ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 1:41 ` [ofa-general] " David Miller ` (2 preceding siblings ...) 2007-10-09 2:14 ` [ofa-general] " jamal @ 2007-10-09 16:51 ` Andi Kleen 2007-10-09 18:22 ` Stephen Hemminger 2007-10-09 20:43 ` David Miller 3 siblings, 2 replies; 79+ messages in thread From: Andi Kleen @ 2007-10-09 16:51 UTC (permalink / raw) To: David Miller Cc: jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber David Miller <davem@davemloft.net> writes: > > 2) Switch the default qdisc away from pfifo_fast to a new DRR fifo > with load balancing using the code in #1. I think this is kind > of in the territory of what Peter said he is working on. Hopefully that new qdisc will just use the TX rings of the hardware directly. They are typically large enough these days. That might avoid some locking in this critical path. > I know this is controversial, but realistically I doubt users > benefit at all from the prioritization that pfifo provides. I agree. For most interfaces the priority is probably dubious. Even for DSL the prioritization will be likely usually done in a router these days. Also for the fast interfaces where we do TSO priority doesn't work very well anyways -- with large packets there is not too much to prioritize. > 3) Work on discovering a way to make the locking on transmit as > localized to the current thread of execution as possible. Things > like RCU and statistic replication, techniques we use widely > elsewhere in the stack, begin to come to mind. If the data is just passed on to the hardware queue, why is any locking needed at all? (except for the driver locking of course) -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 16:51 ` Andi Kleen @ 2007-10-09 18:22 ` Stephen Hemminger 2007-10-09 18:30 ` Andi Kleen 2007-10-09 20:43 ` David Miller 1 sibling, 1 reply; 79+ messages in thread From: Stephen Hemminger @ 2007-10-09 18:22 UTC (permalink / raw) To: Andi Kleen Cc: johnpol, Robert.Olsson, jeff, gaagaan, kaber, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, David Miller, herbert On 09 Oct 2007 18:51:51 +0200 Andi Kleen <andi@firstfloor.org> wrote: > David Miller <davem@davemloft.net> writes: > > > > 2) Switch the default qdisc away from pfifo_fast to a new DRR fifo > > with load balancing using the code in #1. I think this is kind > > of in the territory of what Peter said he is working on. > > Hopefully that new qdisc will just use the TX rings of the hardware > directly. They are typically large enough these days. That might avoid > some locking in this critical path. > > > I know this is controversial, but realistically I doubt users > > benefit at all from the prioritization that pfifo provides. > > I agree. For most interfaces the priority is probably dubious. > Even for DSL the prioritization will be likely usually done in a router > these days. > > Also for the fast interfaces where we do TSO priority doesn't work > very well anyways -- with large packets there is not too much > to prioritize. > > > 3) Work on discovering a way to make the locking on transmit as > > localized to the current thread of execution as possible. Things > > like RCU and statistic replication, techniques we use widely > > elsewhere in the stack, begin to come to mind. > > If the data is just passed on to the hardware queue, why is any > locking needed at all? (except for the driver locking of course) > > -Andi I wonder about the whole idea of queueing in general at such high speeds. Given the normal bi-modal distribution of packets, and the predominance of 1500 byte MTU; does it make sense to even have any queueing in software at all? -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 18:22 ` Stephen Hemminger @ 2007-10-09 18:30 ` Andi Kleen 0 siblings, 0 replies; 79+ messages in thread From: Andi Kleen @ 2007-10-09 18:30 UTC (permalink / raw) To: Stephen Hemminger Cc: Andi Kleen, David Miller, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber > I wonder about the whole idea of queueing in general at such high speeds. > Given the normal bi-modal distribution of packets, and the predominance > of 1500 byte MTU; does it make sense to even have any queueing in software > at all? Yes that is my point -- it should just pass it through directly and the driver can then put it into the different per CPU (or per whatever) queues managed by the hardware. The only thing the qdisc needs to do is to set some bit that says "it is ok to put this into difference queues; don't need strict ordering" Otherwise if the drivers did that unconditionally they might cause problems with other qdiscs. This would also require that the driver exports some hint to the upper layer on how large its internal queues are. A device with a short queue would still require pfifo_fast. Long queue devices could just pass through. That again could be a single flag. -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 16:51 ` Andi Kleen 2007-10-09 18:22 ` Stephen Hemminger @ 2007-10-09 20:43 ` David Miller 2007-10-09 20:53 ` Stephen Hemminger 2007-10-11 6:52 ` Krishna Kumar2 1 sibling, 2 replies; 79+ messages in thread From: David Miller @ 2007-10-09 20:43 UTC (permalink / raw) To: andi Cc: johnpol, Robert.Olsson, herbert, gaagaan, shemminger, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jeff, general, mchan, tgraf, randy.dunlap, jagana, kaber, sri From: Andi Kleen <andi@firstfloor.org> Date: 09 Oct 2007 18:51:51 +0200 > Hopefully that new qdisc will just use the TX rings of the hardware > directly. They are typically large enough these days. That might avoid > some locking in this critical path. Indeed, I also realized last night that for the default qdiscs we do a lot of stupid useless work. If the queue is a FIFO and the device can take packets, we should send it directly and not stick it into the qdisc at all. > If the data is just passed on to the hardware queue, why is any > locking needed at all? (except for the driver locking of course) Absolutely. Our packet scheduler subsystem is great, but by default it should just get out of the way. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:43 ` David Miller @ 2007-10-09 20:53 ` Stephen Hemminger 2007-10-09 21:22 ` David Miller 2007-10-11 6:52 ` Krishna Kumar2 1 sibling, 1 reply; 79+ messages in thread From: Stephen Hemminger @ 2007-10-09 20:53 UTC (permalink / raw) To: David Miller Cc: johnpol, Robert.Olsson, herbert, gaagaan, jeff, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, andi, general, netdev, tgraf, randy.dunlap, jagana, kaber, mchan, sri On Tue, 09 Oct 2007 13:43:31 -0700 (PDT) David Miller <davem@davemloft.net> wrote: > From: Andi Kleen <andi@firstfloor.org> > Date: 09 Oct 2007 18:51:51 +0200 > > > Hopefully that new qdisc will just use the TX rings of the hardware > > directly. They are typically large enough these days. That might avoid > > some locking in this critical path. > > Indeed, I also realized last night that for the default qdiscs > we do a lot of stupid useless work. If the queue is a FIFO > and the device can take packets, we should send it directly > and not stick it into the qdisc at all. > > > If the data is just passed on to the hardware queue, why is any > > locking needed at all? (except for the driver locking of course) > > Absolutely. > > Our packet scheduler subsystem is great, but by default it should just > get out of the way. I was thinking why not have a default transmit queue len of 0 like the virtual devices. -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:53 ` Stephen Hemminger @ 2007-10-09 21:22 ` David Miller 2007-10-09 21:56 ` jamal 0 siblings, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-09 21:22 UTC (permalink / raw) To: shemminger Cc: johnpol, Robert.Olsson, herbert, gaagaan, jeff, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, andi, general, netdev, tgraf, randy.dunlap, jagana, kaber, mchan, sri From: Stephen Hemminger <shemminger@linux-foundation.org> Date: Tue, 9 Oct 2007 13:53:40 -0700 > I was thinking why not have a default transmit queue len of 0 like > the virtual devices. I'm not so sure. Even if the device has "huge queues" I still think we need a software queue for when the hardware one backs up. It is even beneficial to stick with reasonably sized TX queues because it keeps the total resident state accessed by the CPU within the bounds of the L2 cache. If you go past that it actually hurts to make the TX queue larger instead of helps even if it means you never hit back pressure. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 21:22 ` David Miller @ 2007-10-09 21:56 ` jamal 2007-10-10 0:04 ` David Miller 0 siblings, 1 reply; 79+ messages in thread From: jamal @ 2007-10-09 21:56 UTC (permalink / raw) To: David Miller Cc: johnpol, Robert.Olsson, herbert, gaagaan, jeff, rdreier, peter.p.waskiewicz.jr, mcarlson, andi, general, netdev, tgraf, randy.dunlap, sri, shemminger, kaber, mchan, jagana On Tue, 2007-09-10 at 14:22 -0700, David Miller wrote: > Even if the device has "huge queues" I still think we need a software > queue for when the hardware one backs up. It should be fine to just "pretend" the qdisc exists despite it sitting in the driver and not have s/ware queues at all to avoid all the challenges that qdiscs bring; if the h/ware queues are full because of link pressure etc, you drop. We drop today when the s/ware queues are full. The driver txmit lock takes place of the qdisc queue lock etc. I am assuming there is still need for that locking. The filter/classification scheme still works as is and select classes which map to rings. tc still works as is etc. cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 21:56 ` jamal @ 2007-10-10 0:04 ` David Miller 2007-10-10 0:37 ` Andi Kleen 2007-10-10 16:02 ` Bill Fink 0 siblings, 2 replies; 79+ messages in thread From: David Miller @ 2007-10-10 0:04 UTC (permalink / raw) To: hadi Cc: johnpol, Robert.Olsson, herbert, gaagaan, jeff, rdreier, peter.p.waskiewicz.jr, mcarlson, andi, general, netdev, tgraf, randy.dunlap, sri, shemminger, kaber, mchan, jagana From: jamal <hadi@cyberus.ca> Date: Tue, 09 Oct 2007 17:56:46 -0400 > if the h/ware queues are full because of link pressure etc, you drop. We > drop today when the s/ware queues are full. The driver txmit lock takes > place of the qdisc queue lock etc. I am assuming there is still need for > that locking. The filter/classification scheme still works as is and > select classes which map to rings. tc still works as is etc. I understand your suggestion. We have to keep in mind, however, that the sw queue right now is 1000 packets. I heavily discourage any driver author to try and use any single TX queue of that size. Which means that just dropping on back pressure might not work so well. Or it might be perfect and signal TCP to backoff, who knows! :-) While working out this issue in my mind, it occured to me that we can put the sw queue into the driver as well. The idea is that the network stack, as in the pure hw queue scheme, unconditionally always submits new packets to the driver. Therefore even if the hw TX queue is full, the driver can still queue to an internal sw queue with some limit (say 1000 for ethernet, as is used now). When the hw TX queue gains space, the driver self-batches packets from the sw queue to the hw queue. It sort of obviates the need for mid-level queue batching in the generic networking. Compared to letting the driver self-batch, the mid-level batching approach is pure overhead. We seem to be sort of all mentioning similar ideas. For example, you can get the above kind of scheme today by using a mid-level queue length of zero, and I believe this idea was mentioned by Stephen Hemminger earlier. I may experiment with this in the NIU driver. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 0:04 ` David Miller @ 2007-10-10 0:37 ` Andi Kleen 2007-10-10 0:50 ` David Miller 2007-10-12 16:08 ` Brandeburg, Jesse 2007-10-10 16:02 ` Bill Fink 1 sibling, 2 replies; 79+ messages in thread From: Andi Kleen @ 2007-10-10 0:37 UTC (permalink / raw) To: David Miller Cc: johnpol, Robert.Olsson, herbert, jeff, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, gaagaan, andi, general, jagana, tgraf, randy.dunlap, shemminger, kaber, mchan, sri On Tue, Oct 09, 2007 at 05:04:35PM -0700, David Miller wrote: > We have to keep in mind, however, that the sw queue right now is 1000 > packets. I heavily discourage any driver author to try and use any > single TX queue of that size. Why would you discourage them? If 1000 is ok for a software queue why would it not be ok for a hardware queue? > Which means that just dropping on back > pressure might not work so well. > > Or it might be perfect and signal TCP to backoff, who knows! :-) 1000 packets is a lot. I don't have hard data, but gut feeling is less would also do. And if the hw queues are not enough a better scheme might be to just manage this in the sockets in sendmsg. e.g. provide a wait queue that drivers can wake up and let them block on more queue. > The idea is that the network stack, as in the pure hw queue scheme, > unconditionally always submits new packets to the driver. Therefore > even if the hw TX queue is full, the driver can still queue to an > internal sw queue with some limit (say 1000 for ethernet, as is used > now). > > > When the hw TX queue gains space, the driver self-batches packets > from the sw queue to the hw queue. I don't really see the advantage over the qdisc in that scheme. It's certainly not simpler and probably more code and would likely also not require less locks (e.g. a currently lockless driver would need a new lock for its sw queue). Also it is unclear to me it would be really any faster. -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 0:37 ` Andi Kleen @ 2007-10-10 0:50 ` David Miller 2007-10-10 9:16 ` Andi Kleen 2007-10-12 16:08 ` Brandeburg, Jesse 1 sibling, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-10 0:50 UTC (permalink / raw) To: andi Cc: johnpol, Robert.Olsson, herbert, gaagaan, netdev, rdreier, peter.p.waskiewicz.jr, hadi, mcarlson, jeff, general, jagana, tgraf, randy.dunlap, shemminger, kaber, mchan, sri From: Andi Kleen <andi@firstfloor.org> Date: Wed, 10 Oct 2007 02:37:16 +0200 > On Tue, Oct 09, 2007 at 05:04:35PM -0700, David Miller wrote: > > We have to keep in mind, however, that the sw queue right now is 1000 > > packets. I heavily discourage any driver author to try and use any > > single TX queue of that size. > > Why would you discourage them? > > If 1000 is ok for a software queue why would it not be ok > for a hardware queue? Because with the software queue, you aren't accessing 1000 slots shared with the hardware device which does shared-ownership transactions on those L2 cache lines with the cpu. Long ago I did a test on gigabit on a cpu with only 256K of L2 cache. Using a smaller TX queue make things go faster, and it's exactly because of these L2 cache effects. > 1000 packets is a lot. I don't have hard data, but gut feeling > is less would also do. I'll try to see how backlogged my 10Gb tests get when a strong sender is sending to a weak receiver. > And if the hw queues are not enough a better scheme might be to > just manage this in the sockets in sendmsg. e.g. provide a wait queue that > drivers can wake up and let them block on more queue. TCP does this already, but it operates in a lossy manner. > I don't really see the advantage over the qdisc in that scheme. > It's certainly not simpler and probably more code and would likely > also not require less locks (e.g. a currently lockless driver > would need a new lock for its sw queue). Also it is unclear to me > it would be really any faster. You still need a lock to guard hw TX enqueue from hw TX reclaim. A 256 entry TX hw queue fills up trivially on 1GB and 10GB, but if you increase the size much more performance starts to go down due to L2 cache thrashing. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 0:50 ` David Miller @ 2007-10-10 9:16 ` Andi Kleen 2007-10-10 9:25 ` David Miller 2007-10-10 9:53 ` Herbert Xu 0 siblings, 2 replies; 79+ messages in thread From: Andi Kleen @ 2007-10-10 9:16 UTC (permalink / raw) To: David Miller Cc: andi, hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber > A 256 entry TX hw queue fills up trivially on 1GB and 10GB, but if you With TSO really? > increase the size much more performance starts to go down due to L2 > cache thrashing. Another possibility would be to consider using cache avoidance instructions while updating the TX ring (e.g. write combining on x86) -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 9:16 ` Andi Kleen @ 2007-10-10 9:25 ` David Miller 2007-10-10 10:23 ` Andi Kleen 2007-10-10 9:53 ` Herbert Xu 1 sibling, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-10 9:25 UTC (permalink / raw) To: andi Cc: hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber From: Andi Kleen <andi@firstfloor.org> Date: Wed, 10 Oct 2007 11:16:44 +0200 > > A 256 entry TX hw queue fills up trivially on 1GB and 10GB, but if you > > With TSO really? Yes. > > increase the size much more performance starts to go down due to L2 > > cache thrashing. > > Another possibility would be to consider using cache avoidance > instructions while updating the TX ring (e.g. write combining > on x86) The chip I was working with at the time (UltraSPARC-IIi) compressed all the linear stores into 64-byte full cacheline transactions via the store buffer. It's true that it would allocate in the L2 cache on a miss, which is different from your suggestion. In fact, such a thing might not pan out well, because most of the time you write a single descriptor or two, and that isn't a full cacheline, which means a read/modify/write is the only coherent way to make such a write to RAM. Sure you could batch, but I'd rather give the chip work to do unless I unequivocably knew I'd have enough pending to fill a cacheline's worth of descriptors. And since you suggest we shouldn't queue in software... :-) ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 9:25 ` David Miller @ 2007-10-10 10:23 ` Andi Kleen 2007-10-10 10:44 ` David Miller 0 siblings, 1 reply; 79+ messages in thread From: Andi Kleen @ 2007-10-10 10:23 UTC (permalink / raw) To: David Miller Cc: andi, hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber On Wed, Oct 10, 2007 at 02:25:50AM -0700, David Miller wrote: > The chip I was working with at the time (UltraSPARC-IIi) compressed > all the linear stores into 64-byte full cacheline transactions via > the store buffer. That's a pretty old CPU. Conclusions on more modern ones might be different. > In fact, such a thing might not pan out well, because most of the time > you write a single descriptor or two, and that isn't a full cacheline, > which means a read/modify/write is the only coherent way to make such > a write to RAM. x86 WC does R-M-W and is coherent of course. The main difference is just that the result is not cached. When the hardware accesses the cache line then the cache should be also invalidated. > Sure you could batch, but I'd rather give the chip work to do unless > I unequivocably knew I'd have enough pending to fill a cacheline's > worth of descriptors. And since you suggest we shouldn't queue in > software... :-) Hmm, it probably would need to be coupled with batched submission if multiple packets are available you're right. Probably not worth doing explicit queueing though. I suppose it would be an interesting experiment at least. -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 10:23 ` Andi Kleen @ 2007-10-10 10:44 ` David Miller 2007-10-10 13:08 ` jamal 2007-10-10 15:35 ` Waskiewicz Jr, Peter P 0 siblings, 2 replies; 79+ messages in thread From: David Miller @ 2007-10-10 10:44 UTC (permalink / raw) To: andi Cc: hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber From: Andi Kleen <andi@firstfloor.org> Date: Wed, 10 Oct 2007 12:23:31 +0200 > On Wed, Oct 10, 2007 at 02:25:50AM -0700, David Miller wrote: > > The chip I was working with at the time (UltraSPARC-IIi) compressed > > all the linear stores into 64-byte full cacheline transactions via > > the store buffer. > > That's a pretty old CPU. Conclusions on more modern ones might be different. Cache matters, just scale the numbers. > I suppose it would be an interesting experiment at least. Absolutely. I've always gotten very poor results when increasing the TX queue a lot, for example with NIU the point of diminishing returns seems to be in the range of 256-512 TX descriptor entries and this was with 1.6Ghz cpus. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 10:44 ` David Miller @ 2007-10-10 13:08 ` jamal 2007-10-10 22:37 ` David Miller 2007-10-10 15:35 ` Waskiewicz Jr, Peter P 1 sibling, 1 reply; 79+ messages in thread From: jamal @ 2007-10-10 13:08 UTC (permalink / raw) To: David Miller Cc: johnpol, Robert.Olsson, herbert, gaagaan, jeff, rdreier, peter.p.waskiewicz.jr, mcarlson, andi, general, netdev, tgraf, randy.dunlap, sri, shemminger, kaber, mchan, jagana On Wed, 2007-10-10 at 03:44 -0700, David Miller wrote: > I've always gotten very poor results when increasing the TX queue a > lot, for example with NIU the point of diminishing returns seems to > be in the range of 256-512 TX descriptor entries and this was with > 1.6Ghz cpus. Is it interupt per packet? From my experience, you may find interesting results varying tx interupt mitigation parameters in addition to the ring parameters. Unfortunately when you do that, optimal parameters also depends on packet size. so what may work for 64B, wont work well for 1400B. cheers, jamal ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 13:08 ` jamal @ 2007-10-10 22:37 ` David Miller 0 siblings, 0 replies; 79+ messages in thread From: David Miller @ 2007-10-10 22:37 UTC (permalink / raw) To: hadi Cc: johnpol, Robert.Olsson, herbert, gaagaan, jeff, rdreier, peter.p.waskiewicz.jr, mcarlson, andi, general, netdev, tgraf, randy.dunlap, sri, shemminger, kaber, mchan, jagana From: jamal <hadi@cyberus.ca> Date: Wed, 10 Oct 2007 09:08:48 -0400 > On Wed, 2007-10-10 at 03:44 -0700, David Miller wrote: > > > I've always gotten very poor results when increasing the TX queue a > > lot, for example with NIU the point of diminishing returns seems to > > be in the range of 256-512 TX descriptor entries and this was with > > 1.6Ghz cpus. > > Is it interupt per packet? From my experience, you may find interesting > results varying tx interupt mitigation parameters in addition to the > ring parameters. > Unfortunately when you do that, optimal parameters also depends on > packet size. so what may work for 64B, wont work well for 1400B. No, it was not interrupt per-packet, I was telling the chip to interrupt me every 1/4 of the ring. ^ permalink raw reply [flat|nested] 79+ messages in thread
* RE: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 10:44 ` David Miller 2007-10-10 13:08 ` jamal @ 2007-10-10 15:35 ` Waskiewicz Jr, Peter P 2007-10-10 16:02 ` Andi Kleen 1 sibling, 1 reply; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-10 15:35 UTC (permalink / raw) To: David Miller, andi Cc: hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber > From: Andi Kleen <andi@firstfloor.org> > Date: Wed, 10 Oct 2007 12:23:31 +0200 > > > On Wed, Oct 10, 2007 at 02:25:50AM -0700, David Miller wrote: > > > The chip I was working with at the time (UltraSPARC-IIi) > compressed > > > all the linear stores into 64-byte full cacheline > transactions via > > > the store buffer. > > > > That's a pretty old CPU. Conclusions on more modern ones > might be different. > > Cache matters, just scale the numbers. > > > I suppose it would be an interesting experiment at least. > > Absolutely. > > I've always gotten very poor results when increasing the TX > queue a lot, for example with NIU the point of diminishing > returns seems to be in the range of 256-512 TX descriptor > entries and this was with 1.6Ghz cpus. We've done similar testing with ixgbe to push maximum descriptor counts, and we lost performance very quickly in the same range you're quoting on NIU. Cheers, -PJ Waskiewicz ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 15:35 ` Waskiewicz Jr, Peter P @ 2007-10-10 16:02 ` Andi Kleen 2007-10-10 16:42 ` Waskiewicz Jr, Peter P 0 siblings, 1 reply; 79+ messages in thread From: Andi Kleen @ 2007-10-10 16:02 UTC (permalink / raw) To: Waskiewicz Jr, Peter P Cc: johnpol, Robert.Olsson, herbert, jeff, kaber, netdev, rdreier, mchan, hadi, mcarlson, gaagaan, andi, general, jagana, tgraf, randy.dunlap, shemminger, David Miller, sri > We've done similar testing with ixgbe to push maximum descriptor counts, > and we lost performance very quickly in the same range you're quoting on > NIU. Did you try it with WC writes to the ring or CLFLUSH? -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* RE: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 16:02 ` Andi Kleen @ 2007-10-10 16:42 ` Waskiewicz Jr, Peter P 0 siblings, 0 replies; 79+ messages in thread From: Waskiewicz Jr, Peter P @ 2007-10-10 16:42 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber > -----Original Message----- > From: Andi Kleen [mailto:andi@firstfloor.org] > Sent: Wednesday, October 10, 2007 9:02 AM > To: Waskiewicz Jr, Peter P > Cc: David Miller; andi@firstfloor.org; hadi@cyberus.ca; > shemminger@linux-foundation.org; jeff@garzik.org; > johnpol@2ka.mipt.ru; herbert@gondor.apana.org.au; > gaagaan@gmail.com; Robert.Olsson@data.slu.se; > netdev@vger.kernel.org; rdreier@cisco.com; > mcarlson@broadcom.com; jagana@us.ibm.com; > general@lists.openfabrics.org; mchan@broadcom.com; > tgraf@suug.ch; randy.dunlap@oracle.com; sri@us.ibm.com; > kaber@trash.net > Subject: Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net > core use batching > > > We've done similar testing with ixgbe to push maximum descriptor > > counts, and we lost performance very quickly in the same > range you're > > quoting on NIU. > > Did you try it with WC writes to the ring or CLFLUSH? > > -Andi Hmm, I think it might be slightly different, but it still shows queue depth vs. performance. I was actually referring to how many descriptors we can represent a packet with before it becomes a problem wrt performance. This morning I tried to actually push my ixgbe NIC hard enough to come close to filling the ring with packets (384-byte packets), and even on my 8-core Xeon I can't do it. My system can't generate enough I/O to fill the hardware queues before CPUs max out. -PJ Waskiewicz ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 9:16 ` Andi Kleen 2007-10-10 9:25 ` David Miller @ 2007-10-10 9:53 ` Herbert Xu 1 sibling, 0 replies; 79+ messages in thread From: Herbert Xu @ 2007-10-10 9:53 UTC (permalink / raw) To: Andi Kleen Cc: David Miller, hadi, shemminger, jeff, johnpol, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber On Wed, Oct 10, 2007 at 11:16:44AM +0200, Andi Kleen wrote: > > A 256 entry TX hw queue fills up trivially on 1GB and 10GB, but if you > > With TSO really? Hardware queues are generally per-page rather than per-skb so it'd fill up quicker than a software queue even with TSO. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* RE: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 0:37 ` Andi Kleen 2007-10-10 0:50 ` David Miller @ 2007-10-12 16:08 ` Brandeburg, Jesse 2007-10-12 17:05 ` Stephen Hemminger 2007-10-12 18:27 ` Andi Kleen 1 sibling, 2 replies; 79+ messages in thread From: Brandeburg, Jesse @ 2007-10-12 16:08 UTC (permalink / raw) To: Andi Kleen, David Miller Cc: hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, Waskiewicz Jr, Peter P, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber Andi Kleen wrote: >> When the hw TX queue gains space, the driver self-batches packets >> from the sw queue to the hw queue. > > I don't really see the advantage over the qdisc in that scheme. > It's certainly not simpler and probably more code and would likely > also not require less locks (e.g. a currently lockless driver > would need a new lock for its sw queue). Also it is unclear to me > it would be really any faster. related to this comment, does Linux have a lockless (using atomics) singly linked list element? That would be very useful in a driver hot path. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-12 16:08 ` Brandeburg, Jesse @ 2007-10-12 17:05 ` Stephen Hemminger 2007-10-12 18:29 ` Andi Kleen 2007-10-12 18:27 ` Andi Kleen 1 sibling, 1 reply; 79+ messages in thread From: Stephen Hemminger @ 2007-10-12 17:05 UTC (permalink / raw) To: Brandeburg, Jesse Cc: Andi Kleen, David Miller, hadi, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, Waskiewicz Jr, Peter P, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber On Fri, 12 Oct 2007 09:08:58 -0700 "Brandeburg, Jesse" <jesse.brandeburg@intel.com> wrote: > Andi Kleen wrote: > >> When the hw TX queue gains space, the driver self-batches packets > >> from the sw queue to the hw queue. > > > > I don't really see the advantage over the qdisc in that scheme. > > It's certainly not simpler and probably more code and would likely > > also not require less locks (e.g. a currently lockless driver > > would need a new lock for its sw queue). Also it is unclear to me > > it would be really any faster. > > related to this comment, does Linux have a lockless (using atomics) > singly linked list element? That would be very useful in a driver hot > path. Use RCU? or write a generic version and get it reviewed. You really want someone with knowledge of all the possible barrier impacts to review it. -- Stephen Hemminger <shemminger@linux-foundation.org> ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-12 17:05 ` Stephen Hemminger @ 2007-10-12 18:29 ` Andi Kleen 0 siblings, 0 replies; 79+ messages in thread From: Andi Kleen @ 2007-10-12 18:29 UTC (permalink / raw) To: Stephen Hemminger Cc: Brandeburg, Jesse, Andi Kleen, David Miller, hadi, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, Waskiewicz Jr, Peter P, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber > Use RCU? or write a generic version and get it reviewed. You really > want someone with knowledge of all the possible barrier impacts to > review it. I guess he was thinking of using cmpxchg; but we don't support this in portable code. RCU is not really suitable for this because it assume writing is relatively rare which is definitely not the case for a qdisc. Also general list management with RCU is quite expensive anyways -- it would require a full copy (that is the 'C' in RCU which Linux generally doesn't use at all) -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-12 16:08 ` Brandeburg, Jesse 2007-10-12 17:05 ` Stephen Hemminger @ 2007-10-12 18:27 ` Andi Kleen 1 sibling, 0 replies; 79+ messages in thread From: Andi Kleen @ 2007-10-12 18:27 UTC (permalink / raw) To: Brandeburg, Jesse Cc: Andi Kleen, David Miller, hadi, shemminger, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, Waskiewicz Jr, Peter P, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber > related to this comment, does Linux have a lockless (using atomics) > singly linked list element? That would be very useful in a driver hot > path. No; it doesn't. At least not a portable one. Besides they tend to be not faster anyways because e.g. cmpxchg tends to be as slow as an explicit spinlock. -Andi ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 0:04 ` David Miller 2007-10-10 0:37 ` Andi Kleen @ 2007-10-10 16:02 ` Bill Fink 2007-10-10 22:53 ` David Miller 1 sibling, 1 reply; 79+ messages in thread From: Bill Fink @ 2007-10-10 16:02 UTC (permalink / raw) To: David Miller Cc: hadi, shemminger, andi, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber On Tue, 09 Oct 2007, David Miller wrote: > From: jamal <hadi@cyberus.ca> > Date: Tue, 09 Oct 2007 17:56:46 -0400 > > > if the h/ware queues are full because of link pressure etc, you drop. We > > drop today when the s/ware queues are full. The driver txmit lock takes > > place of the qdisc queue lock etc. I am assuming there is still need for > > that locking. The filter/classification scheme still works as is and > > select classes which map to rings. tc still works as is etc. > > I understand your suggestion. > > We have to keep in mind, however, that the sw queue right now is 1000 > packets. I heavily discourage any driver author to try and use any > single TX queue of that size. Which means that just dropping on back > pressure might not work so well. > > Or it might be perfect and signal TCP to backoff, who knows! :-) I can't remember the details anymore, but for 10-GigE, I have encountered cases where I was able to significantly increase TCP performance by increasing the txqueuelen to 10000, which is the setting I now use for any 10-GigE testing. -Bill ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-10 16:02 ` Bill Fink @ 2007-10-10 22:53 ` David Miller 0 siblings, 0 replies; 79+ messages in thread From: David Miller @ 2007-10-10 22:53 UTC (permalink / raw) To: billfink Cc: hadi, shemminger, andi, jeff, johnpol, herbert, gaagaan, Robert.Olsson, netdev, rdreier, peter.p.waskiewicz.jr, mcarlson, jagana, general, mchan, tgraf, randy.dunlap, sri, kaber From: Bill Fink <billfink@mindspring.com> Date: Wed, 10 Oct 2007 12:02:15 -0400 > On Tue, 09 Oct 2007, David Miller wrote: > > > We have to keep in mind, however, that the sw queue right now is 1000 > > packets. I heavily discourage any driver author to try and use any > > single TX queue of that size. Which means that just dropping on back > > pressure might not work so well. > > > > Or it might be perfect and signal TCP to backoff, who knows! :-) > > I can't remember the details anymore, but for 10-GigE, I have encountered > cases where I was able to significantly increase TCP performance by > increasing the txqueuelen to 10000, which is the setting I now use for > any 10-GigE testing. For some reason this does not surprise me. We bumped the ethernet default up to 1000 for gigabit. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:43 ` David Miller 2007-10-09 20:53 ` Stephen Hemminger @ 2007-10-11 6:52 ` Krishna Kumar2 1 sibling, 0 replies; 79+ messages in thread From: Krishna Kumar2 @ 2007-10-11 6:52 UTC (permalink / raw) To: David Miller Cc: andi, gaagaan, general, hadi, herbert, jagana, jeff, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rdreier, Robert.Olsson, shemminger, sri, tgraf Hi Dave, David Miller wrote on 10/10/2007 02:13:31 AM: > > Hopefully that new qdisc will just use the TX rings of the hardware > > directly. They are typically large enough these days. That might avoid > > some locking in this critical path. > > Indeed, I also realized last night that for the default qdiscs > we do a lot of stupid useless work. If the queue is a FIFO > and the device can take packets, we should send it directly > and not stick it into the qdisc at all. Since you are talking of how it should be done in the *current* code, I feel LLTX drivers will not work nicely with this. Actually I was trying this change a couple of weeks back, but felt that doin go would result in out of order packets (skbs present in q which were not sent out for LLTX failure will be sent out only at next net_tx_action, while other skbs are sent ahead). One option is to first call qdisc_run() and then process this skb, but that is ugly (requeue handling). However I guess this can be done cleanly once LLTX is removed. Thanks, - KK ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 23:40 ` jamal 2007-10-09 1:13 ` Jeff Garzik @ 2007-10-09 1:31 ` Jeff Garzik 1 sibling, 0 replies; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 1:31 UTC (permalink / raw) To: hadi Cc: Waskiewicz Jr, Peter P, David Miller, krkumar2, johnpol, herbert, kaber, shemminger, jagana, Robert.Olsson, rick.jones2, xma, gaagaan, netdev, rdreier, mcarlson, mchan, general, tgraf, randy.dunlap, sri jamal wrote: > The challenge to deal with is that netdevices, filters, the queues and > scheduler are closely inter-twined. So it is not just the scheduling > region and QDISC_RUNNING. For example, lets pick just the filters > because they are simple to see: You need to attach them to something - > whatever that is, you then need to synchronize against config and > multiple cpus trying to use them. You could: > a) replicate them across cpus and only lock on config, but you are > wasting RAM then I think you've pretty much bought into the cost of wasting RAM, when doing multiple TX rings. So logic implies associated costs, like the ones you describe, come along for the ride. > b) attach them to rings instead of netdevices - but that makes me wonder > if those subqueues are now going to become netdevices. This also means > you change all user space interfaces to know about subqueues. If you > recall this was a major contention in our earlier discussion. That's definitely a good question, and I honestly don't see any easy solutions. Multiple net devices makes a -lot- of things easier, with regards to existing infrastructure, but it also imposes potentially annoying administrative burdens: Not only must each interface be set up individually, but the userland apps must be made aware of this unique method of concurrency. Jeff ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] RE: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-08 22:33 ` Waskiewicz Jr, Peter P 2007-10-08 23:40 ` jamal @ 2007-10-09 10:58 ` Krishna Kumar2 2007-10-09 11:02 ` David Miller 1 sibling, 1 reply; 79+ messages in thread From: Krishna Kumar2 @ 2007-10-09 10:58 UTC (permalink / raw) To: Waskiewicz Jr, Peter P Cc: jagana, johnpol, herbert, gaagaan, Robert.Olsson, mcarlson, rdreier, hadi, kaber, randy.dunlap, jeff, general, mchan, tgraf, netdev, shemminger, David Miller, sri Hi Peter, "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@intel.com> wrote on 10/09/2007 04:03:42 AM: > > true, that needs some resolution. Heres a hand-waving thought: > > Assuming all packets of a specific map end up in the same > > qdiscn queue, it seems feasible to ask the qdisc scheduler to > > give us enough packages (ive seen people use that terms to > > refer to packets) for each hardware ring's available space. > > With the patches i posted, i do that via > > dev->xmit_win that assumes only one view of the driver; essentially a > > single ring. > > If that is doable, then it is up to the driver to say "i have > > space for 5 in ring[0], 10 in ring[1] 0 in ring[2]" based on > > what scheduling scheme the driver implements - the dev->blist > > can stay the same. Its a handwave, so there may be issues > > there and there could be better ways to handle this. > > > > Note: The other issue that needs resolving that i raised > > earlier was in regards to multiqueue running on multiple cpus > > servicing different rings concurently. > > I can see the qdisc being modified to send batches per queue_mapping. > This shouldn't be too difficult, and if we had the xmit_win per queue > (in the subqueue struct like Dave pointed out). I hope my understanding of multiqueue is correct for this mail to make sense :-) Isn't it enough that the multiqueue+batching drivers handle skbs belonging to different queue's themselves, instead of qdisc having to figure that out? This will reduce costs for most skbs that are neither batched nor sent to multiqueue devices. Eg, driver can keep processing skbs and put to the correct tx_queue as long as mapping remains the same. If the mapping changes, it posts earlier skbs (with the correct lock) and then iterates for the other skbs that have the next different mapping, and so on. (This is required only if driver is supposed to transmit >1 skb in one call, otherwise it is not an issue) Alternatively, supporting drivers could return a different code on mapping change, like: NETDEV_TX_MAPPING_CHANGED (for batching only) so that qdisc_run() could retry. Would that work? Secondly having xmit_win per queue: would it help in multiple skb case? Currently there is no way to tell qdisc to dequeue skbs from a particular band - it returns skb from highest priority band. thanks, - KK ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 10:58 ` [ofa-general] " Krishna Kumar2 @ 2007-10-09 11:02 ` David Miller 2007-10-09 11:20 ` [ofa-general] " Krishna Kumar2 2007-10-09 11:21 ` Krishna Kumar2 0 siblings, 2 replies; 79+ messages in thread From: David Miller @ 2007-10-09 11:02 UTC (permalink / raw) To: krkumar2 Cc: peter.p.waskiewicz.jr, gaagaan, general, hadi, herbert, jagana, jeff, johnpol, kaber, mcarlson, mchan, netdev, randy.dunlap, rdreier, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma From: Krishna Kumar2 <krkumar2@in.ibm.com> Date: Tue, 9 Oct 2007 16:28:27 +0530 > Isn't it enough that the multiqueue+batching drivers handle skbs > belonging to different queue's themselves, instead of qdisc having > to figure that out? This will reduce costs for most skbs that are > neither batched nor sent to multiqueue devices. > > Eg, driver can keep processing skbs and put to the correct tx_queue > as long as mapping remains the same. If the mapping changes, it posts > earlier skbs (with the correct lock) and then iterates for the other > skbs that have the next different mapping, and so on. The complexity in most of these suggestions is beginning to drive me a bit crazy :-) This should be the simplest thing in the world, when TX queue has space, give it packets. Period. When I hear suggestions like "have the driver pick the queue in ->hard_start_xmit() and return some special status if the queue becomes different"..... you know, I really begin to wonder :-) If we have to go back, get into the queueing layer locks, have these special cases, and whatnot, what's the point? This code should eventually be able to run lockless all the way to the TX queue handling code of the driver. The queueing code should know what TX queue the packet will be bound for, and always precisely invoke the driver in a state where the driver can accept the packet. Ignore LLTX, it sucks, it was a big mistake, and we will get rid of it. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 11:02 ` David Miller @ 2007-10-09 11:20 ` Krishna Kumar2 2007-10-09 11:21 ` Krishna Kumar2 1 sibling, 0 replies; 79+ messages in thread From: Krishna Kumar2 @ 2007-10-09 11:20 UTC (permalink / raw) To: David Miller Cc: jagana, johnpol, gaagaan, jeff, Robert.Olsson, mcarlson, rdreier, peter.p.waskiewicz.jr, hadi, netdev, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber, herbert Hi Dave, David Miller <davem@davemloft.net> wrote on 10/09/2007 04:32:55 PM: > > Isn't it enough that the multiqueue+batching drivers handle skbs > > belonging to different queue's themselves, instead of qdisc having > > to figure that out? This will reduce costs for most skbs that are > > neither batched nor sent to multiqueue devices. > > > > Eg, driver can keep processing skbs and put to the correct tx_queue > > as long as mapping remains the same. If the mapping changes, it posts > > earlier skbs (with the correct lock) and then iterates for the other > > skbs that have the next different mapping, and so on. > > The complexity in most of these suggestions is beginning to drive me a > bit crazy :-) > > This should be the simplest thing in the world, when TX queue has > space, give it packets. Period. > > When I hear suggestions like "have the driver pick the queue in > ->hard_start_xmit() and return some special status if the queue > becomes different"..... you know, I really begin to wonder :-) > > If we have to go back, get into the queueing layer locks, have these > special cases, and whatnot, what's the point? I understand your point, but the qdisc code itself needs almost no change, as small as: qdisc_restart() { ... case NETDEV_TX_MAPPING_CHANGED: /* * Driver sent some skbs from one mapping, and found others * are for different queue_mapping. Try again. */ ret = 1; /* guaranteed to have atleast 1 skb in batch list */ break; ... } Alternatively if the driver does all the dirty work, qdisc needs no change at all. However, I am not sure if this addresses all the concerns raised by you, Peter, Jamal, others. > This code should eventually be able to run lockless all the way to the > TX queue handling code of the driver. The queueing code should know > what TX queue the packet will be bound for, and always precisely > invoke the driver in a state where the driver can accept the packet. This sounds like a good idea :) I need to think more on this, esp as my batching sends multiple skbs of possibly different mappings to device, and those skbs stay in batch list if driver couldn't send them out. thanks, - KK ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 11:02 ` David Miller 2007-10-09 11:20 ` [ofa-general] " Krishna Kumar2 @ 2007-10-09 11:21 ` Krishna Kumar2 2007-10-09 11:24 ` David Miller 1 sibling, 1 reply; 79+ messages in thread From: Krishna Kumar2 @ 2007-10-09 11:21 UTC (permalink / raw) To: David Miller Cc: jagana, johnpol, gaagaan, jeff, Robert.Olsson, mcarlson, rdreier, peter.p.waskiewicz.jr, hadi, netdev, general, mchan, tgraf, randy.dunlap, sri, shemminger, kaber, herbert David Miller <davem@davemloft.net> wrote on 10/09/2007 04:32:55 PM: > Ignore LLTX, it sucks, it was a big mistake, and we will get rid of > it. Great, this will make life easy. Any idea how long that would take? It seems simple enough to do. thanks, - KK ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 11:21 ` Krishna Kumar2 @ 2007-10-09 11:24 ` David Miller 2007-10-09 12:44 ` [ofa-general] " Jeff Garzik 2007-10-09 20:22 ` [ofa-general] " Roland Dreier 0 siblings, 2 replies; 79+ messages in thread From: David Miller @ 2007-10-09 11:24 UTC (permalink / raw) To: krkumar2 Cc: gaagaan, general, hadi, herbert, jagana, jeff, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rdreier, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma From: Krishna Kumar2 <krkumar2@in.ibm.com> Date: Tue, 9 Oct 2007 16:51:14 +0530 > David Miller <davem@davemloft.net> wrote on 10/09/2007 04:32:55 PM: > > > Ignore LLTX, it sucks, it was a big mistake, and we will get rid of > > it. > > Great, this will make life easy. Any idea how long that would take? > It seems simple enough to do. I'd say we can probably try to get rid of it in 2.6.25, this is assuming we get driver authors to cooperate and do the conversions or alternatively some other motivated person. I can just threaten to do them all and that should get the driver maintainers going :-) ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 11:24 ` David Miller @ 2007-10-09 12:44 ` Jeff Garzik 2007-10-09 12:55 ` Herbert Xu 2007-10-09 20:14 ` David Miller 2007-10-09 20:22 ` [ofa-general] " Roland Dreier 1 sibling, 2 replies; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 12:44 UTC (permalink / raw) To: David Miller Cc: jagana, herbert, gaagaan, Robert.Olsson, mcarlson, rdreier, peter.p.waskiewicz.jr, hadi, netdev, general, mchan, tgraf, randy.dunlap, johnpol, shemminger, kaber, sri [-- Attachment #1: Type: text/plain, Size: 678 bytes --] David Miller wrote: > From: Krishna Kumar2 <krkumar2@in.ibm.com> > Date: Tue, 9 Oct 2007 16:51:14 +0530 > >> David Miller <davem@davemloft.net> wrote on 10/09/2007 04:32:55 PM: >> >>> Ignore LLTX, it sucks, it was a big mistake, and we will get rid of >>> it. >> Great, this will make life easy. Any idea how long that would take? >> It seems simple enough to do. > > I'd say we can probably try to get rid of it in 2.6.25, this is > assuming we get driver authors to cooperate and do the conversions > or alternatively some other motivated person. > > I can just threaten to do them all and that should get the driver > maintainers going :-) What, like this? :) Jeff [-- Attachment #2: patch --] [-- Type: text/plain, Size: 13349 bytes --] drivers/net/atl1/atl1_main.c | 16 +++++----------- drivers/net/chelsio/cxgb2.c | 1 - drivers/net/chelsio/sge.c | 20 +++++++++----------- drivers/net/e1000/e1000_main.c | 6 +----- drivers/net/ixgb/ixgb_main.c | 24 ------------------------ drivers/net/pasemi_mac.c | 2 +- drivers/net/rionet.c | 19 +++++++------------ drivers/net/spider_net.c | 2 +- drivers/net/sungem.c | 17 ++++++----------- drivers/net/tehuti.c | 12 +----------- drivers/net/tehuti.h | 3 +-- 11 files changed, 32 insertions(+), 90 deletions(-) diff --git a/drivers/net/atl1/atl1_main.c b/drivers/net/atl1/atl1_main.c index 4c728f1..03e94fe 100644 --- a/drivers/net/atl1/atl1_main.c +++ b/drivers/net/atl1/atl1_main.c @@ -1665,10 +1665,7 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct net_device *netdev) len -= skb->data_len; - if (unlikely(skb->len == 0)) { - dev_kfree_skb_any(skb); - return NETDEV_TX_OK; - } + WARN_ON(skb->len == 0); param.data = 0; param.tso.tsopu = 0; @@ -1703,11 +1700,7 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } } - if (!spin_trylock_irqsave(&adapter->lock, flags)) { - /* Can't get lock - tell upper layer to requeue */ - dev_printk(KERN_DEBUG, &adapter->pdev->dev, "tx locked\n"); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&adapter->lock, flags); if (atl1_tpd_avail(&adapter->tpd_ring) < count) { /* not enough descriptors */ @@ -1749,8 +1742,11 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct net_device *netdev) atl1_tx_map(adapter, skb, 1 == val); atl1_tx_queue(adapter, count, ¶m); netdev->trans_start = jiffies; + spin_unlock_irqrestore(&adapter->lock, flags); + atl1_update_mailbox(adapter); + return NETDEV_TX_OK; } @@ -2301,8 +2297,6 @@ static int __devinit atl1_probe(struct pci_dev *pdev, */ /* netdev->features |= NETIF_F_TSO; */ - netdev->features |= NETIF_F_LLTX; - /* * patch for some L1 of old version, * the final version of L1 may not need these diff --git a/drivers/net/chelsio/cxgb2.c b/drivers/net/chelsio/cxgb2.c index 2dbf8dc..0aba7e7 100644 --- a/drivers/net/chelsio/cxgb2.c +++ b/drivers/net/chelsio/cxgb2.c @@ -1084,7 +1084,6 @@ static int __devinit init_one(struct pci_dev *pdev, netdev->mem_end = mmio_start + mmio_len - 1; netdev->priv = adapter; netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; - netdev->features |= NETIF_F_LLTX; adapter->flags |= RX_CSUM_ENABLED | TCP_CSUM_CAPABLE; if (pci_using_dac) diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c index ffa7e64..84f5869 100644 --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1739,8 +1739,7 @@ static int t1_sge_tx(struct sk_buff *skb, struct adapter *adapter, struct cmdQ *q = &sge->cmdQ[qid]; unsigned int credits, pidx, genbit, count, use_sched_skb = 0; - if (!spin_trylock(&q->lock)) - return NETDEV_TX_LOCKED; + spin_lock(&q->lock); reclaim_completed_tx(sge, q); @@ -1817,12 +1816,12 @@ use_sched: } if (use_sched_skb) { - if (spin_trylock(&q->lock)) { - credits = q->size - q->in_use; - skb = NULL; - goto use_sched; - } + spin_lock(&q->lock); + credits = q->size - q->in_use; + skb = NULL; + goto use_sched; } + return NETDEV_TX_OK; } @@ -1977,13 +1976,12 @@ static void sge_tx_reclaim_cb(unsigned long data) for (i = 0; i < SGE_CMDQ_N; ++i) { struct cmdQ *q = &sge->cmdQ[i]; - if (!spin_trylock(&q->lock)) - continue; + spin_lock(&q->lock); reclaim_completed_tx(sge, q); - if (i == 0 && q->in_use) { /* flush pending credits */ + if (i == 0 && q->in_use) /* flush pending credits */ writel(F_CMDQ0_ENABLE, sge->adapter->regs + A_SG_DOORBELL); - } + spin_unlock(&q->lock); } mod_timer(&sge->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD); diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 10505de..b64b03b 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -992,8 +992,6 @@ e1000_probe(struct pci_dev *pdev, if (pci_using_dac) netdev->features |= NETIF_F_HIGHDMA; - netdev->features |= NETIF_F_LLTX; - adapter->en_mng_pt = e1000_enable_mng_pass_thru(&adapter->hw); /* initialize eeprom parameters */ @@ -3368,9 +3366,7 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) (adapter->hw.mac_type == e1000_82573)) e1000_transfer_dhcp_info(adapter, skb); - if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) - /* Collision - tell upper layer to requeue */ - return NETDEV_TX_LOCKED; + spin_lock_irqsave(&tx_ring->tx_lock, flags); /* need: count + 2 desc gap to keep tail from touching * head, otherwise try next time */ diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c index d444de5..c04b59b 100644 --- a/drivers/net/ixgb/ixgb_main.c +++ b/drivers/net/ixgb/ixgb_main.c @@ -449,9 +449,6 @@ ixgb_probe(struct pci_dev *pdev, NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER; netdev->features |= NETIF_F_TSO; -#ifdef NETIF_F_LLTX - netdev->features |= NETIF_F_LLTX; -#endif if(pci_using_dac) netdev->features |= NETIF_F_HIGHDMA; @@ -1455,16 +1452,7 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) return 0; } -#ifdef NETIF_F_LLTX - local_irq_save(flags); - if (!spin_trylock(&adapter->tx_lock)) { - /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } -#else spin_lock_irqsave(&adapter->tx_lock, flags); -#endif if (unlikely(ixgb_maybe_stop_tx(netdev, &adapter->tx_ring, DESC_NEEDED))) { @@ -1473,9 +1461,7 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_BUSY; } -#ifndef NETIF_F_LLTX spin_unlock_irqrestore(&adapter->tx_lock, flags); -#endif if(adapter->vlgrp && vlan_tx_tag_present(skb)) { tx_flags |= IXGB_TX_FLAGS_VLAN; @@ -1487,9 +1473,6 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) tso = ixgb_tso(adapter, skb); if (tso < 0) { dev_kfree_skb_any(skb); -#ifdef NETIF_F_LLTX - spin_unlock_irqrestore(&adapter->tx_lock, flags); -#endif return NETDEV_TX_OK; } @@ -1503,13 +1486,6 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev) netdev->trans_start = jiffies; -#ifdef NETIF_F_LLTX - /* Make sure there is space in the ring for the next send. */ - ixgb_maybe_stop_tx(netdev, &adapter->tx_ring, DESC_NEEDED); - - spin_unlock_irqrestore(&adapter->tx_lock, flags); - -#endif return NETDEV_TX_OK; } diff --git a/drivers/net/pasemi_mac.c b/drivers/net/pasemi_mac.c index 9f9a421..78f939b 100644 --- a/drivers/net/pasemi_mac.c +++ b/drivers/net/pasemi_mac.c @@ -1352,7 +1352,7 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netif_napi_add(dev, &mac->napi, pasemi_mac_poll, 64); - dev->features = NETIF_F_HW_CSUM | NETIF_F_LLTX | NETIF_F_SG; + dev->features = NETIF_F_HW_CSUM | NETIF_F_SG; /* These should come out of the device tree eventually */ mac->dma_txch = index; diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index e7fd08a..cd2d25c 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -180,11 +180,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct net_device *ndev) u16 destid; unsigned long flags; - local_irq_save(flags); - if (!spin_trylock(&rnet->tx_lock)) { - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&rnet->tx_lock, flags); if ((rnet->tx_cnt + 1) > RIONET_TX_RING_SIZE) { netif_stop_queue(ndev); @@ -259,7 +255,7 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo struct net_device *ndev = dev_id; struct rionet_private *rnet = ndev->priv; - spin_lock(&rnet->lock); + spin_lock(&rnet->tx_lock); if (netif_msg_intr(rnet)) printk(KERN_INFO @@ -278,7 +274,7 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo if (rnet->tx_cnt < RIONET_TX_RING_SIZE) netif_wake_queue(ndev); - spin_unlock(&rnet->lock); + spin_unlock(&rnet->tx_lock); } static int rionet_open(struct net_device *ndev) @@ -420,10 +416,10 @@ static void rionet_set_msglevel(struct net_device *ndev, u32 value) } static const struct ethtool_ops rionet_ethtool_ops = { - .get_drvinfo = rionet_get_drvinfo, - .get_msglevel = rionet_get_msglevel, - .set_msglevel = rionet_set_msglevel, - .get_link = ethtool_op_get_link, + .get_drvinfo = rionet_get_drvinfo, + .get_msglevel = rionet_get_msglevel, + .set_msglevel = rionet_set_msglevel, + .get_link = ethtool_op_get_link, }; static int rionet_setup_netdev(struct rio_mport *mport) @@ -461,7 +457,6 @@ static int rionet_setup_netdev(struct rio_mport *mport) ndev->hard_start_xmit = &rionet_start_xmit; ndev->stop = &rionet_close; ndev->mtu = RIO_MAX_MSG_SIZE - 14; - ndev->features = NETIF_F_LLTX; SET_ETHTOOL_OPS(ndev, &rionet_ethtool_ops); spin_lock_init(&rnet->lock); diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c index fab055f..22193a9 100644 --- a/drivers/net/spider_net.c +++ b/drivers/net/spider_net.c @@ -2329,7 +2329,7 @@ spider_net_setup_netdev(struct spider_net_card *card) spider_net_setup_netdev_ops(netdev); - netdev->features = NETIF_F_IP_CSUM | NETIF_F_LLTX; + netdev->features = NETIF_F_IP_CSUM; /* some time: NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX | * NETIF_F_HW_VLAN_FILTER */ diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index 53b8344..6aaf56d 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -932,7 +932,6 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id) { struct net_device *dev = dev_id; struct gem *gp = dev->priv; - unsigned long flags; /* Swallow interrupts when shutting the chip down, though * that shouldn't happen, we should have done free_irq() at @@ -941,14 +940,14 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id) if (!gp->running) return IRQ_HANDLED; - spin_lock_irqsave(&gp->lock, flags); + spin_lock(&gp->lock); if (netif_rx_schedule_prep(dev, &gp->napi)) { u32 gem_status = readl(gp->regs + GREG_STAT); if (gem_status == 0) { napi_enable(&gp->napi); - spin_unlock_irqrestore(&gp->lock, flags); + spin_unlock(&gp->lock); return IRQ_NONE; } gp->status = gem_status; @@ -956,7 +955,7 @@ static irqreturn_t gem_interrupt(int irq, void *dev_id) __netif_rx_schedule(dev, &gp->napi); } - spin_unlock_irqrestore(&gp->lock, flags); + spin_unlock(&gp->lock); /* If polling was disabled at the time we received that * interrupt, we may return IRQ_HANDLED here while we @@ -1031,12 +1030,8 @@ static int gem_start_xmit(struct sk_buff *skb, struct net_device *dev) (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { - /* Tell upper layer to requeue */ - local_irq_restore(flags); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&gp->tx_lock, flags); + /* We raced with gem_do_stop() */ if (!gp->running) { spin_unlock_irqrestore(&gp->tx_lock, flags); @@ -3160,7 +3155,7 @@ static int __devinit gem_init_one(struct pci_dev *pdev, gp->phy_mii.def ? gp->phy_mii.def->name : "no"); /* GEM can do it all... */ - dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_LLTX; + dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; if (pci_using_dac) dev->features |= NETIF_F_HIGHDMA; diff --git a/drivers/net/tehuti.c b/drivers/net/tehuti.c index 2483431..e0da1e0 100644 --- a/drivers/net/tehuti.c +++ b/drivers/net/tehuti.c @@ -1606,7 +1606,6 @@ static inline int bdx_tx_space(struct bdx_priv *priv) * o NETDEV_TX_BUSY Cannot transmit packet, try later * Usually a bug, means queue start/stop flow control is broken in * the driver. Note: the driver must NOT put the skb in its DMA ring. - * o NETDEV_TX_LOCKED Locking failed, please retry quickly. */ static int bdx_tx_transmit(struct sk_buff *skb, struct net_device *ndev) { @@ -1624,13 +1623,7 @@ static int bdx_tx_transmit(struct sk_buff *skb, struct net_device *ndev) unsigned long flags; ENTER; - local_irq_save(flags); - if (!spin_trylock(&priv->tx_lock)) { - local_irq_restore(flags); - DBG("%s[%s]: TX locked, returning NETDEV_TX_LOCKED\n", - BDX_DRV_NAME, ndev->name); - return NETDEV_TX_LOCKED; - } + spin_lock_irqsave(&priv->tx_lock, flags); /* build tx descriptor */ BDX_ASSERT(f->m.wptr >= f->m.memsz); /* started with valid wptr */ @@ -2048,9 +2041,6 @@ bdx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * between transmit and TX irq cleanup. In addition * set multicast list callback has to use priv->tx_lock. */ -#ifdef BDX_LLTX - ndev->features |= NETIF_F_LLTX; -#endif spin_lock_init(&priv->tx_lock); /*bdx_hw_reset(priv); */ diff --git a/drivers/net/tehuti.h b/drivers/net/tehuti.h index efd170f..c55f69a 100644 --- a/drivers/net/tehuti.h +++ b/drivers/net/tehuti.h @@ -35,7 +35,6 @@ /* Compile Time Switches */ /* start */ #define BDX_TSO -#define BDX_LLTX #define BDX_DELAY_WPTR /* #define BDX_MSI */ /* end */ @@ -270,7 +269,7 @@ struct bdx_priv { int tx_update_mark; int tx_noupd; #endif - spinlock_t tx_lock; /* NETIF_F_LLTX mode */ + spinlock_t tx_lock; /* rarely used */ u8 port; [-- Attachment #3: Type: text/plain, Size: 0 bytes --] ^ permalink raw reply related [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 12:44 ` [ofa-general] " Jeff Garzik @ 2007-10-09 12:55 ` Herbert Xu 2007-10-09 13:00 ` Jeff Garzik 2007-10-09 20:14 ` David Miller 1 sibling, 1 reply; 79+ messages in thread From: Herbert Xu @ 2007-10-09 12:55 UTC (permalink / raw) To: Jeff Garzik Cc: jagana, gaagaan, Robert.Olsson, mcarlson, rdreier, peter.p.waskiewicz.jr, hadi, kaber, netdev, general, mchan, tgraf, randy.dunlap, johnpol, shemminger, David Miller, sri On Tue, Oct 09, 2007 at 08:44:25AM -0400, Jeff Garzik wrote: > David Miller wrote: > > > >I can just threaten to do them all and that should get the driver > >maintainers going :-) > > What, like this? :) Awsome :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 12:55 ` Herbert Xu @ 2007-10-09 13:00 ` Jeff Garzik 0 siblings, 0 replies; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 13:00 UTC (permalink / raw) To: Herbert Xu Cc: David Miller, krkumar2, gaagaan, general, hadi, jagana, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rdreier, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma Herbert Xu wrote: > On Tue, Oct 09, 2007 at 08:44:25AM -0400, Jeff Garzik wrote: >> David Miller wrote: >>> I can just threaten to do them all and that should get the driver >>> maintainers going :-) >> What, like this? :) > > Awsome :) Note my patch is just to get the maintainers going. :) I'm not going to commit that, since I don't have any way to test any of the drivers I touched (but I wouldn't scream if it appeared in net-2.6.24 either) Jeff ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 12:44 ` [ofa-general] " Jeff Garzik 2007-10-09 12:55 ` Herbert Xu @ 2007-10-09 20:14 ` David Miller 2007-10-09 20:20 ` [ofa-general] " Jeff Garzik 1 sibling, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-09 20:14 UTC (permalink / raw) To: jeff Cc: krkumar2, gaagaan, general, hadi, herbert, jagana, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rdreier, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma From: Jeff Garzik <jeff@garzik.org> Date: Tue, 09 Oct 2007 08:44:25 -0400 > David Miller wrote: > > From: Krishna Kumar2 <krkumar2@in.ibm.com> > > Date: Tue, 9 Oct 2007 16:51:14 +0530 > > > >> David Miller <davem@davemloft.net> wrote on 10/09/2007 04:32:55 PM: > >> > >>> Ignore LLTX, it sucks, it was a big mistake, and we will get rid of > >>> it. > >> Great, this will make life easy. Any idea how long that would take? > >> It seems simple enough to do. > > > > I'd say we can probably try to get rid of it in 2.6.25, this is > > assuming we get driver authors to cooperate and do the conversions > > or alternatively some other motivated person. > > > > I can just threaten to do them all and that should get the driver > > maintainers going :-) > > What, like this? :) Thanks, but it's probably going to need some corrections and/or an audit. If you unconditionally take those locks in the transmit function, there is probably an ABBA deadlock elsewhere in the driver now, most likely in the TX reclaim processing, and you therefore need to handle that too. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:14 ` David Miller @ 2007-10-09 20:20 ` Jeff Garzik 2007-10-09 21:25 ` David Miller 0 siblings, 1 reply; 79+ messages in thread From: Jeff Garzik @ 2007-10-09 20:20 UTC (permalink / raw) To: David Miller Cc: jagana, herbert, gaagaan, Robert.Olsson, mcarlson, rdreier, peter.p.waskiewicz.jr, hadi, netdev, general, mchan, tgraf, randy.dunlap, johnpol, shemminger, kaber, sri David Miller wrote: > From: Jeff Garzik <jeff@garzik.org> > Date: Tue, 09 Oct 2007 08:44:25 -0400 > >> David Miller wrote: >>> From: Krishna Kumar2 <krkumar2@in.ibm.com> >>> Date: Tue, 9 Oct 2007 16:51:14 +0530 >>> >>>> David Miller <davem@davemloft.net> wrote on 10/09/2007 04:32:55 PM: >>>> >>>>> Ignore LLTX, it sucks, it was a big mistake, and we will get rid of >>>>> it. >>>> Great, this will make life easy. Any idea how long that would take? >>>> It seems simple enough to do. >>> I'd say we can probably try to get rid of it in 2.6.25, this is >>> assuming we get driver authors to cooperate and do the conversions >>> or alternatively some other motivated person. >>> >>> I can just threaten to do them all and that should get the driver >>> maintainers going :-) >> What, like this? :) > > Thanks, but it's probably going to need some corrections and/or > an audit. I would be happy if someone wanted to audit that patch. > If you unconditionally take those locks in the transmit function, > there is probably an ABBA deadlock elsewhere in the driver now, most > likely in the TX reclaim processing, and you therefore need to handle > that too. And I most certainly checked the relevant transmit paths and other locking to make sure lock ordering was correct. Jeff ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:20 ` [ofa-general] " Jeff Garzik @ 2007-10-09 21:25 ` David Miller 0 siblings, 0 replies; 79+ messages in thread From: David Miller @ 2007-10-09 21:25 UTC (permalink / raw) To: jeff Cc: krkumar2, gaagaan, general, hadi, herbert, jagana, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rdreier, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma From: Jeff Garzik <jeff@garzik.org> Date: Tue, 09 Oct 2007 16:20:14 -0400 > David Miller wrote: > > If you unconditionally take those locks in the transmit function, > > there is probably an ABBA deadlock elsewhere in the driver now, most > > likely in the TX reclaim processing, and you therefore need to handle > > that too. > > And I most certainly checked the relevant transmit paths and other > locking to make sure lock ordering was correct. Awesome. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 11:24 ` David Miller 2007-10-09 12:44 ` [ofa-general] " Jeff Garzik @ 2007-10-09 20:22 ` Roland Dreier 2007-10-09 20:51 ` David Miller 1 sibling, 1 reply; 79+ messages in thread From: Roland Dreier @ 2007-10-09 20:22 UTC (permalink / raw) To: David Miller Cc: jagana, herbert, gaagaan, Robert.Olsson, mcarlson, peter.p.waskiewicz.jr, hadi, randy.dunlap, jeff, general, mchan, tgraf, netdev, johnpol, shemminger, kaber, sri > I'd say we can probably try to get rid of it in 2.6.25, this is > assuming we get driver authors to cooperate and do the conversions > or alternatively some other motivated person. > > I can just threaten to do them all and that should get the driver > maintainers going :-) I can definitely kill LLTX for IPoIB by 2.6.25 and I just added it to my TODO list so I don't forget. In fact if 2.6.23 drags on long enough I may do it for 2.6.24.... ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:22 ` [ofa-general] " Roland Dreier @ 2007-10-09 20:51 ` David Miller 2007-10-09 21:40 ` Roland Dreier 2007-10-09 22:44 ` [ofa-general] " Roland Dreier 0 siblings, 2 replies; 79+ messages in thread From: David Miller @ 2007-10-09 20:51 UTC (permalink / raw) To: rdreier Cc: krkumar2, gaagaan, general, hadi, herbert, jagana, jeff, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma From: Roland Dreier <rdreier@cisco.com> Date: Tue, 09 Oct 2007 13:22:44 -0700 > I can definitely kill LLTX for IPoIB by 2.6.25 and I just added it to > my TODO list so I don't forget. > > In fact if 2.6.23 drags on long enough I may do it for 2.6.24.... Before you add new entries to your list, how is that ibm driver NAPI conversion coming along? :-) Right now that's a more pressing task to complete. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:51 ` David Miller @ 2007-10-09 21:40 ` Roland Dreier 2007-10-09 22:44 ` [ofa-general] " Roland Dreier 1 sibling, 0 replies; 79+ messages in thread From: Roland Dreier @ 2007-10-09 21:40 UTC (permalink / raw) To: David Miller Cc: krkumar2, gaagaan, general, hadi, herbert, jagana, jeff, johnpol, kaber, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, randy.dunlap, rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma > Before you add new entries to your list, how is that ibm driver NAPI > conversion coming along? :-) I still haven't done much. OK, I will try to get my board booting again this week. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 2/3][NET_BATCH] net core use batching 2007-10-09 20:51 ` David Miller 2007-10-09 21:40 ` Roland Dreier @ 2007-10-09 22:44 ` Roland Dreier 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier 1 sibling, 1 reply; 79+ messages in thread From: Roland Dreier @ 2007-10-09 22:44 UTC (permalink / raw) To: David Miller Cc: jagana, herbert, gaagaan, Robert.Olsson, mcarlson, peter.p.waskiewicz.jr, hadi, randy.dunlap, jeff, general, mchan, tgraf, netdev, johnpol, shemminger, kaber, sri > Before you add new entries to your list, how is that ibm driver NAPI > conversion coming along? :-) OK, thanks for the kick in the pants, I have a couple of patches for net-2.6.24 coming (including an unrelated trivial warning fix for IPoIB). - R. ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning 2007-10-09 22:44 ` [ofa-general] " Roland Dreier @ 2007-10-09 22:46 ` Roland Dreier 2007-10-09 22:47 ` [ofa-general] [PATCH 2/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier ` (4 more replies) 0 siblings, 5 replies; 79+ messages in thread From: Roland Dreier @ 2007-10-09 22:46 UTC (permalink / raw) To: David Miller; +Cc: netdev, general, jeff The conversion to use netdevice internal stats left an unused variable in ipoib_neigh_free(), since there's no longer any reason to get netdev_priv() in order to increment dropped packets. Delete the unused priv variable. Signed-off-by: Roland Dreier <rolandd@cisco.com> --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6b1b4b2..855c9de 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -854,7 +854,6 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) { - struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; *to_ipoib_neigh(neigh->neighbour) = NULL; while ((skb = __skb_dequeue(&neigh->queue))) { ^ permalink raw reply related [flat|nested] 79+ messages in thread
* [ofa-general] [PATCH 2/4] ibm_emac: Convert to use napi_struct independent of struct net_device 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier @ 2007-10-09 22:47 ` Roland Dreier 2007-10-09 22:47 ` [PATCH 3/4] ibm_new_emac: Nuke SET_MODULE_OWNER() use Roland Dreier ` (3 subsequent siblings) 4 siblings, 0 replies; 79+ messages in thread From: Roland Dreier @ 2007-10-09 22:47 UTC (permalink / raw) To: David Miller; +Cc: netdev, jeff, general Commit da3dedd9 ("[NET]: Make NAPI polling independent of struct net_device objects.") changed the interface to NAPI polling. Fix up the ibm_emac driver so that it works with this new interface. This is actually a nice cleanup because ibm_emac is one of the drivers that wants to have multiple NAPI structures for a single net_device. Tested with the internal MAC of a PowerPC 440SPe SoC with an AMCC 'Yucca' evaluation board. Signed-off-by: Roland Dreier <rolandd@cisco.com> --- drivers/net/ibm_emac/ibm_emac_mal.c | 48 ++++++++++++---------------------- drivers/net/ibm_emac/ibm_emac_mal.h | 2 +- include/linux/netdevice.h | 10 +++++++ 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/drivers/net/ibm_emac/ibm_emac_mal.c b/drivers/net/ibm_emac/ibm_emac_mal.c index cabd984..cc3ddc9 100644 --- a/drivers/net/ibm_emac/ibm_emac_mal.c +++ b/drivers/net/ibm_emac/ibm_emac_mal.c @@ -207,10 +207,10 @@ static irqreturn_t mal_serr(int irq, void *dev_instance) static inline void mal_schedule_poll(struct ibm_ocp_mal *mal) { - if (likely(netif_rx_schedule_prep(&mal->poll_dev))) { + if (likely(napi_schedule_prep(&mal->napi))) { MAL_DBG2("%d: schedule_poll" NL, mal->def->index); mal_disable_eob_irq(mal); - __netif_rx_schedule(&mal->poll_dev); + __napi_schedule(&mal->napi); } else MAL_DBG2("%d: already in poll" NL, mal->def->index); } @@ -273,11 +273,11 @@ static irqreturn_t mal_rxde(int irq, void *dev_instance) return IRQ_HANDLED; } -static int mal_poll(struct net_device *ndev, int *budget) +static int mal_poll(struct napi_struct *napi, int budget) { - struct ibm_ocp_mal *mal = ndev->priv; + struct ibm_ocp_mal *mal = container_of(napi, struct ibm_ocp_mal, napi); struct list_head *l; - int rx_work_limit = min(ndev->quota, *budget), received = 0, done; + int received = 0; MAL_DBG2("%d: poll(%d) %d ->" NL, mal->def->index, *budget, rx_work_limit); @@ -295,38 +295,34 @@ static int mal_poll(struct net_device *ndev, int *budget) list_for_each(l, &mal->poll_list) { struct mal_commac *mc = list_entry(l, struct mal_commac, poll_list); - int n = mc->ops->poll_rx(mc->dev, rx_work_limit); + int n = mc->ops->poll_rx(mc->dev, budget); if (n) { received += n; - rx_work_limit -= n; - if (rx_work_limit <= 0) { - done = 0; + budget -= n; + if (budget <= 0) goto more_work; // XXX What if this is the last one ? - } } } /* We need to disable IRQs to protect from RXDE IRQ here */ local_irq_disable(); - __netif_rx_complete(ndev); + __napi_complete(napi); mal_enable_eob_irq(mal); local_irq_enable(); - done = 1; - /* Check for "rotting" packet(s) */ list_for_each(l, &mal->poll_list) { struct mal_commac *mc = list_entry(l, struct mal_commac, poll_list); if (unlikely(mc->ops->peek_rx(mc->dev) || mc->rx_stopped)) { MAL_DBG2("%d: rotting packet" NL, mal->def->index); - if (netif_rx_reschedule(ndev, received)) + if (napi_reschedule(napi)) mal_disable_eob_irq(mal); else MAL_DBG2("%d: already in poll list" NL, mal->def->index); - if (rx_work_limit > 0) + if (budget > 0) goto again; else goto more_work; @@ -335,12 +331,8 @@ static int mal_poll(struct net_device *ndev, int *budget) } more_work: - ndev->quota -= received; - *budget -= received; - - MAL_DBG2("%d: poll() %d <- %d" NL, mal->def->index, *budget, - done ? 0 : 1); - return done ? 0 : 1; + MAL_DBG2("%d: poll() %d <- %d" NL, mal->def->index, budget, received); + return received; } static void mal_reset(struct ibm_ocp_mal *mal) @@ -425,11 +417,8 @@ static int __init mal_probe(struct ocp_device *ocpdev) mal->def = ocpdev->def; INIT_LIST_HEAD(&mal->poll_list); - set_bit(__LINK_STATE_START, &mal->poll_dev.state); - mal->poll_dev.weight = CONFIG_IBM_EMAC_POLL_WEIGHT; - mal->poll_dev.poll = mal_poll; - mal->poll_dev.priv = mal; - atomic_set(&mal->poll_dev.refcnt, 1); + mal->napi.weight = CONFIG_IBM_EMAC_POLL_WEIGHT; + mal->napi.poll = mal_poll; INIT_LIST_HEAD(&mal->list); @@ -520,11 +509,8 @@ static void __exit mal_remove(struct ocp_device *ocpdev) MAL_DBG("%d: remove" NL, mal->def->index); - /* Syncronize with scheduled polling, - stolen from net/core/dev.c:dev_close() - */ - clear_bit(__LINK_STATE_START, &mal->poll_dev.state); - netif_poll_disable(&mal->poll_dev); + /* Synchronize with scheduled polling */ + napi_disable(&mal->napi); if (!list_empty(&mal->list)) { /* This is *very* bad */ diff --git a/drivers/net/ibm_emac/ibm_emac_mal.h b/drivers/net/ibm_emac/ibm_emac_mal.h index 64bc338..8f54d62 100644 --- a/drivers/net/ibm_emac/ibm_emac_mal.h +++ b/drivers/net/ibm_emac/ibm_emac_mal.h @@ -195,7 +195,7 @@ struct ibm_ocp_mal { dcr_host_t dcrhost; struct list_head poll_list; - struct net_device poll_dev; + struct napi_struct napi; struct list_head list; u32 tx_chan_mask; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 91cd3f3..4848c7a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -349,6 +349,16 @@ static inline void napi_schedule(struct napi_struct *n) __napi_schedule(n); } +/* Try to reschedule poll. Called by dev->poll() after napi_complete(). */ +static inline int napi_reschedule(struct napi_struct *napi) +{ + if (napi_schedule_prep(napi)) { + __napi_schedule(napi); + return 1; + } + return 0; +} + /** * napi_complete - NAPI processing complete * @n: napi context ^ permalink raw reply related [flat|nested] 79+ messages in thread
* [PATCH 3/4] ibm_new_emac: Nuke SET_MODULE_OWNER() use 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier 2007-10-09 22:47 ` [ofa-general] [PATCH 2/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier @ 2007-10-09 22:47 ` Roland Dreier 2007-10-09 22:48 ` [PATCH 4/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier ` (2 subsequent siblings) 4 siblings, 0 replies; 79+ messages in thread From: Roland Dreier @ 2007-10-09 22:47 UTC (permalink / raw) To: David Miller; +Cc: netdev, general, jeff Signed-off-by: Roland Dreier <rolandd@cisco.com> --- drivers/net/ibm_newemac/core.c | 1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c index ce127b9..8ea5009 100644 --- a/drivers/net/ibm_newemac/core.c +++ b/drivers/net/ibm_newemac/core.c @@ -2549,7 +2549,6 @@ static int __devinit emac_probe(struct of_device *ofdev, dev->ndev = ndev; dev->ofdev = ofdev; dev->blist = blist; - SET_MODULE_OWNER(ndev); SET_NETDEV_DEV(ndev, &ofdev->dev); /* Initialize some embedded data structures */ ^ permalink raw reply related [flat|nested] 79+ messages in thread
* [PATCH 4/4] ibm_emac: Convert to use napi_struct independent of struct net_device 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier 2007-10-09 22:47 ` [ofa-general] [PATCH 2/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier 2007-10-09 22:47 ` [PATCH 3/4] ibm_new_emac: Nuke SET_MODULE_OWNER() use Roland Dreier @ 2007-10-09 22:48 ` Roland Dreier 2007-10-09 22:51 ` [ofa-general] " Roland Dreier 2007-10-09 23:17 ` [PATCH 1/4] IPoIB: Fix unused variable warning David Miller 2007-10-10 0:47 ` [ofa-general] " Jeff Garzik 4 siblings, 1 reply; 79+ messages in thread From: Roland Dreier @ 2007-10-09 22:48 UTC (permalink / raw) To: David Miller; +Cc: netdev, general, jeff Commit da3dedd9 ("[NET]: Make NAPI polling independent of struct net_device objects.") changed the interface to NAPI polling. Fix up the ibm_newemac driver so that it works with this new interface. This is actually a nice cleanup because ibm_newemac is one of the drivers that wants to have multiple NAPI structures for a single net_device. Compile-tested only as I don't have a system that uses the ibm_newemac driver. This conversion the conversion for the ibm_emac driver that was tested on real PowerPC 440SPe hardware. Signed-off-by: Roland Dreier <rolandd@cisco.com> --- drivers/net/ibm_newemac/mal.c | 55 ++++++++++++++-------------------------- drivers/net/ibm_newemac/mal.h | 2 +- 2 files changed, 20 insertions(+), 37 deletions(-) diff --git a/drivers/net/ibm_newemac/mal.c b/drivers/net/ibm_newemac/mal.c index c4335b7..5885411 100644 --- a/drivers/net/ibm_newemac/mal.c +++ b/drivers/net/ibm_newemac/mal.c @@ -235,10 +235,10 @@ static irqreturn_t mal_serr(int irq, void *dev_instance) static inline void mal_schedule_poll(struct mal_instance *mal) { - if (likely(netif_rx_schedule_prep(&mal->poll_dev))) { + if (likely(napi_schedule_prep(&mal->napi))) { MAL_DBG2(mal, "schedule_poll" NL); mal_disable_eob_irq(mal); - __netif_rx_schedule(&mal->poll_dev); + __napi_schedule(&mal->napi); } else MAL_DBG2(mal, "already in poll" NL); } @@ -318,8 +318,7 @@ void mal_poll_disable(struct mal_instance *mal, struct mal_commac *commac) msleep(1); /* Synchronize with the MAL NAPI poller. */ - while (test_bit(__LINK_STATE_RX_SCHED, &mal->poll_dev.state)) - msleep(1); + napi_disable(&mal->napi); } void mal_poll_enable(struct mal_instance *mal, struct mal_commac *commac) @@ -330,11 +329,11 @@ void mal_poll_enable(struct mal_instance *mal, struct mal_commac *commac) // XXX might want to kick a poll now... } -static int mal_poll(struct net_device *ndev, int *budget) +static int mal_poll(struct napi_struct *napi, int budget) { - struct mal_instance *mal = netdev_priv(ndev); + struct mal_instance *mal = container_of(napi, struct mal_instance, napi); struct list_head *l; - int rx_work_limit = min(ndev->quota, *budget), received = 0, done; + int received = 0; unsigned long flags; MAL_DBG2(mal, "poll(%d) %d ->" NL, *budget, @@ -358,26 +357,21 @@ static int mal_poll(struct net_device *ndev, int *budget) int n; if (unlikely(test_bit(MAL_COMMAC_POLL_DISABLED, &mc->flags))) continue; - n = mc->ops->poll_rx(mc->dev, rx_work_limit); + n = mc->ops->poll_rx(mc->dev, budget); if (n) { received += n; - rx_work_limit -= n; - if (rx_work_limit <= 0) { - done = 0; - // XXX What if this is the last one ? - goto more_work; - } + budget -= n; + if (budget <= 0) + goto more_work; // XXX What if this is the last one ? } } /* We need to disable IRQs to protect from RXDE IRQ here */ spin_lock_irqsave(&mal->lock, flags); - __netif_rx_complete(ndev); + __napi_complete(napi); mal_enable_eob_irq(mal); spin_unlock_irqrestore(&mal->lock, flags); - done = 1; - /* Check for "rotting" packet(s) */ list_for_each(l, &mal->poll_list) { struct mal_commac *mc = @@ -387,12 +381,12 @@ static int mal_poll(struct net_device *ndev, int *budget) if (unlikely(mc->ops->peek_rx(mc->dev) || test_bit(MAL_COMMAC_RX_STOPPED, &mc->flags))) { MAL_DBG2(mal, "rotting packet" NL); - if (netif_rx_reschedule(ndev, received)) + if (napi_reschedule(napi)) mal_disable_eob_irq(mal); else MAL_DBG2(mal, "already in poll list" NL); - if (rx_work_limit > 0) + if (budget > 0) goto again; else goto more_work; @@ -401,13 +395,8 @@ static int mal_poll(struct net_device *ndev, int *budget) } more_work: - ndev->quota -= received; - *budget -= received; - - MAL_DBG2(mal, "poll() %d <- %d" NL, *budget, - done ? 0 : 1); - - return done ? 0 : 1; + MAL_DBG2(mal, "poll() %d <- %d" NL, budget, received); + return received; } static void mal_reset(struct mal_instance *mal) @@ -538,11 +527,8 @@ static int __devinit mal_probe(struct of_device *ofdev, } INIT_LIST_HEAD(&mal->poll_list); - set_bit(__LINK_STATE_START, &mal->poll_dev.state); - mal->poll_dev.weight = CONFIG_IBM_NEW_EMAC_POLL_WEIGHT; - mal->poll_dev.poll = mal_poll; - mal->poll_dev.priv = mal; - atomic_set(&mal->poll_dev.refcnt, 1); + mal->napi.weight = CONFIG_IBM_NEW_EMAC_POLL_WEIGHT; + mal->napi.poll = mal_poll; INIT_LIST_HEAD(&mal->list); spin_lock_init(&mal->lock); @@ -653,11 +639,8 @@ static int __devexit mal_remove(struct of_device *ofdev) MAL_DBG(mal, "remove" NL); - /* Syncronize with scheduled polling, - stolen from net/core/dev.c:dev_close() - */ - clear_bit(__LINK_STATE_START, &mal->poll_dev.state); - netif_poll_disable(&mal->poll_dev); + /* Synchronize with scheduled polling */ + napi_disable(&mal->napi); if (!list_empty(&mal->list)) { /* This is *very* bad */ diff --git a/drivers/net/ibm_newemac/mal.h b/drivers/net/ibm_newemac/mal.h index 57b69dc..cb1a16d 100644 --- a/drivers/net/ibm_newemac/mal.h +++ b/drivers/net/ibm_newemac/mal.h @@ -197,7 +197,7 @@ struct mal_instance { int serr_irq; /* MAL System Error IRQ */ struct list_head poll_list; - struct net_device poll_dev; + struct napi_struct napi; struct list_head list; u32 tx_chan_mask; ^ permalink raw reply related [flat|nested] 79+ messages in thread
* Re: [ofa-general] [PATCH 4/4] ibm_emac: Convert to use napi_struct independent of struct net_device 2007-10-09 22:48 ` [PATCH 4/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier @ 2007-10-09 22:51 ` Roland Dreier 0 siblings, 0 replies; 79+ messages in thread From: Roland Dreier @ 2007-10-09 22:51 UTC (permalink / raw) To: David Miller; +Cc: netdev, general, jeff Sorry... wrong subject here; it should have been "ibm_newemac: ..." - R. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 1/4] IPoIB: Fix unused variable warning 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier ` (2 preceding siblings ...) 2007-10-09 22:48 ` [PATCH 4/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier @ 2007-10-09 23:17 ` David Miller 2007-10-10 0:32 ` Jeff Garzik 2007-10-10 0:47 ` [ofa-general] " Jeff Garzik 4 siblings, 1 reply; 79+ messages in thread From: David Miller @ 2007-10-09 23:17 UTC (permalink / raw) To: rdreier; +Cc: jeff, general, netdev From: Roland Dreier <rdreier@cisco.com> Date: Tue, 09 Oct 2007 15:46:13 -0700 > The conversion to use netdevice internal stats left an unused variable > in ipoib_neigh_free(), since there's no longer any reason to get > netdev_priv() in order to increment dropped packets. Delete the > unused priv variable. > > Signed-off-by: Roland Dreier <rolandd@cisco.com> Jeff, do you want to merge in Roland's 4 patches to your tree then do a sync with me so I can pull it all in from you? Alternative I can merge in Roland's work directly if that's easier for you. Just let me know. ^ permalink raw reply [flat|nested] 79+ messages in thread
* Re: [PATCH 1/4] IPoIB: Fix unused variable warning 2007-10-09 23:17 ` [PATCH 1/4] IPoIB: Fix unused variable warning David Miller @ 2007-10-10 0:32 ` Jeff Garzik 0 siblings, 0 replies; 79+ messages in thread From: Jeff Garzik @ 2007-10-10 0:32 UTC (permalink / raw) To: David Miller; +Cc: rdreier, general, netdev David Miller wrote: > From: Roland Dreier <rdreier@cisco.com> > Date: Tue, 09 Oct 2007 15:46:13 -0700 > >> The conversion to use netdevice internal stats left an unused variable >> in ipoib_neigh_free(), since there's no longer any reason to get >> netdev_priv() in order to increment dropped packets. Delete the >> unused priv variable. >> >> Signed-off-by: Roland Dreier <rolandd@cisco.com> > > Jeff, do you want to merge in Roland's 4 patches to your tree then do > a sync with me so I can pull it all in from you? Grabbing them now... ^ permalink raw reply [flat|nested] 79+ messages in thread
* [ofa-general] Re: [PATCH 1/4] IPoIB: Fix unused variable warning 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier ` (3 preceding siblings ...) 2007-10-09 23:17 ` [PATCH 1/4] IPoIB: Fix unused variable warning David Miller @ 2007-10-10 0:47 ` Jeff Garzik 4 siblings, 0 replies; 79+ messages in thread From: Jeff Garzik @ 2007-10-10 0:47 UTC (permalink / raw) To: Roland Dreier; +Cc: netdev, David Miller, general Roland Dreier wrote: > The conversion to use netdevice internal stats left an unused variable > in ipoib_neigh_free(), since there's no longer any reason to get > netdev_priv() in order to increment dropped packets. Delete the > unused priv variable. > > Signed-off-by: Roland Dreier <rolandd@cisco.com> > --- > drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 - > 1 files changed, 0 insertions(+), 1 deletions(-) applied 1-4 ^ permalink raw reply [flat|nested] 79+ messages in thread
end of thread, other threads:[~2007-10-12 18:29 UTC | newest] Thread overview: 79+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-10-08 18:26 [PATCH 2/3][NET_BATCH] net core use batching jamal 2007-10-08 19:46 ` Waskiewicz Jr, Peter P 2007-10-08 20:48 ` jamal 2007-10-08 21:26 ` [ofa-general] " David Miller 2007-10-08 22:34 ` jamal 2007-10-08 22:36 ` [ofa-general] " Waskiewicz Jr, Peter P 2007-10-08 22:33 ` Waskiewicz Jr, Peter P 2007-10-08 23:40 ` jamal 2007-10-09 1:13 ` Jeff Garzik 2007-10-09 1:41 ` [ofa-general] " David Miller 2007-10-09 2:01 ` Herbert Xu 2007-10-09 2:03 ` Herbert Xu 2007-10-09 2:04 ` Herbert Xu 2007-10-09 2:15 ` jamal 2007-10-09 2:16 ` Herbert Xu 2007-10-09 2:19 ` [ofa-general] " jamal 2007-10-09 2:20 ` Herbert Xu 2007-10-09 2:45 ` [ofa-general] " David Miller 2007-10-09 2:43 ` David Miller 2007-10-09 2:46 ` Herbert Xu 2007-10-09 2:12 ` [ofa-general] " Jeff Garzik 2007-10-09 2:46 ` David Miller 2007-10-09 18:48 ` [ofa-general] " Waskiewicz Jr, Peter P 2007-10-09 19:04 ` Jeff Garzik 2007-10-09 19:07 ` Waskiewicz Jr, Peter P 2007-10-09 2:14 ` [ofa-general] " jamal 2007-10-09 2:16 ` Herbert Xu 2007-10-09 2:47 ` [ofa-general] " David Miller 2007-10-09 16:51 ` Andi Kleen 2007-10-09 18:22 ` Stephen Hemminger 2007-10-09 18:30 ` Andi Kleen 2007-10-09 20:43 ` David Miller 2007-10-09 20:53 ` Stephen Hemminger 2007-10-09 21:22 ` David Miller 2007-10-09 21:56 ` jamal 2007-10-10 0:04 ` David Miller 2007-10-10 0:37 ` Andi Kleen 2007-10-10 0:50 ` David Miller 2007-10-10 9:16 ` Andi Kleen 2007-10-10 9:25 ` David Miller 2007-10-10 10:23 ` Andi Kleen 2007-10-10 10:44 ` David Miller 2007-10-10 13:08 ` jamal 2007-10-10 22:37 ` David Miller 2007-10-10 15:35 ` Waskiewicz Jr, Peter P 2007-10-10 16:02 ` Andi Kleen 2007-10-10 16:42 ` Waskiewicz Jr, Peter P 2007-10-10 9:53 ` Herbert Xu 2007-10-12 16:08 ` Brandeburg, Jesse 2007-10-12 17:05 ` Stephen Hemminger 2007-10-12 18:29 ` Andi Kleen 2007-10-12 18:27 ` Andi Kleen 2007-10-10 16:02 ` Bill Fink 2007-10-10 22:53 ` David Miller 2007-10-11 6:52 ` Krishna Kumar2 2007-10-09 1:31 ` Jeff Garzik 2007-10-09 10:58 ` [ofa-general] " Krishna Kumar2 2007-10-09 11:02 ` David Miller 2007-10-09 11:20 ` [ofa-general] " Krishna Kumar2 2007-10-09 11:21 ` Krishna Kumar2 2007-10-09 11:24 ` David Miller 2007-10-09 12:44 ` [ofa-general] " Jeff Garzik 2007-10-09 12:55 ` Herbert Xu 2007-10-09 13:00 ` Jeff Garzik 2007-10-09 20:14 ` David Miller 2007-10-09 20:20 ` [ofa-general] " Jeff Garzik 2007-10-09 21:25 ` David Miller 2007-10-09 20:22 ` [ofa-general] " Roland Dreier 2007-10-09 20:51 ` David Miller 2007-10-09 21:40 ` Roland Dreier 2007-10-09 22:44 ` [ofa-general] " Roland Dreier 2007-10-09 22:46 ` [ofa-general] [PATCH 1/4] IPoIB: Fix unused variable warning Roland Dreier 2007-10-09 22:47 ` [ofa-general] [PATCH 2/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier 2007-10-09 22:47 ` [PATCH 3/4] ibm_new_emac: Nuke SET_MODULE_OWNER() use Roland Dreier 2007-10-09 22:48 ` [PATCH 4/4] ibm_emac: Convert to use napi_struct independent of struct net_device Roland Dreier 2007-10-09 22:51 ` [ofa-general] " Roland Dreier 2007-10-09 23:17 ` [PATCH 1/4] IPoIB: Fix unused variable warning David Miller 2007-10-10 0:32 ` Jeff Garzik 2007-10-10 0:47 ` [ofa-general] " Jeff Garzik
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).