[RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
@ 2008-09-18  6:43 Alexander Duyck
  2008-09-18  6:56 ` Alexander Duyck
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Alexander Duyck @ 2008-09-18  6:43 UTC (permalink / raw)
  To: netdev; +Cc: jarkao2, herbert, davem, kaber

This this patch is mangled I appologize, this is my first try sending
a patch directly to netdev.

The patch below is my attempt to resolve the issue found with qdisc_run 
only checking the state of queue zero before running.  This approach 
essentially makes the qdisc layer smart enough to do it's own check to 
see if a hw queue is stopped instead of relying on other calls to check 
beforehand.

I have been able to verify functionality for most qdiscs with the
exceptions of netem, red, sfq, and tbf.  I am not familiar with the
operation of these qdiscs and so I am not certain how to avoid the high  
drop rate I am currently seeing when using these qdiscs.

The main advantages of this patch can be seen using a netperf UDP_STREAM
test to a slow interface with multiple queues and a qdisc such as pfifo,
bfifo, or prio.  For my testing I used an 82575 with 4 queues on a
system with 8 cpus.  When any queue other than 0 was used in the old 
method the cpu utilization for one core would go to 100%, using this new 
approach the cpu utilization for all queues was at the same level queue 
0 was with the old approach.

---
This patch changes the behavior of the sch->dequeue to only
dequeue a packet if the queue it is bound for is not currently
stopped.  This functionality is provided via a new op called
smart_dequeue.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---

 include/net/pkt_sched.h   |    5 --
 include/net/sch_generic.h |   27 ++++++++++
 net/sched/sch_atm.c       |   22 ++++++++
 net/sched/sch_blackhole.c |    1 
 net/sched/sch_cbq.c       |  118 ++++++++++++++++++++++++++++++++++++++++++++-
 net/sched/sch_dsmark.c    |   47 ++++++++++++++++++
 net/sched/sch_fifo.c      |    2 +
 net/sched/sch_generic.c   |   30 +++++++++--
 net/sched/sch_gred.c      |   34 +++++++++++++
 net/sched/sch_hfsc.c      |   86 ++++++++++++++++++++++++++++++++-
 net/sched/sch_htb.c       |   82 ++++++++++++++++++++++++++++++-
 net/sched/sch_multiq.c    |   45 ++++++++++++++---
 net/sched/sch_netem.c     |   40 +++++++++++++++
 net/sched/sch_prio.c      |   23 +++++++++
 net/sched/sch_red.c       |   22 ++++++++
 net/sched/sch_sfq.c       |   46 ++++++++++++++++--
 net/sched/sch_tbf.c       |   66 +++++++++++++++++++++++++
 net/sched/sch_teql.c      |   49 ++++++++++++++++---
 18 files changed, 706 insertions(+), 39 deletions(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b786a5b..4082f39 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -90,10 +90,7 @@ extern void __qdisc_run(struct Qdisc *q);
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-	struct netdev_queue *txq = q->dev_queue;
-
-	if (!netif_tx_queue_stopped(txq) &&
-	    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
+	if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
 		__qdisc_run(q);
 }
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e556962..4400a18 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -45,6 +45,7 @@ struct Qdisc
 #define TCQ_F_BUILTIN	1
 #define TCQ_F_THROTTLED	2
 #define TCQ_F_INGRESS	4
+#define TCQ_F_STOPPED	8
 	int			padded;
 	struct Qdisc_ops	*ops;
 	struct qdisc_size_table	*stab;
@@ -110,6 +111,7 @@ struct Qdisc_ops
 
 	int 			(*enqueue)(struct sk_buff *, struct Qdisc *);
 	struct sk_buff *	(*dequeue)(struct Qdisc *);
+	struct sk_buff *	(*smart_dequeue)(struct Qdisc *);
 	int 			(*requeue)(struct sk_buff *, struct Qdisc *);
 	unsigned int		(*drop)(struct Qdisc *);
 
@@ -399,6 +401,31 @@ static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
 	return __qdisc_enqueue_tail(skb, sch, &sch->q);
 }
 
+static inline struct sk_buff *__qdisc_smart_dequeue(struct Qdisc *sch,
+						    struct sk_buff_head *list)
+{
+	struct sk_buff *skb = skb_peek(list);
+	struct netdev_queue *txq;
+
+	if (!skb)
+		return NULL;
+
+	txq = netdev_get_tx_queue(qdisc_dev(sch), skb_get_queue_mapping(skb));
+	if (netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)) {
+		sch->flags |= TCQ_F_STOPPED;
+		return NULL;
+	}
+	__skb_unlink(skb, list);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	sch->flags &= ~TCQ_F_STOPPED;
+	return skb;
+}
+
+static inline struct sk_buff *qdisc_smart_dequeue(struct Qdisc *sch)
+{
+	return __qdisc_smart_dequeue(sch, &sch->q);
+}
+
 static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch,
 						   struct sk_buff_head *list)
 {
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 43d3725..91a40b2 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -516,12 +516,31 @@ static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
 
 	pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
 	tasklet_schedule(&p->task);
-	skb = p->link.q->dequeue(p->link.q);
+	skb = p->link.q->ops->dequeue(p->link.q);
 	if (skb)
 		sch->q.qlen--;
 	return skb;
 }
 
+static struct sk_buff *atm_tc_smart_dequeue(struct Qdisc *sch)
+{
+	struct atm_qdisc_data *p = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	pr_debug("atm_tc_smart_dequeue(sch %p,[qdisc %p])\n", sch, p);
+	tasklet_schedule(&p->task);
+	skb = p->link.q->dequeue(p->link.q);
+	if (skb) {
+		sch->q.qlen--;
+		sch->flags &= ~TCQ_F_STOPPED;
+	} else {
+		if (p->link.q->flags & TCQ_F_STOPPED)
+			sch->flags |= TCQ_F_STOPPED;
+	}
+
+	return skb;
+}
+
 static int atm_tc_requeue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct atm_qdisc_data *p = qdisc_priv(sch);
@@ -694,6 +713,7 @@ static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
 	.priv_size	= sizeof(struct atm_qdisc_data),
 	.enqueue	= atm_tc_enqueue,
 	.dequeue	= atm_tc_dequeue,
+	.smart_dequeue	= atm_tc_smart_dequeue,
 	.requeue	= atm_tc_requeue,
 	.drop		= atm_tc_drop,
 	.init		= atm_tc_init,
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
index 507fb48..48e6909 100644
--- a/net/sched/sch_blackhole.c
+++ b/net/sched/sch_blackhole.c
@@ -33,6 +33,7 @@ static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
 	.priv_size	= 0,
 	.enqueue	= blackhole_enqueue,
 	.dequeue	= blackhole_dequeue,
+	.smart_dequeue	= blackhole_dequeue,
 	.owner		= THIS_MODULE,
 };
 
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 8b06fa9..5ec6040 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -851,7 +851,7 @@ cbq_under_limit(struct cbq_class *cl)
 }
 
 static __inline__ struct sk_buff *
-cbq_dequeue_prio(struct Qdisc *sch, int prio)
+cbq_dequeue_prio(struct Qdisc *sch, int prio, int *stopped)
 {
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	struct cbq_class *cl_tail, *cl_prev, *cl;
@@ -881,7 +881,10 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 				goto next_class;
 			}
 
-			skb = cl->q->dequeue(cl->q);
+			if (stopped)
+				skb = cl->q->dequeue(cl->q);
+			else
+				skb = cl->q->ops->dequeue(cl->q);
 
 			/* Class did not give us any skb :-(
 			   It could occur even if cl->q->q.qlen != 0
@@ -912,6 +915,11 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 			return skb;
 
 skip_class:
+			if (stopped && (cl->q->flags & TCQ_F_STOPPED)) {
+				*stopped = true;
+				return NULL;
+			}
+
 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 				/* Class is empty or penalized.
 				   Unlink it from active chain.
@@ -964,7 +972,7 @@ cbq_dequeue_1(struct Qdisc *sch)
 	while (activemask) {
 		int prio = ffz(~activemask);
 		activemask &= ~(1<<prio);
-		skb = cbq_dequeue_prio(sch, prio);
+		skb = cbq_dequeue_prio(sch, prio, NULL);
 		if (skb)
 			return skb;
 	}
@@ -1048,6 +1056,109 @@ cbq_dequeue(struct Qdisc *sch)
 	return NULL;
 }
 
+static __inline__ struct sk_buff *
+cbq_smart_dequeue_1(struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	unsigned activemask;
+	int stopped = false;
+
+	activemask = q->activemask&0xFF;
+	while (activemask) {
+		int prio = ffz(~activemask);
+		activemask &= ~(1<<prio);
+		skb = cbq_dequeue_prio(sch, prio, &stopped);
+		if (skb)
+			return skb;
+		if (stopped) {
+			sch->flags |= TCQ_F_STOPPED;
+			break;
+		}
+	}
+	return NULL;
+}
+
+static struct sk_buff *
+cbq_smart_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	psched_time_t now;
+	psched_tdiff_t incr;
+
+	now = psched_get_time();
+	incr = now - q->now_rt;
+
+	if (q->tx_class) {
+		psched_tdiff_t incr2;
+		/* Time integrator. We calculate EOS time
+		   by adding expected packet transmission time.
+		   If real time is greater, we warp artificial clock,
+		   so that:
+
+		   cbq_time = max(real_time, work);
+		 */
+		incr2 = L2T(&q->link, q->tx_len);
+		q->now += incr2;
+		cbq_update(q);
+		incr -= incr2;
+		if (incr < 0)
+			incr = 0;
+	}
+	q->now += incr;
+	q->now_rt = now;
+
+	for (;;) {
+		q->wd_expires = 0;
+
+		skb = cbq_smart_dequeue_1(sch);
+		if (skb) {
+			sch->q.qlen--;
+			sch->flags &= ~(TCQ_F_THROTTLED | TCQ_F_STOPPED);
+			return skb;
+		}
+
+		if (sch->flags & TCQ_F_STOPPED)
+			return NULL;
+
+		/* All the classes are overlimit.
+
+		   It is possible, if:
+
+		   1. Scheduler is empty.
+		   2. Toplevel cutoff inhibited borrowing.
+		   3. Root class is overlimit.
+
+		   Reset 2d and 3d conditions and retry.
+
+		   Note, that NS and cbq-2.0 are buggy, peeking
+		   an arbitrary class is appropriate for ancestor-only
+		   sharing, but not for toplevel algorithm.
+
+		   Our version is better, but slower, because it requires
+		   two passes, but it is unavoidable with top-level sharing.
+		*/
+
+		if (q->toplevel == TC_CBQ_MAXLEVEL &&
+		    q->link.undertime == PSCHED_PASTPERFECT)
+			break;
+
+		q->toplevel = TC_CBQ_MAXLEVEL;
+		q->link.undertime = PSCHED_PASTPERFECT;
+	}
+
+	/* No packets in scheduler or nobody wants to give them to us :-(
+	   Sigh... start watchdog timer in the last case. */
+
+	if (sch->q.qlen) {
+		sch->qstats.overlimits++;
+		if (q->wd_expires)
+			qdisc_watchdog_schedule(&q->watchdog,
+						now + q->wd_expires);
+	}
+	return NULL;
+}
 /* CBQ class maintanance routines */
 
 static void cbq_adjust_levels(struct cbq_class *this)
@@ -2065,6 +2176,7 @@ static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct cbq_sched_data),
 	.enqueue	=	cbq_enqueue,
 	.dequeue	=	cbq_dequeue,
+	.smart_dequeue	=	cbq_smart_dequeue,
 	.requeue	=	cbq_requeue,
 	.drop		=	cbq_drop,
 	.init		=	cbq_init,
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index edd1298..21da7af 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -313,6 +313,52 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 	return skb;
 }
 
+static struct sk_buff *dsmark_smart_dequeue(struct Qdisc *sch)
+{
+	struct dsmark_qdisc_data *p = qdisc_priv(sch);
+	struct sk_buff *skb;
+	u32 index;
+
+	pr_debug("dsmark_smart_dequeue(sch %p,[qdisc %p])\n", sch, p);
+
+	skb = p->q->dequeue(p->q);
+	if (skb == NULL) {
+		if (p->q->flags & TCQ_F_STOPPED)
+			sch->flags |= TCQ_F_STOPPED;
+		return NULL;
+	}
+
+	sch->q.qlen--;
+	sch->flags &= ~TCQ_F_STOPPED;
+
+	index = skb->tc_index & (p->indices - 1);
+	pr_debug("index %d->%d\n", skb->tc_index, index);
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
+				    p->value[index]);
+			break;
+	case __constant_htons(ETH_P_IPV6):
+		ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
+				    p->value[index]);
+			break;
+	default:
+		/*
+		 * Only complain if a change was actually attempted.
+		 * This way, we can send non-IP traffic through dsmark
+		 * and don't need yet another qdisc as a bypass.
+		 */
+		if (p->mask[index] != 0xff || p->value[index])
+			printk(KERN_WARNING
+			       "dsmark_smart_dequeue: unsupported protocol %d"
+			       "\n", ntohs(skb->protocol));
+		break;
+	}
+
+	return skb;
+}
+
 static int dsmark_requeue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = qdisc_priv(sch);
@@ -496,6 +542,7 @@ static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct dsmark_qdisc_data),
 	.enqueue	=	dsmark_enqueue,
 	.dequeue	=	dsmark_dequeue,
+	.smart_dequeue	=	dsmark_smart_dequeue,
 	.requeue	=	dsmark_requeue,
 	.drop		=	dsmark_drop,
 	.init		=	dsmark_init,
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 23d258b..15f28f6 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -83,6 +83,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	pfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
+	.smart_dequeue	=	qdisc_smart_dequeue,
 	.requeue	=	qdisc_requeue,
 	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
@@ -98,6 +99,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	bfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
+	.smart_dequeue	=	qdisc_smart_dequeue,
 	.requeue	=	qdisc_requeue,
 	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index ec0a083..f32cb83 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -135,8 +135,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
 
 	HARD_TX_LOCK(dev, txq, smp_processor_id());
-	if (!netif_tx_queue_stopped(txq) &&
-	    !netif_tx_queue_frozen(txq))
+	if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
 		ret = dev_hard_start_xmit(skb, dev, txq);
 	HARD_TX_UNLOCK(dev, txq);
 
@@ -163,10 +162,6 @@ static inline int qdisc_restart(struct Qdisc *q)
 		break;
 	}
 
-	if (ret && (netif_tx_queue_stopped(txq) ||
-		    netif_tx_queue_frozen(txq)))
-		ret = 0;
-
 	return ret;
 }
 
@@ -313,6 +308,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = {
 	.priv_size	=	0,
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
+	.smart_dequeue	=	noop_dequeue,
 	.requeue	=	noop_requeue,
 	.owner		=	THIS_MODULE,
 };
@@ -337,6 +333,7 @@ static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 	.priv_size	=	0,
 	.enqueue	=	noop_enqueue,
 	.dequeue	=	noop_dequeue,
+	.smart_dequeue	=	noop_dequeue,
 	.requeue	=	noop_requeue,
 	.owner		=	THIS_MODULE,
 };
@@ -400,6 +397,24 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 	return NULL;
 }
 
+static struct sk_buff *pfifo_fast_smart_dequeue(struct Qdisc* qdisc)
+{
+	int prio;
+	struct sk_buff_head *list = qdisc_priv(qdisc);
+	struct sk_buff *skb;
+
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
+		if (!skb_queue_empty(list + prio)) {
+			skb = __qdisc_smart_dequeue(qdisc, list + prio);
+			if (skb != NULL)
+				qdisc->q.qlen--;
+			return skb;
+		}
+	}
+
+	return NULL;
+}
+
 static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
 	qdisc->q.qlen++;
@@ -446,6 +461,7 @@ static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
 	.enqueue	=	pfifo_fast_enqueue,
 	.dequeue	=	pfifo_fast_dequeue,
+	.smart_dequeue	=	pfifo_fast_smart_dequeue,
 	.requeue	=	pfifo_fast_requeue,
 	.init		=	pfifo_fast_init,
 	.reset		=	pfifo_fast_reset,
@@ -475,7 +491,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	skb_queue_head_init(&sch->q);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
-	sch->dequeue = ops->dequeue;
+	sch->dequeue = ops->smart_dequeue;
 	sch->dev_queue = dev_queue;
 	dev_hold(qdisc_dev(sch));
 	atomic_set(&sch->refcnt, 1);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index c1ad6b8..5d1654f 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -292,6 +292,39 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
 	return NULL;
 }
 
+static struct sk_buff *gred_smart_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct gred_sched *t = qdisc_priv(sch);
+
+	skb = qdisc_smart_dequeue(sch);
+
+	if (skb) {
+		struct gred_sched_data *q;
+		u16 dp = tc_index_to_dp(skb);
+
+		if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
+			if (net_ratelimit())
+				printk(KERN_WARNING "GRED: Unable to relocate "
+				       "VQ 0x%x after dequeue, screwing up "
+				       "backlog.\n", tc_index_to_dp(skb));
+		} else {
+			q->backlog -= qdisc_pkt_len(skb);
+
+			if (!q->backlog && !gred_wred_mode(t))
+				red_start_of_idle_period(&q->parms);
+		}
+
+		return skb;
+	}
+
+	if (!(sch->flags & TCQ_F_STOPPED) && gred_wred_mode(t) &&
+	    !red_is_idling(&t->wred_set))
+		red_start_of_idle_period(&t->wred_set);
+
+	return NULL;
+}
+
 static unsigned int gred_drop(struct Qdisc* sch)
 {
 	struct sk_buff *skb;
@@ -602,6 +635,7 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct gred_sched),
 	.enqueue	=	gred_enqueue,
 	.dequeue	=	gred_dequeue,
+	.smart_dequeue	=	gred_smart_dequeue,
 	.requeue	=	gred_requeue,
 	.drop		=	gred_drop,
 	.init		=	gred_init,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index c1e77da..2060250 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -889,7 +889,7 @@ qdisc_peek_len(struct Qdisc *sch)
 	struct sk_buff *skb;
 	unsigned int len;
 
-	skb = sch->dequeue(sch);
+	skb = sch->ops->dequeue(sch);
 	if (skb == NULL) {
 		if (net_ratelimit())
 			printk("qdisc_peek_len: non work-conserving qdisc ?\n");
@@ -1642,7 +1642,7 @@ hfsc_dequeue(struct Qdisc *sch)
 		}
 	}
 
-	skb = cl->qdisc->dequeue(cl->qdisc);
+	skb = cl->qdisc->ops->dequeue(cl->qdisc);
 	if (skb == NULL) {
 		if (net_ratelimit())
 			printk("HFSC: Non-work-conserving qdisc ?\n");
@@ -1674,6 +1674,87 @@ hfsc_dequeue(struct Qdisc *sch)
 	return skb;
 }
 
+static struct sk_buff *
+hfsc_smart_dequeue(struct Qdisc *sch)
+{
+	struct hfsc_sched *q = qdisc_priv(sch);
+	struct hfsc_class *cl;
+	struct sk_buff *skb;
+	u64 cur_time;
+	unsigned int next_len;
+	int realtime = 0;
+
+	if (sch->q.qlen == 0)
+		return NULL;
+	skb = skb_peek(&q->requeue);
+	if (skb) {
+		struct netdev_queue *txq;
+		txq = netdev_get_tx_queue(qdisc_dev(sch),
+		                          skb_get_queue_mapping(skb));
+		if (netif_tx_queue_stopped(txq) ||
+		    netif_tx_queue_frozen(txq)) {
+			sch->flags |= TCQ_F_STOPPED;
+			return NULL;
+		}
+		__skb_unlink(skb, &q->requeue);
+		goto out;
+	}
+
+	cur_time = psched_get_time();
+
+	/*
+	 * if there are eligible classes, use real-time criteria.
+	 * find the class with the minimum deadline among
+	 * the eligible classes.
+	 */
+	cl = eltree_get_mindl(q, cur_time);
+	if (cl != NULL) {
+		realtime = 1;
+	} else {
+		/*
+		 * use link-sharing criteria
+		 * get the class with the minimum vt in the hierarchy
+		 */
+		cl = vttree_get_minvt(&q->root, cur_time);
+		if (cl == NULL) {
+			sch->qstats.overlimits++;
+			hfsc_schedule_watchdog(sch);
+			return NULL;
+		}
+	}
+
+	skb = cl->qdisc->dequeue(cl->qdisc);
+	if (skb == NULL) {
+		if (net_ratelimit())
+			printk("HFSC: Non-work-conserving qdisc ?\n");
+		return NULL;
+	}
+
+	update_vf(cl, qdisc_pkt_len(skb), cur_time);
+	if (realtime)
+		cl->cl_cumul += qdisc_pkt_len(skb);
+
+	if (cl->qdisc->q.qlen != 0) {
+		if (cl->cl_flags & HFSC_RSC) {
+			/* update ed */
+			next_len = qdisc_peek_len(cl->qdisc);
+			if (realtime)
+				update_ed(cl, next_len);
+			else
+				update_d(cl, next_len);
+		}
+	} else {
+		/* the class becomes passive */
+		set_passive(cl);
+	}
+
+ out:
+	sch->flags &= ~(TCQ_F_THROTTLED | TCQ_F_STOPPED);
+	sch->q.qlen--;
+
+	return skb;
+}
+
 static int
 hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch)
 {
@@ -1735,6 +1816,7 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = {
 	.dump		= hfsc_dump_qdisc,
 	.enqueue	= hfsc_enqueue,
 	.dequeue	= hfsc_dequeue,
+	.smart_dequeue	= hfsc_smart_dequeue,
 	.requeue	= hfsc_requeue,
 	.drop		= hfsc_drop,
 	.cl_ops		= &hfsc_class_ops,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index d14f020..4da1a85 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -803,7 +803,7 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
 /* dequeues packet at given priority and level; call only if
    you are sure that there is active class at prio/level */
 static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
-					int level)
+					int level, int *stopped)
 {
 	struct sk_buff *skb = NULL;
 	struct htb_class *cl, *start;
@@ -840,9 +840,17 @@ next:
 			goto next;
 		}
 
-		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+		if (stopped)
+			skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+		else
+			skb = cl->un.leaf.q->ops->dequeue(cl->un.leaf.q);
+
 		if (likely(skb != NULL))
 			break;
+		if (stopped && (cl->un.leaf.q->flags & TCQ_F_STOPPED)) {
+			*stopped = true;
+			break;
+		}
 		if (!cl->warned) {
 			printk(KERN_WARNING
 			       "htb: class %X isn't work conserving ?!\n",
@@ -915,7 +923,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
 		while (m != (int)(-1)) {
 			int prio = ffz(m);
 			m |= 1 << prio;
-			skb = htb_dequeue_tree(q, prio, level);
+			skb = htb_dequeue_tree(q, prio, level, NULL);
 			if (likely(skb != NULL)) {
 				sch->q.qlen--;
 				sch->flags &= ~TCQ_F_THROTTLED;
@@ -929,6 +937,73 @@ fin:
 	return skb;
 }
 
+static struct sk_buff *htb_smart_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb = NULL;
+	struct htb_sched *q = qdisc_priv(sch);
+	int level, stopped = false;
+	psched_time_t next_event;
+
+	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
+	skb = skb_peek(&q->direct_queue);
+	if (skb) {
+		struct netdev_queue *txq;
+		txq = netdev_get_tx_queue(qdisc_dev(sch),
+		                          skb_get_queue_mapping(skb));
+		if (netif_tx_queue_stopped(txq) ||
+		    netif_tx_queue_frozen(txq)) {
+			sch->flags |= TCQ_F_STOPPED;
+			return NULL;
+		}
+		__skb_unlink(skb, &q->direct_queue);
+		sch->flags &= ~(TCQ_F_THROTTLED | TCQ_F_STOPPED);
+		sch->q.qlen--;
+		return skb;
+	}
+
+	if (!sch->q.qlen)
+		goto fin;
+	q->now = psched_get_time();
+
+	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
+	q->nwc_hit = 0;
+	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
+		/* common case optimization - skip event handler quickly */
+		int m;
+		psched_time_t event;
+
+		if (q->now >= q->near_ev_cache[level]) {
+			event = htb_do_events(q, level);
+			if (!event)
+				event = q->now + PSCHED_TICKS_PER_SEC;
+			q->near_ev_cache[level] = event;
+		} else
+			event = q->near_ev_cache[level];
+
+		if (event && next_event > event)
+			next_event = event;
+
+		m = ~q->row_mask[level];
+		while (m != (int)(-1)) {
+			int prio = ffz(m);
+			m |= 1 << prio;
+			skb = htb_dequeue_tree(q, prio, level, &stopped);
+			if (likely(skb != NULL)) {
+				sch->q.qlen--;
+				sch->flags &= ~(TCQ_F_THROTTLED |
+				                TCQ_F_STOPPED);
+				goto fin;
+			}
+			if (stopped)
+				goto fin;
+		}
+	}
+	sch->qstats.overlimits++;
+	qdisc_watchdog_schedule(&q->watchdog, next_event);
+fin:
+	return skb;
+}
+
 /* try to drop from each class (by prio) until one succeed */
 static unsigned int htb_drop(struct Qdisc *sch)
 {
@@ -1565,6 +1640,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct htb_sched),
 	.enqueue	=	htb_enqueue,
 	.dequeue	=	htb_dequeue,
+	.smart_dequeue	=	htb_smart_dequeue,
 	.requeue	=	htb_requeue,
 	.drop		=	htb_drop,
 	.init		=	htb_init,
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 7f4dbf0..e201171 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -142,15 +142,45 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
 		/* Check that target subqueue is available before
 		 * pulling an skb to avoid excessive requeues
 		 */
-		if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
-			qdisc = q->queues[q->curband];
-			skb = qdisc->dequeue(qdisc);
-			if (skb) {
-				sch->q.qlen--;
-				return skb;
-			}
+		qdisc = q->queues[q->curband];
+		skb = qdisc->ops->dequeue(qdisc);
+		if (skb) {
+			sch->q.qlen--;
+			return skb;
+		}
+	}
+	return NULL;
+
+}
+
+static struct sk_buff *multiq_smart_dequeue(struct Qdisc *sch)
+{
+	struct multiq_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	struct sk_buff *skb;
+	int band, stopped = 0;
+
+	for (band = 0; band < q->bands; band++) {
+		/* cycle through bands to ensure fairness */
+		q->curband++;
+		if (q->curband >= q->bands)
+			q->curband = 0;
+
+		/* Check that target subqueue is available before
+		 * pulling an skb to avoid excessive requeues
+		 */
+		qdisc = q->queues[q->curband];
+		skb = qdisc->dequeue(qdisc);
+		if (skb) {
+			sch->q.qlen--;
+			sch->flags &= ~TCQ_F_STOPPED;
+			return skb;
 		}
+		if (qdisc->flags & TCQ_F_STOPPED)
+			stopped++;
 	}
+	if (stopped)
+		sch->flags |= TCQ_F_STOPPED;
 	return NULL;
 
 }
@@ -448,6 +478,7 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct multiq_sched_data),
 	.enqueue	=	multiq_enqueue,
 	.dequeue	=	multiq_dequeue,
+	.smart_dequeue	=	multiq_smart_dequeue,
 	.requeue	=	multiq_requeue,
 	.drop		=	multiq_drop,
 	.init		=	multiq_init,
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index a119599..47dfe8e 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -283,7 +283,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	if (sch->flags & TCQ_F_THROTTLED)
 		return NULL;
 
-	skb = q->qdisc->dequeue(q->qdisc);
+	skb = q->qdisc->ops->dequeue(q->qdisc);
 	if (skb) {
 		const struct netem_skb_cb *cb = netem_skb_cb(skb);
 		psched_time_t now = psched_get_time();
@@ -308,6 +308,42 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	return NULL;
 }
 
+static struct sk_buff *netem_smart_dequeue(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	smp_mb();
+	if (sch->flags & TCQ_F_THROTTLED)
+		return NULL;
+
+	skb = q->qdisc->dequeue(q->qdisc);
+	if (skb) {
+		const struct netem_skb_cb *cb = netem_skb_cb(skb);
+		psched_time_t now = psched_get_time();
+
+		/* if more time remaining? */
+		if (cb->time_to_send <= now) {
+			pr_debug("netem_dequeue: return skb=%p\n", skb);
+			sch->q.qlen--;
+			sch->flags &= ~TCQ_F_STOPPED;
+			return skb;
+		}
+
+		if (unlikely(q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS)) {
+			qdisc_tree_decrease_qlen(q->qdisc, 1);
+			sch->qstats.drops++;
+			printk(KERN_ERR "netem: %s could not requeue\n",
+			       q->qdisc->ops->id);
+		}
+
+		qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
+	} else if (q->qdisc->flags & TCQ_F_STOPPED) {
+		sch->flags |= TCQ_F_STOPPED;
+	}
+
+	return NULL;
+}
 static void netem_reset(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -541,6 +577,7 @@ static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	tfifo_enqueue,
 	.dequeue	=	qdisc_dequeue_head,
+	.smart_dequeue	=	qdisc_smart_dequeue,
 	.requeue	=	qdisc_requeue,
 	.drop		=	qdisc_queue_drop,
 	.init		=	tfifo_init,
@@ -716,6 +753,7 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct netem_sched_data),
 	.enqueue	=	netem_enqueue,
 	.dequeue	=	netem_dequeue,
+	.smart_dequeue	=	netem_smart_dequeue,
 	.requeue	=	netem_requeue,
 	.drop		=	netem_drop,
 	.init		=	netem_init,
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 504a78c..f085dbe 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -128,11 +128,33 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
 
 	for (prio = 0; prio < q->bands; prio++) {
 		struct Qdisc *qdisc = q->queues[prio];
+		struct sk_buff *skb = qdisc->ops->dequeue(qdisc);
+		if (skb) {
+			sch->q.qlen--;
+			return skb;
+		}
+	}
+	return NULL;
+
+}
+
+static struct sk_buff *prio_smart_dequeue(struct Qdisc* sch)
+{
+	struct prio_sched_data *q = qdisc_priv(sch);
+	int prio;
+
+	for (prio = 0; prio < q->bands; prio++) {
+		struct Qdisc *qdisc = q->queues[prio];
 		struct sk_buff *skb = qdisc->dequeue(qdisc);
 		if (skb) {
 			sch->q.qlen--;
+			sch->flags &= ~TCQ_F_STOPPED;
 			return skb;
 		}
+		if (qdisc->flags & TCQ_F_STOPPED) {
+			sch->flags |= TCQ_F_STOPPED;
+			return NULL;
+		}
 	}
 	return NULL;
 
@@ -421,6 +443,7 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct prio_sched_data),
 	.enqueue	=	prio_enqueue,
 	.dequeue	=	prio_dequeue,
+	.smart_dequeue	=	prio_smart_dequeue,
 	.requeue	=	prio_requeue,
 	.drop		=	prio_drop,
 	.init		=	prio_init,
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 5da0583..b8247cb 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -131,7 +131,7 @@ static struct sk_buff * red_dequeue(struct Qdisc* sch)
 	struct red_sched_data *q = qdisc_priv(sch);
 	struct Qdisc *child = q->qdisc;
 
-	skb = child->dequeue(child);
+	skb = child->ops->dequeue(child);
 	if (skb)
 		sch->q.qlen--;
 	else if (!red_is_idling(&q->parms))
@@ -140,6 +140,25 @@ static struct sk_buff * red_dequeue(struct Qdisc* sch)
 	return skb;
 }
 
+static struct sk_buff * red_smart_dequeue(struct Qdisc* sch)
+{
+	struct sk_buff *skb;
+	struct red_sched_data *q = qdisc_priv(sch);
+	struct Qdisc *child = q->qdisc;
+
+	skb = child->dequeue(child);
+	if (skb) {
+		sch->q.qlen--;
+		sch->flags &= ~TCQ_F_STOPPED;
+	} else {
+		if (child->flags & TCQ_F_STOPPED)
+			sch->flags |= TCQ_F_STOPPED;
+		else if (!red_is_idling(&q->parms))
+			red_start_of_idle_period(&q->parms);
+	}
+
+	return skb;
+}
 static unsigned int red_drop(struct Qdisc* sch)
 {
 	struct red_sched_data *q = qdisc_priv(sch);
@@ -361,6 +380,7 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = {
 	.cl_ops		=	&red_class_ops,
 	.enqueue	=	red_enqueue,
 	.dequeue	=	red_dequeue,
+	.smart_dequeue	=	red_smart_dequeue,
 	.requeue	=	red_requeue,
 	.drop		=	red_drop,
 	.init		=	red_init,
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 6e041d1..2a7ba8e 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -391,9 +391,6 @@ sfq_requeue(struct sk_buff *skb, struct Qdisc *sch)
 	return NET_XMIT_CN;
 }
 
-
-
-
 static struct sk_buff *
 sfq_dequeue(struct Qdisc *sch)
 {
@@ -431,6 +428,48 @@ sfq_dequeue(struct Qdisc *sch)
 	return skb;
 }
 
+static struct sk_buff *
+sfq_smart_dequeue(struct Qdisc *sch)
+{
+	struct sfq_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	sfq_index a, old_a;
+	struct netdev_queue *txq;
+
+	/* No active slots */
+	if (q->tail == SFQ_DEPTH)
+		return NULL;
+
+	a = old_a = q->next[q->tail];
+
+	/* Grab packet */
+	skb = __qdisc_smart_dequeue(sch, &q->qs[a]);
+
+	if (!skb && (sch->flags & TCQ_F_STOPPED))
+		return NULL;
+
+	sfq_dec(q, a);
+	sch->q.qlen--;
+
+	/* Is the slot empty? */
+	if (q->qs[a].qlen == 0) {
+		q->ht[q->hash[a]] = SFQ_DEPTH;
+		a = q->next[a];
+		if (a == old_a) {
+			q->tail = SFQ_DEPTH;
+			return skb;
+		}
+		q->next[q->tail] = a;
+		q->allot[a] += q->quantum;
+	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
+		q->tail = a;
+		a = q->next[a];
+		q->allot[a] += q->quantum;
+	}
+	sch->flags &= ~TCQ_F_STOPPED;
+	return skb;
+}
+
 static void
 sfq_reset(struct Qdisc *sch)
 {
@@ -624,6 +663,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct sfq_sched_data),
 	.enqueue	=	sfq_enqueue,
 	.dequeue	=	sfq_dequeue,
+	.smart_dequeue	=	sfq_smart_dequeue,
 	.requeue	=	sfq_requeue,
 	.drop		=	sfq_drop,
 	.init		=	sfq_init,
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 94c6159..f65204c 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -169,6 +169,67 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
+	skb = q->qdisc->ops->dequeue(q->qdisc);
+
+	if (skb) {
+		psched_time_t now;
+		long toks;
+		long ptoks = 0;
+		unsigned int len = qdisc_pkt_len(skb);
+
+		now = psched_get_time();
+		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
+
+		if (q->P_tab) {
+			ptoks = toks + q->ptokens;
+			if (ptoks > (long)q->mtu)
+				ptoks = q->mtu;
+			ptoks -= L2T_P(q, len);
+		}
+		toks += q->tokens;
+		if (toks > (long)q->buffer)
+			toks = q->buffer;
+		toks -= L2T(q, len);
+
+		if ((toks|ptoks) >= 0) {
+			q->t_c = now;
+			q->tokens = toks;
+			q->ptokens = ptoks;
+			sch->q.qlen--;
+			sch->flags &= ~TCQ_F_THROTTLED;
+			return skb;
+		}
+
+		qdisc_watchdog_schedule(&q->watchdog,
+					now + max_t(long, -toks, -ptoks));
+
+		/* Maybe we have a shorter packet in the queue,
+		   which can be sent now. It sounds cool,
+		   but, however, this is wrong in principle.
+		   We MUST NOT reorder packets under these circumstances.
+
+		   Really, if we split the flow into independent
+		   subflows, it would be a very good solution.
+		   This is the main idea of all FQ algorithms
+		   (cf. CSZ, HPFQ, HFSC)
+		 */
+
+		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
+			/* When requeue fails skb is dropped */
+			qdisc_tree_decrease_qlen(q->qdisc, 1);
+			sch->qstats.drops++;
+		}
+
+		sch->qstats.overlimits++;
+	}
+	return NULL;
+}
+
+static struct sk_buff *tbf_smart_dequeue(struct Qdisc* sch)
+{
+	struct tbf_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
 	skb = q->qdisc->dequeue(q->qdisc);
 
 	if (skb) {
@@ -179,6 +240,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 
 		now = psched_get_time();
 		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
+		sch->flags &= ~TCQ_F_STOPPED;
 
 		if (q->P_tab) {
 			ptoks = toks + q->ptokens;
@@ -221,7 +283,10 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
 		}
 
 		sch->qstats.overlimits++;
+	} else if (q->qdisc->flags & TCQ_F_STOPPED) {
+		sch->flags |= TCQ_F_STOPPED;
 	}
+
 	return NULL;
 }
 
@@ -469,6 +534,7 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct tbf_sched_data),
 	.enqueue	=	tbf_enqueue,
 	.dequeue	=	tbf_dequeue,
+	.smart_dequeue	=	tbf_smart_dequeue,
 	.requeue	=	tbf_requeue,
 	.drop		=	tbf_drop,
 	.init		=	tbf_init,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index d35ef05..fecb3f8 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -123,6 +123,40 @@ teql_dequeue(struct Qdisc* sch)
 	return skb;
 }
 
+static struct sk_buff *
+teql_smart_dequeue(struct Qdisc* sch)
+{
+	struct teql_sched_data *dat = qdisc_priv(sch);
+	struct netdev_queue *dat_queue;
+	struct sk_buff *skb;
+	struct netdev_queue *txq;
+
+	skb = skb_peek(&dat->q);
+	if (skb) {
+		txq = netdev_get_tx_queue(qdisc_dev(sch),
+		                          skb_get_queue_mapping(skb));
+		if (netif_tx_queue_stopped(txq) ||
+		    netif_tx_queue_frozen(txq)) {
+			sch->flags |= TCQ_F_STOPPED;
+			return NULL;
+		}
+		__skb_unlink(skb, &dat->q);
+	}
+	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
+	if (skb == NULL) {
+		struct net_device *m = qdisc_dev(dat_queue->qdisc);
+		if (m) {
+			dat->m->slaves = sch;
+			netif_wake_queue(m);
+		}
+	} else {
+		sch->flags &= ~TCQ_F_STOPPED;
+	}
+	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
+
+	return skb;
+}
+
 static __inline__ void
 teql_neigh_release(struct neighbour *n)
 {
@@ -431,13 +465,14 @@ static __init void teql_master_setup(struct net_device *dev)
 	master->dev	= dev;
 	ops->priv_size  = sizeof(struct teql_sched_data);
 
-	ops->enqueue	=	teql_enqueue;
-	ops->dequeue	=	teql_dequeue;
-	ops->requeue	=	teql_requeue;
-	ops->init	=	teql_qdisc_init;
-	ops->reset	=	teql_reset;
-	ops->destroy	=	teql_destroy;
-	ops->owner	=	THIS_MODULE;
+	ops->enqueue		= teql_enqueue;
+	ops->dequeue		= teql_dequeue;
+	ops->smart_dequeue	= teql_smart_dequeue;
+	ops->requeue		= teql_requeue;
+	ops->init		= teql_qdisc_init;
+	ops->reset		= teql_reset;
+	ops->destroy		= teql_destroy;
+	ops->owner		= THIS_MODULE;
 
 	dev->open		= teql_master_open;
 	dev->hard_start_xmit	= teql_master_xmit;


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-18  6:43 [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Alexander Duyck
@ 2008-09-18  6:56 ` Alexander Duyck
  2008-09-18  9:46 ` David Miller
  2008-09-18 19:44 ` Jarek Poplawski
  2 siblings, 0 replies; 20+ messages in thread
From: Alexander Duyck @ 2008-09-18  6:56 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: netdev, jarkao2, herbert, davem, kaber

On Wed, Sep 17, 2008 at 11:43 PM, Alexander Duyck
<alexander.h.duyck@intel.com> wrote:
> This this patch is mangled I appologize, this is my first try sending
> a patch directly to netdev.
>
Already off to mangling things.  I got Dave's email wrong..  Sorry to
all who reply and get a bad address warning, and I meant to say "If
this patch is mangled..".

Anyway if anyone decides to reply please make note of the bad address.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-18  6:43 [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Alexander Duyck
  2008-09-18  6:56 ` Alexander Duyck
@ 2008-09-18  9:46 ` David Miller
  2008-09-18 14:51   ` Alexander Duyck
  2008-09-18 19:44 ` Jarek Poplawski
  2 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2008-09-18  9:46 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev, jarkao2, herbert, kaber

From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 17 Sep 2008 23:43:02 -0700

> diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
> index 43d3725..91a40b2 100644
> --- a/net/sched/sch_atm.c
> +++ b/net/sched/sch_atm.c
> @@ -516,12 +516,31 @@ static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
>  
>  	pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
>  	tasklet_schedule(&p->task);
> -	skb = p->link.q->dequeue(p->link.q);
> +	skb = p->link.q->ops->dequeue(p->link.q);
>  	if (skb)
>  		sch->q.qlen--;
>  	return skb;
>  }
>  

So what is the difference between qdisc->dequeue and qdisc->ops->dequeue?
The same applies to ->enqueue.

qdisc->{dequeue,enqueue} are given the value of ops->{dequeue,enqueue}
at the time of qdisc creation.  I can only see two reasons for their
existence:

1) We used to allow overriding ->enqueue and ->dequeue by certain
   modules.  I see no such use like this in the current tree.

2) For performance it's kept as a copy in the qdisc.

Either way, changing ->ops->dequeue into ->dequeue doesn't seem to be
correct, unless you have some explanation.

This is done in a few other places in your patch.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-18  9:46 ` David Miller
@ 2008-09-18 14:51   ` Alexander Duyck
  0 siblings, 0 replies; 20+ messages in thread
From: Alexander Duyck @ 2008-09-18 14:51 UTC (permalink / raw)
  To: David Miller; +Cc: alexander.h.duyck, netdev, jarkao2, herbert, kaber

On Thu, Sep 18, 2008 at 2:46 AM, David Miller <davem@davemloft.net> wrote:
> So what is the difference between qdisc->dequeue and qdisc->ops->dequeue?
> The same applies to ->enqueue.
>
> qdisc->{dequeue,enqueue} are given the value of ops->{dequeue,enqueue}
> at the time of qdisc creation.  I can only see two reasons for their
> existence:
>
> 1) We used to allow overriding ->enqueue and ->dequeue by certain
>   modules.  I see no such use like this in the current tree.
>
> 2) For performance it's kept as a copy in the qdisc.
>
> Either way, changing ->ops->dequeue into ->dequeue doesn't seem to be
> correct, unless you have some explanation.
>
> This is done in a few other places in your patch.

I redefined qdisc->dequeue to be set to smart_dequeue in sch_generic.c:
@@ -475,7 +491,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
       skb_queue_head_init(&sch->q);
       sch->ops = ops;
       sch->enqueue = ops->enqueue;
-       sch->dequeue = ops->dequeue;
+       sch->dequeue = ops->smart_dequeue;
       sch->dev_queue = dev_queue;
       dev_hold(qdisc_dev(sch));
       atomic_set(&sch->refcnt, 1);

Most of the changes from qdisc->dequeue to qdisc->ops->dequeue are to have the
standard dequeue call use nothing but standard dequeue calls in it's
path.  I needed
to maintain qdisc->ops->dequeue because there are several functions throughout
the qdisc code that require the ability to dequeue a packet regardless
of hw queue
state.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-18  6:43 [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Alexander Duyck
  2008-09-18  6:56 ` Alexander Duyck
  2008-09-18  9:46 ` David Miller
@ 2008-09-18 19:44 ` Jarek Poplawski
  2008-09-19  1:11   ` Alexander Duyck
  2008-09-19 15:16   ` Jarek Poplawski
  2 siblings, 2 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-18 19:44 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: netdev, herbert, davem, kaber

Alexander Duyck wrote, On 09/18/2008 08:43 AM:
...
> ---
> This patch changes the behavior of the sch->dequeue to only
> dequeue a packet if the queue it is bound for is not currently
> stopped.  This functionality is provided via a new op called
> smart_dequeue.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> ---

I think, these changes make sense only if they don't add more then give,
and two dequeues (and still no way to kill requeue) is IMHO too much.
(I mean the maintenance.) As far as I can see it's mainly for HFSC's
qdisc_peek_len(), anyway this looks like main issue to me.

Below a few small doubts. (I need to find some time for details yet.)
BTW, this patch needs a checkpatch run.

---
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b786a5b..4082f39 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -90,10 +90,7 @@ extern void __qdisc_run(struct Qdisc *q);

 static inline void qdisc_run(struct Qdisc *q)
 {
-	struct netdev_queue *txq = q->dev_queue;
-
-	if (!netif_tx_queue_stopped(txq) &&

I think, there is no reason to do a full dequeue try each time instead
of this check, even if we save on requeuing now. We could try to save
the result of the last dequeue, e.g. a number or some mask of a few
tx_queues which prevented dequeuing, and check for the change of state
only. (Or alternatively, what I mentioned before: a flag set with the
full stop or freeze.)

-	    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
+	if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
 		__qdisc_run(q);
 }

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e556962..4400a18 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
...
+static inline struct sk_buff *__qdisc_smart_dequeue(struct Qdisc *sch,
+						    struct sk_buff_head *list)
+{
+	struct sk_buff *skb = skb_peek(list);

Since success is much more likely here, __skb_dequeue() with
__skb_queue_head() on fail looks better to me.

Of course, it's a matter of taste, but (if we really need these two
dequeues) maybe qdisc_dequeue_smart() would be more in line with
qdisc_dequeue_head()? (And similarly smart names below.)

+	struct netdev_queue *txq;
+
+	if (!skb)
+		return NULL;
+
+	txq = netdev_get_tx_queue(qdisc_dev(sch), skb_get_queue_mapping(skb));
+	if (netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)) {
+		sch->flags |= TCQ_F_STOPPED;
+		return NULL;
+	}
+	__skb_unlink(skb, list);
+	sch->qstats.backlog -= qdisc_pkt_len(skb);
+	sch->flags &= ~TCQ_F_STOPPED;

This clearing is probably needed in ->reset() and in ->drop() too.
(Or above, after if (!skb))

...
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index d14f020..4da1a85 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
...
+static struct sk_buff *htb_smart_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb = NULL;
+	struct htb_sched *q = qdisc_priv(sch);
+	int level, stopped = false;
+	psched_time_t next_event;
+
+	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
+	skb = skb_peek(&q->direct_queue);

As above: __skb_dequeue()?

Thanks,
Jarek P.

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-18 19:44 ` Jarek Poplawski
@ 2008-09-19  1:11   ` Alexander Duyck
  2008-09-19  9:17     ` Jarek Poplawski
  2008-09-19 10:32     ` [take 2] " Jarek Poplawski
  2008-09-19 15:16   ` Jarek Poplawski
  1 sibling, 2 replies; 20+ messages in thread
From: Alexander Duyck @ 2008-09-19  1:11 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Alexander Duyck, netdev, herbert, davem, kaber

On Thu, Sep 18, 2008 at 12:44 PM, Jarek Poplawski <jarkao2@gmail.com> wrote:
> I think, these changes make sense only if they don't add more then give,
> and two dequeues (and still no way to kill requeue) is IMHO too much.
> (I mean the maintenance.) As far as I can see it's mainly for HFSC's
> qdisc_peek_len(), anyway this looks like main issue to me.

The thing is this was just meant to be a proof of concept for the most
part so I was doing a lot of cut and paste coding, and as a result the
size increased by a good amount.  I admit this could be cleaned up a
lot I just wanted to verify some things.

Also my ultimate goal wasn't to kill requeue completely as you can't
do that since in the case of TSO/GSO you will end up with SKBs which
require multiple transmit descriptors and therefore you will always
need an option to requeue.  The advantage with this approach is that
you don't incur the CPU cost, which is a significant savings when you
compare the requeue approach which was generating 13% cpu to the smart
dequeue which was only using 3%.

> Below a few small doubts. (I need to find some time for details yet.)
> BTW, this patch needs a checkpatch run.

I did run checkpatch on this.  Most of the errors are inherited from
the cut and paste and I didn't want to take the time to completely
rewrite the core qdisc functionality.  Other than those errors I
believe some were whitespace complaints as I was using tabs to the
start of my functions and then spaces to indent my function parameters
in the case of having to wrap for long lines.

> ---
> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
> index b786a5b..4082f39 100644
> --- a/include/net/pkt_sched.h
> +++ b/include/net/pkt_sched.h
> @@ -90,10 +90,7 @@ extern void __qdisc_run(struct Qdisc *q);
>
>  static inline void qdisc_run(struct Qdisc *q)
>  {
> -       struct netdev_queue *txq = q->dev_queue;
> -
> -       if (!netif_tx_queue_stopped(txq) &&
>
> I think, there is no reason to do a full dequeue try each time instead
> of this check, even if we save on requeuing now. We could try to save
> the result of the last dequeue, e.g. a number or some mask of a few
> tx_queues which prevented dequeuing, and check for the change of state
> only. (Or alternatively, what I mentioned before: a flag set with the
> full stop or freeze.)

Once again if you have a suggestion on approach feel free to modify
the patch and see how it works for you.  My only concern is that there
are several qdiscs which won't give you the same packet twice and so
you don't know what is going to pop out until you go in and check.

>
> -           !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
> +       if (!test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
>                __qdisc_run(q);
>  }
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index e556962..4400a18 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> ...
> +static inline struct sk_buff *__qdisc_smart_dequeue(struct Qdisc *sch,
> +                                                   struct sk_buff_head *list)
> +{
> +       struct sk_buff *skb = skb_peek(list);
>
> Since success is much more likely here, __skb_dequeue() with
> __skb_queue_head() on fail looks better to me.
>
> Of course, it's a matter of taste, but (if we really need these two
> dequeues) maybe qdisc_dequeue_smart() would be more in line with
> qdisc_dequeue_head()? (And similarly smart names below.)

Right, but then we are getting back to the queue/requeue stuff.  If
you want feel free to make use of my patch to generate your own that
uses that approach.  I just don't like changing things unless I
absolutely have to and all I did is essentially tear apart
__skb_dequeue and place the bits inline with my testing for
netif_tx_subqueue_stopped.

> +       struct netdev_queue *txq;
> +
> +       if (!skb)
> +               return NULL;
> +
> +       txq = netdev_get_tx_queue(qdisc_dev(sch), skb_get_queue_mapping(skb));
> +       if (netif_tx_queue_stopped(txq) || netif_tx_queue_frozen(txq)) {
> +               sch->flags |= TCQ_F_STOPPED;
> +               return NULL;
> +       }
> +       __skb_unlink(skb, list);
> +       sch->qstats.backlog -= qdisc_pkt_len(skb);
> +       sch->flags &= ~TCQ_F_STOPPED;
>
> This clearing is probably needed in ->reset() and in ->drop() too.
> (Or above, after if (!skb))

For the most part I would agree with you, but for now I was only using
the flag as a part of the smart_dequeue process to flag the upper
queues so I didn't give it much thought.  It is yet another thing I
probably should have cleaned up but didn't get around to since this
was mostly proof of concept.

> ...
> diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
> index d14f020..4da1a85 100644
> --- a/net/sched/sch_htb.c
> +++ b/net/sched/sch_htb.c
> ...
> +static struct sk_buff *htb_smart_dequeue(struct Qdisc *sch)
> +{
> +       struct sk_buff *skb = NULL;
> +       struct htb_sched *q = qdisc_priv(sch);
> +       int level, stopped = false;
> +       psched_time_t next_event;
> +
> +       /* try to dequeue direct packets as high prio (!) to minimize cpu work */
> +       skb = skb_peek(&q->direct_queue);
>
> As above: __skb_dequeue()?

Actually I could probably replace most of the skb_peek stuff with
calls to __qdisc_smart_dequeue instead and then just check for the
flag on fail.

> Thanks,
> Jarek P.

I probably won't be able to contribute to this for at least the next
two weeks since I am going to be out for vacation from Saturday until
the start of October.

In the meantime I also just threw a couple of patches out which may
help anyone who is trying to test this stuff.  Turns out if you try to
do a netperf UDP_STREAM test on a multiqueue aware system with 2.6.27
you get horrible performance on the receive side.  The root cause
appears to be that simple_tx_hash was doing a hash on fragmented
packets and as a result placing packets in psuedo random queues which
caused issues with packet ordering.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19  1:11   ` Alexander Duyck
@ 2008-09-19  9:17     ` Jarek Poplawski
  2008-09-19 10:32     ` [take 2] " Jarek Poplawski
  1 sibling, 0 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19  9:17 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Alexander Duyck, netdev, herbert, davem, kaber

On Thu, Sep 18, 2008 at 06:11:45PM -0700, Alexander Duyck wrote:
> Once again if you have a suggestion on approach feel free to modify
> the patch and see how it works for you.  My only concern is that there
> are several qdiscs which won't give you the same packet twice and so
> you don't know what is going to pop out until you go in and check.
...

Actually, here is my suggestion: I think, that your obviously more
complex solution should be compared with something simple which IMHO
has similar feature, i.e. limited overhead of requeuing.

My proposal is to use partly David's idea of killing ->ops->requeue(),
for now only the first two patches, so don't requeue back to the
qdiscs, but leave qdisc->ops->requeue() for internal usage (like in
HFSC's qdisc_peek_len() hack). Additionaly I use your idea of early
checking the state of tx queue to make this even lighter after
possibly removing the entry check from qdisc_run().

I attach my patch at the end, after original David's two patches.

Thanks,
Jarek P.

---------------> patch 1/3
Subject: [PATCH 1/9]: pkt_sched: Make qdisc->gso_skb a list.
Date: Mon, 18 Aug 2008 01:36:47 -0700 (PDT)
From: David Miller <davem@davemloft.net>
To: netdev@vger.kernel.org
CC: jarkao2@gmail.com
Newsgroups: gmane.linux.network


pkt_sched: Make qdisc->gso_skb a list.

The idea is that we can use this to get rid of
->requeue().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |    2 +-
 net/sched/sch_generic.c   |   12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 84d25f2..140c48b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -52,7 +52,7 @@ struct Qdisc
 	u32			parent;
 	atomic_t		refcnt;
 	unsigned long		state;
-	struct sk_buff		*gso_skb;
+	struct sk_buff_head	requeue;
 	struct sk_buff_head	q;
 	struct netdev_queue	*dev_queue;
 	struct Qdisc		*next_sched;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6f96b7b..39d969e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,7 +45,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	if (unlikely(skb->next))
-		q->gso_skb = skb;
+		__skb_queue_head(&q->requeue, skb);
 	else
 		q->ops->requeue(skb, q);
 
@@ -57,9 +57,8 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
 	struct sk_buff *skb;
 
-	if ((skb = q->gso_skb))
-		q->gso_skb = NULL;
-	else
+	skb = __skb_dequeue(&q->requeue);
+	if (!skb)
 		skb = q->dequeue(q);
 
 	return skb;
@@ -328,6 +327,7 @@ struct Qdisc noop_qdisc = {
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noop_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
+	.requeue.lock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
 };
@@ -353,6 +353,7 @@ static struct Qdisc noqueue_qdisc = {
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noqueue_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
+	.requeue.lock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.dev_queue	=	&noqueue_netdev_queue,
 };
@@ -473,6 +474,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->padded = (char *) sch - (char *) p;
 
 	INIT_LIST_HEAD(&sch->list);
+	skb_queue_head_init(&sch->requeue);
 	skb_queue_head_init(&sch->q);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
@@ -543,7 +545,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	kfree_skb(qdisc->gso_skb);
+	__skb_queue_purge(&qdisc->requeue);
 
 	kfree((char *) qdisc - qdisc->padded);
 }


-------------> patch 2/3
Subject: [PATCH 2/9]: pkt_sched: Always use q->requeue in dev_requeue_skb().
Date: Mon, 18 Aug 2008 01:36:50 -0700 (PDT)
From: David Miller <davem@davemloft.net>
To: netdev@vger.kernel.org
CC: jarkao2@gmail.com
Newsgroups: gmane.linux.network


pkt_sched: Always use q->requeue in dev_requeue_skb().

There is no reason to call into the complicated qdiscs
just to remember the last SKB where we found the device
blocked.

The SKB is outside of the qdiscs realm at this point.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c |    5 +----
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39d969e..96d7d08 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,10 +44,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	if (unlikely(skb->next))
-		__skb_queue_head(&q->requeue, skb);
-	else
-		q->ops->requeue(skb, q);
+	__skb_queue_head(&q->requeue, skb);
 
 	__netif_schedule(q);
 	return 0;


----------------> patch 3/3

pkt_sched: Check the state of tx_queue in dequeue_skb()

Check in dequeue_skb() the state of tx_queue for requeued skb to save
on locking and re-requeuing, and possibly remove the current check in
qdisc_run(). Based on the idea of Alexander Duyck.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>

---

diff -Nurp net-next-2.6-/net/sched/sch_generic.c net-next-2.6+/net/sched/sch_generic.c
--- net-next-2.6-/net/sched/sch_generic.c	2008-09-19 07:21:44.000000000 +0000
+++ net-next-2.6+/net/sched/sch_generic.c	2008-09-19 07:49:15.000000000 +0000
@@ -52,11 +52,21 @@ static inline int dev_requeue_skb(struct
 
 static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = skb_peek(&q->requeue);
 
-	skb = __skb_dequeue(&q->requeue);
-	if (!skb)
+	if (unlikely(skb)) {
+		struct net_device *dev = qdisc_dev(q);
+		struct netdev_queue *txq;
+
+		/* check the reason of requeuing without tx lock first */
+		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+		if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
+			__skb_unlink(skb, &q->requeue);
+		else
+			skb = NULL;
+	} else {
 		skb = q->dequeue(q);
+	}
 
 	return skb;
 }

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [take 2] Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19  1:11   ` Alexander Duyck
  2008-09-19  9:17     ` Jarek Poplawski
@ 2008-09-19 10:32     ` Jarek Poplawski
  2008-09-19 13:07       ` [PATCH] pkt_sched: Fix TX state checking in qdisc_run() Jarek Poplawski
                         ` (2 more replies)
  1 sibling, 3 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 10:32 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: Alexander Duyck, netdev, herbert, davem, kaber

Take 2:
I missed __netif_schedule() in my patch, so the update below.

But, because of this a little change of mind: there is no reason to
repeat __netif_schedule() if there is a full stop or freeze of all
queues, so it seems we need additional check for this anyway. I'll
send my proposal of this test soon.

Jarek P.

On Thu, Sep 18, 2008 at 06:11:45PM -0700, Alexander Duyck wrote:
> Once again if you have a suggestion on approach feel free to modify
> the patch and see how it works for you.  My only concern is that there
> are several qdiscs which won't give you the same packet twice and so
> you don't know what is going to pop out until you go in and check.
...

Actually, here is my suggestion: I think, that your obviously more
complex solution should be compared with something simple which IMHO
has similar feature, i.e. limited overhead of requeuing.

My proposal is to use partly David's idea of killing ->ops->requeue(),
for now only the first two patches, so don't requeue back to the
qdiscs, but leave qdisc->ops->requeue() for internal usage (like in
HFSC's qdisc_peek_len() hack). Additionaly I use your idea of early
checking the state of tx queue to make this even lighter after
possibly removing the entry check from qdisc_run().

I attach my patch at the end, after original David's two patches.

Thanks,
Jarek P.

---------------> patch 1/3
Subject: [PATCH 1/9]: pkt_sched: Make qdisc->gso_skb a list.
Date: Mon, 18 Aug 2008 01:36:47 -0700 (PDT)
From: David Miller <davem@davemloft.net>
To: netdev@vger.kernel.org
CC: jarkao2@gmail.com
Newsgroups: gmane.linux.network


pkt_sched: Make qdisc->gso_skb a list.

The idea is that we can use this to get rid of
->requeue().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |    2 +-
 net/sched/sch_generic.c   |   12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 84d25f2..140c48b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -52,7 +52,7 @@ struct Qdisc
 	u32			parent;
 	atomic_t		refcnt;
 	unsigned long		state;
-	struct sk_buff		*gso_skb;
+	struct sk_buff_head	requeue;
 	struct sk_buff_head	q;
 	struct netdev_queue	*dev_queue;
 	struct Qdisc		*next_sched;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6f96b7b..39d969e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -45,7 +45,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
 	if (unlikely(skb->next))
-		q->gso_skb = skb;
+		__skb_queue_head(&q->requeue, skb);
 	else
 		q->ops->requeue(skb, q);
 
@@ -57,9 +57,8 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
 	struct sk_buff *skb;
 
-	if ((skb = q->gso_skb))
-		q->gso_skb = NULL;
-	else
+	skb = __skb_dequeue(&q->requeue);
+	if (!skb)
 		skb = q->dequeue(q);
 
 	return skb;
@@ -328,6 +327,7 @@ struct Qdisc noop_qdisc = {
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noop_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
+	.requeue.lock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
 };
@@ -353,6 +353,7 @@ static struct Qdisc noqueue_qdisc = {
 	.flags		=	TCQ_F_BUILTIN,
 	.ops		=	&noqueue_qdisc_ops,
 	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
+	.requeue.lock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
 	.dev_queue	=	&noqueue_netdev_queue,
 };
@@ -473,6 +474,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->padded = (char *) sch - (char *) p;
 
 	INIT_LIST_HEAD(&sch->list);
+	skb_queue_head_init(&sch->requeue);
 	skb_queue_head_init(&sch->q);
 	sch->ops = ops;
 	sch->enqueue = ops->enqueue;
@@ -543,7 +545,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 	module_put(ops->owner);
 	dev_put(qdisc_dev(qdisc));
 
-	kfree_skb(qdisc->gso_skb);
+	__skb_queue_purge(&qdisc->requeue);
 
 	kfree((char *) qdisc - qdisc->padded);
 }


-------------> patch 2/3
Subject: [PATCH 2/9]: pkt_sched: Always use q->requeue in dev_requeue_skb().
Date: Mon, 18 Aug 2008 01:36:50 -0700 (PDT)
From: David Miller <davem@davemloft.net>
To: netdev@vger.kernel.org
CC: jarkao2@gmail.com
Newsgroups: gmane.linux.network


pkt_sched: Always use q->requeue in dev_requeue_skb().

There is no reason to call into the complicated qdiscs
just to remember the last SKB where we found the device
blocked.

The SKB is outside of the qdiscs realm at this point.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c |    5 +----
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39d969e..96d7d08 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,10 +44,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 
 static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
-	if (unlikely(skb->next))
-		__skb_queue_head(&q->requeue, skb);
-	else
-		q->ops->requeue(skb, q);
+	__skb_queue_head(&q->requeue, skb);
 
 	__netif_schedule(q);
 	return 0;


----------------> patch 3/3 (take 2)

pkt_sched: Check the state of tx_queue in dequeue_skb()

Check in dequeue_skb() the state of tx_queue for requeued skb to save
on locking and re-requeuing, and possibly remove the current check in
qdisc_run(). Based on the idea of Alexander Duyck.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>

---

diff -Nurp net-next-2.6-/net/sched/sch_generic.c net-next-2.6+/net/sched/sch_generic.c
--- net-next-2.6-/net/sched/sch_generic.c	2008-09-19 07:21:44.000000000 +0000
+++ net-next-2.6+/net/sched/sch_generic.c	2008-09-19 10:11:04.000000000 +0000
@@ -52,11 +52,24 @@ static inline int dev_requeue_skb(struct
 
 static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = skb_peek(&q->requeue);
 
-	skb = __skb_dequeue(&q->requeue);
-	if (!skb)
+	if (unlikely(skb)) {
+		struct net_device *dev = qdisc_dev(q);
+		struct netdev_queue *txq;
+
+		/* check the reason of requeuing without tx lock first */
+		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+		if (!netif_tx_queue_stopped(txq) &&
+		    !netif_tx_queue_frozen(txq)) {
+			__skb_unlink(skb, &q->requeue);
+		} else {
+			skb = NULL;
+			__netif_schedule(q);
+		}
+	} else {
 		skb = q->dequeue(q);
+	}
 
 	return skb;
 }

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH] pkt_sched: Fix TX state checking in qdisc_run()
  2008-09-19 10:32     ` [take 2] " Jarek Poplawski
@ 2008-09-19 13:07       ` Jarek Poplawski
  2008-09-19 14:44       ` [PATCH take2] " Jarek Poplawski
  2008-09-19 17:46       ` [take 2] Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Jarek Poplawski
  2 siblings, 0 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 13:07 UTC (permalink / raw)
  To: David Miller; +Cc: Alexander Duyck, Alexander Duyck, netdev, herbert, kaber


pkt_sched: Fix TX state checking in qdisc_run()

Current check wrongly uses the state of the first tx queue for all tx
queues. This patch brings back per dev __LINK_STATE_XOFF flag, which
is set when all tx queues are stopped. This check is needed in
qdisc_run() to avoid useless __netif_schedule() calls.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
---

 include/linux/netdevice.h |    7 ++++++-
 include/net/pkt_sched.h   |    4 +---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 488c56e..dc76e4b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -271,6 +271,7 @@ struct header_ops {
 
 enum netdev_state_t
 {
+	__LINK_STATE_XOFF,
 	__LINK_STATE_START,
 	__LINK_STATE_PRESENT,
 	__LINK_STATE_NOCARRIER,
@@ -1043,6 +1044,7 @@ static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 static inline void netif_wake_queue(struct net_device *dev)
 {
 	netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
+	clear_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
 static inline void netif_tx_wake_all_queues(struct net_device *dev)
@@ -1053,6 +1055,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 		netif_tx_wake_queue(txq);
 	}
+	clear_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
 static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
@@ -1069,6 +1072,7 @@ static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
  */
 static inline void netif_stop_queue(struct net_device *dev)
 {
+	set_bit(__LINK_STATE_XOFF, &dev->state);
 	netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
 }
 
@@ -1076,6 +1080,7 @@ static inline void netif_tx_stop_all_queues(struct net_device *dev)
 {
 	unsigned int i;
 
+	set_bit(__LINK_STATE_XOFF, &dev->state);
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 		netif_tx_stop_queue(txq);
@@ -1095,7 +1100,7 @@ static inline int netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
  */
 static inline int netif_queue_stopped(const struct net_device *dev)
 {
-	return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
+	return test_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
 static inline int netif_tx_queue_frozen(const struct netdev_queue *dev_queue)
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b786a5b..1718a60 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -90,9 +90,7 @@ extern void __qdisc_run(struct Qdisc *q);
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-	struct netdev_queue *txq = q->dev_queue;
-
-	if (!netif_tx_queue_stopped(txq) &&
+	if (!netif_queue_stopped(qdisc_dev(q)) &&
 	    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
 		__qdisc_run(q);
 }

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH take2] pkt_sched: Fix TX state checking in qdisc_run()
  2008-09-19 10:32     ` [take 2] " Jarek Poplawski
  2008-09-19 13:07       ` [PATCH] pkt_sched: Fix TX state checking in qdisc_run() Jarek Poplawski
@ 2008-09-19 14:44       ` Jarek Poplawski
  2008-09-19 17:49         ` Jarek Poplawski
  2008-09-21  5:18         ` David Miller
  2008-09-19 17:46       ` [take 2] Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Jarek Poplawski
  2 siblings, 2 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 14:44 UTC (permalink / raw)
  To: David Miller; +Cc: Alexander Duyck, Alexander Duyck, netdev, herbert, kaber

Alas this time the changelog needs more details.

Sorry,
Jarek P.
-----------------> (take 2)

pkt_sched: Fix TX state checking in qdisc_run()

Current check wrongly uses the state of the first tx queue for all tx
queues in case of non-default qdiscs. This patch brings back per dev
__LINK_STATE_XOFF flag, which is set when all tx queues are stopped.
This check is needed in qdisc_run() to avoid useless __netif_schedule()
calls. The wrongness of this check was first noticed by Herbert Xu.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
---

 include/linux/netdevice.h |    7 ++++++-
 include/net/pkt_sched.h   |    4 +---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 488c56e..dc76e4b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -271,6 +271,7 @@ struct header_ops {
 
 enum netdev_state_t
 {
+	__LINK_STATE_XOFF,
 	__LINK_STATE_START,
 	__LINK_STATE_PRESENT,
 	__LINK_STATE_NOCARRIER,
@@ -1043,6 +1044,7 @@ static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 static inline void netif_wake_queue(struct net_device *dev)
 {
 	netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
+	clear_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
 static inline void netif_tx_wake_all_queues(struct net_device *dev)
@@ -1053,6 +1055,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 		netif_tx_wake_queue(txq);
 	}
+	clear_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
 static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
@@ -1069,6 +1072,7 @@ static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
  */
 static inline void netif_stop_queue(struct net_device *dev)
 {
+	set_bit(__LINK_STATE_XOFF, &dev->state);
 	netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
 }
 
@@ -1076,6 +1080,7 @@ static inline void netif_tx_stop_all_queues(struct net_device *dev)
 {
 	unsigned int i;
 
+	set_bit(__LINK_STATE_XOFF, &dev->state);
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 		netif_tx_stop_queue(txq);
@@ -1095,7 +1100,7 @@ static inline int netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
  */
 static inline int netif_queue_stopped(const struct net_device *dev)
 {
-	return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
+	return test_bit(__LINK_STATE_XOFF, &dev->state);
 }
 
 static inline int netif_tx_queue_frozen(const struct netdev_queue *dev_queue)
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b786a5b..1718a60 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -90,9 +90,7 @@ extern void __qdisc_run(struct Qdisc *q);
 
 static inline void qdisc_run(struct Qdisc *q)
 {
-	struct netdev_queue *txq = q->dev_queue;
-
-	if (!netif_tx_queue_stopped(txq) &&
+	if (!netif_queue_stopped(qdisc_dev(q)) &&
 	    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state))
 		__qdisc_run(q);
 }
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-18 19:44 ` Jarek Poplawski
  2008-09-19  1:11   ` Alexander Duyck
@ 2008-09-19 15:16   ` Jarek Poplawski
  2008-09-19 16:26     ` Duyck, Alexander H
  2008-09-19 16:37     ` Jarek Poplawski
  1 sibling, 2 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 15:16 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: netdev, herbert, davem, kaber

On Thu, Sep 18, 2008 at 09:44:19PM +0200, Jarek Poplawski wrote:
> Alexander Duyck wrote, On 09/18/2008 08:43 AM:
> ...
> > ---
> > This patch changes the behavior of the sch->dequeue to only
> > dequeue a packet if the queue it is bound for is not currently
> > stopped.  This functionality is provided via a new op called
> > smart_dequeue.
> > 
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> > ---

Alexander, I guess you've seen my last messages/patches in this thread
and noticed that this __netif_schedule() problem is present both in
this RFC patch and sch_multiq: if skb isn't dequeued because of inner
tx_queue stopped state check, there is missing __netif_schedule()
before exit from qdisc_run().

Jarek P.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 15:16   ` Jarek Poplawski
@ 2008-09-19 16:26     ` Duyck, Alexander H
  2008-09-19 17:35       ` Jarek Poplawski
  2008-09-19 16:37     ` Jarek Poplawski
  1 sibling, 1 reply; 20+ messages in thread
From: Duyck, Alexander H @ 2008-09-19 16:26 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, kaber@trash.net

Jarek Poplawski wrote:
> On Thu, Sep 18, 2008 at 09:44:19PM +0200, Jarek Poplawski wrote:
>> Alexander Duyck wrote, On 09/18/2008 08:43 AM:
>> ...
>>> ---
>>> This patch changes the behavior of the sch->dequeue to only
>>> dequeue a packet if the queue it is bound for is not currently
>>> stopped.  This functionality is provided via a new op called
>>> smart_dequeue.
>>>
>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>>> ---
>
> Alexander, I guess you've seen my last messages/patches in this thread
> and noticed that this __netif_schedule() problem is present both in
> this RFC patch and sch_multiq: if skb isn't dequeued because of inner
> tx_queue stopped state check, there is missing __netif_schedule()
> before exit from qdisc_run().
>
> Jarek P.

Actually most of that is handled in the fact that netif_tx_wake_queue
will call __netif_schedule when it decides to wake up a queue that has
been stopped.  Putting it in skb_dequeue is unnecessary, and the way
you did it would cause issues because you should be scheduling the root
qdisc, not the one currently doing the dequeue.

My bigger concern is the fact one queue is back to stopping all queues.
By using one global XOFF state, you create head-of-line blocking.  I can
see the merit in this approach but the problem is for multiqueue the
queues really should be independent.  What your change does is reduce the
benefit of having multiqueue by throwing in a new state which pretty much
matches what netif_stop/wake_queue used to do in the 2.6.26 version of tx
multiqueue.

Also changing dequeue_skb will likely cause additional issues for
several qdiscs as it doesn't report anything up to parent queues, as a
result you will end up with qdiscs like prio acting more like multiq
because they won't know if a queue is empty, stopped, or throttled.
Also I believe this will cause a panic on pfifo_fast and several other
qdiscs because some check to see if there are packets in the queue and
if so dequeue with the assumption that they will get a packet out.  You
will need to add checks for this to resolve this issue.

The one thing I liked about my approach was that after I was done you
could have prio as a parent of multiq leaves or multiq as the parent of
prio leaves and all the hardware queues would receive the packets in the
same order as the result.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 15:16   ` Jarek Poplawski
  2008-09-19 16:26     ` Duyck, Alexander H
@ 2008-09-19 16:37     ` Jarek Poplawski
  1 sibling, 0 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 16:37 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: netdev, herbert, davem, kaber

On Fri, Sep 19, 2008 at 05:16:32PM +0200, Jarek Poplawski wrote:
...
> Alexander, I guess you've seen my last messages/patches in this thread
> and noticed that this __netif_schedule() problem is present both in
> this RFC patch and sch_multiq: if skb isn't dequeued because of inner
> tx_queue stopped state check, there is missing __netif_schedule()
> before exit from qdisc_run().

Hmm... probably false alarm. It seems there have to be same wake_queue
after all.

Sorry,
Jarek P.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 16:26     ` Duyck, Alexander H
@ 2008-09-19 17:35       ` Jarek Poplawski
  2008-09-19 18:01         ` Duyck, Alexander H
  0 siblings, 1 reply; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 17:35 UTC (permalink / raw)
  To: Duyck, Alexander H
  Cc: netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, kaber@trash.net

On Fri, Sep 19, 2008 at 09:26:29AM -0700, Duyck, Alexander H wrote:
...
> Actually most of that is handled in the fact that netif_tx_wake_queue
> will call __netif_schedule when it decides to wake up a queue that has
> been stopped.  Putting it in skb_dequeue is unnecessary,

You are right, I've just noticed this too, and I'll withdraw this change.

> and the way
> you did it would cause issues because you should be scheduling the root
> qdisc, not the one currently doing the dequeue.

I think, this is the right way (but useless here).

> My bigger concern is the fact one queue is back to stopping all queues.
> By using one global XOFF state, you create head-of-line blocking.  I can
> see the merit in this approach but the problem is for multiqueue the
> queues really should be independent.  What your change does is reduce the
> benefit of having multiqueue by throwing in a new state which pretty much
> matches what netif_stop/wake_queue used to do in the 2.6.26 version of tx
> multiqueue.

Do you mean netif_tx_stop_all_queues() and then netif_tx_wake_queue()
before netif_tx_wake_all_queues()? OK, I'll withdraw this patch too.

> Also changing dequeue_skb will likely cause additional issues for
> several qdiscs as it doesn't report anything up to parent queues, as a
> result you will end up with qdiscs like prio acting more like multiq
> because they won't know if a queue is empty, stopped, or throttled.
> Also I believe this will cause a panic on pfifo_fast and several other
> qdiscs because some check to see if there are packets in the queue and
> if so dequeue with the assumption that they will get a packet out.  You
> will need to add checks for this to resolve this issue.

I really can't get your point. Don't you mean skb_dequeue()?
dequeue_skb() is used only by qdisc_restart()...

> The one thing I liked about my approach was that after I was done you
> could have prio as a parent of multiq leaves or multiq as the parent of
> prio leaves and all the hardware queues would receive the packets in the
> same order as the result.

I'm not against your approach, but I'd like to be sure these
complications are really worth of it. Of course, if my proposal, the
first take of 3 patches, doesn't work as you predict (and I doubt),
then we can forget about it.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [take 2] Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 10:32     ` [take 2] " Jarek Poplawski
  2008-09-19 13:07       ` [PATCH] pkt_sched: Fix TX state checking in qdisc_run() Jarek Poplawski
  2008-09-19 14:44       ` [PATCH take2] " Jarek Poplawski
@ 2008-09-19 17:46       ` Jarek Poplawski
  2 siblings, 0 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 17:46 UTC (permalink / raw)
  To: davem; +Cc: Alexander Duyck, Alexander Duyck, netdev, herbert, kaber

Jarek Poplawski wrote, On 09/19/2008 12:32 PM:

> Take 2:
> I missed __netif_schedule() in my patch, so the update below.


David,

After rethinking and reading Alexender's opinion I see this
change isn't needed so I withdraw this take 2.

Please, reconsider the first version of this patch (with your
2 patches), yet.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH take2] pkt_sched: Fix TX state checking in qdisc_run()
  2008-09-19 14:44       ` [PATCH take2] " Jarek Poplawski
@ 2008-09-19 17:49         ` Jarek Poplawski
  2008-09-21  5:18         ` David Miller
  1 sibling, 0 replies; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 17:49 UTC (permalink / raw)
  To: David Miller; +Cc: Alexander Duyck, Alexander Duyck, netdev, herbert, kaber

Jarek Poplawski wrote, On 09/19/2008 04:44 PM:

> Alas this time the changelog needs more details.
> 
> Sorry,
> Jarek P.
> -----------------> (take 2)
> 
> pkt_sched: Fix TX state checking in qdisc_run()


David,

After reading Alexander's opinion I withdaw this patch (take 1 & 2)
as well.

Sorry,
Jarek P.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 17:35       ` Jarek Poplawski
@ 2008-09-19 18:01         ` Duyck, Alexander H
  2008-09-19 18:51           ` Jarek Poplawski
  0 siblings, 1 reply; 20+ messages in thread
From: Duyck, Alexander H @ 2008-09-19 18:01 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, kaber@trash.net

Jarek Poplawski wrote:

>> Also changing dequeue_skb will likely cause additional issues for
>> several qdiscs as it doesn't report anything up to parent queues, as
>> a result you will end up with qdiscs like prio acting more like
>> multiq because they won't know if a queue is empty, stopped, or
>> throttled.
>> Also I believe this will cause a panic on pfifo_fast and several
>> other qdiscs because some check to see if there are packets in the
>> queue and if so dequeue with the assumption that they will get a
>> packet out.  You will need to add checks for this to resolve this
>> issue.
>
> I really can't get your point. Don't you mean skb_dequeue()?
> dequeue_skb() is used only by qdisc_restart()...
You're right.  I misread it as skb_dequeue.  The problem is though you
are still relying on q->requeue which last I knew was only being used
a few qdiscs.  In addition you will still be taking the cpu hit for the
dequeue/requeue on several qdiscs which can't use q->requeue without
violating the way they were supposed to work.

>> The one thing I liked about my approach was that after I was done you
>> could have prio as a parent of multiq leaves or multiq as the parent
>> of prio leaves and all the hardware queues would receive the packets
>> in the same order as the result.
>
> I'm not against your approach, but I'd like to be sure these
> complications are really worth of it. Of course, if my proposal, the
> first take of 3 patches, doesn't work as you predict (and I doubt),
> then we can forget about it.
Well when you get some testing done let me know.  The main areas I am
concerned with are:

1.  CPU utilization stays the same regardless of which queue used.
2.  Maintain current qdisc behavior on a per hw queue basis.
3.  Avoid head-of-line blocking where it applies
        for example: prio band 0 not blocked by band 1, or 1 by 2, etc..
                                or
                         multiq not blocked on any band due to 1 band blocked

As long as all 3 criteria are met I would be happy with any solution
provided.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 18:01         ` Duyck, Alexander H
@ 2008-09-19 18:51           ` Jarek Poplawski
  2008-09-19 21:43             ` Duyck, Alexander H
  0 siblings, 1 reply; 20+ messages in thread
From: Jarek Poplawski @ 2008-09-19 18:51 UTC (permalink / raw)
  To: Duyck, Alexander H
  Cc: netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, kaber@trash.net

On Fri, Sep 19, 2008 at 11:01:12AM -0700, Duyck, Alexander H wrote:
> Jarek Poplawski wrote:
> 
> >> Also changing dequeue_skb will likely cause additional issues for
> >> several qdiscs as it doesn't report anything up to parent queues, as
> >> a result you will end up with qdiscs like prio acting more like
> >> multiq because they won't know if a queue is empty, stopped, or
> >> throttled.
> >> Also I believe this will cause a panic on pfifo_fast and several
> >> other qdiscs because some check to see if there are packets in the
> >> queue and if so dequeue with the assumption that they will get a
> >> packet out.  You will need to add checks for this to resolve this
> >> issue.
> >
> > I really can't get your point. Don't you mean skb_dequeue()?
> > dequeue_skb() is used only by qdisc_restart()...
> You're right.  I misread it as skb_dequeue.  The problem is though you
> are still relying on q->requeue which last I knew was only being used
> a few qdiscs.  In addition you will still be taking the cpu hit for the
> dequeue/requeue on several qdiscs which can't use q->requeue without
> violating the way they were supposed to work.

I'm not sure I understand your point. No qdisc will use any new
q->requeue. All qdisc do dequeue, but on tx fail an skb isn't requeued
back, but queued by qdisc_restart() in a private list (of root qdisc),
and then tried before any new dequeuing. There will be no cpu hit,
because each next try is only skb_peek() on this list, until tx queue
of an skb at the head is working.

> >> The one thing I liked about my approach was that after I was done you
> >> could have prio as a parent of multiq leaves or multiq as the parent
> >> of prio leaves and all the hardware queues would receive the packets
> >> in the same order as the result.
> >
> > I'm not against your approach, but I'd like to be sure these
> > complications are really worth of it. Of course, if my proposal, the
> > first take of 3 patches, doesn't work as you predict (and I doubt),
> > then we can forget about it.
> Well when you get some testing done let me know.  The main areas I am
> concerned with are:
> 
> 1.  CPU utilization stays the same regardless of which queue used.
> 2.  Maintain current qdisc behavior on a per hw queue basis.
> 3.  Avoid head-of-line blocking where it applies
>         for example: prio band 0 not blocked by band 1, or 1 by 2, etc..
>                                 or
>                          multiq not blocked on any band due to 1 band blocked
> 
> As long as all 3 criteria are met I would be happy with any solution
> provided.

Alas this testing is both the weekest (I'm not going to do this), and
the strongest side of this solution, because it's mostly David's work
(my patch could be actually skipped without much impact). So, I hope,
you don't suggest this could be not enough tested or otherwise not the
best?!

Anyway, I don't insist on "my" solution. I simply think it's reasonable,
and it's not for private reasons, because I didn't even find this out.
I think, if it's so wrong you should have no problem to show this in
one little test. And maybe David changed his mind in the meantime and he
will choose your way even without this.

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue.
  2008-09-19 18:51           ` Jarek Poplawski
@ 2008-09-19 21:43             ` Duyck, Alexander H
  0 siblings, 0 replies; 20+ messages in thread
From: Duyck, Alexander H @ 2008-09-19 21:43 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	davem@davemloft.net, kaber@trash.net

Jarek Poplawski wrote:
> On Fri, Sep 19, 2008 at 11:01:12AM -0700, Duyck, Alexander H wrote:
> I'm not sure I understand your point. No qdisc will use any new
> q->requeue. All qdisc do dequeue, but on tx fail an skb isn't requeued
> back, but queued by qdisc_restart() in a private list (of root qdisc),
> and then tried before any new dequeuing. There will be no cpu hit,
> because each next try is only skb_peek() on this list, until tx queue
> of an skb at the head is working.

It was me missing a piece.  I didn't look over the portion where Dave
changed requeue to always use the requeue list.  This breaks multiq
and we are right back to the head-of-line blocking.  Also I think patch
1 will break things since queueing a packet with skb->next already set
will cause issues since the queue uses prev and next last I knew.

> Alas this testing is both the weekest (I'm not going to do this), and
> the strongest side of this solution, because it's mostly David's work
> (my patch could be actually skipped without much impact). So, I hope,
> you don't suggest this could be not enough tested or otherwise not the
> best?!
>
> Anyway, I don't insist on "my" solution. I simply think it's
> reasonable, and it's not for private reasons, because I didn't even
> find this out. I think, if it's so wrong you should have no problem
> to show this in one little test. And maybe David changed his mind in
> the meantime and he will choose your way even without this.

I didn't mean to insist on this if it is how it came across.  My main
concern is that I just don't want to break what I just got working.
Most of this requeue to q->requeue stuff will break multiq and
introduce head-of-line blocking.  When That happens it will fall back
into my lap to try to resolve it so that is why I want to avoid it in
the first place.

Anyway I'm out for the next two weeks with at least the first week of
that being without email access.  I hope I provided some valuable
input so that this can head in the right direction.

Thanks,

Alex

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH take2] pkt_sched: Fix TX state checking in qdisc_run()
  2008-09-19 14:44       ` [PATCH take2] " Jarek Poplawski
  2008-09-19 17:49         ` Jarek Poplawski
@ 2008-09-21  5:18         ` David Miller
  1 sibling, 0 replies; 20+ messages in thread
From: David Miller @ 2008-09-21  5:18 UTC (permalink / raw)
  To: jarkao2; +Cc: alexander.duyck, alexander.h.duyck, netdev, herbert, kaber

From: Jarek Poplawski <jarkao2@gmail.com>
Date: Fri, 19 Sep 2008 16:44:50 +0200

> pkt_sched: Fix TX state checking in qdisc_run()
> 
> Current check wrongly uses the state of the first tx queue for all tx
> queues in case of non-default qdiscs. This patch brings back per dev
> __LINK_STATE_XOFF flag, which is set when all tx queues are stopped.
> This check is needed in qdisc_run() to avoid useless __netif_schedule()
> calls. The wrongness of this check was first noticed by Herbert Xu.
> 
> Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>

I'm dropping this and the patches in the ->smart_requeue thread as well.

It simply isn't crystal clear how to proceed at this point.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2008-09-21  5:19 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-18  6:43 [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Alexander Duyck
2008-09-18  6:56 ` Alexander Duyck
2008-09-18  9:46 ` David Miller
2008-09-18 14:51   ` Alexander Duyck
2008-09-18 19:44 ` Jarek Poplawski
2008-09-19  1:11   ` Alexander Duyck
2008-09-19  9:17     ` Jarek Poplawski
2008-09-19 10:32     ` [take 2] " Jarek Poplawski
2008-09-19 13:07       ` [PATCH] pkt_sched: Fix TX state checking in qdisc_run() Jarek Poplawski
2008-09-19 14:44       ` [PATCH take2] " Jarek Poplawski
2008-09-19 17:49         ` Jarek Poplawski
2008-09-21  5:18         ` David Miller
2008-09-19 17:46       ` [take 2] Re: [RFC PATCH] sched: only dequeue if packet can be queued to hardware queue Jarek Poplawski
2008-09-19 15:16   ` Jarek Poplawski
2008-09-19 16:26     ` Duyck, Alexander H
2008-09-19 17:35       ` Jarek Poplawski
2008-09-19 18:01         ` Duyck, Alexander H
2008-09-19 18:51           ` Jarek Poplawski
2008-09-19 21:43             ` Duyck, Alexander H
2008-09-19 16:37     ` Jarek Poplawski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).