* Re: [PATCH] (3/3) netem: adjust parent qlen when duplicating
2005-05-04 1:54 ` Patrick McHardy
@ 2005-05-04 1:57 ` Patrick McHardy
2005-05-04 5:01 ` Stephen Hemminger
2005-05-04 17:09 ` [RFC] alternate way of handling netem duplication Stephen Hemminger
1 sibling, 1 reply; 10+ messages in thread
From: Patrick McHardy @ 2005-05-04 1:57 UTC (permalink / raw)
To: David S. Miller; +Cc: shemminger, netdev, netem
[-- Attachment #1: Type: text/plain, Size: 1467 bytes --]
Patrick McHardy wrote:
> David S. Miller wrote:
>
>>On Wed, 04 May 2005 01:48:02 +0200
>>Patrick McHardy <kaber@trash.net> wrote:
>>
>>
>>
>>>That's what I already suggested, it should be pretty simple to do
>>>so. I'll send a patch once your tree appears on kernel.org.
>
>
> This one should work. It keeps a pointer to the parent qdisc in struct
> Qdisc and adjusts q.qlen of all parent qdiscs in netem. The __parent
> pointer also used by CBQ is renamed to parentq and is used for this. To
> avoid confusion, the parent classid is also renamed to parentcl. It
> should work with all currently included classful qdiscs except HFSC.
> Statistics are not correctly updated (and can't be without support from
> the qdisc since classes are internal to it), we need to keep this in
> mind in case a qdisc like RED which uses qstats.backlog for calculations
> is converted to a classful one. Fixing HFSC is very low priority, it
> could only use netem in work-conserving mode anyway.
>
> My favourite solution would be to avoid this hack and let tc actions
> handle duplication, as Jamal suggested. My previous point against this
> of not necessarily having an identical classification result for the
> duplicated packet as the original one is actually a plus since it
> provides more flexibility. Steven, what do you think about it?
>
> Signed-off-by: Patrick McHardy <kaber@trash.net>
>
Oops, I've attached an old patch with a bug. This one is the
correct one.
[-- Attachment #2: x --]
[-- Type: text/plain, Size: 4330 bytes --]
Index: include/net/sch_generic.h
===================================================================
--- 591ce19741741438606ab75a45ac9f973cbb4787/include/net/sch_generic.h (mode:100644 sha1:c57504b3b51819522dc9f868aa9a208d61dd3892)
+++ uncommitted/include/net/sch_generic.h (mode:100644)
@@ -35,7 +35,7 @@
int padded;
struct Qdisc_ops *ops;
u32 handle;
- u32 parent;
+ u32 parentcl;
atomic_t refcnt;
struct sk_buff_head q;
struct net_device *dev;
@@ -49,10 +49,11 @@
int (*reshape_fail)(struct sk_buff *skb,
struct Qdisc *q);
- /* This field is deprecated, but it is still used by CBQ
+ /* This field is deprecated, but it is still used by CBQ and netem
* and it will live until better solution will be invented.
+ * Valid only while qdisc is grafted to its parent.
*/
- struct Qdisc *__parent;
+ struct Qdisc *parentq;
};
struct Qdisc_class_ops
Index: net/sched/sch_api.c
===================================================================
--- 591ce19741741438606ab75a45ac9f973cbb4787/net/sched/sch_api.c (mode:100644 sha1:07977f8f2679b30cf93808f93fbbcce3c3dbc776)
+++ uncommitted/net/sched/sch_api.c (mode:100644)
@@ -378,9 +378,11 @@
if (cops) {
unsigned long cl = cops->get(parent, classid);
if (cl) {
+ if (new) {
+ new->parentcl = classid;
+ new->parentq = parent;
+ }
err = cops->graft(parent, cl, new, old);
- if (new)
- new->parent = classid;
cops->put(parent, cl);
}
}
@@ -855,7 +857,7 @@
q_idx++;
continue;
}
- if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
+ if (tc_fill_qdisc(skb, q, q->parentcl, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
read_unlock_bh(&qdisc_tree_lock);
goto done;
@@ -1289,7 +1291,6 @@
subsys_initcall(pktsched_init);
-EXPORT_SYMBOL(qdisc_lookup);
EXPORT_SYMBOL(qdisc_get_rtab);
EXPORT_SYMBOL(qdisc_put_rtab);
EXPORT_SYMBOL(register_qdisc);
Index: net/sched/sch_cbq.c
===================================================================
--- 591ce19741741438606ab75a45ac9f973cbb4787/net/sched/sch_cbq.c (mode:100644 sha1:d43e3b8cbf6af27a25ab7b9d2aee82a32f8010eb)
+++ uncommitted/net/sched/sch_cbq.c (mode:100644)
@@ -419,9 +419,6 @@
return ret;
}
-#ifdef CONFIG_NET_CLS_POLICE
- cl->q->__parent = sch;
-#endif
if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
sch->q.qlen++;
sch->bstats.packets++;
@@ -456,7 +453,6 @@
#ifdef CONFIG_NET_CLS_POLICE
q->rx_class = cl;
- cl->q->__parent = sch;
#endif
if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
sch->q.qlen++;
@@ -690,7 +686,7 @@
static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
{
int len = skb->len;
- struct Qdisc *sch = child->__parent;
+ struct Qdisc *sch = child->parentq;
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl = q->rx_class;
@@ -701,7 +697,6 @@
cbq_mark_toplevel(q, cl);
q->rx_class = cl;
- cl->q->__parent = sch;
if (cl->q->enqueue(skb, cl->q) == 0) {
sch->q.qlen++;
Index: net/sched/sch_generic.c
===================================================================
--- 591ce19741741438606ab75a45ac9f973cbb4787/net/sched/sch_generic.c (mode:100644 sha1:87e48a4e105133ca3d0407b5c2d84f1b0e3a72c4)
+++ uncommitted/net/sched/sch_generic.c (mode:100644)
@@ -501,7 +501,7 @@
/* unlink inner qdiscs from dev->qdisc_list immediately */
list_for_each_entry(cq, &cql, list)
list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
- if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
+ if (TC_H_MAJ(q->parentcl) == TC_H_MAJ(cq->handle)) {
if (q->ops->cl_ops == NULL)
list_del_init(&q->list);
else
Index: net/sched/sch_netem.c
===================================================================
--- 591ce19741741438606ab75a45ac9f973cbb4787/net/sched/sch_netem.c (mode:100644 sha1:e0c9fbe73b15cee32b44f4469e1ac5eeb9849267)
+++ uncommitted/net/sched/sch_netem.c (mode:100644)
@@ -230,11 +230,9 @@
* queue, the parent's qlen accounting gets confused,
* so fix it.
*/
- qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent));
- if (qp)
+ for (qp = sch; qp; qp = qp->parentq)
qp->q.qlen++;
- sch->q.qlen++;
sch->bstats.bytes += skb2->len;
sch->bstats.packets++;
} else
^ permalink raw reply [flat|nested] 10+ messages in thread* [RFC] alternate way of handling netem duplication
2005-05-04 1:54 ` Patrick McHardy
2005-05-04 1:57 ` Patrick McHardy
@ 2005-05-04 17:09 ` Stephen Hemminger
1 sibling, 0 replies; 10+ messages in thread
From: Stephen Hemminger @ 2005-05-04 17:09 UTC (permalink / raw)
To: Patrick McHardy; +Cc: David S. Miller, netdev, netem
Here is an alternate way of handling duplication.
What it does is mark the packets on enqueue with a repcount.
Then on dequeue it cheats and bypasses direct to the device avoiding
the other qdisc.
When duplication takes place in the dequeue there is a harmless race.
Since qdisc_xmit (formerly part of qdisc_restart) drops queue_lock, and reacquires.
Therefore netem dequeue will return NULL causing qdisc_restart to return qlen > 0
and qdisc_run will recall qdisc_restart, which will pick up the same packet again.
It is untested, and reordering needs fixing.
Index: netem-2.6.12-rc3/net/sched/sch_netem.c
===================================================================
--- netem-2.6.12-rc3.orig/net/sched/sch_netem.c
+++ netem-2.6.12-rc3/net/sched/sch_netem.c
@@ -59,7 +59,6 @@ struct netem_sched_data {
u32 latency;
u32 loss;
u32 limit;
- u32 counter;
u32 gap;
u32 jitter;
u32 duplicate;
@@ -78,6 +77,7 @@ struct netem_sched_data {
/* Time stamp put into socket buffer control block */
struct netem_skb_cb {
psched_time_t time_to_send;
+ int repcount;
};
/* init_crandom - initialize correlated random number generator
@@ -137,35 +137,6 @@ static long tabledist(unsigned long mu,
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
-/* Put skb in the private delayed queue. */
-static int netem_delay(struct Qdisc *sch, struct sk_buff *skb)
-{
- struct netem_sched_data *q = qdisc_priv(sch);
- psched_tdiff_t td;
- psched_time_t now;
-
- PSCHED_GET_TIME(now);
- td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
-
- /* Always queue at tail to keep packets in order */
- if (likely(q->delayed.qlen < q->limit)) {
- struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
-
- PSCHED_TADD2(now, td, cb->time_to_send);
-
- pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb,
- now, cb->time_to_send);
-
- __skb_queue_tail(&q->delayed, skb);
- return NET_XMIT_SUCCESS;
- }
-
- pr_debug("netem_delay: queue over limit %d\n", q->limit);
- sch->qstats.overlimits++;
- kfree_skb(skb);
- return NET_XMIT_DROP;
-}
-
/*
* Move a packet that is ready to send from the delay holding
* list to the underlying qdisc.
@@ -206,64 +177,45 @@ static int netem_run(struct Qdisc *sch)
static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
- int ret;
+ struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
+ psched_tdiff_t td;
+ psched_time_t now;
pr_debug("netem_enqueue skb=%p\n", skb);
+ cb->repcount = 1;
+ PSCHED_GET_TIME(now);
+ td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
+ PSCHED_TADD2(now, td, cb->time_to_send);
+
+ pr_debug("netem_enqueue: skb=%p now=%llu tosend=%llu\n", skb,
+ now, cb->time_to_send);
+
/* Random packet drop 0 => none, ~0 => all */
- if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
- pr_debug("netem_enqueue: random loss\n");
- sch->qstats.drops++;
- kfree_skb(skb);
- return 0; /* lie about loss so TCP doesn't know */
- }
+ if (q->loss && q->loss >= get_crandom(&q->loss_cor))
+ --cb->repcount;
/* Random duplication */
- if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
- struct sk_buff *skb2;
+ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+ ++cb->repcount;
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) {
- struct Qdisc *qp;
-
- /* Since one packet can generate two packets in the
- * queue, the parent's qlen accounting gets confused,
- * so fix it.
- */
- qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent));
- if (qp)
- qp->q.qlen++;
-
- sch->q.qlen++;
- sch->bstats.bytes += skb2->len;
- sch->bstats.packets++;
- } else
- sch->qstats.drops++;
- }
-
- /* If doing simple delay then gap == 0 so all packets
- * go into the delayed holding queue
- * otherwise if doing out of order only "1 out of gap"
- * packets will be delayed.
- */
- if (q->counter < q->gap) {
- ++q->counter;
- ret = q->qdisc->enqueue(skb, q->qdisc);
- } else {
- q->counter = 0;
- ret = netem_delay(sch, skb);
- netem_run(sch);
+ if (cb->repcount == 0) {
+ kfree_skb(skb);
+ return 0;
}
- if (likely(ret == NET_XMIT_SUCCESS)) {
- sch->q.qlen++;
- sch->bstats.bytes += skb->len;
- sch->bstats.packets++;
- } else
+ if (q->delayed.qlen < q->limit) {
sch->qstats.drops++;
-
- pr_debug("netem: enqueue ret %d\n", ret);
- return ret;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ __skb_queue_tail(&q->delayed, skb);
+ sch->q.qlen++;
+ sch->bstats.bytes += skb->len;
+ sch->bstats.packets++;
+
+ return NET_XMIT_SUCCESS;
}
/* Requeue packets but don't change time stamp */
@@ -302,7 +254,23 @@ static struct sk_buff *netem_dequeue(str
skb = q->qdisc->dequeue(q->qdisc);
if (skb) {
- pr_debug("netem_dequeue: return skb=%p\n", skb);
+ struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
+
+ printk("netem_dequeue: skb=%p count=%d\n", skb, cb->repcount);
+ if (cb->repcount > 1) {
+ struct sk_buff *skb2;
+
+ /* Go direct to device to avoid problems
+ with parent qlen */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2 && qdisc_xmit(sch->dev, skb2) != NETDEV_TX_OK) {
+ kfree_skb(skb2);
+ q->qdisc->ops->requeue(skb, q->qdisc);
+ }
+ --cb->repcount;
+ return NULL;
+ }
+
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
}
@@ -459,7 +427,6 @@ static int netem_init(struct Qdisc *sch,
init_timer(&q->timer);
q->timer.function = netem_watchdog;
q->timer.data = (unsigned long) sch;
- q->counter = 0;
q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
if (!q->qdisc) {
Index: netem-2.6.12-rc3/include/net/pkt_sched.h
===================================================================
--- netem-2.6.12-rc3.orig/include/net/pkt_sched.h
+++ netem-2.6.12-rc3/include/net/pkt_sched.h
@@ -234,6 +234,8 @@ static inline void qdisc_run(struct net_
/* NOTHING */;
}
+extern int qdisc_xmit(struct net_device *dev, struct sk_buff *skb);
+
extern int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res);
Index: netem-2.6.12-rc3/net/sched/sch_api.c
===================================================================
--- netem-2.6.12-rc3.orig/net/sched/sch_api.c
+++ netem-2.6.12-rc3/net/sched/sch_api.c
@@ -1289,7 +1289,6 @@ static int __init pktsched_init(void)
subsys_initcall(pktsched_init);
-EXPORT_SYMBOL(qdisc_lookup);
EXPORT_SYMBOL(qdisc_get_rtab);
EXPORT_SYMBOL(qdisc_put_rtab);
EXPORT_SYMBOL(register_qdisc);
Index: netem-2.6.12-rc3/net/sched/sch_generic.c
===================================================================
--- netem-2.6.12-rc3.orig/net/sched/sch_generic.c
+++ netem-2.6.12-rc3/net/sched/sch_generic.c
@@ -77,7 +77,78 @@ void qdisc_unlock_tree(struct net_device
dev->queue_lock and dev->xmit_lock are mutually exclusive,
if one is grabbed, another must be free.
*/
+int qdisc_xmit(struct net_device *dev, struct sk_buff *skb)
+{
+ unsigned need_lock = !(dev->features & NETIF_F_LLTX);
+
+ /*
+ * When the driver has LLTX set it does its own locking
+ * in start_xmit. No need to add additional overhead by
+ * locking again. These checks are worth it because
+ * even uncongested locks can be quite expensive.
+ * The driver can do trylock like here too, in case
+ * of lock congestion it should return -1 and the packet
+ * will be requeued.
+ */
+ if (need_lock) {
+ if (!spin_trylock(&dev->xmit_lock))
+ goto collision;
+ /* Remember that the driver is grabbed by us. */
+ dev->xmit_lock_owner = smp_processor_id();
+ }
+
+ /* And release queue */
+ spin_unlock(&dev->queue_lock);
+
+ if (likely(!netif_queue_stopped(dev))) {
+ int ret;
+ if (netdev_nit)
+ dev_queue_xmit_nit(skb, dev);
+
+ ret = dev->hard_start_xmit(skb, dev);
+ if (likely(ret == NETDEV_TX_OK)) {
+ if (need_lock) {
+ dev->xmit_lock_owner = -1;
+ spin_unlock(&dev->xmit_lock);
+ }
+ spin_lock(&dev->queue_lock);
+ return NETDEV_TX_OK;
+ }
+ if (ret == NETDEV_TX_LOCKED && !need_lock) {
+ spin_lock(&dev->queue_lock);
+ goto collision;
+ }
+
+ }
+
+ /* NETDEV_TX_BUSY - we need to requeue */
+ /* Release the driver */
+ if (need_lock) {
+ dev->xmit_lock_owner = -1;
+ spin_unlock(&dev->xmit_lock);
+ }
+ spin_lock(&dev->queue_lock);
+ return NETDEV_TX_BUSY;
+
+ collision:
+ /* So, someone grabbed the driver. */
+
+ /* It may be transient configuration error,
+ when hard_start_xmit() recurses. We detect
+ it by checking xmit owner and drop the
+ packet when deadloop is detected.
+ */
+ if (dev->xmit_lock_owner == smp_processor_id()) {
+ kfree_skb(skb);
+ if (net_ratelimit())
+ printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
+ return NETDEV_TX_OK;
+ }
+
+ __get_cpu_var(netdev_rx_stat).cpu_collision++;
+ return NETDEV_TX_BUSY;
+}
/* Kick device.
Note, that this procedure can be called by a watchdog timer, so that
@@ -96,91 +167,19 @@ int qdisc_restart(struct net_device *dev
struct sk_buff *skb;
/* Dequeue packet */
- if ((skb = q->dequeue(q)) != NULL) {
- unsigned nolock = (dev->features & NETIF_F_LLTX);
- /*
- * When the driver has LLTX set it does its own locking
- * in start_xmit. No need to add additional overhead by
- * locking again. These checks are worth it because
- * even uncongested locks can be quite expensive.
- * The driver can do trylock like here too, in case
- * of lock congestion it should return -1 and the packet
- * will be requeued.
- */
- if (!nolock) {
- if (!spin_trylock(&dev->xmit_lock)) {
- collision:
- /* So, someone grabbed the driver. */
-
- /* It may be transient configuration error,
- when hard_start_xmit() recurses. We detect
- it by checking xmit owner and drop the
- packet when deadloop is detected.
- */
- if (dev->xmit_lock_owner == smp_processor_id()) {
- kfree_skb(skb);
- if (net_ratelimit())
- printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
- return -1;
- }
- __get_cpu_var(netdev_rx_stat).cpu_collision++;
- goto requeue;
- }
- /* Remember that the driver is grabbed by us. */
- dev->xmit_lock_owner = smp_processor_id();
- }
-
- {
- /* And release queue */
- spin_unlock(&dev->queue_lock);
-
- if (!netif_queue_stopped(dev)) {
- int ret;
- if (netdev_nit)
- dev_queue_xmit_nit(skb, dev);
-
- ret = dev->hard_start_xmit(skb, dev);
- if (ret == NETDEV_TX_OK) {
- if (!nolock) {
- dev->xmit_lock_owner = -1;
- spin_unlock(&dev->xmit_lock);
- }
- spin_lock(&dev->queue_lock);
- return -1;
- }
- if (ret == NETDEV_TX_LOCKED && nolock) {
- spin_lock(&dev->queue_lock);
- goto collision;
- }
- }
-
- /* NETDEV_TX_BUSY - we need to requeue */
- /* Release the driver */
- if (!nolock) {
- dev->xmit_lock_owner = -1;
- spin_unlock(&dev->xmit_lock);
- }
- spin_lock(&dev->queue_lock);
- q = dev->qdisc;
- }
-
- /* Device kicked us out :(
- This is possible in three cases:
-
- 0. driver is locked
- 1. fastroute is enabled
- 2. device cannot determine busy state
- before start of transmission (f.e. dialout)
- 3. device is buggy (ppp)
- */
+ skb = q->dequeue(q);
+ if (likely(skb)) {
+ if (likely(qdisc_xmit(dev, skb) == NETDEV_TX_OK))
+ return -1;
-requeue:
+ q = dev->qdisc;
q->ops->requeue(skb, q);
netif_schedule(dev);
return 1;
+ } else {
+ BUG_ON((int) q->q.qlen < 0);
+ return q->q.qlen;
}
- BUG_ON((int) q->q.qlen < 0);
- return q->q.qlen;
}
static void dev_watchdog(unsigned long arg)
@@ -604,6 +603,7 @@ EXPORT_SYMBOL(noop_qdisc);
EXPORT_SYMBOL(noop_qdisc_ops);
EXPORT_SYMBOL(qdisc_create_dflt);
EXPORT_SYMBOL(qdisc_destroy);
+EXPORT_SYMBOL(qdisc_xmit);
EXPORT_SYMBOL(qdisc_reset);
EXPORT_SYMBOL(qdisc_restart);
EXPORT_SYMBOL(qdisc_lock_tree);
^ permalink raw reply [flat|nested] 10+ messages in thread