[PATCH 5/5 v4] net: add old_queue

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
@ 2010-12-16  4:56 Changli Gao
  2010-12-16  5:47 ` Eric Dumazet
  2010-12-17 13:09 ` jamal
  0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2010-12-16  4:56 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
	Jiri Pirko, netdev, netem, Changli Gao

For the skbs returned from ifb, we should use the queue_mapping
saved before ifb.

We save old queue_mapping in old_queue_mapping just before calling 
dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
just before reinjecting the ingress packets.

A new struct dev_skb_cb is added, and valid in qdisc and gso layer.
The original qdisc_skb_cb and DEV_GSO_CB use dev_skb_cb as the first
member.

netem_skb_cb is changed to contain qdisc_skb_cb.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
v4: don't restore the old_queue_mapping for egress packets.
v3: use the method from Eric to allocate memory from skb->cb. Thank him.
v2: save old_queue_mapping in skb->cb
 drivers/net/ifb.c         |    1 +
 include/linux/netdevice.h |   10 ++++++++++
 include/net/sch_generic.h |    3 ++-
 net/core/dev.c            |   15 ++++++++++-----
 net/sched/act_mirred.c    |    1 +
 net/sched/sch_netem.c     |    8 ++++----
 6 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 918a38e..1632345 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -96,6 +96,7 @@ static void ri_tasklet(unsigned long arg)
 			dev_queue_xmit(skb);
 		} else if (from & AT_INGRESS) {
 			skb_pull(skb, skb->dev->hard_header_len);
+			skb->queue_mapping = dev_skb_cb(skb)->old_queue_mapping;
 			netif_rx(skb);
 		} else
 			BUG();
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d31bc3c..6f128e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1295,6 +1295,16 @@ struct napi_gro_cb {
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
 
+struct dev_skb_cb {
+	u16		old_queue_mapping;
+};
+
+static inline struct dev_skb_cb *dev_skb_cb(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct dev_skb_cb));
+	return (struct dev_skb_cb *)skb->cb;
+}
+
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	struct net_device	*dev;	/* NULL is wildcarded here	     */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ea1f8a8..52c32e3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -198,8 +198,8 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
+	struct dev_skb_cb	dev_cb; /* must be the first */
 	unsigned int		pkt_len;
-	char			data[];
 };
 
 static inline int qdisc_qlen(struct Qdisc *q)
@@ -209,6 +209,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
 
 static inline struct qdisc_skb_cb *qdisc_skb_cb(struct sk_buff *skb)
 {
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct qdisc_skb_cb));
 	return (struct qdisc_skb_cb *)skb->cb;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a0..bf5ced5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1901,10 +1901,15 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 }
 
 struct dev_gso_cb {
-	void (*destructor)(struct sk_buff *skb);
+	struct dev_skb_cb	dev_cb; /* must be the first */
+	void			(*destructor)(struct sk_buff *skb);
 };
 
-#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+static inline struct dev_gso_cb *dev_gso_cb(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct dev_gso_cb));
+	return (struct dev_gso_cb *)skb->cb;
+}
 
 static void dev_gso_skb_destructor(struct sk_buff *skb)
 {
@@ -1918,7 +1923,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
 		kfree_skb(nskb);
 	} while (skb->next);
 
-	cb = DEV_GSO_CB(skb);
+	cb = dev_gso_cb(skb);
 	if (cb->destructor)
 		cb->destructor(skb);
 }
@@ -1947,7 +1952,7 @@ static int dev_gso_segment(struct sk_buff *skb)
 		return PTR_ERR(segs);
 
 	skb->next = segs;
-	DEV_GSO_CB(skb)->destructor = skb->destructor;
+	dev_gso_cb(skb)->destructor = skb->destructor;
 	skb->destructor = dev_gso_skb_destructor;
 
 	return 0;
@@ -2103,7 +2108,7 @@ gso:
 
 out_kfree_gso_skb:
 	if (likely(skb->next == NULL))
-		skb->destructor = DEV_GSO_CB(skb)->destructor;
+		skb->destructor = dev_gso_cb(skb)->destructor;
 out_kfree_skb:
 	kfree_skb(skb);
 out:
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 0c311be..419e82f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -197,6 +197,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 
 	skb2->skb_iif = skb->dev->ifindex;
 	skb2->dev = dev;
+	dev_skb_cb(skb2)->old_queue_mapping = skb->queue_mapping;
 	dev_queue_xmit(skb2);
 	err = 0;
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e5593c0..c2cf72f 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -77,14 +77,14 @@ struct netem_sched_data {
 
 /* Time stamp put into socket buffer control block */
 struct netem_skb_cb {
-	psched_time_t	time_to_send;
+	struct qdisc_skb_cb	qdisc_cb; /* must be the first */
+	psched_time_t		time_to_send;
 };
 
 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 {
-	BUILD_BUG_ON(sizeof(skb->cb) <
-		sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
-	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct netem_skb_cb));
+	return (struct netem_skb_cb *)skb->cb;
 }
 
 /* init_crandom - initialize correlated random number generator

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-16  4:56 [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb Changli Gao
@ 2010-12-16  5:47 ` Eric Dumazet
  2010-12-17 13:09 ` jamal
  1 sibling, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-12-16  5:47 UTC (permalink / raw)
  To: Changli Gao
  Cc: Jamal Hadi Salim, David S. Miller, Stephen Hemminger, Tom Herbert,
	Jiri Pirko, netdev, netem

Le jeudi 16 décembre 2010 à 12:56 +0800, Changli Gao a écrit :
> For the skbs returned from ifb, we should use the queue_mapping
> saved before ifb.
> 
> We save old queue_mapping in old_queue_mapping just before calling 
> dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
> just before reinjecting the ingress packets.
> 
> A new struct dev_skb_cb is added, and valid in qdisc and gso layer.
> The original qdisc_skb_cb and DEV_GSO_CB use dev_skb_cb as the first
> member.
> 
> netem_skb_cb is changed to contain qdisc_skb_cb.
> 
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-16  4:56 [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb Changli Gao
  2010-12-16  5:47 ` Eric Dumazet
@ 2010-12-17 13:09 ` jamal
  2010-12-17 13:41   ` Changli Gao
  1 sibling, 1 reply; 10+ messages in thread
From: jamal @ 2010-12-17 13:09 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
	Jiri Pirko, netdev, netem

On Thu, 2010-12-16 at 12:56 +0800, Changli Gao wrote:
> For the skbs returned from ifb, we should use the queue_mapping
> saved before ifb.
> 
> We save old queue_mapping in old_queue_mapping just before calling 
> dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
> just before reinjecting the ingress packets.
> 
> A new struct dev_skb_cb is added, and valid in qdisc and gso layer.
> The original qdisc_skb_cb and DEV_GSO_CB use dev_skb_cb as the first
> member.
> 
> netem_skb_cb is changed to contain qdisc_skb_cb.

I am sorry Changli - I think we are talking past each other. I
a conflicted on the whole point of saving and restoring these
devqueue mappings. I understand that for ifb, saving and restoring the
original devs is fundamental for its operation- but i am not sure i see
it for the queues. As an example:

---
# For all packets arriving on ifb0, change mapping to 3 and
# redirect to to ifb1

tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
match u32 0 0 flowid 1:2 \
action skbedit queue_mapping 4 \
action mirred egress redirect dev ifb1
#
# redirect all packets arriving in eth0 to ifb0
$TC filter add eth0 parent 1:0 protocol ip prio 10 u32 \
match u32 0 0 flowid 1:2 action mirred egress redirect dev ifb0
----

what is the expected behavior?

cheers,
jamal


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-17 13:09 ` jamal
@ 2010-12-17 13:41   ` Changli Gao
  2010-12-21 13:07     ` jamal
  0 siblings, 1 reply; 10+ messages in thread
From: Changli Gao @ 2010-12-17 13:41 UTC (permalink / raw)
  To: hadi
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
	Jiri Pirko, netdev, netem

On Fri, Dec 17, 2010 at 9:09 PM, jamal <hadi@cyberus.ca> wrote:
> On Thu, 2010-12-16 at 12:56 +0800, Changli Gao wrote:
>
> I am sorry Changli - I think we are talking past each other. I
> a conflicted on the whole point of saving and restoring these
> devqueue mappings. I understand that for ifb, saving and restoring the
> original devs is fundamental for its operation- but i am not sure i see
> it for the queues. As an example:
>
> ---
> # For all packets arriving on ifb0, change mapping to 3 and
> # redirect to to ifb1
>
> tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
> match u32 0 0 flowid 1:2 \
> action skbedit queue_mapping 4 \
> action mirred egress redirect dev ifb1
> #
> # redirect all packets arriving in eth0 to ifb0
> $TC filter add eth0 parent 1:0 protocol ip prio 10 u32 \
> match u32 0 0 flowid 1:2 action mirred egress redirect dev ifb0
> ----
>
> what is the expected behavior?
>

I doubt it can work.

eth0 -> ifb0: skb->skb_iif = eth0.
ifb0 -> ifb1: skb->skb_iif = ifb0
ifb1 -> ifb0: as skb->skb_iif == ifb0, skb->skb_iif = ifb1
ifb0 -> ifb1...
...

Did you test it?

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-17 13:41   ` Changli Gao
@ 2010-12-21 13:07     ` jamal
  2010-12-21 14:03       ` Changli Gao
  0 siblings, 1 reply; 10+ messages in thread
From: jamal @ 2010-12-21 13:07 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
	Jiri Pirko, netdev, netem

On Fri, 2010-12-17 at 21:41 +0800, Changli Gao wrote:

Sorry for the latency - I am a little swamped.

> I doubt it can work.

It should work - this used to be part of my regression tests.
If it doesnt work something is broken.

In any case, I shouldnt have used this example because
it distracted from the point i was trying to make:
You are restoring the old qmap when the point is we could change
it to a new one. A simpler example illustrating how
a qmap could be changed:

----
tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
 match u32 0 0 flowid 1:2 \
 action skbedit queue_mapping 4
---

cheers,
jamal

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-21 13:07     ` jamal
@ 2010-12-21 14:03       ` Changli Gao
  2010-12-21 15:24         ` Eric Dumazet
  2010-12-23 13:00         ` jamal
  0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2010-12-21 14:03 UTC (permalink / raw)
  To: hadi
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
	Jiri Pirko, netdev, netem

On Tue, Dec 21, 2010 at 9:07 PM, jamal <hadi@cyberus.ca> wrote:
> On Fri, 2010-12-17 at 21:41 +0800, Changli Gao wrote:
>
> Sorry for the latency - I am a little swamped.
>
>> I doubt it can work.
>
> It should work - this used to be part of my regression tests.
> If it doesnt work something is broken.

When I tested it, my OS got frozen.

>
> In any case, I shouldnt have used this example because
> it distracted from the point i was trying to make:
> You are restoring the old qmap when the point is we could change
> it to a new one. A simpler example illustrating how
> a qmap could be changed:
>
> ----
> tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
>  match u32 0 0 flowid 1:2 \
>  action skbedit queue_mapping 4
> ----

Currently, you can only change the rx queue mapping, because for tx,
dev_pick_tx() doesn't use skb->queue_mapping to choose tx queue.

However, I don't think change the rx queue mapping is a good idea.
When the skbs returned from ifb enter netif_receive_skb() again,
get_rps_cpu() may warn about the wrong rx queue, and my this patch is
used to solve this problem. Even though the rx queue is legal, a
different rps_cpus settings will be used, and the skbs may be
redirected to different CPUs. Is it expected?

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-21 14:03       ` Changli Gao
@ 2010-12-21 15:24         ` Eric Dumazet
  2010-12-22  0:08           ` Changli Gao
  2010-12-23 13:21           ` jamal
  2010-12-23 13:00         ` jamal
  1 sibling, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-12-21 15:24 UTC (permalink / raw)
  To: Changli Gao
  Cc: hadi, David S. Miller, Stephen Hemminger, Tom Herbert, Jiri Pirko,
	netdev, netem

Le mardi 21 décembre 2010 à 22:03 +0800, Changli Gao a écrit :
> However, I don't think change the rx queue mapping is a good idea.
> When the skbs returned from ifb enter netif_receive_skb() again,
> get_rps_cpu() may warn about the wrong rx queue, and my this patch is
> used to solve this problem. Even though the rx queue is legal, a
> different rps_cpus settings will be used, and the skbs may be
> redirected to different CPUs. Is it expected?
> 
> 

Do we really want a multi queue ifb at all ?

Why not use percpu data and LLTX, like we did for other virtual devices
(loopback, tunnels, vlans, ...)

I guess most ifb uses need to finaly deliver packets in a monoqueue
anyway, optimizing ifb might raise lock contention on this resource.

See what we did in commit 79640a4ca6955e3e (net: add additional lock to
qdisc to increase throughput) : Adding one spinlock actually helped a
lot ;)




^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-21 15:24         ` Eric Dumazet
@ 2010-12-22  0:08           ` Changli Gao
  2010-12-23 13:21           ` jamal
  1 sibling, 0 replies; 10+ messages in thread
From: Changli Gao @ 2010-12-22  0:08 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: hadi, David S. Miller, Stephen Hemminger, Tom Herbert, Jiri Pirko,
	netdev, netem

On Tue, Dec 21, 2010 at 11:24 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Do we really want a multi queue ifb at all ?
>
> Why not use percpu data and LLTX, like we did for other virtual devices
> (loopback, tunnels, vlans, ...)
>
> I guess most ifb uses need to finaly deliver packets in a monoqueue
> anyway, optimizing ifb might raise lock contention on this resource.
>

For ingress shaping, the later processing should be in parallel, so
multiple ri_tasklets are needed.

For one queue ifb, the queue mapping should be saved and restored for
ingress skbs, as it is reset to 0 in dev_pick_tx(ifb).


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-21 15:24         ` Eric Dumazet
  2010-12-22  0:08           ` Changli Gao
@ 2010-12-23 13:21           ` jamal
  1 sibling, 0 replies; 10+ messages in thread
From: jamal @ 2010-12-23 13:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Changli Gao, David S. Miller, Stephen Hemminger, Tom Herbert,
	Jiri Pirko, netdev, netem

On Tue, 2010-12-21 at 16:24 +0100, Eric Dumazet wrote:

> 
> Do we really want a multi queue ifb at all ?
> 
> Why not use percpu data and LLTX, like we did for other virtual devices
> (loopback, tunnels, vlans, ...)
> 
> I guess most ifb uses need to finaly deliver packets in a monoqueue
> anyway, optimizing ifb might raise lock contention on this resource.

I guess once you start having hardware that is multiqueue on the ingress
side at least then something per cpu is needed on ifb. But i agree that
the optimizations may end up harming the simplicity that ifb intended.
It is already jumping a lot of hoops to work around things as is.

> See what we did in commit 79640a4ca6955e3e (net: add additional lock to
> qdisc to increase throughput) : Adding one spinlock actually helped a
> lot ;)

Yes, that was fascinating stuff;-> I am still scratching my head and
continuing to itch on when i can get more time to look closely with some
testing. But i dont see the connection with what Changli is attempting
with multiq ifb - what do you have in mind.

cheers,
jamal

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
  2010-12-21 14:03       ` Changli Gao
  2010-12-21 15:24         ` Eric Dumazet
@ 2010-12-23 13:00         ` jamal
  1 sibling, 0 replies; 10+ messages in thread
From: jamal @ 2010-12-23 13:00 UTC (permalink / raw)
  To: Changli Gao
  Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
	Jiri Pirko, netdev, netem

On Tue, 2010-12-21 at 22:03 +0800, Changli Gao wrote:

> When I tested it, my OS got frozen.

I will look into it the next opportunity i get. The example i showed
is on egress btw. A ping from outside that matches the filter
will be a good test.

> Currently, you can only change the rx queue mapping, because for tx,
> dev_pick_tx() doesn't use skb->queue_mapping to choose tx queue.

If skbedit is on egress, it will happen after (and override whatever
dev_pick_tx() chose), no? Thats the whole point for skbedits queuemap
editing.

> However, I don't think change the rx queue mapping is a good idea.

I agree for that as a default policy. But it is
policy that skbedit can and should be able to override.

> When the skbs returned from ifb enter netif_receive_skb() again,
> get_rps_cpu() may warn about the wrong rx queue, and my this patch is
> used to solve this problem. Even though the rx queue is legal, a
> different rps_cpus settings will be used, and the skbs may be
> redirected to different CPUs. Is it expected?

I am not sure without analyzing what performance impact would be, i.e i
think that the only reason i wouldnt do it is because it may have crazy
effect on performance but:
If i wanted to override the choice made by rps through some policy, why
shouldnt i be able to do it? Same thing if i wanted to bypass rps. tc
level seems appropriate.
I may be misreading the code: Quick glance at the code indicates users
have no choice on ingress: rps happens first then we can do tc level -
so it doesnt matter what changes we make to the queue map it will not
take effect in any case. Am i mistaken?

cheers,
jamal

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-12-23 13:21 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-16  4:56 [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb Changli Gao
2010-12-16  5:47 ` Eric Dumazet
2010-12-17 13:09 ` jamal
2010-12-17 13:41   ` Changli Gao
2010-12-21 13:07     ` jamal
2010-12-21 14:03       ` Changli Gao
2010-12-21 15:24         ` Eric Dumazet
2010-12-22  0:08           ` Changli Gao
2010-12-23 13:21           ` jamal
2010-12-23 13:00         ` jamal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).