* [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
@ 2010-12-16 4:56 Changli Gao
2010-12-16 5:47 ` Eric Dumazet
2010-12-17 13:09 ` jamal
0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2010-12-16 4:56 UTC (permalink / raw)
To: Jamal Hadi Salim
Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
Jiri Pirko, netdev, netem, Changli Gao
For the skbs returned from ifb, we should use the queue_mapping
saved before ifb.
We save old queue_mapping in old_queue_mapping just before calling
dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
just before reinjecting the ingress packets.
A new struct dev_skb_cb is added, and valid in qdisc and gso layer.
The original qdisc_skb_cb and DEV_GSO_CB use dev_skb_cb as the first
member.
netem_skb_cb is changed to contain qdisc_skb_cb.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
v4: don't restore the old_queue_mapping for egress packets.
v3: use the method from Eric to allocate memory from skb->cb. Thank him.
v2: save old_queue_mapping in skb->cb
drivers/net/ifb.c | 1 +
include/linux/netdevice.h | 10 ++++++++++
include/net/sch_generic.h | 3 ++-
net/core/dev.c | 15 ++++++++++-----
net/sched/act_mirred.c | 1 +
net/sched/sch_netem.c | 8 ++++----
6 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 918a38e..1632345 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -96,6 +96,7 @@ static void ri_tasklet(unsigned long arg)
dev_queue_xmit(skb);
} else if (from & AT_INGRESS) {
skb_pull(skb, skb->dev->hard_header_len);
+ skb->queue_mapping = dev_skb_cb(skb)->old_queue_mapping;
netif_rx(skb);
} else
BUG();
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d31bc3c..6f128e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1295,6 +1295,16 @@ struct napi_gro_cb {
#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
+struct dev_skb_cb {
+ u16 old_queue_mapping;
+};
+
+static inline struct dev_skb_cb *dev_skb_cb(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct dev_skb_cb));
+ return (struct dev_skb_cb *)skb->cb;
+}
+
struct packet_type {
__be16 type; /* This is really htons(ether_type). */
struct net_device *dev; /* NULL is wildcarded here */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ea1f8a8..52c32e3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -198,8 +198,8 @@ struct tcf_proto {
};
struct qdisc_skb_cb {
+ struct dev_skb_cb dev_cb; /* must be the first */
unsigned int pkt_len;
- char data[];
};
static inline int qdisc_qlen(struct Qdisc *q)
@@ -209,6 +209,7 @@ static inline int qdisc_qlen(struct Qdisc *q)
static inline struct qdisc_skb_cb *qdisc_skb_cb(struct sk_buff *skb)
{
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct qdisc_skb_cb));
return (struct qdisc_skb_cb *)skb->cb;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index d28b3a0..bf5ced5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1901,10 +1901,15 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
}
struct dev_gso_cb {
- void (*destructor)(struct sk_buff *skb);
+ struct dev_skb_cb dev_cb; /* must be the first */
+ void (*destructor)(struct sk_buff *skb);
};
-#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
+static inline struct dev_gso_cb *dev_gso_cb(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct dev_gso_cb));
+ return (struct dev_gso_cb *)skb->cb;
+}
static void dev_gso_skb_destructor(struct sk_buff *skb)
{
@@ -1918,7 +1923,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
kfree_skb(nskb);
} while (skb->next);
- cb = DEV_GSO_CB(skb);
+ cb = dev_gso_cb(skb);
if (cb->destructor)
cb->destructor(skb);
}
@@ -1947,7 +1952,7 @@ static int dev_gso_segment(struct sk_buff *skb)
return PTR_ERR(segs);
skb->next = segs;
- DEV_GSO_CB(skb)->destructor = skb->destructor;
+ dev_gso_cb(skb)->destructor = skb->destructor;
skb->destructor = dev_gso_skb_destructor;
return 0;
@@ -2103,7 +2108,7 @@ gso:
out_kfree_gso_skb:
if (likely(skb->next == NULL))
- skb->destructor = DEV_GSO_CB(skb)->destructor;
+ skb->destructor = dev_gso_cb(skb)->destructor;
out_kfree_skb:
kfree_skb(skb);
out:
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 0c311be..419e82f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -197,6 +197,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
+ dev_skb_cb(skb2)->old_queue_mapping = skb->queue_mapping;
dev_queue_xmit(skb2);
err = 0;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e5593c0..c2cf72f 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -77,14 +77,14 @@ struct netem_sched_data {
/* Time stamp put into socket buffer control block */
struct netem_skb_cb {
- psched_time_t time_to_send;
+ struct qdisc_skb_cb qdisc_cb; /* must be the first */
+ psched_time_t time_to_send;
};
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
{
- BUILD_BUG_ON(sizeof(skb->cb) <
- sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
- return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct netem_skb_cb));
+ return (struct netem_skb_cb *)skb->cb;
}
/* init_crandom - initialize correlated random number generator
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-16 4:56 [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb Changli Gao
@ 2010-12-16 5:47 ` Eric Dumazet
2010-12-17 13:09 ` jamal
1 sibling, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-12-16 5:47 UTC (permalink / raw)
To: Changli Gao
Cc: Jamal Hadi Salim, David S. Miller, Stephen Hemminger, Tom Herbert,
Jiri Pirko, netdev, netem
Le jeudi 16 décembre 2010 à 12:56 +0800, Changli Gao a écrit :
> For the skbs returned from ifb, we should use the queue_mapping
> saved before ifb.
>
> We save old queue_mapping in old_queue_mapping just before calling
> dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
> just before reinjecting the ingress packets.
>
> A new struct dev_skb_cb is added, and valid in qdisc and gso layer.
> The original qdisc_skb_cb and DEV_GSO_CB use dev_skb_cb as the first
> member.
>
> netem_skb_cb is changed to contain qdisc_skb_cb.
>
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-16 4:56 [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb Changli Gao
2010-12-16 5:47 ` Eric Dumazet
@ 2010-12-17 13:09 ` jamal
2010-12-17 13:41 ` Changli Gao
1 sibling, 1 reply; 10+ messages in thread
From: jamal @ 2010-12-17 13:09 UTC (permalink / raw)
To: Changli Gao
Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
Jiri Pirko, netdev, netem
On Thu, 2010-12-16 at 12:56 +0800, Changli Gao wrote:
> For the skbs returned from ifb, we should use the queue_mapping
> saved before ifb.
>
> We save old queue_mapping in old_queue_mapping just before calling
> dev_queue_xmit, and restore the old_queue_mapping to queue_mapping
> just before reinjecting the ingress packets.
>
> A new struct dev_skb_cb is added, and valid in qdisc and gso layer.
> The original qdisc_skb_cb and DEV_GSO_CB use dev_skb_cb as the first
> member.
>
> netem_skb_cb is changed to contain qdisc_skb_cb.
I am sorry Changli - I think we are talking past each other. I
a conflicted on the whole point of saving and restoring these
devqueue mappings. I understand that for ifb, saving and restoring the
original devs is fundamental for its operation- but i am not sure i see
it for the queues. As an example:
---
# For all packets arriving on ifb0, change mapping to 3 and
# redirect to to ifb1
tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
match u32 0 0 flowid 1:2 \
action skbedit queue_mapping 4 \
action mirred egress redirect dev ifb1
#
# redirect all packets arriving in eth0 to ifb0
$TC filter add eth0 parent 1:0 protocol ip prio 10 u32 \
match u32 0 0 flowid 1:2 action mirred egress redirect dev ifb0
----
what is the expected behavior?
cheers,
jamal
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-17 13:09 ` jamal
@ 2010-12-17 13:41 ` Changli Gao
2010-12-21 13:07 ` jamal
0 siblings, 1 reply; 10+ messages in thread
From: Changli Gao @ 2010-12-17 13:41 UTC (permalink / raw)
To: hadi
Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
Jiri Pirko, netdev, netem
On Fri, Dec 17, 2010 at 9:09 PM, jamal <hadi@cyberus.ca> wrote:
> On Thu, 2010-12-16 at 12:56 +0800, Changli Gao wrote:
>
> I am sorry Changli - I think we are talking past each other. I
> a conflicted on the whole point of saving and restoring these
> devqueue mappings. I understand that for ifb, saving and restoring the
> original devs is fundamental for its operation- but i am not sure i see
> it for the queues. As an example:
>
> ---
> # For all packets arriving on ifb0, change mapping to 3 and
> # redirect to to ifb1
>
> tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
> match u32 0 0 flowid 1:2 \
> action skbedit queue_mapping 4 \
> action mirred egress redirect dev ifb1
> #
> # redirect all packets arriving in eth0 to ifb0
> $TC filter add eth0 parent 1:0 protocol ip prio 10 u32 \
> match u32 0 0 flowid 1:2 action mirred egress redirect dev ifb0
> ----
>
> what is the expected behavior?
>
I doubt it can work.
eth0 -> ifb0: skb->skb_iif = eth0.
ifb0 -> ifb1: skb->skb_iif = ifb0
ifb1 -> ifb0: as skb->skb_iif == ifb0, skb->skb_iif = ifb1
ifb0 -> ifb1...
...
Did you test it?
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-17 13:41 ` Changli Gao
@ 2010-12-21 13:07 ` jamal
2010-12-21 14:03 ` Changli Gao
0 siblings, 1 reply; 10+ messages in thread
From: jamal @ 2010-12-21 13:07 UTC (permalink / raw)
To: Changli Gao
Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
Jiri Pirko, netdev, netem
On Fri, 2010-12-17 at 21:41 +0800, Changli Gao wrote:
Sorry for the latency - I am a little swamped.
> I doubt it can work.
It should work - this used to be part of my regression tests.
If it doesnt work something is broken.
In any case, I shouldnt have used this example because
it distracted from the point i was trying to make:
You are restoring the old qmap when the point is we could change
it to a new one. A simpler example illustrating how
a qmap could be changed:
----
tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
match u32 0 0 flowid 1:2 \
action skbedit queue_mapping 4
---
cheers,
jamal
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-21 13:07 ` jamal
@ 2010-12-21 14:03 ` Changli Gao
2010-12-21 15:24 ` Eric Dumazet
2010-12-23 13:00 ` jamal
0 siblings, 2 replies; 10+ messages in thread
From: Changli Gao @ 2010-12-21 14:03 UTC (permalink / raw)
To: hadi
Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
Jiri Pirko, netdev, netem
On Tue, Dec 21, 2010 at 9:07 PM, jamal <hadi@cyberus.ca> wrote:
> On Fri, 2010-12-17 at 21:41 +0800, Changli Gao wrote:
>
> Sorry for the latency - I am a little swamped.
>
>> I doubt it can work.
>
> It should work - this used to be part of my regression tests.
> If it doesnt work something is broken.
When I tested it, my OS got frozen.
>
> In any case, I shouldnt have used this example because
> it distracted from the point i was trying to make:
> You are restoring the old qmap when the point is we could change
> it to a new one. A simpler example illustrating how
> a qmap could be changed:
>
> ----
> tc filter add dev ifb0 parent 1:0 protocol ip prio 10 u32 \
> match u32 0 0 flowid 1:2 \
> action skbedit queue_mapping 4
> ----
Currently, you can only change the rx queue mapping, because for tx,
dev_pick_tx() doesn't use skb->queue_mapping to choose tx queue.
However, I don't think change the rx queue mapping is a good idea.
When the skbs returned from ifb enter netif_receive_skb() again,
get_rps_cpu() may warn about the wrong rx queue, and my this patch is
used to solve this problem. Even though the rx queue is legal, a
different rps_cpus settings will be used, and the skbs may be
redirected to different CPUs. Is it expected?
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-21 14:03 ` Changli Gao
@ 2010-12-21 15:24 ` Eric Dumazet
2010-12-22 0:08 ` Changli Gao
2010-12-23 13:21 ` jamal
2010-12-23 13:00 ` jamal
1 sibling, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2010-12-21 15:24 UTC (permalink / raw)
To: Changli Gao
Cc: hadi, David S. Miller, Stephen Hemminger, Tom Herbert, Jiri Pirko,
netdev, netem
Le mardi 21 décembre 2010 à 22:03 +0800, Changli Gao a écrit :
> However, I don't think change the rx queue mapping is a good idea.
> When the skbs returned from ifb enter netif_receive_skb() again,
> get_rps_cpu() may warn about the wrong rx queue, and my this patch is
> used to solve this problem. Even though the rx queue is legal, a
> different rps_cpus settings will be used, and the skbs may be
> redirected to different CPUs. Is it expected?
>
>
Do we really want a multi queue ifb at all ?
Why not use percpu data and LLTX, like we did for other virtual devices
(loopback, tunnels, vlans, ...)
I guess most ifb uses need to finaly deliver packets in a monoqueue
anyway, optimizing ifb might raise lock contention on this resource.
See what we did in commit 79640a4ca6955e3e (net: add additional lock to
qdisc to increase throughput) : Adding one spinlock actually helped a
lot ;)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-21 15:24 ` Eric Dumazet
@ 2010-12-22 0:08 ` Changli Gao
2010-12-23 13:21 ` jamal
1 sibling, 0 replies; 10+ messages in thread
From: Changli Gao @ 2010-12-22 0:08 UTC (permalink / raw)
To: Eric Dumazet
Cc: hadi, David S. Miller, Stephen Hemminger, Tom Herbert, Jiri Pirko,
netdev, netem
On Tue, Dec 21, 2010 at 11:24 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Do we really want a multi queue ifb at all ?
>
> Why not use percpu data and LLTX, like we did for other virtual devices
> (loopback, tunnels, vlans, ...)
>
> I guess most ifb uses need to finaly deliver packets in a monoqueue
> anyway, optimizing ifb might raise lock contention on this resource.
>
For ingress shaping, the later processing should be in parallel, so
multiple ri_tasklets are needed.
For one queue ifb, the queue mapping should be saved and restored for
ingress skbs, as it is reset to 0 in dev_pick_tx(ifb).
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-21 15:24 ` Eric Dumazet
2010-12-22 0:08 ` Changli Gao
@ 2010-12-23 13:21 ` jamal
1 sibling, 0 replies; 10+ messages in thread
From: jamal @ 2010-12-23 13:21 UTC (permalink / raw)
To: Eric Dumazet
Cc: Changli Gao, David S. Miller, Stephen Hemminger, Tom Herbert,
Jiri Pirko, netdev, netem
On Tue, 2010-12-21 at 16:24 +0100, Eric Dumazet wrote:
>
> Do we really want a multi queue ifb at all ?
>
> Why not use percpu data and LLTX, like we did for other virtual devices
> (loopback, tunnels, vlans, ...)
>
> I guess most ifb uses need to finaly deliver packets in a monoqueue
> anyway, optimizing ifb might raise lock contention on this resource.
I guess once you start having hardware that is multiqueue on the ingress
side at least then something per cpu is needed on ifb. But i agree that
the optimizations may end up harming the simplicity that ifb intended.
It is already jumping a lot of hoops to work around things as is.
> See what we did in commit 79640a4ca6955e3e (net: add additional lock to
> qdisc to increase throughput) : Adding one spinlock actually helped a
> lot ;)
Yes, that was fascinating stuff;-> I am still scratching my head and
continuing to itch on when i can get more time to look closely with some
testing. But i dont see the connection with what Changli is attempting
with multiq ifb - what do you have in mind.
cheers,
jamal
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb
2010-12-21 14:03 ` Changli Gao
2010-12-21 15:24 ` Eric Dumazet
@ 2010-12-23 13:00 ` jamal
1 sibling, 0 replies; 10+ messages in thread
From: jamal @ 2010-12-23 13:00 UTC (permalink / raw)
To: Changli Gao
Cc: David S. Miller, Stephen Hemminger, Eric Dumazet, Tom Herbert,
Jiri Pirko, netdev, netem
On Tue, 2010-12-21 at 22:03 +0800, Changli Gao wrote:
> When I tested it, my OS got frozen.
I will look into it the next opportunity i get. The example i showed
is on egress btw. A ping from outside that matches the filter
will be a good test.
> Currently, you can only change the rx queue mapping, because for tx,
> dev_pick_tx() doesn't use skb->queue_mapping to choose tx queue.
If skbedit is on egress, it will happen after (and override whatever
dev_pick_tx() chose), no? Thats the whole point for skbedits queuemap
editing.
> However, I don't think change the rx queue mapping is a good idea.
I agree for that as a default policy. But it is
policy that skbedit can and should be able to override.
> When the skbs returned from ifb enter netif_receive_skb() again,
> get_rps_cpu() may warn about the wrong rx queue, and my this patch is
> used to solve this problem. Even though the rx queue is legal, a
> different rps_cpus settings will be used, and the skbs may be
> redirected to different CPUs. Is it expected?
I am not sure without analyzing what performance impact would be, i.e i
think that the only reason i wouldnt do it is because it may have crazy
effect on performance but:
If i wanted to override the choice made by rps through some policy, why
shouldnt i be able to do it? Same thing if i wanted to bypass rps. tc
level seems appropriate.
I may be misreading the code: Quick glance at the code indicates users
have no choice on ingress: rps happens first then we can do tc level -
so it doesnt matter what changes we make to the queue map it will not
take effect in any case. Am i mistaken?
cheers,
jamal
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2010-12-23 13:21 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-16 4:56 [PATCH 5/5 v4] net: add old_queue_mapping into skb->cb Changli Gao
2010-12-16 5:47 ` Eric Dumazet
2010-12-17 13:09 ` jamal
2010-12-17 13:41 ` Changli Gao
2010-12-21 13:07 ` jamal
2010-12-21 14:03 ` Changli Gao
2010-12-21 15:24 ` Eric Dumazet
2010-12-22 0:08 ` Changli Gao
2010-12-23 13:21 ` jamal
2010-12-23 13:00 ` jamal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).