* [PATCH] net: fix problem in dequeuing from input_pkt_queue
@ 2010-05-19 21:47 Tom Herbert
2010-05-19 23:45 ` Changli Gao
0 siblings, 1 reply; 7+ messages in thread
From: Tom Herbert @ 2010-05-19 21:47 UTC (permalink / raw)
To: davem; +Cc: eric.dumazet, xiaosuo, netdev
Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
The primary issue it that the queue head must be incremented only
after a packet has been processed, that is only after
__netif_receive_skb has been called. This is needed for the mechanism
to prevent OOO packet in RFS. Also when flushing the input_pkt_queue
and process_queue, the process queue should be done first to prevent
OOO packets.
Because the input_pkt_queue has been effectively split into two queues,
the calculation of the tail ptr is no longer correct. The correct value
would be head+input_pkt_queue->len+process_queue->len. To avoid
this calculation we added an explict input_queue_tail in softnet_data.
The tail value is simply incremented when queuing to input_pkt_queue.
In process_backlog the processing of the packet queue can be done
without irq's being disabled.
Made dropped in softnet_data to be "unsigned int" for consistency.
Signed-off-by: Tom Herbert <therbert@google.com>
---
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c3487a6..bc0bc85 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1403,17 +1403,25 @@ struct softnet_data {
struct softnet_data *rps_ipi_next;
unsigned int cpu;
unsigned int input_queue_head;
+ unsigned int input_queue_tail;
#endif
- unsigned dropped;
+ unsigned int dropped;
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
};
-static inline void input_queue_head_add(struct softnet_data *sd,
- unsigned int len)
+static inline void input_queue_head_incr(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- sd->input_queue_head += len;
+ sd->input_queue_head++;
+#endif
+}
+
+static inline void input_queue_tail_incr_save(struct softnet_data *sd,
+ unsigned int *qtail)
+{
+#ifdef CONFIG_RPS
+ *qtail = ++sd->input_queue_tail;
#endif
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c82065..be7d475 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2426,10 +2426,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
if (skb_queue_len(&sd->input_pkt_queue)) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
-#ifdef CONFIG_RPS
- *qtail = sd->input_queue_head +
- skb_queue_len(&sd->input_pkt_queue);
-#endif
+ input_queue_tail_incr_save(sd, qtail);
rps_unlock(sd);
local_irq_restore(flags);
return NET_RX_SUCCESS;
@@ -2959,22 +2956,24 @@ static void flush_backlog(void *arg)
struct softnet_data *sd = &__get_cpu_var(softnet_data);
struct sk_buff *skb, *tmp;
- rps_lock(sd);
- skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+ skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
if (skb->dev == dev) {
- __skb_unlink(skb, &sd->input_pkt_queue);
+ __skb_unlink(skb, &sd->process_queue);
kfree_skb(skb);
- input_queue_head_add(sd, 1);
+ input_queue_head_incr(sd);
}
}
- rps_unlock(sd);
- skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ rps_lock(sd);
+ skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
if (skb->dev == dev) {
- __skb_unlink(skb, &sd->process_queue);
+ __skb_unlink(skb, &sd->input_pkt_queue);
kfree_skb(skb);
+ input_queue_head_incr(sd);
}
}
+ rps_unlock(sd);
+
}
static int napi_gro_complete(struct sk_buff *skb)
@@ -3320,26 +3319,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
}
#endif
napi->weight = weight_p;
- local_irq_disable();
while (work < quota) {
struct sk_buff *skb;
unsigned int qlen;
while ((skb = __skb_dequeue(&sd->process_queue))) {
- local_irq_enable();
__netif_receive_skb(skb);
+ input_queue_head_incr(sd);
if (++work >= quota)
return work;
- local_irq_disable();
}
+ local_irq_disable();
rps_lock(sd);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen) {
- input_queue_head_add(sd, qlen);
+ if (qlen)
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
- }
+
if (qlen < quota - work) {
/*
* Inline a custom version of __napi_complete().
@@ -3354,8 +3351,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
quota = work + qlen;
}
rps_unlock(sd);
+ local_irq_enable();
}
- local_irq_enable();
return work;
}
@@ -5679,12 +5676,14 @@ static int dev_cpu_callback(struct notifier_block *nfb,
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+ while ((skb = __skb_dequeue(&oldsd->process_queue))) {
netif_rx(skb);
- input_queue_head_add(oldsd, 1);
+ input_queue_head_incr(oldsd);
}
- while ((skb = __skb_dequeue(&oldsd->process_queue)))
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
+ input_queue_head_incr(oldsd);
+ }
return NOTIFY_OK;
}
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
2010-05-19 21:47 [PATCH] net: fix problem in dequeuing from input_pkt_queue Tom Herbert
@ 2010-05-19 23:45 ` Changli Gao
2010-05-19 23:58 ` Tom Herbert
0 siblings, 1 reply; 7+ messages in thread
From: Changli Gao @ 2010-05-19 23:45 UTC (permalink / raw)
To: Tom Herbert; +Cc: davem, eric.dumazet, netdev
On Thu, May 20, 2010 at 5:47 AM, Tom Herbert <therbert@google.com> wrote:
> Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
> The primary issue it that the queue head must be incremented only
> after a packet has been processed, that is only after
> __netif_receive_skb has been called. This is needed for the mechanism
> to prevent OOO packet in RFS. Also when flushing the input_pkt_queue
> and process_queue, the process queue should be done first to prevent
> OOO packets.
>
> Because the input_pkt_queue has been effectively split into two queues,
> the calculation of the tail ptr is no longer correct. The correct value
> would be head+input_pkt_queue->len+process_queue->len. To avoid
> this calculation we added an explict input_queue_tail in softnet_data.
> The tail value is simply incremented when queuing to input_pkt_queue.
>
> In process_backlog the processing of the packet queue can be done
> without irq's being disabled.
>
> static int napi_gro_complete(struct sk_buff *skb)
> @@ -3320,26 +3319,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
> }
> #endif
> napi->weight = weight_p;
> - local_irq_disable();
> while (work < quota) {
> struct sk_buff *skb;
> unsigned int qlen;
>
> while ((skb = __skb_dequeue(&sd->process_queue))) {
> - local_irq_enable();
we need to keep local irq disabled. If not, flush_backlog may be
called, and it will access sd->process_queue.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
2010-05-19 23:45 ` Changli Gao
@ 2010-05-19 23:58 ` Tom Herbert
2010-05-20 0:09 ` Changli Gao
0 siblings, 1 reply; 7+ messages in thread
From: Tom Herbert @ 2010-05-19 23:58 UTC (permalink / raw)
To: Changli Gao; +Cc: davem, eric.dumazet, netdev
>> napi->weight = weight_p;
>> - local_irq_disable();
>> while (work < quota) {
>> struct sk_buff *skb;
>> unsigned int qlen;
>>
>> while ((skb = __skb_dequeue(&sd->process_queue))) {
>> - local_irq_enable();
>
> we need to keep local irq disabled. If not, flush_backlog may be
> called, and it will access sd->process_queue.
>
It should be okay? process_backlog only runs in softirq so bottom
halves are already disabled, and I don't think flush_backlog runs out
of an interrupt.
>
> --
> Regards,
> Changli Gao(xiaosuo@gmail.com)
>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
2010-05-19 23:58 ` Tom Herbert
@ 2010-05-20 0:09 ` Changli Gao
2010-05-20 2:48 ` Tom Herbert
0 siblings, 1 reply; 7+ messages in thread
From: Changli Gao @ 2010-05-20 0:09 UTC (permalink / raw)
To: Tom Herbert; +Cc: davem, eric.dumazet, netdev
On Thu, May 20, 2010 at 7:58 AM, Tom Herbert <therbert@google.com> wrote:
>>> napi->weight = weight_p;
>>> - local_irq_disable();
>>> while (work < quota) {
>>> struct sk_buff *skb;
>>> unsigned int qlen;
>>>
>>> while ((skb = __skb_dequeue(&sd->process_queue))) {
>>> - local_irq_enable();
>>
>> we need to keep local irq disabled. If not, flush_backlog may be
>> called, and it will access sd->process_queue.
>>
>
> It should be okay? process_backlog only runs in softirq so bottom
> halves are already disabled, and I don't think flush_backlog runs out
> of an interrupt.
>
Oh no. It is an IRQ handler.
on_each_cpu(flush_backlog, dev, 1);
...
int on_each_cpu(void (*func) (void *info), void *info, int wait)
{
int ret = 0;
preempt_disable();
ret = smp_call_function(func, info, wait);
local_irq_disable();
func(info);
local_irq_enable();
preempt_enable();
return ret;
}
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
2010-05-20 0:09 ` Changli Gao
@ 2010-05-20 2:48 ` Tom Herbert
2010-05-20 4:37 ` Eric Dumazet
0 siblings, 1 reply; 7+ messages in thread
From: Tom Herbert @ 2010-05-20 2:48 UTC (permalink / raw)
To: Changli Gao; +Cc: davem, eric.dumazet, netdev
>> It should be okay? process_backlog only runs in softirq so bottom
>> halves are already disabled, and I don't think flush_backlog runs out
>> of an interrupt.
>>
>
> Oh no. It is an IRQ handler.
>
Very well, I will fix that.
Now I'm wondering, though, what the purpose of flush_backlog is...
since __netif_receive_skb is called with interrupts enabled it's
obvious flush_backlog won't catch all the skb's that reference the
device go away. Is there a reason these packets need to be flushed
and can't just be processed?
> on_each_cpu(flush_backlog, dev, 1);
> ...
> int on_each_cpu(void (*func) (void *info), void *info, int wait)
> {
> int ret = 0;
>
> preempt_disable();
> ret = smp_call_function(func, info, wait);
> local_irq_disable();
> func(info);
> local_irq_enable();
> preempt_enable();
> return ret;
> }
>
> --
> Regards,
> Changli Gao(xiaosuo@gmail.com)
>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
2010-05-20 2:48 ` Tom Herbert
@ 2010-05-20 4:37 ` Eric Dumazet
2010-05-20 6:05 ` Tom Herbert
0 siblings, 1 reply; 7+ messages in thread
From: Eric Dumazet @ 2010-05-20 4:37 UTC (permalink / raw)
To: Tom Herbert; +Cc: Changli Gao, davem, netdev
Le mercredi 19 mai 2010 à 19:48 -0700, Tom Herbert a écrit :
> >> It should be okay? process_backlog only runs in softirq so bottom
> >> halves are already disabled, and I don't think flush_backlog runs out
> >> of an interrupt.
> >>
> >
> > Oh no. It is an IRQ handler.
> >
> Very well, I will fix that.
>
> Now I'm wondering, though, what the purpose of flush_backlog is...
> since __netif_receive_skb is called with interrupts enabled it's
> obvious flush_backlog won't catch all the skb's that reference the
> device go away. Is there a reason these packets need to be flushed
> and can't just be processed?
flush_backlog is called when device is dismantled.
No new packets should be generated by the device at this moment.
Could you please split your patch in units, I spent 20 minutes to review
it and come to same conclusion than Changli (need to disable interrupts
as they are currently disabled) and also :
input_queue_head_incr(sd); are _not_ needed in flush_backlog()
We are in the very last moments of the life of the device, in a very
unlikely situation (packets in flight, not already consumed by the cpu),
we are _dropping_ packets, so OOO means nothing at this point.
In dev_cpu_callback(), you reverse the order of input_pkt_queue /
process_queue.
Thats fine, but should be a single patch, because I am not sure the
input_queue_head_incr() are valid here, since we re-inject these packets
to netif_rx(). Could you clarify this point ?
Thanks !
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] net: fix problem in dequeuing from input_pkt_queue
2010-05-20 4:37 ` Eric Dumazet
@ 2010-05-20 6:05 ` Tom Herbert
0 siblings, 0 replies; 7+ messages in thread
From: Tom Herbert @ 2010-05-20 6:05 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Changli Gao, davem, netdev
On Wed, May 19, 2010 at 9:37 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mercredi 19 mai 2010 à 19:48 -0700, Tom Herbert a écrit :
>> >> It should be okay? process_backlog only runs in softirq so bottom
>> >> halves are already disabled, and I don't think flush_backlog runs out
>> >> of an interrupt.
>> >>
>> >
>> > Oh no. It is an IRQ handler.
>> >
>> Very well, I will fix that.
>>
>> Now I'm wondering, though, what the purpose of flush_backlog is...
>> since __netif_receive_skb is called with interrupts enabled it's
>> obvious flush_backlog won't catch all the skb's that reference the
>> device go away. Is there a reason these packets need to be flushed
>> and can't just be processed?
>
> flush_backlog is called when device is dismantled.
>
> No new packets should be generated by the device at this moment.
>
But again since __netif_receive_skb is called with interrupts disabled
there is still a hole that the device could be completely dismantled
but at least one packet from the device still will be processed. So
it seems like that's a bug, or maybe it's okay to process packets
after flush_backlog-- if the latter case were true why throw out
perfectly good packets? The only rationale I can think of for
flush_backlog is to eliminate skb's with references to device that has
gone away, but the mechanism does not seem sufficient to cover all
possible skb's with a reference.
> Could you please split your patch in units, I spent 20 minutes to review
> it and come to same conclusion than Changli (need to disable interrupts
> as they are currently disabled) and also :
>
> input_queue_head_incr(sd); are _not_ needed in flush_backlog()
>
I don't see why they wouldn't be needed. queue tail is incremented
when queuing to the input_pkt_queue, queue head is incremented when
dequeuing (after skb freed or processed). queue_tail-queue_head ==
input_pkt_queue.len+process_queue.len. These should be invariants.
> We are in the very last moments of the life of the device, in a very
> unlikely situation (packets in flight, not already consumed by the cpu),
> we are _dropping_ packets, so OOO means nothing at this point.
>
True. But still seems nice to handle process_queue to be consistent.
> In dev_cpu_callback(), you reverse the order of input_pkt_queue /
> process_queue.
>
> Thats fine, but should be a single patch, because I am not sure the
> input_queue_head_incr() are valid here, since we re-inject these packets
> to netif_rx(). Could you clarify this point ?
>
queue_head advances on every dequeue. See above...
Thanks for the great comments!
Tom
> Thanks !
>
>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2010-05-20 6:05 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-19 21:47 [PATCH] net: fix problem in dequeuing from input_pkt_queue Tom Herbert
2010-05-19 23:45 ` Changli Gao
2010-05-19 23:58 ` Tom Herbert
2010-05-20 0:09 ` Changli Gao
2010-05-20 2:48 ` Tom Herbert
2010-05-20 4:37 ` Eric Dumazet
2010-05-20 6:05 ` Tom Herbert
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).