* [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
@ 2010-09-10 13:22 Changli Gao
2010-09-10 14:26 ` Eric Dumazet
0 siblings, 1 reply; 6+ messages in thread
From: Changli Gao @ 2010-09-10 13:22 UTC (permalink / raw)
To: David S. Miller
Cc: Eric Dumazet, Oliver Hartkopp, Michael S. Tsirkin, netdev,
Changli Gao
Since skb->destructor() is used to account socket memory, and maybe called
before the skb is sent out, a corrupt skb maybe sent out finally.
A new destructor is added into structure skb_shared_info(), and it won't
be called until the last reference to the data of a skb is put. af_packet
uses this destructor instead.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
include/linux/skbuff.h | 1 +
net/core/skbuff.c | 19 ++++++++++++++-----
net/packet/af_packet.c | 38 +++++++++++++++++++++++++-------------
3 files changed, 40 insertions(+), 18 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9e8085a..f874c13 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -191,6 +191,7 @@ struct skb_shared_info {
__u8 tx_flags;
struct sk_buff *frag_list;
struct skb_shared_hwtstamps hwtstamps;
+ void (*destructor)(struct sk_buff *skb);
/*
* Warning : all fields before dataref are cleared in __alloc_skb()
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2d1bc76..ff37e54 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -332,10 +332,14 @@ static void skb_release_data(struct sk_buff *skb)
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
&skb_shinfo(skb)->dataref)) {
- if (skb_shinfo(skb)->nr_frags) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (shinfo->destructor)
+ shinfo->destructor(skb);
+ if (shinfo->nr_frags) {
int i;
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ for (i = 0; i < shinfo->nr_frags; i++)
+ put_page(shinfo->frags[i].page);
}
if (skb_has_frag_list(skb))
@@ -497,9 +501,12 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return false;
+ shinfo = skb_shinfo(skb);
+ if (shinfo->destructor)
+ return false;
+
skb_release_head_state(skb);
- shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
@@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb),
- offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
+ offsetof(struct skb_shared_info,
+ frags[skb_shinfo(skb)->nr_frags]));
+ skb_shinfo(skb)->destructor = NULL;
/* Check if we can avoid taking references on fragments if we own
* the last reference on skb->head. (see skb_release_data())
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27..7e16b55 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -823,22 +823,27 @@ ring_is_full:
goto drop_n_restore;
}
+struct tpacket_destructor_arg {
+ struct sock *sk;
+ void *ph;
+};
+
static void tpacket_destruct_skb(struct sk_buff *skb)
{
- struct packet_sock *po = pkt_sk(skb->sk);
- void *ph;
-
- BUG_ON(skb == NULL);
+ struct tpacket_destructor_arg *arg = skb_shinfo(skb)->destructor_arg;
+ struct packet_sock *po = pkt_sk(arg->sk);
+ void *ph = arg->ph;
if (likely(po->tx_ring.pg_vec)) {
- ph = skb_shinfo(skb)->destructor_arg;
BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
atomic_dec(&po->tx_ring.pending);
__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
}
+ skb->sk = arg->sk;
sock_wfree(skb);
+ kfree(arg);
}
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
@@ -862,7 +867,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
skb->dev = dev;
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
- skb_shinfo(skb)->destructor_arg = ph.raw;
switch (po->tp_version) {
case TPACKET_V2:
@@ -884,9 +888,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
to_write = tp_len;
if (sock->type == SOCK_DGRAM) {
- err = dev_hard_header(skb, dev, ntohs(proto), addr,
- NULL, tp_len);
- if (unlikely(err < 0))
+ if (unlikely(dev_hard_header(skb, dev, ntohs(proto), addr,
+ NULL, tp_len) < 0))
return -EINVAL;
} else if (dev->hard_header_len) {
/* net device doesn't like empty head */
@@ -897,8 +900,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
}
skb_push(skb, dev->hard_header_len);
- err = skb_store_bits(skb, 0, data,
- dev->hard_header_len);
+ err = skb_store_bits(skb, 0, data, dev->hard_header_len);
if (unlikely(err))
return err;
@@ -906,7 +908,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
to_write -= dev->hard_header_len;
}
- err = -EFAULT;
page = virt_to_page(data);
offset = offset_in_page(data);
len_max = PAGE_SIZE - offset;
@@ -994,6 +995,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
size_max = dev->mtu + reserve;
do {
+ struct tpacket_destructor_arg *arg;
+
ph = packet_current_frame(po, &po->tx_ring,
TP_STATUS_SEND_REQUEST);
@@ -1028,7 +1031,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
}
}
- skb->destructor = tpacket_destruct_skb;
+ arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+ if (unlikely(arg == NULL)) {
+ err = -ENOBUFS;
+ goto out_status;
+ }
+ arg->sk = &po->sk;
+ arg->ph = ph;
+ skb_shinfo(skb)->destructor_arg = arg;
+ skb->destructor = NULL;
+ skb_shinfo(skb)->destructor = tpacket_destruct_skb;
__packet_set_status(po, ph, TP_STATUS_SENDING);
atomic_inc(&po->tx_ring.pending);
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
2010-09-10 13:22 [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out Changli Gao
@ 2010-09-10 14:26 ` Eric Dumazet
2010-09-10 16:47 ` Changli Gao
0 siblings, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2010-09-10 14:26 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, Oliver Hartkopp, Michael S. Tsirkin, netdev
Le vendredi 10 septembre 2010 à 21:22 +0800, Changli Gao a écrit :
> Since skb->destructor() is used to account socket memory, and maybe called
> before the skb is sent out, a corrupt skb maybe sent out finally.
>
> A new destructor is added into structure skb_shared_info(), and it won't
> be called until the last reference to the data of a skb is put. af_packet
> uses this destructor instead.
>
Hi Changli
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ---
> include/linux/skbuff.h | 1 +
> net/core/skbuff.c | 19 ++++++++++++++-----
> net/packet/af_packet.c | 38 +++++++++++++++++++++++++-------------
> 3 files changed, 40 insertions(+), 18 deletions(-)
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 9e8085a..f874c13 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -191,6 +191,7 @@ struct skb_shared_info {
> __u8 tx_flags;
> struct sk_buff *frag_list;
> struct skb_shared_hwtstamps hwtstamps;
> + void (*destructor)(struct sk_buff *skb);
>
> /*
> * Warning : all fields before dataref are cleared in __alloc_skb()
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2d1bc76..ff37e54 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -332,10 +332,14 @@ static void skb_release_data(struct sk_buff *skb)
> if (!skb->cloned ||
> !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
> &skb_shinfo(skb)->dataref)) {
> - if (skb_shinfo(skb)->nr_frags) {
> + struct skb_shared_info *shinfo = skb_shinfo(skb);
> +
> + if (shinfo->destructor)
> + shinfo->destructor(skb);
> + if (shinfo->nr_frags) {
> int i;
> - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> - put_page(skb_shinfo(skb)->frags[i].page);
> + for (i = 0; i < shinfo->nr_frags; i++)
> + put_page(shinfo->frags[i].page);
> }
>
> if (skb_has_frag_list(skb))
> @@ -497,9 +501,12 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
> if (skb_shared(skb) || skb_cloned(skb))
> return false;
>
> + shinfo = skb_shinfo(skb);
> + if (shinfo->destructor)
> + return false;
> +
> skb_release_head_state(skb);
>
> - shinfo = skb_shinfo(skb);
> memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
> atomic_set(&shinfo->dataref, 1);
>
> @@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>
> memcpy((struct skb_shared_info *)(data + size),
> skb_shinfo(skb),
> - offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
> + offsetof(struct skb_shared_info,
> + frags[skb_shinfo(skb)->nr_frags]));
> + skb_shinfo(skb)->destructor = NULL;
>
> /* Check if we can avoid taking references on fragments if we own
> * the last reference on skb->head. (see skb_release_data())
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 3616f27..7e16b55 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -823,22 +823,27 @@ ring_is_full:
> goto drop_n_restore;
> }
>
> +struct tpacket_destructor_arg {
> + struct sock *sk;
> + void *ph;
> +};
> +
> static void tpacket_destruct_skb(struct sk_buff *skb)
> {
> - struct packet_sock *po = pkt_sk(skb->sk);
> - void *ph;
> -
> - BUG_ON(skb == NULL);
> + struct tpacket_destructor_arg *arg = skb_shinfo(skb)->destructor_arg;
> + struct packet_sock *po = pkt_sk(arg->sk);
> + void *ph = arg->ph;
>
> if (likely(po->tx_ring.pg_vec)) {
> - ph = skb_shinfo(skb)->destructor_arg;
> BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
> BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
> atomic_dec(&po->tx_ring.pending);
> __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
> }
>
> + skb->sk = arg->sk;
> sock_wfree(skb);
Are you sure sock_wfree(skb) is still needed ?
> + kfree(arg);
this new kmalloc()/kfree() for each sent packet wont please the guys
using af_packet/mmap interface...
> }
>
> static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> @@ -862,7 +867,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> skb->dev = dev;
> skb->priority = po->sk.sk_priority;
> skb->mark = po->sk.sk_mark;
> - skb_shinfo(skb)->destructor_arg = ph.raw;
>
> switch (po->tp_version) {
> case TPACKET_V2:
> @@ -884,9 +888,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> to_write = tp_len;
>
> if (sock->type == SOCK_DGRAM) {
> - err = dev_hard_header(skb, dev, ntohs(proto), addr,
> - NULL, tp_len);
> - if (unlikely(err < 0))
> + if (unlikely(dev_hard_header(skb, dev, ntohs(proto), addr,
> + NULL, tp_len) < 0))
> return -EINVAL;
> } else if (dev->hard_header_len) {
> /* net device doesn't like empty head */
> @@ -897,8 +900,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> }
>
> skb_push(skb, dev->hard_header_len);
> - err = skb_store_bits(skb, 0, data,
> - dev->hard_header_len);
> + err = skb_store_bits(skb, 0, data, dev->hard_header_len);
> if (unlikely(err))
> return err;
>
> @@ -906,7 +908,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
> to_write -= dev->hard_header_len;
> }
>
> - err = -EFAULT;
> page = virt_to_page(data);
> offset = offset_in_page(data);
> len_max = PAGE_SIZE - offset;
> @@ -994,6 +995,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> size_max = dev->mtu + reserve;
>
> do {
> + struct tpacket_destructor_arg *arg;
> +
> ph = packet_current_frame(po, &po->tx_ring,
> TP_STATUS_SEND_REQUEST);
>
> @@ -1028,7 +1031,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> }
> }
>
> - skb->destructor = tpacket_destruct_skb;
> + arg = kmalloc(sizeof(*arg), GFP_KERNEL);
> + if (unlikely(arg == NULL)) {
> + err = -ENOBUFS;
> + goto out_status;
> + }
> + arg->sk = &po->sk;
> + arg->ph = ph;
> + skb_shinfo(skb)->destructor_arg = arg;
> + skb->destructor = NULL;
why setting skb->destructor to NULL here ?
> + skb_shinfo(skb)->destructor = tpacket_destruct_skb;
> __packet_set_status(po, ph, TP_STATUS_SENDING);
> atomic_inc(&po->tx_ring.pending);
>
I dont yet understand how this can prevent af_unix module being unloaded
while packets are in flight
I believe sock_wfree() should be avoided (since early orphaning occurs),
to reduce number of atomic ops to the minimum.
af_packet/mmap users want fast operations, we should not use
sock_wfree() for them, because max number of in flight packets is known
(tx ring buffer)
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
2010-09-10 14:26 ` Eric Dumazet
@ 2010-09-10 16:47 ` Changli Gao
2010-09-10 16:58 ` Eric Dumazet
0 siblings, 1 reply; 6+ messages in thread
From: Changli Gao @ 2010-09-10 16:47 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, Oliver Hartkopp, Michael S. Tsirkin, netdev
On Fri, Sep 10, 2010 at 10:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 10 septembre 2010 à 21:22 +0800, Changli Gao a écrit :
>> Since skb->destructor() is used to account socket memory, and maybe called
>> before the skb is sent out, a corrupt skb maybe sent out finally.
>>
>> A new destructor is added into structure skb_shared_info(), and it won't
>> be called until the last reference to the data of a skb is put. af_packet
>> uses this destructor instead.
>>
>
> Hi Changli
>
>> static void tpacket_destruct_skb(struct sk_buff *skb)
>> {
>> - struct packet_sock *po = pkt_sk(skb->sk);
>> - void *ph;
>> -
>> - BUG_ON(skb == NULL);
>> + struct tpacket_destructor_arg *arg = skb_shinfo(skb)->destructor_arg;
>> + struct packet_sock *po = pkt_sk(arg->sk);
>> + void *ph = arg->ph;
>>
>> if (likely(po->tx_ring.pg_vec)) {
>> - ph = skb_shinfo(skb)->destructor_arg;
>> BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
>> BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
>> atomic_dec(&po->tx_ring.pending);
>> __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
>> }
>>
>> + skb->sk = arg->sk;
>> sock_wfree(skb);
>
> Are you sure sock_wfree(skb) is still needed ?
sock_wfree(skb) is also used to wake up the users who sleep on
poll(2). If sock_wfree(skb) is moved into skb->destructor(), and
called before skb is sent out, pollers will be waked up without
POLLOUT, and since the later skb_shinfo(skb)->destructor() doesn't
wake up the pollers, POLLOUT events will be lost, and the poller will
be blocked forever.
>
>
>> + kfree(arg);
>
> this new kmalloc()/kfree() for each sent packet wont please the guys
> using af_packet/mmap interface...
Embed these two pointers into skb_shared_info? It may slow the others.
>
>> }
>>
>> static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> @@ -862,7 +867,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> skb->dev = dev;
>> skb->priority = po->sk.sk_priority;
>> skb->mark = po->sk.sk_mark;
>> - skb_shinfo(skb)->destructor_arg = ph.raw;
>>
>> switch (po->tp_version) {
>> case TPACKET_V2:
>> @@ -884,9 +888,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> to_write = tp_len;
>>
>> if (sock->type == SOCK_DGRAM) {
>> - err = dev_hard_header(skb, dev, ntohs(proto), addr,
>> - NULL, tp_len);
>> - if (unlikely(err < 0))
>> + if (unlikely(dev_hard_header(skb, dev, ntohs(proto), addr,
>> + NULL, tp_len) < 0))
>> return -EINVAL;
>> } else if (dev->hard_header_len) {
>> /* net device doesn't like empty head */
>> @@ -897,8 +900,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> }
>>
>> skb_push(skb, dev->hard_header_len);
>> - err = skb_store_bits(skb, 0, data,
>> - dev->hard_header_len);
>> + err = skb_store_bits(skb, 0, data, dev->hard_header_len);
>> if (unlikely(err))
>> return err;
>>
>> @@ -906,7 +908,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
>> to_write -= dev->hard_header_len;
>> }
>>
>> - err = -EFAULT;
>> page = virt_to_page(data);
>> offset = offset_in_page(data);
>> len_max = PAGE_SIZE - offset;
>> @@ -994,6 +995,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> size_max = dev->mtu + reserve;
>>
>> do {
>> + struct tpacket_destructor_arg *arg;
>> +
>> ph = packet_current_frame(po, &po->tx_ring,
>> TP_STATUS_SEND_REQUEST);
>>
>> @@ -1028,7 +1031,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>> }
>> }
>>
>> - skb->destructor = tpacket_destruct_skb;
>> + arg = kmalloc(sizeof(*arg), GFP_KERNEL);
>> + if (unlikely(arg == NULL)) {
>> + err = -ENOBUFS;
>> + goto out_status;
>> + }
>> + arg->sk = &po->sk;
>> + arg->ph = ph;
>> + skb_shinfo(skb)->destructor_arg = arg;
>> + skb->destructor = NULL;
>
> why setting skb->destructor to NULL here ?
Let skb_shinfo(skb)->destructor() do all the things which used to be
done by skb->destructor().
>
>> + skb_shinfo(skb)->destructor = tpacket_destruct_skb;
>> __packet_set_status(po, ph, TP_STATUS_SENDING);
>> atomic_inc(&po->tx_ring.pending);
>>
>
> I dont yet understand how this can prevent af_unix module being unloaded
> while packets are in flight
This issue isn't addressed, and I think it should be fixed in a separate patch.
>
> I believe sock_wfree() should be avoided (since early orphaning occurs),
> to reduce number of atomic ops to the minimum.
>
> af_packet/mmap users want fast operations, we should not use
> sock_wfree() for them, because max number of in flight packets is known
> (tx ring buffer)
>
>
But the users rely on the kernel to inform them there is available
frame for use.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
2010-09-10 16:47 ` Changli Gao
@ 2010-09-10 16:58 ` Eric Dumazet
2010-09-10 17:12 ` Eric Dumazet
0 siblings, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2010-09-10 16:58 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, Oliver Hartkopp, Michael S. Tsirkin, netdev
Le samedi 11 septembre 2010 à 00:47 +0800, Changli Gao a écrit :
> On Fri, Sep 10, 2010 at 10:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Are you sure sock_wfree(skb) is still needed ?
>
> sock_wfree(skb) is also used to wake up the users who sleep on
> poll(2). If sock_wfree(skb) is moved into skb->destructor(), and
> called before skb is sent out, pollers will be waked up without
> POLLOUT, and since the later skb_shinfo(skb)->destructor() doesn't
> wake up the pollers, POLLOUT events will be lost, and the poller will
> be blocked forever.
>
Then implement poll() to use the number of available slots.
(not use the default poll() that relies on generic sk / inet queues and
counters)
Really, sock_wfree() cannot be used at all, or we also must disable
early orphaning of these skbs.
Goal is to replace skb->destructor use in af_packet by
shinfo->destructor, not mix the two.
> >
> >
> >> + kfree(arg);
> >
> > this new kmalloc()/kfree() for each sent packet wont please the guys
> > using af_packet/mmap interface...
>
> Embed these two pointers into skb_shared_info? It may slow the others.
we have some room because of SKB_PAD alignment,
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
2010-09-10 16:58 ` Eric Dumazet
@ 2010-09-10 17:12 ` Eric Dumazet
2010-09-11 0:19 ` Changli Gao
0 siblings, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2010-09-10 17:12 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, Oliver Hartkopp, Michael S. Tsirkin, netdev
Le vendredi 10 septembre 2010 à 18:58 +0200, Eric Dumazet a écrit :
> Le samedi 11 septembre 2010 à 00:47 +0800, Changli Gao a écrit :
> > On Fri, Sep 10, 2010 at 10:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
> > > Are you sure sock_wfree(skb) is still needed ?
> >
> > sock_wfree(skb) is also used to wake up the users who sleep on
> > poll(2). If sock_wfree(skb) is moved into skb->destructor(), and
> > called before skb is sent out, pollers will be waked up without
> > POLLOUT, and since the later skb_shinfo(skb)->destructor() doesn't
> > wake up the pollers, POLLOUT events will be lost, and the poller will
> > be blocked forever.
> >
>
> Then implement poll() to use the number of available slots.
> (not use the default poll() that relies on generic sk / inet queues and
> counters)
>
> Really, sock_wfree() cannot be used at all, or we also must disable
> early orphaning of these skbs.
>
> Goal is to replace skb->destructor use in af_packet by
> shinfo->destructor, not mix the two.
Thinking again about this, we also might avoid taking references on
pages and releasing references too.
shinfo->destructor should replace the skb_release_data() logic,
not complement it.
if (shinfo->destructor) {
shinfo->destructor(skb);
} else {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
put_page(skb_shinfo(skb)->frags[i].page);
if (skb_has_frag_list(skb))
....
kfree(skb->head);
}
As long as the mmap zone is correctly protected in af_packet code, of
course (not releasing it as long as some packets are still in flight)
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
2010-09-10 17:12 ` Eric Dumazet
@ 2010-09-11 0:19 ` Changli Gao
0 siblings, 0 replies; 6+ messages in thread
From: Changli Gao @ 2010-09-11 0:19 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, Oliver Hartkopp, Michael S. Tsirkin, netdev
On Sat, Sep 11, 2010 at 1:12 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le vendredi 10 septembre 2010 à 18:58 +0200, Eric Dumazet a écrit :
>> Le samedi 11 septembre 2010 à 00:47 +0800, Changli Gao a écrit :
>> > On Fri, Sep 10, 2010 at 10:26 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>
>> > > Are you sure sock_wfree(skb) is still needed ?
>> >
>> > sock_wfree(skb) is also used to wake up the users who sleep on
>> > poll(2). If sock_wfree(skb) is moved into skb->destructor(), and
>> > called before skb is sent out, pollers will be waked up without
>> > POLLOUT, and since the later skb_shinfo(skb)->destructor() doesn't
>> > wake up the pollers, POLLOUT events will be lost, and the poller will
>> > be blocked forever.
>> >
>>
>> Then implement poll() to use the number of available slots.
>> (not use the default poll() that relies on generic sk / inet queues and
>> counters)
>>
>> Really, sock_wfree() cannot be used at all, or we also must disable
>> early orphaning of these skbs.
>>
>> Goal is to replace skb->destructor use in af_packet by
>> shinfo->destructor, not mix the two.
>
> Thinking again about this, we also might avoid taking references on
> pages and releasing references too.
>
> shinfo->destructor should replace the skb_release_data() logic,
> not complement it.
>
> if (shinfo->destructor) {
> shinfo->destructor(skb);
> } else {
> for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> put_page(skb_shinfo(skb)->frags[i].page);
> if (skb_has_frag_list(skb))
> ....
> kfree(skb->head);
> }
>
>
> As long as the mmap zone is correctly protected in af_packet code, of
> course (not releasing it as long as some packets are still in flight)
>
It touches too much internal implementation.
I think most of your ideas are about optimizations, and should be
addressed in separate patches. I'll avoid kmalloc/kfree in the next
version. Thanks.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2010-09-11 0:20 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-10 13:22 [PATCH] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out Changli Gao
2010-09-10 14:26 ` Eric Dumazet
2010-09-10 16:47 ` Changli Gao
2010-09-10 16:58 ` Eric Dumazet
2010-09-10 17:12 ` Eric Dumazet
2010-09-11 0:19 ` Changli Gao
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).