* [PATCH] af_packet: add interframe drop cmsg
@ 2009-09-23 20:32 Neil Horman
2009-09-23 20:55 ` Eric Dumazet
0 siblings, 1 reply; 3+ messages in thread
From: Neil Horman @ 2009-09-23 20:32 UTC (permalink / raw)
To: netdev; +Cc: davem, nhorman
Add Ancilliary data to better represent loss information
I've had a few requests recently to provide more detail regarding frame loss
during an AF_PACKET packet capture session. Specifically the requestors want to
see where in a packet sequence frames were lost, i.e. they want to see that 40
frames were lost between frames 302 and 303 in a packet capture file. In order
to do this we need:
1) The kernel to export this data to user space
2) The applications to make use of it
This patch addresses item (1). It does this by doing the following:
A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
which frames were lost between it and the previously enqueued frame. Note I use
a ring buffer with a correlator value (the skb pointer) to do this. This was
done because the skb->cb array is exhausted already for AF_PACKET
B) For any frame dequeued that has ancilliary data in the ring buffer (as
determined by the correlator value), we add a cmsg structure to the msghdr that
gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
cmsg_type PACKET_GAPDATA. It contains a u32 value which counts the number of
frames lost between the reception of the frame being currently recevied and the
frame most recently preceding it. Note this creates a situation in which if we
have packet loss followed immediately by a socket close operation we might miss
some gap information. This situation is covered by the use of the
PACKET_AUXINFO socket option, which provides total loss stats (from which the
final gap can be computed).
I've tested this patch myself, and it works well.
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
include/linux/if_packet.h | 2 +
net/packet/af_packet.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index dea7d6b..e5d200f 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -48,11 +48,13 @@ struct sockaddr_ll
#define PACKET_RESERVE 12
#define PACKET_TX_RING 13
#define PACKET_LOSS 14
+#define PACKET_GAPDATA 15
struct tpacket_stats
{
unsigned int tp_packets;
unsigned int tp_drops;
+ unsigned int tp_gap;
};
struct tpacket_auxdata
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d3d52c6..b74a91c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
static void packet_flush_mclist(struct sock *sk);
+struct packet_gap_record {
+ struct sk_buff *skb;
+ __u32 gap;
+};
+
struct packet_sock {
/* struct sock has to be the first member of packet_sock */
struct sock sk;
@@ -197,6 +202,11 @@ struct packet_sock {
int ifindex; /* bound device */
__be16 num;
struct packet_mclist *mclist;
+ struct packet_gap_record *gaps;
+ unsigned int gap_head;
+ unsigned int gap_tail;
+ unsigned int gap_list_size;
+
#ifdef CONFIG_PACKET_MMAP
atomic_t mapped;
enum tpacket_versions tp_version;
@@ -524,6 +534,55 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
}
/*
+ * If we've lost frames since the last time we queued one to the
+ * sk_receive_queue, we need to record it here.
+ * This must be called under the protection of the socket lock
+ * to prevent racing with other softirqs and user space
+ */
+static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
+{
+ /*
+ * do nothing if there is no gap
+ */
+ if (!po->stats.tp_gap)
+ return;
+
+ /*
+ * If there is, check the gap list tail to make sure we
+ * have an open entry
+ */
+ if (po->gaps[po->gap_tail].skb != NULL) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "packet socket gap list is full!\n");
+ return;
+ }
+
+ /*
+ * We have a free entry, record it
+ */
+ po->gaps[po->gap_tail].skb = skb;
+ po->gaps[po->gap_tail].gap = po->stats.tp_gap;
+ po->gap_tail = (po->gap_tail+1) % po->gap_list_size;
+ po->stats.tp_gap = 0;
+ return;
+
+}
+
+static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
+{
+ __u32 gap = 0;
+
+ if (po->gaps[po->gap_head].skb != skb)
+ return 0;
+
+ gap = po->gaps[po->gap_head].gap;
+ po->gaps[po->gap_head].skb = NULL;
+ po->gap_head = (po->gap_head + 1) % po->gap_list_size;
+ return gap;
+}
+
+
+/*
This function makes lazy skb cloning in hope that most of packets
are discarded by BPF.
@@ -626,6 +685,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
spin_lock(&sk->sk_receive_queue.lock);
po->stats.tp_packets++;
+ record_packet_gap(skb, po);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk, skb->len);
@@ -634,6 +694,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
spin_lock(&sk->sk_receive_queue.lock);
po->stats.tp_drops++;
+ po->stats.tp_gap++;
spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
@@ -811,6 +872,7 @@ drop:
ring_is_full:
po->stats.tp_drops++;
+ po->stats.tp_gap++;
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk, 0);
@@ -1223,6 +1285,8 @@ static int packet_release(struct socket *sock)
skb_queue_purge(&sk->sk_receive_queue);
sk_refcnt_debug_release(sk);
+ kfree(po->gaps);
+
sock_put(sk);
return 0;
}
@@ -1350,6 +1414,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
+ unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
if (!capable(CAP_NET_RAW))
return -EPERM;
@@ -1360,6 +1425,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sock->state = SS_UNCONNECTED;
err = -ENOBUFS;
+
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
if (sk == NULL)
goto out;
@@ -1374,6 +1440,19 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sk->sk_family = PF_PACKET;
po->num = proto;
+ err = -ENOMEM;
+ po->gaps = kmalloc(sizeof(struct packet_gap_record)*num_records,
+ GFP_KERNEL);
+ if (!po->gaps)
+ goto out_free;
+ po->gap_tail = po->gap_head = 0;
+ po->gap_list_size = num_records;
+
+ for (num_records = 0; num_records < po->gap_list_size; num_records++) {
+ po->gaps[num_records].skb = NULL;
+ po->gaps[num_records].gap = 0;
+ }
+
sk->sk_destruct = packet_sock_destruct;
sk_refcnt_debug_inc(sk);
@@ -1402,6 +1481,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sock_prot_inuse_add(net, &packet_proto, 1);
write_unlock_bh(&net->packet.sklist_lock);
return 0;
+
+out_free:
+ sk_free(sk);
out:
return err;
}
@@ -1418,6 +1500,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
struct sk_buff *skb;
int copied, err;
struct sockaddr_ll *sll;
+ __u32 gap;
err = -EINVAL;
if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1492,10 +1575,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
aux.tp_mac = 0;
aux.tp_net = skb_network_offset(skb);
aux.tp_vlan_tci = skb->vlan_tci;
-
put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
}
+ lock_sock(sk);
+ gap = check_packet_gap(skb, pkt_sk(sk));
+ release_sock(sk);
+ if (gap)
+ put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
+
/*
* Free or return the buffer as appropriate. Again this
* hides all the races and re-entrancy issues from us.
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] af_packet: add interframe drop cmsg
2009-09-23 20:32 [PATCH] af_packet: add interframe drop cmsg Neil Horman
@ 2009-09-23 20:55 ` Eric Dumazet
2009-09-23 23:17 ` Neil Horman
0 siblings, 1 reply; 3+ messages in thread
From: Eric Dumazet @ 2009-09-23 20:55 UTC (permalink / raw)
To: Neil Horman; +Cc: netdev, davem
Neil Horman a écrit :
> Add Ancilliary data to better represent loss information
>
> I've had a few requests recently to provide more detail regarding frame loss
> during an AF_PACKET packet capture session. Specifically the requestors want to
> see where in a packet sequence frames were lost, i.e. they want to see that 40
> frames were lost between frames 302 and 303 in a packet capture file. In order
> to do this we need:
>
> 1) The kernel to export this data to user space
> 2) The applications to make use of it
>
> This patch addresses item (1). It does this by doing the following:
>
> A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
> which frames were lost between it and the previously enqueued frame. Note I use
> a ring buffer with a correlator value (the skb pointer) to do this. This was
> done because the skb->cb array is exhausted already for AF_PACKET
Hmm, how mmap() users can find this information ? I thought recent libpcap were
using mmap(), in order to reduce losses :)
>
> B) For any frame dequeued that has ancilliary data in the ring buffer (as
> determined by the correlator value), we add a cmsg structure to the msghdr that
> gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
> cmsg_type PACKET_GAPDATA. It contains a u32 value which counts the number of
> frames lost between the reception of the frame being currently recevied and the
> frame most recently preceding it. Note this creates a situation in which if we
> have packet loss followed immediately by a socket close operation we might miss
> some gap information. This situation is covered by the use of the
> PACKET_AUXINFO socket option, which provides total loss stats (from which the
> final gap can be computed).
>
> I've tested this patch myself, and it works well.
Okay :)
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>
>
> include/linux/if_packet.h | 2 +
> net/packet/af_packet.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 91 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index dea7d6b..e5d200f 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -48,11 +48,13 @@ struct sockaddr_ll
> #define PACKET_RESERVE 12
> #define PACKET_TX_RING 13
> #define PACKET_LOSS 14
> +#define PACKET_GAPDATA 15
>
> struct tpacket_stats
> {
> unsigned int tp_packets;
> unsigned int tp_drops;
> + unsigned int tp_gap;
> };
>
> struct tpacket_auxdata
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index d3d52c6..b74a91c 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
>
> static void packet_flush_mclist(struct sock *sk);
>
> +struct packet_gap_record {
> + struct sk_buff *skb;
> + __u32 gap;
> +};
> +
> struct packet_sock {
> /* struct sock has to be the first member of packet_sock */
> struct sock sk;
> @@ -197,6 +202,11 @@ struct packet_sock {
> int ifindex; /* bound device */
> __be16 num;
> struct packet_mclist *mclist;
> + struct packet_gap_record *gaps;
> + unsigned int gap_head;
> + unsigned int gap_tail;
> + unsigned int gap_list_size;
> +
> #ifdef CONFIG_PACKET_MMAP
> atomic_t mapped;
> enum tpacket_versions tp_version;
> @@ -524,6 +534,55 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
> }
>
> /*
> + * If we've lost frames since the last time we queued one to the
> + * sk_receive_queue, we need to record it here.
> + * This must be called under the protection of the socket lock
> + * to prevent racing with other softirqs and user space
> + */
> +static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> +{
> + /*
> + * do nothing if there is no gap
> + */
> + if (!po->stats.tp_gap)
> + return;
> +
> + /*
> + * If there is, check the gap list tail to make sure we
> + * have an open entry
> + */
> + if (po->gaps[po->gap_tail].skb != NULL) {
> + if (net_ratelimit())
> + printk(KERN_WARNING "packet socket gap list is full!\n");
New code can use pr_warning() macro
> + return;
> + }
> +
> + /*
> + * We have a free entry, record it
> + */
> + po->gaps[po->gap_tail].skb = skb;
> + po->gaps[po->gap_tail].gap = po->stats.tp_gap;
> + po->gap_tail = (po->gap_tail+1) % po->gap_list_size;
you could avoid this divide
if (++po->gap_tail == po->gap_list_size)
po->gap_tail = 0;
> + po->stats.tp_gap = 0;
> + return;
> +
> +}
> +
> +static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> +{
> + __u32 gap = 0;
> +
> + if (po->gaps[po->gap_head].skb != skb)
> + return 0;
> +
> + gap = po->gaps[po->gap_head].gap;
> + po->gaps[po->gap_head].skb = NULL;
> + po->gap_head = (po->gap_head + 1) % po->gap_list_size;
ditto
> + return gap;
> +}
> +
> +
> +/*
> This function makes lazy skb cloning in hope that most of packets
> are discarded by BPF.
>
> @@ -626,6 +685,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
>
> spin_lock(&sk->sk_receive_queue.lock);
> po->stats.tp_packets++;
> + record_packet_gap(skb, po);
> __skb_queue_tail(&sk->sk_receive_queue, skb);
> spin_unlock(&sk->sk_receive_queue.lock);
> sk->sk_data_ready(sk, skb->len);
> @@ -634,6 +694,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
> drop_n_acct:
> spin_lock(&sk->sk_receive_queue.lock);
> po->stats.tp_drops++;
> + po->stats.tp_gap++;
> spin_unlock(&sk->sk_receive_queue.lock);
>
> drop_n_restore:
> @@ -811,6 +872,7 @@ drop:
>
> ring_is_full:
> po->stats.tp_drops++;
> + po->stats.tp_gap++;
> spin_unlock(&sk->sk_receive_queue.lock);
>
> sk->sk_data_ready(sk, 0);
> @@ -1223,6 +1285,8 @@ static int packet_release(struct socket *sock)
> skb_queue_purge(&sk->sk_receive_queue);
> sk_refcnt_debug_release(sk);
>
> + kfree(po->gaps);
> +
> sock_put(sk);
> return 0;
> }
> @@ -1350,6 +1414,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> struct packet_sock *po;
> __be16 proto = (__force __be16)protocol; /* weird, but documented */
> int err;
> + unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
>
> if (!capable(CAP_NET_RAW))
> return -EPERM;
> @@ -1360,6 +1425,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> sock->state = SS_UNCONNECTED;
>
> err = -ENOBUFS;
> +
> sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
> if (sk == NULL)
> goto out;
> @@ -1374,6 +1440,19 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> sk->sk_family = PF_PACKET;
> po->num = proto;
>
> + err = -ENOMEM;
> + po->gaps = kmalloc(sizeof(struct packet_gap_record)*num_records,
> + GFP_KERNEL);
kzalloc(), and no need for some following lines
> + if (!po->gaps)
> + goto out_free;
> + po->gap_tail = po->gap_head = 0;
> + po->gap_list_size = num_records;
> +
> + for (num_records = 0; num_records < po->gap_list_size; num_records++) {
> + po->gaps[num_records].skb = NULL;
> + po->gaps[num_records].gap = 0;
> + }
> +
> sk->sk_destruct = packet_sock_destruct;
> sk_refcnt_debug_inc(sk);
>
> @@ -1402,6 +1481,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> sock_prot_inuse_add(net, &packet_proto, 1);
> write_unlock_bh(&net->packet.sklist_lock);
> return 0;
> +
> +out_free:
> + sk_free(sk);
> out:
> return err;
> }
> @@ -1418,6 +1500,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> struct sk_buff *skb;
> int copied, err;
> struct sockaddr_ll *sll;
> + __u32 gap;
>
> err = -EINVAL;
> if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> @@ -1492,10 +1575,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> aux.tp_mac = 0;
> aux.tp_net = skb_network_offset(skb);
> aux.tp_vlan_tci = skb->vlan_tci;
> -
Please dont mix cleanups
> put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
> }
>
> + lock_sock(sk);
strange locking here. this doesnt match locking used at record time.
( spin_lock(&sk->sk_receive_queue.lock);)
> + gap = check_packet_gap(skb, pkt_sk(sk));
> + release_sock(sk);
> + if (gap)
> + put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
> +
> /*
> * Free or return the buffer as appropriate. Again this
> * hides all the races and re-entrancy issues from us.
Thanks
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] af_packet: add interframe drop cmsg
2009-09-23 20:55 ` Eric Dumazet
@ 2009-09-23 23:17 ` Neil Horman
0 siblings, 0 replies; 3+ messages in thread
From: Neil Horman @ 2009-09-23 23:17 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, davem
On Wed, Sep 23, 2009 at 10:55:12PM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
> > Add Ancilliary data to better represent loss information
> >
> > I've had a few requests recently to provide more detail regarding frame loss
> > during an AF_PACKET packet capture session. Specifically the requestors want to
> > see where in a packet sequence frames were lost, i.e. they want to see that 40
> > frames were lost between frames 302 and 303 in a packet capture file. In order
> > to do this we need:
> >
> > 1) The kernel to export this data to user space
> > 2) The applications to make use of it
> >
> > This patch addresses item (1). It does this by doing the following:
> >
> > A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
> > which frames were lost between it and the previously enqueued frame. Note I use
> > a ring buffer with a correlator value (the skb pointer) to do this. This was
> > done because the skb->cb array is exhausted already for AF_PACKET
>
> Hmm, how mmap() users can find this information ? I thought recent libpcap were
> using mmap(), in order to reduce losses :)
>
Yeah, in some/most cases it does, but to be honest, I think any solution for
determining frame loss with sequence encoding is going to diverge between a
descriptor based solution (i.e. recvmsg), and an mmap solution is going to be
divergent. About the only solution I could see that could be common would be
the use of some sort of marker frame getting inserted into the receive queue,
and I'm pretty certain thats going to be a hard sell.
> >
> > B) For any frame dequeued that has ancilliary data in the ring buffer (as
> > determined by the correlator value), we add a cmsg structure to the msghdr that
> > gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
> > cmsg_type PACKET_GAPDATA. It contains a u32 value which counts the number of
> > frames lost between the reception of the frame being currently recevied and the
> > frame most recently preceding it. Note this creates a situation in which if we
> > have packet loss followed immediately by a socket close operation we might miss
> > some gap information. This situation is covered by the use of the
> > PACKET_AUXINFO socket option, which provides total loss stats (from which the
> > final gap can be computed).
> >
> > I've tested this patch myself, and it works well.
>
> Okay :)
>
Thanks for the vote of confidence :). I augmented the patch to randomly drop
frames, then wrote a applicatoin to loop on an AF_PACKET frame receive, and
compared a printk showing the in-kernel drop rates with what the user space app
recorded.
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >
> >
> > include/linux/if_packet.h | 2 +
> > net/packet/af_packet.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 91 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> > index dea7d6b..e5d200f 100644
> > --- a/include/linux/if_packet.h
> > +++ b/include/linux/if_packet.h
> > @@ -48,11 +48,13 @@ struct sockaddr_ll
> > #define PACKET_RESERVE 12
> > #define PACKET_TX_RING 13
> > #define PACKET_LOSS 14
> > +#define PACKET_GAPDATA 15
> >
> > struct tpacket_stats
> > {
> > unsigned int tp_packets;
> > unsigned int tp_drops;
> > + unsigned int tp_gap;
> > };
> >
> > struct tpacket_auxdata
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index d3d52c6..b74a91c 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
> >
> > static void packet_flush_mclist(struct sock *sk);
> >
> > +struct packet_gap_record {
> > + struct sk_buff *skb;
> > + __u32 gap;
> > +};
> > +
> > struct packet_sock {
> > /* struct sock has to be the first member of packet_sock */
> > struct sock sk;
> > @@ -197,6 +202,11 @@ struct packet_sock {
> > int ifindex; /* bound device */
> > __be16 num;
> > struct packet_mclist *mclist;
> > + struct packet_gap_record *gaps;
> > + unsigned int gap_head;
> > + unsigned int gap_tail;
> > + unsigned int gap_list_size;
> > +
> > #ifdef CONFIG_PACKET_MMAP
> > atomic_t mapped;
> > enum tpacket_versions tp_version;
> > @@ -524,6 +534,55 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
> > }
> >
> > /*
> > + * If we've lost frames since the last time we queued one to the
> > + * sk_receive_queue, we need to record it here.
> > + * This must be called under the protection of the socket lock
> > + * to prevent racing with other softirqs and user space
> > + */
> > +static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> > +{
> > + /*
> > + * do nothing if there is no gap
> > + */
> > + if (!po->stats.tp_gap)
> > + return;
> > +
> > + /*
> > + * If there is, check the gap list tail to make sure we
> > + * have an open entry
> > + */
> > + if (po->gaps[po->gap_tail].skb != NULL) {
> > + if (net_ratelimit())
> > + printk(KERN_WARNING "packet socket gap list is full!\n");
>
> New code can use pr_warning() macro
>
good point.
> > + return;
> > + }
> > +
> > + /*
> > + * We have a free entry, record it
> > + */
> > + po->gaps[po->gap_tail].skb = skb;
> > + po->gaps[po->gap_tail].gap = po->stats.tp_gap;
> > + po->gap_tail = (po->gap_tail+1) % po->gap_list_size;
>
> you could avoid this divide
>
> if (++po->gap_tail == po->gap_list_size)
> po->gap_tail = 0;
>
Yup, I can do that.
> > + po->stats.tp_gap = 0;
> > + return;
> > +
> > +}
> > +
> > +static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> > +{
> > + __u32 gap = 0;
> > +
> > + if (po->gaps[po->gap_head].skb != skb)
> > + return 0;
> > +
> > + gap = po->gaps[po->gap_head].gap;
> > + po->gaps[po->gap_head].skb = NULL;
> > + po->gap_head = (po->gap_head + 1) % po->gap_list_size;
>
> ditto
>
ditto :)
> > + return gap;
> > +}
> > +
> > +
> > +/*
> > This function makes lazy skb cloning in hope that most of packets
> > are discarded by BPF.
> >
> > @@ -626,6 +685,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
> >
> > spin_lock(&sk->sk_receive_queue.lock);
> > po->stats.tp_packets++;
> > + record_packet_gap(skb, po);
> > __skb_queue_tail(&sk->sk_receive_queue, skb);
> > spin_unlock(&sk->sk_receive_queue.lock);
> > sk->sk_data_ready(sk, skb->len);
> > @@ -634,6 +694,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
> > drop_n_acct:
> > spin_lock(&sk->sk_receive_queue.lock);
> > po->stats.tp_drops++;
> > + po->stats.tp_gap++;
> > spin_unlock(&sk->sk_receive_queue.lock);
> >
> > drop_n_restore:
> > @@ -811,6 +872,7 @@ drop:
> >
> > ring_is_full:
> > po->stats.tp_drops++;
> > + po->stats.tp_gap++;
> > spin_unlock(&sk->sk_receive_queue.lock);
> >
> > sk->sk_data_ready(sk, 0);
> > @@ -1223,6 +1285,8 @@ static int packet_release(struct socket *sock)
> > skb_queue_purge(&sk->sk_receive_queue);
> > sk_refcnt_debug_release(sk);
> >
> > + kfree(po->gaps);
> > +
> > sock_put(sk);
> > return 0;
> > }
> > @@ -1350,6 +1414,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > struct packet_sock *po;
> > __be16 proto = (__force __be16)protocol; /* weird, but documented */
> > int err;
> > + unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
> >
> > if (!capable(CAP_NET_RAW))
> > return -EPERM;
> > @@ -1360,6 +1425,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > sock->state = SS_UNCONNECTED;
> >
> > err = -ENOBUFS;
> > +
> > sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
> > if (sk == NULL)
> > goto out;
> > @@ -1374,6 +1440,19 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > sk->sk_family = PF_PACKET;
> > po->num = proto;
> >
> > + err = -ENOMEM;
> > + po->gaps = kmalloc(sizeof(struct packet_gap_record)*num_records,
> > + GFP_KERNEL);
>
> kzalloc(), and no need for some following lines
>
Will do.
> > + if (!po->gaps)
> > + goto out_free;
> > + po->gap_tail = po->gap_head = 0;
> > + po->gap_list_size = num_records;
> > +
> > + for (num_records = 0; num_records < po->gap_list_size; num_records++) {
> > + po->gaps[num_records].skb = NULL;
> > + po->gaps[num_records].gap = 0;
> > + }
> > +
> > sk->sk_destruct = packet_sock_destruct;
> > sk_refcnt_debug_inc(sk);
> >
> > @@ -1402,6 +1481,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > sock_prot_inuse_add(net, &packet_proto, 1);
> > write_unlock_bh(&net->packet.sklist_lock);
> > return 0;
> > +
> > +out_free:
> > + sk_free(sk);
> > out:
> > return err;
> > }
> > @@ -1418,6 +1500,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > struct sk_buff *skb;
> > int copied, err;
> > struct sockaddr_ll *sll;
> > + __u32 gap;
> >
> > err = -EINVAL;
> > if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> > @@ -1492,10 +1575,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > aux.tp_mac = 0;
> > aux.tp_net = skb_network_offset(skb);
> > aux.tp_vlan_tci = skb->vlan_tci;
> > -
>
> Please dont mix cleanups
>
Doh, I thought I'd removed that. Thanks
> > put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
> > }
> >
> > + lock_sock(sk);
>
> strange locking here. this doesnt match locking used at record time.
>
> ( spin_lock(&sk->sk_receive_queue.lock);)
>
Not sure why AF_PACKET didn't use sock_lock_bh, there, I think that probably
requires a cleanup. The intent was to protect the ring buffer using the socket
lock. I think we likey need to change the existing lock usage in af_packet.c to
use sock_lock_bh in this case.
> > + gap = check_packet_gap(skb, pkt_sk(sk));
> > + release_sock(sk);
> > + if (gap)
> > + put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
> > +
> > /*
> > * Free or return the buffer as appropriate. Again this
> > * hides all the races and re-entrancy issues from us.
>
> Thanks
>
Thanks for the notes Eric, I'll cleanup and repost in the AM.
Regards
Neil
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2009-09-23 23:17 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-09-23 20:32 [PATCH] af_packet: add interframe drop cmsg Neil Horman
2009-09-23 20:55 ` Eric Dumazet
2009-09-23 23:17 ` Neil Horman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).