* Re: [PATCH] af_packet: add interframe drop cmsg
From: Neil Horman @ 2009-09-23 23:17 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev, davem
In-Reply-To: <4ABA8B30.9060904@gmail.com>
On Wed, Sep 23, 2009 at 10:55:12PM +0200, Eric Dumazet wrote:
> Neil Horman a écrit :
> > Add Ancilliary data to better represent loss information
> >
> > I've had a few requests recently to provide more detail regarding frame loss
> > during an AF_PACKET packet capture session. Specifically the requestors want to
> > see where in a packet sequence frames were lost, i.e. they want to see that 40
> > frames were lost between frames 302 and 303 in a packet capture file. In order
> > to do this we need:
> >
> > 1) The kernel to export this data to user space
> > 2) The applications to make use of it
> >
> > This patch addresses item (1). It does this by doing the following:
> >
> > A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
> > which frames were lost between it and the previously enqueued frame. Note I use
> > a ring buffer with a correlator value (the skb pointer) to do this. This was
> > done because the skb->cb array is exhausted already for AF_PACKET
>
> Hmm, how mmap() users can find this information ? I thought recent libpcap were
> using mmap(), in order to reduce losses :)
>
Yeah, in some/most cases it does, but to be honest, I think any solution for
determining frame loss with sequence encoding is going to diverge between a
descriptor based solution (i.e. recvmsg), and an mmap solution is going to be
divergent. About the only solution I could see that could be common would be
the use of some sort of marker frame getting inserted into the receive queue,
and I'm pretty certain thats going to be a hard sell.
> >
> > B) For any frame dequeued that has ancilliary data in the ring buffer (as
> > determined by the correlator value), we add a cmsg structure to the msghdr that
> > gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
> > cmsg_type PACKET_GAPDATA. It contains a u32 value which counts the number of
> > frames lost between the reception of the frame being currently recevied and the
> > frame most recently preceding it. Note this creates a situation in which if we
> > have packet loss followed immediately by a socket close operation we might miss
> > some gap information. This situation is covered by the use of the
> > PACKET_AUXINFO socket option, which provides total loss stats (from which the
> > final gap can be computed).
> >
> > I've tested this patch myself, and it works well.
>
> Okay :)
>
Thanks for the vote of confidence :). I augmented the patch to randomly drop
frames, then wrote a applicatoin to loop on an AF_PACKET frame receive, and
compared a printk showing the in-kernel drop rates with what the user space app
recorded.
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >
> >
> > include/linux/if_packet.h | 2 +
> > net/packet/af_packet.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 91 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> > index dea7d6b..e5d200f 100644
> > --- a/include/linux/if_packet.h
> > +++ b/include/linux/if_packet.h
> > @@ -48,11 +48,13 @@ struct sockaddr_ll
> > #define PACKET_RESERVE 12
> > #define PACKET_TX_RING 13
> > #define PACKET_LOSS 14
> > +#define PACKET_GAPDATA 15
> >
> > struct tpacket_stats
> > {
> > unsigned int tp_packets;
> > unsigned int tp_drops;
> > + unsigned int tp_gap;
> > };
> >
> > struct tpacket_auxdata
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index d3d52c6..b74a91c 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
> >
> > static void packet_flush_mclist(struct sock *sk);
> >
> > +struct packet_gap_record {
> > + struct sk_buff *skb;
> > + __u32 gap;
> > +};
> > +
> > struct packet_sock {
> > /* struct sock has to be the first member of packet_sock */
> > struct sock sk;
> > @@ -197,6 +202,11 @@ struct packet_sock {
> > int ifindex; /* bound device */
> > __be16 num;
> > struct packet_mclist *mclist;
> > + struct packet_gap_record *gaps;
> > + unsigned int gap_head;
> > + unsigned int gap_tail;
> > + unsigned int gap_list_size;
> > +
> > #ifdef CONFIG_PACKET_MMAP
> > atomic_t mapped;
> > enum tpacket_versions tp_version;
> > @@ -524,6 +534,55 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
> > }
> >
> > /*
> > + * If we've lost frames since the last time we queued one to the
> > + * sk_receive_queue, we need to record it here.
> > + * This must be called under the protection of the socket lock
> > + * to prevent racing with other softirqs and user space
> > + */
> > +static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> > +{
> > + /*
> > + * do nothing if there is no gap
> > + */
> > + if (!po->stats.tp_gap)
> > + return;
> > +
> > + /*
> > + * If there is, check the gap list tail to make sure we
> > + * have an open entry
> > + */
> > + if (po->gaps[po->gap_tail].skb != NULL) {
> > + if (net_ratelimit())
> > + printk(KERN_WARNING "packet socket gap list is full!\n");
>
> New code can use pr_warning() macro
>
good point.
> > + return;
> > + }
> > +
> > + /*
> > + * We have a free entry, record it
> > + */
> > + po->gaps[po->gap_tail].skb = skb;
> > + po->gaps[po->gap_tail].gap = po->stats.tp_gap;
> > + po->gap_tail = (po->gap_tail+1) % po->gap_list_size;
>
> you could avoid this divide
>
> if (++po->gap_tail == po->gap_list_size)
> po->gap_tail = 0;
>
Yup, I can do that.
> > + po->stats.tp_gap = 0;
> > + return;
> > +
> > +}
> > +
> > +static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> > +{
> > + __u32 gap = 0;
> > +
> > + if (po->gaps[po->gap_head].skb != skb)
> > + return 0;
> > +
> > + gap = po->gaps[po->gap_head].gap;
> > + po->gaps[po->gap_head].skb = NULL;
> > + po->gap_head = (po->gap_head + 1) % po->gap_list_size;
>
> ditto
>
ditto :)
> > + return gap;
> > +}
> > +
> > +
> > +/*
> > This function makes lazy skb cloning in hope that most of packets
> > are discarded by BPF.
> >
> > @@ -626,6 +685,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
> >
> > spin_lock(&sk->sk_receive_queue.lock);
> > po->stats.tp_packets++;
> > + record_packet_gap(skb, po);
> > __skb_queue_tail(&sk->sk_receive_queue, skb);
> > spin_unlock(&sk->sk_receive_queue.lock);
> > sk->sk_data_ready(sk, skb->len);
> > @@ -634,6 +694,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
> > drop_n_acct:
> > spin_lock(&sk->sk_receive_queue.lock);
> > po->stats.tp_drops++;
> > + po->stats.tp_gap++;
> > spin_unlock(&sk->sk_receive_queue.lock);
> >
> > drop_n_restore:
> > @@ -811,6 +872,7 @@ drop:
> >
> > ring_is_full:
> > po->stats.tp_drops++;
> > + po->stats.tp_gap++;
> > spin_unlock(&sk->sk_receive_queue.lock);
> >
> > sk->sk_data_ready(sk, 0);
> > @@ -1223,6 +1285,8 @@ static int packet_release(struct socket *sock)
> > skb_queue_purge(&sk->sk_receive_queue);
> > sk_refcnt_debug_release(sk);
> >
> > + kfree(po->gaps);
> > +
> > sock_put(sk);
> > return 0;
> > }
> > @@ -1350,6 +1414,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > struct packet_sock *po;
> > __be16 proto = (__force __be16)protocol; /* weird, but documented */
> > int err;
> > + unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
> >
> > if (!capable(CAP_NET_RAW))
> > return -EPERM;
> > @@ -1360,6 +1425,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > sock->state = SS_UNCONNECTED;
> >
> > err = -ENOBUFS;
> > +
> > sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
> > if (sk == NULL)
> > goto out;
> > @@ -1374,6 +1440,19 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > sk->sk_family = PF_PACKET;
> > po->num = proto;
> >
> > + err = -ENOMEM;
> > + po->gaps = kmalloc(sizeof(struct packet_gap_record)*num_records,
> > + GFP_KERNEL);
>
> kzalloc(), and no need for some following lines
>
Will do.
> > + if (!po->gaps)
> > + goto out_free;
> > + po->gap_tail = po->gap_head = 0;
> > + po->gap_list_size = num_records;
> > +
> > + for (num_records = 0; num_records < po->gap_list_size; num_records++) {
> > + po->gaps[num_records].skb = NULL;
> > + po->gaps[num_records].gap = 0;
> > + }
> > +
> > sk->sk_destruct = packet_sock_destruct;
> > sk_refcnt_debug_inc(sk);
> >
> > @@ -1402,6 +1481,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> > sock_prot_inuse_add(net, &packet_proto, 1);
> > write_unlock_bh(&net->packet.sklist_lock);
> > return 0;
> > +
> > +out_free:
> > + sk_free(sk);
> > out:
> > return err;
> > }
> > @@ -1418,6 +1500,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > struct sk_buff *skb;
> > int copied, err;
> > struct sockaddr_ll *sll;
> > + __u32 gap;
> >
> > err = -EINVAL;
> > if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> > @@ -1492,10 +1575,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > aux.tp_mac = 0;
> > aux.tp_net = skb_network_offset(skb);
> > aux.tp_vlan_tci = skb->vlan_tci;
> > -
>
> Please dont mix cleanups
>
Doh, I thought I'd removed that. Thanks
> > put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
> > }
> >
> > + lock_sock(sk);
>
> strange locking here. this doesnt match locking used at record time.
>
> ( spin_lock(&sk->sk_receive_queue.lock);)
>
Not sure why AF_PACKET didn't use sock_lock_bh, there, I think that probably
requires a cleanup. The intent was to protect the ring buffer using the socket
lock. I think we likey need to change the existing lock usage in af_packet.c to
use sock_lock_bh in this case.
> > + gap = check_packet_gap(skb, pkt_sk(sk));
> > + release_sock(sk);
> > + if (gap)
> > + put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
> > +
> > /*
> > * Free or return the buffer as appropriate. Again this
> > * hides all the races and re-entrancy issues from us.
>
> Thanks
>
Thanks for the notes Eric, I'll cleanup and repost in the AM.
Regards
Neil
>
^ permalink raw reply
* ixgbe patch to provide NIC's tx/rx counters via ethtool
From: Ben Greear @ 2009-09-23 22:36 UTC (permalink / raw)
To: NetDev
[-- Attachment #1: Type: text/plain, Size: 1104 bytes --]
When LRO is enabled, the received packet and byte counters represent the
LRO'd packets, not the packets/bytes on the wire. The Intel 82599 NIC has
registers that keep count of the physical packets. Add these counters to
the ethtool stats. The byte counters are 36-bit, but the high 4 bits were
being ignored in the 2.6.31 ixgbe driver: Read those as well to allow
longer time between polling the stats to detect wraps.
Signed-off-by: Ben Greear <greearb@candelatech.com>
Please do not apply this until the ixgbe authors ACK it. There may
have been reasons for not reading the high 4 bits, or they may dislike
this approach entirely.
Here is ethtool stats output with LRO enabled, with patch applied:
#ethtool -S eth20
NIC statistics:
rx_packets: 15944000
tx_packets: 12339293
rx_bytes: 272306022656
tx_bytes: 940244184
rx_pkts_nic: 187747191
tx_pkts_nic: 12340822
rx_bytes_nic: 284695533402
tx_bytes_nic: 989725050
lsc_int: 3
...
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
[-- Attachment #2: ixgbe_stats.patch --]
[-- Type: text/plain, Size: 1713 bytes --]
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index dff8dfa..da3cba3 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -53,6 +53,10 @@ static struct ixgbe_stats ixgbe_gstrings_stats[] = {
{"tx_packets", IXGBE_STAT(net_stats.tx_packets)},
{"rx_bytes", IXGBE_STAT(net_stats.rx_bytes)},
{"tx_bytes", IXGBE_STAT(net_stats.tx_bytes)},
+ {"rx_pkts_nic", IXGBE_STAT(stats.gprc)},
+ {"tx_pkts_nic", IXGBE_STAT(stats.gptc)},
+ {"rx_bytes_nic", IXGBE_STAT(stats.gorc)},
+ {"tx_bytes_nic", IXGBE_STAT(stats.gotc)},
{"lsc_int", IXGBE_STAT(lsc_int)},
{"tx_busy", IXGBE_STAT(tx_busy)},
{"non_eop_descs", IXGBE_STAT(non_eop_descs)},
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 77b0381..929a847 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4377,10 +4377,13 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
/* 82598 hardware only has a 32 bit counter in the high register */
if (hw->mac.type == ixgbe_mac_82599EB) {
+ u64 tmp;
adapter->stats.gorc += IXGBE_READ_REG(hw, IXGBE_GORCL);
- IXGBE_READ_REG(hw, IXGBE_GORCH); /* to clear */
+ tmp = IXGBE_READ_REG(hw, IXGBE_GORCH) & 0xF; /* 4 high bits of GORC */
+ adapter->stats.gorc += (tmp << 32);
adapter->stats.gotc += IXGBE_READ_REG(hw, IXGBE_GOTCL);
- IXGBE_READ_REG(hw, IXGBE_GOTCH); /* to clear */
+ tmp = IXGBE_READ_REG(hw, IXGBE_GOTCH) & 0xF; /* 4 high bits of GOTC */
+ adapter->stats.gotc += (tmp << 32);
adapter->stats.tor += IXGBE_READ_REG(hw, IXGBE_TORL);
IXGBE_READ_REG(hw, IXGBE_TORH); /* to clear */
adapter->stats.lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXCNT);
^ permalink raw reply related
* Re: [PATCH 5/8] [RFC] CAIF Protocol Stack
From: Randy Dunlap @ 2009-09-23 22:24 UTC (permalink / raw)
To: sjur.brandeland; +Cc: netdev, Kim.xx.Lilliestierna
In-Reply-To: <1253727086-10353-1-git-send-email-sjur.brandeland@stericsson.com>
sjur.brandeland@stericsson.com wrote:
> From: Kim Lilliestierna <Kim.xx.Lilliestierna@ericsson.com>
>
> Signed-off-by: sjur.brandeland@stericsson.com
>
> ---
> net/caif/Kconfig | 61 +++
> net/caif/Makefile | 62 +++
> net/caif/caif_chnlif.c | 219 ++++++++
> net/caif/caif_chr.c | 378 ++++++++++++++
> net/caif/caif_config_util.c | 167 +++++++
> net/caif/chnl_chr.c | 1161 +++++++++++++++++++++++++++++++++++++++++++
> net/caif/chnl_net.c | 464 +++++++++++++++++
> 7 files changed, 2512 insertions(+), 0 deletions(-)
> create mode 100644 net/caif/Kconfig
> create mode 100644 net/caif/Makefile
> create mode 100644 net/caif/caif_chnlif.c
> create mode 100644 net/caif/caif_chr.c
> create mode 100644 net/caif/caif_config_util.c
> create mode 100644 net/caif/chnl_chr.c
> create mode 100644 net/caif/chnl_net.c
>
> diff --git a/net/caif/Kconfig b/net/caif/Kconfig
> new file mode 100644
> index 0000000..24151c1
> --- /dev/null
> +++ b/net/caif/Kconfig
> @@ -0,0 +1,61 @@
> +#
> +# CAIF net configurations
> +#
> +
> +#menu "Caif Support"
> +comment "CAIF Support"
> +
> +menuconfig CAIF
> + tristate "Enable Caif support"
> + default m
Does having CONFIG_CAIF=m only control kconfig menus or does it
cause some code to be built? If the latter, then it should default
to N.
> + ---help---
> + Say Y here if you need to a phone modem that uses CAIF as transport
to use (?) as transport.
> + You will also need to say yes to anny caif physical devices that your platform
any
> + supports.
> + This can be built as either built in or as a module, if you select to build it as module
This can be either built-in or as a loadable module. If you ...
> + the other CAIF parts also needs to built as modules
also need to be built as modules.
> + See Documentation/CAIF for a further explanation on how to use and configure.
> +
> +if CAIF
> +
> +config CAIF_CHARDEV
> + tristate "CAIF character device"
> + default CAIF
> + ---help---
> + Say Y if you will be using the CAIF character devices,
devices.
> + This is needed for AT type channels
channels.
> + If you select to build it as a built in then the main caif device must also be a builtin
<end above with period>
> + If unsure say Y
<end above with period>
> +
> +config CAIF_NETDEV
> + tristate "CAIF Network device"
> + default CAIF
> + ---help---
> + If you select to build it as a built in then the main caif device must also be a builtin
> + Say Y if you will be using the CAIF based network device
> + If unsure say Y
Please end all of those sentences with a period ('.').
> +
> +
> +config CAIF_USE_PLAIN
> + bool "Use plain buffers instead of SKB in caif"
> + default n
> + ---help---
> + Use plain buffer to transport data
> + Select what type of internal buffering CAIF should use,
> + skb or plain,
> + If unsure say y
ditto.
> +
> +config CAIF_DEBUG
> + bool "Enable Debug"
> + default n
> + --- help ---
> + Enable the inclusion of debug code in the caif stack
> + be aware that doing this will impact performance
ditto.
> + If unsure say n here.
> +
> +# Include physical drivers
> +# should be broken out into its own config file
> +# source "drivers/net/caif/Kconfig"
> +source "drivers/net/caif/Kconfig"
> +endif
> +#endmenu
> diff --git a/net/caif/Makefile b/net/caif/Makefile
> new file mode 100644
> index 0000000..49696ab
> --- /dev/null
> +++ b/net/caif/Makefile
> @@ -0,0 +1,62 @@
> +ifeq ($(CONFIG_CAIF_USE_PLAIN),1)
> +CFPKT:=plain
> +else
> +CFPKT:=skbuff
> +CAIF_FLAGS+=-DCAIF_USE_SKB
> +endif
> +
> +ifeq ($(CONFIG_CAIF_DEBUG),1)
> +CAIF_FLAGS+=-DCAIF_DEBUG_ON
> +endif
> +
> +
> +ccflags-y := -DCAIF_KERNEL $(CAIF_FLAGS) -Iinclude/net/caif/ -Iinclude/net/caif/generic/ -Iinclude/linux/caif/
> +
> +
> +caif-objs := caif_chr.o caif_chnlif.o caif_config_util.o \
> + generic/cfcnfg.o generic/cfmuxl.o generic/cfctrl.o \
> + generic/cffrml.o generic/cfveil.o generic/cflist.o \
> + generic/fcs.o generic/cfserl.o generic/cfdgml.o \
> + generic/cfspil.o generic/cfrfml.o generic/cfvidl.o \
> + generic/cfmsll.o generic/cfutill.o generic/cfshml.o \
> + generic/cfloopcfg.o generic/cflooplayer.o generic/cfsrvl.o \
> + generic/cfpkt_$(CFPKT).o
> +
> +
> +clean-dirs:= .tmp_versions
> +
> +clean-files:= Module.symvers modules.order *.cmd *~ \
> + generic/loopback/Module.symvers \
> + generic/loopback/modules.order \
> + generic/loopback/*.cmd \
> + generic/loopback/*.o \
> + generic/loopback/*~ \
> + generic/Module.symvers \
> + generic/modules.order \
> + generic/*.cmd \
> + generic/*.o \
> + generic/*~
> +
> +
> +# Main caif module
> +obj-$(CONFIG_CAIF) += caif.o
> +
> +# Character device
> +obj-$(CAIF_CHRDEV) += chnl_chr.o
> +
> +# Net device
> +obj-$(CAIF_NETDEV) += chnl_net.o
> +
> +export-objs := caif_chr.o
> +
> +## Indent.. to remove all DOS cruft like CRLF and trailing spaces, and to standardize the indenting style
This should be done one time only, before it is merged into the kernel source tree.
> +indent:
> + ${MAKE} -C generic indent
> + sh -c 'for F in *.[ch]; do cat $$F | tr -d "\r" |tr -c '\015'| sed "s/[ \t]*$$//;" > $$F.tmp; mv $$F.tmp $$F; done'
> + ${INDENT} -kr -i8 *.[ch]
> +
> +clean:
> + ${MAKE} -C generic clean
> + rm generic/modules.order generic/Module.symvers generic/*.cmd generic/*~ \
> + generic/modules.order generic/Module.symvers \
> + generic/*.o generic/loopback/*.o
> diff --git a/net/caif/caif_chnlif.c b/net/caif/caif_chnlif.c
> new file mode 100644
> index 0000000..e53ca7a
> --- /dev/null
> +++ b/net/caif/caif_chnlif.c
> @@ -0,0 +1,219 @@
> +/*
> +* Copyright (C) ST-Ericsson AB 2009
> +*
> +* Author: Daniel Martensson / Daniel.Martensson@stericsson.com
> +*
> +* License terms: GNU General Public License (GPL), version 2.
> +*
> +*/
> +
> +#include <linux/skbuff.h>
> +#include "caif_kernel.h"
> +#include "caif_layer.h"
> +#include "caif_config_util.h"
> +#include "caif_log.h"
> +#include "cfpkt.h"
> +#include "cfcnfg.h"
> +#include "cfglue.h"
> +
> +
> +
> +struct caif_kernelif {
> + layer_t layer;
> + struct caif_device *dev;
> + cfctrl_link_param_t param;
> +};
> +static cfcnfg_t *cnfg;
> +/**
> + * func chnlif_set_cnfg - Set the global config
> + * @cfg: Config structure to set
> +*/
> +
> +void chnlif_set_cnfg(cfcnfg_t *cfg)
> +{
> + cnfg = cfg;
> +}
> +EXPORT_SYMBOL(chnlif_set_cnfg);
> +
> +/**
> + * func caif_create_skb - Creates a caif skb buffer
> + * @data: data to add to buffer
> + * @data_length: lenht of data
length
---
~Randy
^ permalink raw reply
* Re: fanotify as syscalls
From: hch @ 2009-09-23 21:58 UTC (permalink / raw)
To: Davide Libenzi
Cc: hch@infradead.org, Tvrtko Ursulin, Andreas Gruenbacher,
Jamie Lokier, Eric Paris, Linus Torvalds, Evgeniy Polyakov,
David Miller, Linux Kernel Mailing List,
linux-fsdevel@vger.kernel.org, netdev@vger.kernel.org,
viro@zeniv.linux.org.uk, alan@linux.intel.com
In-Reply-To: <alpine.DEB.2.00.0909230827100.21515@makko.or.mcafeemobile.com>
On Wed, Sep 23, 2009 at 08:35:18AM -0700, Davide Libenzi wrote:
> The fear is that this becomes a trojan horse (no pun intended) for more
> and more hooks and "stuff", driven by we-really-need-this-too and
> we-really-need-that-too. And once something it's in, it's harder to say no,
> under the pressure of offering a "limited solution".
> This ws the reason I threw the syscall tracing thing in, so they have a
> low level generic hook, and they cam knock themselves out in their module
> (might need a few exports, but that's about it).
Replacing idiotify with a saner interface is a good goal. I just don't
think we should take the stakes of the snake oil industry too serious in
it.
^ permalink raw reply
* Re: fanotify as syscalls
From: hch @ 2009-09-23 21:56 UTC (permalink / raw)
To: Eric Paris
Cc: Arjan van de Ven, Tvrtko Ursulin, Davide Libenzi,
Andreas Gruenbacher, Jamie Lokier, Linus Torvalds,
Evgeniy Polyakov, David Miller, Linux Kernel Mailing List,
linux-fsdevel@vger.kernel.org, netdev@vger.kernel.org,
viro@zeniv.linux.org.uk, alan@linux.intel.com, hch@infradead.org
In-Reply-To: <1253721097.2890.3.camel@dhcp231-106.rdu.redhat.com>
On Wed, Sep 23, 2009 at 11:51:37AM -0400, Eric Paris wrote:
> And users would be left in a situation between choosing an LSM which
> actually does in provable ways increase security and using an AV
> scanner.
Sounds like a good thing, no?
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Ben Greear @ 2009-09-23 21:56 UTC (permalink / raw)
To: Peter P Waskiewicz Jr; +Cc: NetDev
In-Reply-To: <4ABA9509.7000008@candelatech.com>
I'm poking at the ixgbe_main code in 2.6.31.
It seems from the spec sheet that the 82599 supports the GORCH.
/* 82598 hardware only has a 32 bit counter in the high register */
if (hw->mac.type == ixgbe_mac_82599EB) {
adapter->stats.gorc += IXGBE_READ_REG(hw, IXGBE_GORCL);
IXGBE_READ_REG(hw, IXGBE_GORCH); /* to clear */
stats.gorc is 64-bit, so any reason not to grab the 4 high-bits out of GORCL
and add them to stats.gorc instead of just clearing them as this
code seems to do?
That gives us some precious extra seconds to read counters before they
wrap :)
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Ben Greear @ 2009-09-23 21:37 UTC (permalink / raw)
To: Peter P Waskiewicz Jr; +Cc: NetDev
In-Reply-To: <1253730766.2538.28.camel@localhost.localdomain>
I noticed that 'ethtool -d' has some counters that may be right,
including bytes.
(I'm looking at the 82599 chipset currently)
0x04074: gprc (Good Packets Received Count) 0xF611E2C2
0x04078: bprc (Broadcast Packets Rx Count) 0x00000000
0x0407C: mprc (Multicast Packets Rx Count) 0x0000000C
0x04080: gptc (Good Packets Transmitted Count) 0xC9962C10
0x04088: gorcl (Good Octets Rx Count Low) 0x9A23F0B0
0x0408C: gorch (Good Octets Rx Count High) 0x00000000
0x04090: gotcl (Good Octets Tx Count Low) 0x5DFAACD4
0x04094: gotch (Good Octets Tx Count High) 0x00000000
Any suggestions on a more efficient way to get these than dumping 'ethtool -d' and
searching all that data? Maybe put it in 'ethtool -S' as well ?
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* Re: [AX25] kernel panic
From: Bernard Pidoux F6BVP @ 2009-09-23 21:17 UTC (permalink / raw)
To: Jarek Poplawski
Cc: Bernard Pidoux, Ralf Baechle DL5RB, Linux Netdev List, linux-hams
In-Reply-To: <20090921201157.GA5460@del.dom.local>
Hi Jarek,
After applying your second patch I had to wait until today before I catched
these kernel messages.
The last three ones where single lines.
There was no kernel panic.
Sep 23 10:13:53 f6bvp-11 klogd: ------------[ cut here ]------------
Sep 23 10:13:53 f6bvp-11 klogd: WARNING: at net/ax25/af_ax25.c:241 ax25_find_cb+0x170/0x1a0 [ax25]()
Sep 23 10:13:53 f6bvp-11 klogd: Hardware name: MS-7258
Sep 23 10:13:53 f6bvp-11 klogd: Modules linked in: netconsole netrom mkiss rose ax25 nfsd exportfs nfs l
ockd nfs_acl auth_rpcgss sunrpc af_packet ipv6 snd_via82xx snd_ac97_codec ac97_bus snd_mpu401_uart snd_r
awmidi snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_pcm snd_timer
snd_page_alloc 8139cp snd_mixer_oss 8139too snd i2c_viapro i2c_core soundcore shpchp sr_mod mii pci_hot
plug binfmt_misc ext3 jbd cpufreq_ondemand cpufreq_conservative cpufreq_powersave acpi_cpufreq freq_tabl
e processor floppy sg rtc_cmos evdev button thermal via_agp pata_via ata_generic ide_pci_generic pata_ac
pi sata_via libata sd_mod scsi_mod crc_t10dif
Sep 23 10:13:53 f6bvp-11 klogd: Pid: 5, comm: events/0 Not tainted 2.6.31-nosmp #3
Sep 23 10:13:53 f6bvp-11 klogd: Call Trace:
Sep 23 10:13:53 f6bvp-11 klogd: <IRQ> [<ffffffffa03b40d0>] ? ax25_find_cb+0x170/0x1a0 [ax25]
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff81053b90>] warn_slowpath_common+0x80/0xe0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff81053c12>] warn_slowpath_null+0x22/0x40
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffffa03b40d0>] ax25_find_cb+0x170/0x1a0 [ax25]
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffffa03aced5>] ? ax25_listen_mine+0x95/0xb0 [ax25]
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffffa03ad299>] ax25_kiss_rcv+0x199/0xac0 [ax25]
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8132b4bd>] ? sock_def_readable+0x3d/0x80
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8132c96d>] ? sock_queue_rcv_skb+0x12d/0x160
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8133a831>] netif_receive_skb+0x351/0x5f0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff81061699>] ? run_timer_softirq+0x179/0x250
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8133ab50>] process_backlog+0x80/0xe0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8133b4a4>] net_rx_action+0xf4/0x220
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8105b242>] __do_softirq+0xe2/0x1d0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8101414a>] call_softirq+0x1a/0x30
Sep 23 10:13:53 f6bvp-11 klogd: <EOI> [<ffffffff810162c5>] do_softirq+0x75/0xc0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8105b094>] local_bh_enable+0xc4/0xd0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffffa03cf798>] mkiss_receive_buf+0x3a8/0x460 [mkiss]
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8104b864>] ? finish_task_switch+0x44/0xe0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff812af800>] flush_to_ldisc+0x110/0x1f0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8106c71c>] ? schedule_delayed_work+0x2c/0x50
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff812af6f0>] ? flush_to_ldisc+0x0/0x1f0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8106c263>] worker_thread+0x173/0x260
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff81071be0>] ? autoremove_wake_function+0x0/0x60
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8106c0f0>] ? worker_thread+0x0/0x260
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8107149e>] kthread+0xae/0xc0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff8101404a>] child_rip+0xa/0x20
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff810713f0>] ? kthread+0x0/0xc0
Sep 23 10:13:53 f6bvp-11 klogd: [<ffffffff81014040>] ? child_rip+0x0/0x20
Sep 23 10:13:53 f6bvp-11 klogd: ---[ end trace 422da9fe354a7ce3 ]---
Sep 23 10:13:53 f6bvp-11 klogd: AX25_DBG: 1 ffff880003c80000 0 ax25_find_cb
Sep 23 10:13:53 f6bvp-11 last message repeated 2 times
-----
Sep 23 14:26:56 f6bvp-11 klogd: AX25_DBG: 1 ffff88005247b000 0 ax25_find_cb
Sep 23 14:26:56 f6bvp-11 klogd: AX25_DBG: 1 ffff88005247b000 0 ax25_find_cb
Sep 23 14:26:56 f6bvp-11 klogd: AX25_DBG: 1 ffff88005247b000 0 ax25_find_cb
-----
Regards,
Bernard
Jarek Poplawski a e'crit :
> <20090910142436.GB10547@linux-mips.org> <4AA9288B.2070205@upmc.fr>
> <20090911120557.GA12175@linux-mips.org> <4AB5EAE5.6070605@upmc.fr>
> <20090920210242.GA9804@del.dom.local> <4AB73CDE.4030709@upmc.fr>
> In-Reply-To: <4AB73CDE.4030709@upmc.fr>
>
> Bernard Pidoux wrote, On 09/21/2009 10:44 AM:
>
>> Hi Jarek,
>>
>> Good fishing !
>>
>> During the night I catched the following two identical AX25_DBG
>> messages with netconsole
>> sending already reported message: kernel BUG at kernel/timer.c:913!
>> and followed by kernel
>> panics and the machine rebooting.
>>
>>
>> Sep 21 03:24:06 f6bvp-11 klogd: ------------[ cut here ]------------
>> Sep 21 03:24:06 f6bvp-11 klogd: WARNING: at include/net/ax25.h:260
>> ax25_kiss_rcv+0x650/0xab0 [ax25]()
>
> Thanks for testing. Alas I don't get how it's possible at this place
> (unless I miss the place), especially with a nosmp kernel. So here is
> take 2 (to apply after reverting the previous one).
>
> Regards,
> Jarek P.
> --- (debugging patch, take 2)
>
> include/net/ax25.h | 36 ++++++++++++++++++++++++++++++++++++
> net/ax25/af_ax25.c | 12 ++++++++++++
> 2 files changed, 48 insertions(+), 0 deletions(-)
>
> diff --git a/include/net/ax25.h b/include/net/ax25.h
> index 717e219..7fefbb0 100644
> --- a/include/net/ax25.h
> +++ b/include/net/ax25.h
> @@ -252,9 +252,45 @@ typedef struct ax25_cb {
> #define ax25_cb_hold(__ax25) \
> atomic_inc(&((__ax25)->refcount))
>
> +static __inline__ int ax25_timers_warn(ax25_cb *ax25)
> +{
> + int err = 0;
> +
> + if (del_timer(&ax25->timer)) {
> + WARN_ON_ONCE(1);
> + err = 1;
> + }
> + if (del_timer(&ax25->t1timer)) {
> + WARN_ON_ONCE(1);
> + err += 2;
> + }
> + if (del_timer(&ax25->t2timer)) {
> + WARN_ON_ONCE(1);
> + err += 4;
> + }
> + if (del_timer(&ax25->t3timer)) {
> + WARN_ON_ONCE(1);
> + err += 8;
> + }
> + if (del_timer(&ax25->idletimer)) {
> + WARN_ON_ONCE(1);
> + err += 16;
> + }
> + if (del_timer(&ax25->dtimer)) {
> + WARN_ON_ONCE(1);
> + err += 32;
> + }
> + if (err)
> + printk(KERN_WARNING "AX25_DBG: %d %p %u %s %d\n", err, ax25,
> + ax25->state, __func__, __LINE__);
> +
> + return err;
> +}
> +
> static __inline__ void ax25_cb_put(ax25_cb *ax25)
> {
> if (atomic_dec_and_test(&ax25->refcount)) {
> + ax25_timers_warn(ax25);
> kfree(ax25->digipeat);
> kfree(ax25);
> }
> diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
> index da0f64f..f1f515c 100644
> --- a/net/ax25/af_ax25.c
> +++ b/net/ax25/af_ax25.c
> @@ -58,6 +58,9 @@ static const struct proto_ops ax25_proto_ops;
>
> static void ax25_free_sock(struct sock *sk)
> {
> + if (ax25_timers_warn(ax25_sk(sk)))
> + printk(KERN_WARNING "AX25_DBG: %p %u %u %u\n", sk,
> + sk->sk_family, sk->sk_type, sk->sk_protocol);
> ax25_cb_put(ax25_sk(sk));
> }
>
> @@ -222,6 +225,8 @@ ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr,
> if (s->ax25_dev == NULL)
> continue;
> if (ax25cmp(&s->source_addr, src_addr) == 0 && ax25cmp(&s->dest_addr, dest_addr) == 0 && s->ax25_dev->dev == dev) {
> + int ref;
> +
> if (digi != NULL && digi->ndigi != 0) {
> if (s->digipeat == NULL)
> continue;
> @@ -231,6 +236,13 @@ ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr,
> if (s->digipeat != NULL && s->digipeat->ndigi != 0)
> continue;
> }
> + ref = atomic_read(&s->refcount);
> + if (ref < 2) {
> + WARN_ON_ONCE(1);
> + printk(KERN_WARNING "AX25_DBG: %d %p %d %s\n",
> + ref, s, s->state, __func__);
> + }
> +
> ax25_cb_hold(s);
> spin_unlock_bh(&ax25_list_lock);
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-hams" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Gregory Haskins @ 2009-09-23 21:15 UTC (permalink / raw)
To: Avi Kivity
Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm,
linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze,
alacrityvm-devel
In-Reply-To: <4ABA78DC.7070604@redhat.com>
[-- Attachment #1: Type: text/plain, Size: 13736 bytes --]
Avi Kivity wrote:
> On 09/23/2009 08:58 PM, Gregory Haskins wrote:
>>>
>>>> It also pulls parts of the device model into the host kernel.
>>>>
>>> That is the point. Most of it needs to be there for performance.
>>>
>> To clarify this point:
>>
>> There are various aspects about designing high-performance virtual
>> devices such as providing the shortest paths possible between the
>> physical resources and the consumers. Conversely, we also need to
>> ensure that we meet proper isolation/protection guarantees at the same
>> time. What this means is there are various aspects to any
>> high-performance PV design that require to be placed in-kernel to
>> maximize the performance yet properly isolate the guest.
>>
>> For instance, you are required to have your signal-path (interrupts and
>> hypercalls), your memory-path (gpa translation), and
>> addressing/isolation model in-kernel to maximize performance.
>>
>
> Exactly. That's what vhost puts into the kernel and nothing more.
Actually, no. Generally, _KVM_ puts those things into the kernel, and
vhost consumes them. Without KVM (or something equivalent), vhost is
incomplete. One of my goals with vbus is to generalize the "something
equivalent" part here.
I know you may not care about non-kvm use cases, and thats fine. No one
says you have to. However, note that some of use do care about these
non-kvm cases, and thus its a distinction I am making here as a benefit
of the vbus framework.
>
>> Vbus accomplishes its in-kernel isolation model by providing a
>> "container" concept, where objects are placed into this container by
>> userspace. The host kernel enforces isolation/protection by using a
>> namespace to identify objects that is only relevant within a specific
>> container's context (namely, a "u32 dev-id"). The guest addresses the
>> objects by its dev-id, and the kernel ensures that the guest can't
>> access objects outside of its dev-id namespace.
>>
>
> vhost manages to accomplish this without any kernel support.
No, vhost manages to accomplish this because of KVMs kernel support
(ioeventfd, etc). Without a KVM-like in-kernel support, vhost is a
merely a kind of "tuntap"-like clone signalled by eventfds.
vbus on the other hand, generalizes one more piece of the puzzle
(namely, the function of pio+ioeventfd and userspace's programming of
it) by presenting the devid namespace and container concept.
This goes directly to my rebuttal of your claim that vbus places too
much in the kernel. I state that, one way or the other, address decode
and isolation _must_ be in the kernel for performance. Vbus does this
with a devid/container scheme. vhost+virtio-pci+kvm does it with
pci+pio+ioeventfd.
> The guest
> simply has not access to any vhost resources other than the guest->host
> doorbell, which is handed to the guest outside vhost (so it's somebody
> else's problem, in userspace).
You mean _controlled_ by userspace, right? Obviously, the other side of
the kernel still needs to be programmed (ioeventfd, etc). Otherwise,
vhost would be pointless: e.g. just use vanilla tuntap if you don't need
fast in-kernel decoding.
>
>> All that is required is a way to transport a message with a "devid"
>> attribute as an address (such as DEVCALL(devid)) and the framework
>> provides the rest of the decode+execute function.
>>
>
> vhost avoids that.
No, it doesn't avoid it. It just doesn't specify how its done, and
relies on something else to do it on its behalf.
Conversely, vbus specifies how its done, but not how to transport the
verb "across the wire". That is the role of the vbus-connector abstraction.
>
>> Contrast this to vhost+virtio-pci (called simply "vhost" from here).
>>
>
> It's the wrong name. vhost implements only the data path.
Understood, but vhost+virtio-pci is what I am contrasting, and I use
"vhost" for short from that point on because I am too lazy to type the
whole name over and over ;)
>
>> It is not immune to requiring in-kernel addressing support either, but
>> rather it just does it differently (and its not as you might expect via
>> qemu).
>>
>> Vhost relies on QEMU to render PCI objects to the guest, which the guest
>> assigns resources (such as BARs, interrupts, etc).
>
> vhost does not rely on qemu. It relies on its user to handle
> configuration. In one important case it's qemu+pci. It could just as
> well be the lguest launcher.
I meant vhost=vhost+virtio-pci here. Sorry for the confusion.
The point I am making specifically is that vhost in general relies on
other in-kernel components to function. I.e. It cannot function without
having something like the PCI model to build an IO namespace. That
namespace (in this case, pio addresses+data tuples) are used for the
in-kernel addressing function under KVM + virtio-pci.
The case of the lguest launcher is a good one to highlight. Yes, you
can presumably also use lguest with vhost, if the requisite facilities
are exposed to lguest-bus, and some eventfd based thing like ioeventfd
is written for the host (if it doesnt exist already).
And when the next virt design "foo" comes out, it can make a "foo-bus"
model, and implement foo-eventfd on the backend, etc, etc.
Ira can make ira-bus, and ira-eventfd, etc, etc.
Each iteration will invariably introduce duplicated parts of the stack.
Vbus tries to generalize some of those pieces so we can reuse them.
I chose the very non-specific name "virtual-bus" for the design
intentionally to decouple it from any one particular "hypervisor" (e.g.
xenbus, lguest-bus, etc) and promote it as a general purpose bus for
hopefully any hypervisor (or physical systems too, e.g. Iras). I assume
"virtio" was chosen to reflect a similar positioning at the device-model
layer.
Had vbus come out before lguest, I would have proposed that lguest
should use it natively instead of creating lguest-bus. While its
probably too late in that specific case, perhaps going forward this is
the direction we can take, just like perhaps virtio is the device model
direction we can take.
Likewise, the backend is generalized so one model can be written that
works in all environments that support vbus. The connector takes care
of the "wire" details, and the other stuff functions to serve the bus
portion of the stack (signal-routing, memory-routing,
isolation/addressing, etc).
>
>> A PCI-BAR in this
>> example may represent a PIO address for triggering some operation in the
>> device-model's fast-path. For it to have meaning in the fast-path, KVM
>> has to have in-kernel knowledge of what a PIO-exit is, and what to do
>> with it (this is where pio-bus and ioeventfd come in). The programming
>> of the PIO-exit and the ioeventfd are likewise controlled by some
>> userspace management entity (i.e. qemu). The PIO address and value
>> tuple form the address, and the ioeventfd framework within KVM provide
>> the decode+execute function.
>>
>
> Right.
>
>> This idea seemingly works fine, mind you, but it rides on top of a *lot*
>> of stuff including but not limited to: the guests pci stack, the qemu
>> pci emulation, kvm pio support, and ioeventfd. When you get into
>> situations where you don't have PCI or even KVM underneath you (e.g. a
>> userspace container, Ira's rig, etc) trying to recreate all of that PCI
>> infrastructure for the sake of using PCI is, IMO, a lot of overhead for
>> little gain.
>>
>
> For the N+1th time, no. vhost is perfectly usable without pci. Can we
> stop raising and debunking this point?
Again, I understand vhost is decoupled from PCI, and I don't mean to
imply anything different. I use PCI as an example here because a) its
the only working example of vhost today (to my knowledge), and b) you
have stated in the past that PCI is the only "right" way here, to
paraphrase. Perhaps you no longer feel that way, so I apologize if you
feel you already recanted your position on PCI and I missed it.
I digress. My point here isn't PCI. The point here is the missing
component for when PCI is not present. The component that is partially
satisfied by vbus's devid addressing scheme. If you are going to use
vhost, and you don't have PCI, you've gotta build something to replace it.
>
>> All you really need is a simple decode+execute mechanism, and a way to
>> program it from userspace control. vbus tries to do just that:
>> commoditize it so all you need is the transport of the control messages
>> (like DEVCALL()), but the decode+execute itself is reuseable, even
>> across various environments (like KVM or Iras rig).
>>
>
> If you think it should be "commodotized", write libvhostconfig.so.
I know you are probably being facetious here, but what do you propose
for the parts that must be in-kernel?
>
>> And your argument, I believe, is that vbus allows both to be implemented
>> in the kernel (though to reiterate, its optional) and is therefore a bad
>> design, so lets discuss that.
>>
>> I believe the assertion is that things like config-space are best left
>> to userspace, and we should only relegate fast-path duties to the
>> kernel. The problem is that, in my experience, a good deal of
>> config-space actually influences the fast-path and thus needs to
>> interact with the fast-path mechanism eventually anyway.
>> Whats left
>> over that doesn't fall into this category may cheaply ride on existing
>> plumbing, so its not like we created something new or unnatural just to
>> support this subclass of config-space.
>>
>
> Flexibility is reduced, because changing code in the kernel is more
> expensive than in userspace, and kernel/user interfaces aren't typically
> as wide as pure userspace interfaces. Security is reduced, since a bug
> in the kernel affects the host, while a bug in userspace affects just on
> guest.
For a mac-address attribute? Thats all we are really talking about
here. These points you raise, while true of any kernel code I suppose,
are a bit of a stretch in this context.
>
> Example: feature negotiation. If it happens in userspace, it's easy to
> limit what features we expose to the guest.
Its not any harder in the kernel. I do this today.
And when you are done negotiating said features, you will generally have
to turn around and program the feature into the backend anyway (e.g.
ioctl() to vhost module). Now you have to maintain some knowledge of
that particular feature and how to program it in two places.
Conversely, I am eliminating the (unnecessary) middleman by letting the
feature negotiating take place directly between the two entities that
will consume it.
> If it happens in the
> kernel, we need to add an interface to let the kernel know which
> features it should expose to the guest.
You need this already either way for both models anyway. As an added
bonus, vbus has generalized that interface using sysfs attributes, so
all models are handled in a similar and community accepted way.
> We also need to add an
> interface to let userspace know which features were negotiated, if we
> want to implement live migration. Something fairly trivial bloats rapidly.
Can you elaborate on the requirements for live-migration? Wouldnt an
opaque save/restore model work here? (e.g. why does userspace need to be
able to interpret the in-kernel state, just pass it along as a blob to
the new instance).
>
>> For example: take an attribute like the mac-address assigned to a NIC.
>> This clearly doesn't need to be in-kernel and could go either way (such
>> as a PCI config-space register).
>>
>> As another example: consider an option bit that enables a new feature
>> that affects the fast-path, like RXBUF merging. If we use the split
>> model where config space is handled by userspace and fast-path is
>> in-kernel, the userspace component is only going to act as a proxy.
>> I.e. it will pass the option down to the kernel eventually. Therefore,
>> there is little gain in trying to split this type of slow-path out to
>> userspace. In fact, its more work.
>>
>
> As you can see above, userspace needs to be involved in this, and the
> number of interfaces required is smaller if it's in userspace:
Actually, no. My experience has been the opposite. Anytime I sat down
and tried to satisfy your request to move things to the userspace,
things got ugly and duplicative really quick. I suspect part of the
reason you may think its easier because you already have part of
virtio-net in userspace and its surrounding support, but that is not the
case moving forward for new device types.
> you only
> need to know which features the kernel supports (they can be enabled
> unconditionally, just not exposed).
>
> Further, some devices are perfectly happy to be implemented in
> userspace, so we need userspace configuration support anyway. Why
> reimplement it in the kernel?
Thats fine. vbus is targetted for high-performance IO. So if you have
a robust userspace (like KVM+QEMU) and low-performance constraints (say,
for a console or something), put it in userspace and vbus is not
involved. I don't care.
However, if you are coming from somewhere else (like Ira's rig) where
you don't necessarily have a robust userspace module, vbus provides a
model that allows you to chose whether you want to do a vhost like
model, or a full resource container with the isolation guarantees, etc,
built in.
Kind Regards,
-Greg
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 267 bytes --]
^ permalink raw reply
* PATCH 1/1: rt2x00dev.c / rt2x00lib.h fixes build breakage
From: Ken Lewis @ 2009-09-23 20:59 UTC (permalink / raw)
To: LKML, linux-next, netdev
[-- Attachment #1: Type: text/plain, Size: 450 bytes --]
The headers in drivers/net/wireless/rt2x00/rt2x00lib.h don't match the
use of the function in rt2x00dev.c The build fails as a result.
This has been a problem in linux-next since early September. I've
e-mailed a patch to linux-next and to linux-net, but the 2.6.32 merge
window has brought the problem to the mainline and so I'm re-sending
my patch. I've opened a bug on bugzilla:
http://bugzilla.kernel.org/show_bug.cgi?id=14217
Take care.
Ken
[-- Attachment #2: diff-fix-for-rt2x00lib.h.patch --]
[-- Type: text/x-diff, Size: 544 bytes --]
diff --git a/drivers/net/wireless/rt2x00/rt2x00lib.h b/drivers/net/wireless/rt2x00/rt2x00lib.h
index 5462cb5..567f029 100644
--- a/drivers/net/wireless/rt2x00/rt2x00lib.h
+++ b/drivers/net/wireless/rt2x00/rt2x00lib.h
@@ -380,7 +380,7 @@ static inline void rt2x00crypto_tx_insert_iv(struct sk_buff *skb,
{
}
-static inline void rt2x00crypto_rx_insert_iv(struct sk_buff *skb, bool l2pad,
+static inline void rt2x00crypto_rx_insert_iv(struct sk_buff *skb,
unsigned int header_length,
struct rxdone_entry_desc *rxdesc)
{
^ permalink raw reply related
* PATCH 0/1: rt2x00dev.c / rt2x00lib.h fixes build breakage
From: Ken Lewis @ 2009-09-23 20:58 UTC (permalink / raw)
To: linux-next, LKML, netdev
The headers in drivers/net/wireless/rt2x00/rt2x00lib.h don't match the
use of the function in rt2x00dev.c The build fails as a result.
This has been a problem in linux-next since early September. I've
e-mailed a patch to linux-next and to linux-net, but the 2.6.32 merge
window has brought the problem to the mainline and so I'm re-sending
my patch. I've opened a bug on bugzilla:
http://bugzilla.kernel.org/show_bug.cgi?id=14217
Take care.
Ken
^ permalink raw reply
* Re: [PATCH] af_packet: add interframe drop cmsg
From: Eric Dumazet @ 2009-09-23 20:55 UTC (permalink / raw)
To: Neil Horman; +Cc: netdev, davem
In-Reply-To: <20090923203202.GA13805@hmsreliant.think-freely.org>
Neil Horman a écrit :
> Add Ancilliary data to better represent loss information
>
> I've had a few requests recently to provide more detail regarding frame loss
> during an AF_PACKET packet capture session. Specifically the requestors want to
> see where in a packet sequence frames were lost, i.e. they want to see that 40
> frames were lost between frames 302 and 303 in a packet capture file. In order
> to do this we need:
>
> 1) The kernel to export this data to user space
> 2) The applications to make use of it
>
> This patch addresses item (1). It does this by doing the following:
>
> A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
> which frames were lost between it and the previously enqueued frame. Note I use
> a ring buffer with a correlator value (the skb pointer) to do this. This was
> done because the skb->cb array is exhausted already for AF_PACKET
Hmm, how mmap() users can find this information ? I thought recent libpcap were
using mmap(), in order to reduce losses :)
>
> B) For any frame dequeued that has ancilliary data in the ring buffer (as
> determined by the correlator value), we add a cmsg structure to the msghdr that
> gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
> cmsg_type PACKET_GAPDATA. It contains a u32 value which counts the number of
> frames lost between the reception of the frame being currently recevied and the
> frame most recently preceding it. Note this creates a situation in which if we
> have packet loss followed immediately by a socket close operation we might miss
> some gap information. This situation is covered by the use of the
> PACKET_AUXINFO socket option, which provides total loss stats (from which the
> final gap can be computed).
>
> I've tested this patch myself, and it works well.
Okay :)
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>
>
> include/linux/if_packet.h | 2 +
> net/packet/af_packet.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 91 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index dea7d6b..e5d200f 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -48,11 +48,13 @@ struct sockaddr_ll
> #define PACKET_RESERVE 12
> #define PACKET_TX_RING 13
> #define PACKET_LOSS 14
> +#define PACKET_GAPDATA 15
>
> struct tpacket_stats
> {
> unsigned int tp_packets;
> unsigned int tp_drops;
> + unsigned int tp_gap;
> };
>
> struct tpacket_auxdata
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index d3d52c6..b74a91c 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
>
> static void packet_flush_mclist(struct sock *sk);
>
> +struct packet_gap_record {
> + struct sk_buff *skb;
> + __u32 gap;
> +};
> +
> struct packet_sock {
> /* struct sock has to be the first member of packet_sock */
> struct sock sk;
> @@ -197,6 +202,11 @@ struct packet_sock {
> int ifindex; /* bound device */
> __be16 num;
> struct packet_mclist *mclist;
> + struct packet_gap_record *gaps;
> + unsigned int gap_head;
> + unsigned int gap_tail;
> + unsigned int gap_list_size;
> +
> #ifdef CONFIG_PACKET_MMAP
> atomic_t mapped;
> enum tpacket_versions tp_version;
> @@ -524,6 +534,55 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
> }
>
> /*
> + * If we've lost frames since the last time we queued one to the
> + * sk_receive_queue, we need to record it here.
> + * This must be called under the protection of the socket lock
> + * to prevent racing with other softirqs and user space
> + */
> +static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> +{
> + /*
> + * do nothing if there is no gap
> + */
> + if (!po->stats.tp_gap)
> + return;
> +
> + /*
> + * If there is, check the gap list tail to make sure we
> + * have an open entry
> + */
> + if (po->gaps[po->gap_tail].skb != NULL) {
> + if (net_ratelimit())
> + printk(KERN_WARNING "packet socket gap list is full!\n");
New code can use pr_warning() macro
> + return;
> + }
> +
> + /*
> + * We have a free entry, record it
> + */
> + po->gaps[po->gap_tail].skb = skb;
> + po->gaps[po->gap_tail].gap = po->stats.tp_gap;
> + po->gap_tail = (po->gap_tail+1) % po->gap_list_size;
you could avoid this divide
if (++po->gap_tail == po->gap_list_size)
po->gap_tail = 0;
> + po->stats.tp_gap = 0;
> + return;
> +
> +}
> +
> +static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
> +{
> + __u32 gap = 0;
> +
> + if (po->gaps[po->gap_head].skb != skb)
> + return 0;
> +
> + gap = po->gaps[po->gap_head].gap;
> + po->gaps[po->gap_head].skb = NULL;
> + po->gap_head = (po->gap_head + 1) % po->gap_list_size;
ditto
> + return gap;
> +}
> +
> +
> +/*
> This function makes lazy skb cloning in hope that most of packets
> are discarded by BPF.
>
> @@ -626,6 +685,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
>
> spin_lock(&sk->sk_receive_queue.lock);
> po->stats.tp_packets++;
> + record_packet_gap(skb, po);
> __skb_queue_tail(&sk->sk_receive_queue, skb);
> spin_unlock(&sk->sk_receive_queue.lock);
> sk->sk_data_ready(sk, skb->len);
> @@ -634,6 +694,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
> drop_n_acct:
> spin_lock(&sk->sk_receive_queue.lock);
> po->stats.tp_drops++;
> + po->stats.tp_gap++;
> spin_unlock(&sk->sk_receive_queue.lock);
>
> drop_n_restore:
> @@ -811,6 +872,7 @@ drop:
>
> ring_is_full:
> po->stats.tp_drops++;
> + po->stats.tp_gap++;
> spin_unlock(&sk->sk_receive_queue.lock);
>
> sk->sk_data_ready(sk, 0);
> @@ -1223,6 +1285,8 @@ static int packet_release(struct socket *sock)
> skb_queue_purge(&sk->sk_receive_queue);
> sk_refcnt_debug_release(sk);
>
> + kfree(po->gaps);
> +
> sock_put(sk);
> return 0;
> }
> @@ -1350,6 +1414,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> struct packet_sock *po;
> __be16 proto = (__force __be16)protocol; /* weird, but documented */
> int err;
> + unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
>
> if (!capable(CAP_NET_RAW))
> return -EPERM;
> @@ -1360,6 +1425,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> sock->state = SS_UNCONNECTED;
>
> err = -ENOBUFS;
> +
> sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
> if (sk == NULL)
> goto out;
> @@ -1374,6 +1440,19 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> sk->sk_family = PF_PACKET;
> po->num = proto;
>
> + err = -ENOMEM;
> + po->gaps = kmalloc(sizeof(struct packet_gap_record)*num_records,
> + GFP_KERNEL);
kzalloc(), and no need for some following lines
> + if (!po->gaps)
> + goto out_free;
> + po->gap_tail = po->gap_head = 0;
> + po->gap_list_size = num_records;
> +
> + for (num_records = 0; num_records < po->gap_list_size; num_records++) {
> + po->gaps[num_records].skb = NULL;
> + po->gaps[num_records].gap = 0;
> + }
> +
> sk->sk_destruct = packet_sock_destruct;
> sk_refcnt_debug_inc(sk);
>
> @@ -1402,6 +1481,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
> sock_prot_inuse_add(net, &packet_proto, 1);
> write_unlock_bh(&net->packet.sklist_lock);
> return 0;
> +
> +out_free:
> + sk_free(sk);
> out:
> return err;
> }
> @@ -1418,6 +1500,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> struct sk_buff *skb;
> int copied, err;
> struct sockaddr_ll *sll;
> + __u32 gap;
>
> err = -EINVAL;
> if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> @@ -1492,10 +1575,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> aux.tp_mac = 0;
> aux.tp_net = skb_network_offset(skb);
> aux.tp_vlan_tci = skb->vlan_tci;
> -
Please dont mix cleanups
> put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
> }
>
> + lock_sock(sk);
strange locking here. this doesnt match locking used at record time.
( spin_lock(&sk->sk_receive_queue.lock);)
> + gap = check_packet_gap(skb, pkt_sk(sk));
> + release_sock(sk);
> + if (gap)
> + put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
> +
> /*
> * Free or return the buffer as appropriate. Again this
> * hides all the races and re-entrancy issues from us.
Thanks
^ permalink raw reply
* Re: r8169 chips on some Intel D945GSEJT boards fail to work after PXE boot
From: Francois Romieu @ 2009-09-23 20:57 UTC (permalink / raw)
To: Simon Farnsworth; +Cc: netdev
In-Reply-To: <4ABA535E.2010801@onelan.com>
Simon Farnsworth <simon.farnsworth@onelan.com> :
[...]
> Some boards are good, and just work, whether I boot via PXE or boot from
> the local disk; dmesg.working and lspci.working are from a good board.
>
> Some boards are bad; they work fine if I boot from local disk (including
> network), but the kernel cannot detect link, or send or receive data if
> I PXE boot. dmesg.broken and lspci.broken are from a bad board.
No cunning theroy in sight but does reducing the amount of memory on a
bad board from 1 Go to 512 Mo turn it into a good one ?
The failing board exhibits a correctable error status bit. Clearing it
is the least we can do.
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 50c6a3c..79bc4ab 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -2200,6 +2200,11 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
tp->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
if (!tp->pcie_cap && netif_msg_probe(tp))
dev_info(&pdev->dev, "no PCI Express capability\n");
+ else {
+ pci_write_config_word(pdev, tp->pcie_cap + PCI_EXP_DEVSTA,
+ PCI_EXP_DEVSTA_CED | PCI_EXP_DEVSTA_NFED |
+ PCI_EXP_DEVSTA_FED | PCI_EXP_DEVSTA_URD);
+ }
RTL_W16(IntrMask, 0x0000);
--
Ueimor
^ permalink raw reply related
* [PATCH] af_packet: add interframe drop cmsg
From: Neil Horman @ 2009-09-23 20:32 UTC (permalink / raw)
To: netdev; +Cc: davem, nhorman
Add Ancilliary data to better represent loss information
I've had a few requests recently to provide more detail regarding frame loss
during an AF_PACKET packet capture session. Specifically the requestors want to
see where in a packet sequence frames were lost, i.e. they want to see that 40
frames were lost between frames 302 and 303 in a packet capture file. In order
to do this we need:
1) The kernel to export this data to user space
2) The applications to make use of it
This patch addresses item (1). It does this by doing the following:
A) attaching ancilliary data to any skb enqueued to a socket recieve queue for
which frames were lost between it and the previously enqueued frame. Note I use
a ring buffer with a correlator value (the skb pointer) to do this. This was
done because the skb->cb array is exhausted already for AF_PACKET
B) For any frame dequeued that has ancilliary data in the ring buffer (as
determined by the correlator value), we add a cmsg structure to the msghdr that
gets copied to user space, this cmsg structure is of cmsg_level AF_PACKET, and
cmsg_type PACKET_GAPDATA. It contains a u32 value which counts the number of
frames lost between the reception of the frame being currently recevied and the
frame most recently preceding it. Note this creates a situation in which if we
have packet loss followed immediately by a socket close operation we might miss
some gap information. This situation is covered by the use of the
PACKET_AUXINFO socket option, which provides total loss stats (from which the
final gap can be computed).
I've tested this patch myself, and it works well.
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
include/linux/if_packet.h | 2 +
net/packet/af_packet.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 91 insertions(+), 1 deletion(-)
diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index dea7d6b..e5d200f 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -48,11 +48,13 @@ struct sockaddr_ll
#define PACKET_RESERVE 12
#define PACKET_TX_RING 13
#define PACKET_LOSS 14
+#define PACKET_GAPDATA 15
struct tpacket_stats
{
unsigned int tp_packets;
unsigned int tp_drops;
+ unsigned int tp_gap;
};
struct tpacket_auxdata
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d3d52c6..b74a91c 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -179,6 +179,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
static void packet_flush_mclist(struct sock *sk);
+struct packet_gap_record {
+ struct sk_buff *skb;
+ __u32 gap;
+};
+
struct packet_sock {
/* struct sock has to be the first member of packet_sock */
struct sock sk;
@@ -197,6 +202,11 @@ struct packet_sock {
int ifindex; /* bound device */
__be16 num;
struct packet_mclist *mclist;
+ struct packet_gap_record *gaps;
+ unsigned int gap_head;
+ unsigned int gap_tail;
+ unsigned int gap_list_size;
+
#ifdef CONFIG_PACKET_MMAP
atomic_t mapped;
enum tpacket_versions tp_version;
@@ -524,6 +534,55 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
}
/*
+ * If we've lost frames since the last time we queued one to the
+ * sk_receive_queue, we need to record it here.
+ * This must be called under the protection of the socket lock
+ * to prevent racing with other softirqs and user space
+ */
+static void record_packet_gap(struct sk_buff *skb, struct packet_sock *po)
+{
+ /*
+ * do nothing if there is no gap
+ */
+ if (!po->stats.tp_gap)
+ return;
+
+ /*
+ * If there is, check the gap list tail to make sure we
+ * have an open entry
+ */
+ if (po->gaps[po->gap_tail].skb != NULL) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "packet socket gap list is full!\n");
+ return;
+ }
+
+ /*
+ * We have a free entry, record it
+ */
+ po->gaps[po->gap_tail].skb = skb;
+ po->gaps[po->gap_tail].gap = po->stats.tp_gap;
+ po->gap_tail = (po->gap_tail+1) % po->gap_list_size;
+ po->stats.tp_gap = 0;
+ return;
+
+}
+
+static __u32 check_packet_gap(struct sk_buff *skb, struct packet_sock *po)
+{
+ __u32 gap = 0;
+
+ if (po->gaps[po->gap_head].skb != skb)
+ return 0;
+
+ gap = po->gaps[po->gap_head].gap;
+ po->gaps[po->gap_head].skb = NULL;
+ po->gap_head = (po->gap_head + 1) % po->gap_list_size;
+ return gap;
+}
+
+
+/*
This function makes lazy skb cloning in hope that most of packets
are discarded by BPF.
@@ -626,6 +685,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
spin_lock(&sk->sk_receive_queue.lock);
po->stats.tp_packets++;
+ record_packet_gap(skb, po);
__skb_queue_tail(&sk->sk_receive_queue, skb);
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk, skb->len);
@@ -634,6 +694,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
spin_lock(&sk->sk_receive_queue.lock);
po->stats.tp_drops++;
+ po->stats.tp_gap++;
spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
@@ -811,6 +872,7 @@ drop:
ring_is_full:
po->stats.tp_drops++;
+ po->stats.tp_gap++;
spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk, 0);
@@ -1223,6 +1285,8 @@ static int packet_release(struct socket *sock)
skb_queue_purge(&sk->sk_receive_queue);
sk_refcnt_debug_release(sk);
+ kfree(po->gaps);
+
sock_put(sk);
return 0;
}
@@ -1350,6 +1414,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
struct packet_sock *po;
__be16 proto = (__force __be16)protocol; /* weird, but documented */
int err;
+ unsigned int num_records = PAGE_SIZE/sizeof(struct packet_gap_record);
if (!capable(CAP_NET_RAW))
return -EPERM;
@@ -1360,6 +1425,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sock->state = SS_UNCONNECTED;
err = -ENOBUFS;
+
sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
if (sk == NULL)
goto out;
@@ -1374,6 +1440,19 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sk->sk_family = PF_PACKET;
po->num = proto;
+ err = -ENOMEM;
+ po->gaps = kmalloc(sizeof(struct packet_gap_record)*num_records,
+ GFP_KERNEL);
+ if (!po->gaps)
+ goto out_free;
+ po->gap_tail = po->gap_head = 0;
+ po->gap_list_size = num_records;
+
+ for (num_records = 0; num_records < po->gap_list_size; num_records++) {
+ po->gaps[num_records].skb = NULL;
+ po->gaps[num_records].gap = 0;
+ }
+
sk->sk_destruct = packet_sock_destruct;
sk_refcnt_debug_inc(sk);
@@ -1402,6 +1481,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
sock_prot_inuse_add(net, &packet_proto, 1);
write_unlock_bh(&net->packet.sklist_lock);
return 0;
+
+out_free:
+ sk_free(sk);
out:
return err;
}
@@ -1418,6 +1500,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
struct sk_buff *skb;
int copied, err;
struct sockaddr_ll *sll;
+ __u32 gap;
err = -EINVAL;
if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1492,10 +1575,15 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
aux.tp_mac = 0;
aux.tp_net = skb_network_offset(skb);
aux.tp_vlan_tci = skb->vlan_tci;
-
put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
}
+ lock_sock(sk);
+ gap = check_packet_gap(skb, pkt_sk(sk));
+ release_sock(sk);
+ if (gap)
+ put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(u32), &gap);
+
/*
* Free or return the buffer as appropriate. Again this
* hides all the races and re-entrancy issues from us.
^ permalink raw reply related
* Re: [PATCH] tunnel: eliminate recursion field
From: Eric Dumazet @ 2009-09-23 20:28 UTC (permalink / raw)
To: David S. Miller; +Cc: Linux Netdev List
In-Reply-To: <4ABA8254.9000701@gmail.com>
Eric Dumazet a écrit :
> It seems recursion field from "struct ip_tunnel" is not anymore needed.
> recursion prevention is done at the upper level (in dev_queue_xmit()),
> since we use HARD_TX_LOCK protection for tunnels.
>
> This avoids a cache line ping pong on "struct ip_tunnel" : This structure
> should be now mostly read on xmit and receive paths.
Oops I forgot ipv6 tunnels, silly me, here is an updated version.
Thanks
[PATCH] tunnel: eliminate recursion field
It seems recursion field from "struct ip_tunnel" is not anymore needed.
recursion prevention is done at the upper level (in dev_queue_xmit()),
since we use HARD_TX_LOCK protection for tunnels.
This avoids a cache line ping pong on "struct ip_tunnel" : This structure
should be now mostly read on xmit and receive paths.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/net/ipip.h | 1 -
net/ipv4/ip_gre.c | 13 +------------
net/ipv4/ipip.c | 8 --------
net/ipv6/ip6_tunnel.c | 7 -------
net/ipv6/sit.c | 8 --------
5 files changed, 1 insertion(+), 36 deletions(-)
diff --git a/include/net/ipip.h b/include/net/ipip.h
index 5d3036f..76e3ea6 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -12,7 +12,6 @@ struct ip_tunnel
struct ip_tunnel *next;
struct net_device *dev;
- int recursion; /* Depth of hard_start_xmit recursion */
int err_count; /* Number of arrived ICMP errors */
unsigned long err_time; /* Time when the last ICMP error arrived */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d9645c9..41ada99 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -66,10 +66,7 @@
solution, but it supposes maintaing new variable in ALL
skb, even if no tunneling is used.
- Current solution: t->recursion lock breaks dead loops. It looks
- like dev->tbusy flag, but I preferred new variable, because
- the semantics is different. One day, when hard_start_xmit
- will be multithreaded we will have to use skb->encapsulation.
+ Current solution: HARD_TX_LOCK lock breaks dead loops.
@@ -678,11 +675,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
__be32 dst;
int mtu;
- if (tunnel->recursion++) {
- stats->collisions++;
- goto tx_error;
- }
-
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
@@ -820,7 +812,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
ip_rt_put(rt);
stats->tx_dropped++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
if (skb->sk)
@@ -888,7 +879,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
nf_reset(skb);
IPTUNNEL_XMIT();
- tunnel->recursion--;
return NETDEV_TX_OK;
tx_error_icmp:
@@ -897,7 +887,6 @@ tx_error_icmp:
tx_error:
stats->tx_errors++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 62548cb..08ccd34 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -402,11 +402,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
__be32 dst = tiph->daddr;
int mtu;
- if (tunnel->recursion++) {
- stats->collisions++;
- goto tx_error;
- }
-
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
@@ -485,7 +480,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
ip_rt_put(rt);
stats->tx_dropped++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
if (skb->sk)
@@ -523,7 +517,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb);
IPTUNNEL_XMIT();
- tunnel->recursion--;
return NETDEV_TX_OK;
tx_error_icmp:
@@ -531,7 +524,6 @@ tx_error_icmp:
tx_error:
stats->tx_errors++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 7d25bbe..c595bbe 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1043,11 +1043,6 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
struct net_device_stats *stats = &t->dev->stats;
int ret;
- if (t->recursion++) {
- stats->collisions++;
- goto tx_err;
- }
-
switch (skb->protocol) {
case htons(ETH_P_IP):
ret = ip4ip6_tnl_xmit(skb, dev);
@@ -1062,14 +1057,12 @@ ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (ret < 0)
goto tx_err;
- t->recursion--;
return NETDEV_TX_OK;
tx_err:
stats->tx_errors++;
stats->tx_dropped++;
kfree_skb(skb);
- t->recursion--;
return NETDEV_TX_OK;
}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0ae4f64..fcb5396 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -626,11 +626,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct in6_addr *addr6;
int addr_type;
- if (tunnel->recursion++) {
- stats->collisions++;
- goto tx_error;
- }
-
if (skb->protocol != htons(ETH_P_IPV6))
goto tx_error;
@@ -753,7 +748,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
ip_rt_put(rt);
stats->tx_dropped++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
if (skb->sk)
@@ -794,7 +788,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
nf_reset(skb);
IPTUNNEL_XMIT();
- tunnel->recursion--;
return NETDEV_TX_OK;
tx_error_icmp:
@@ -802,7 +795,6 @@ tx_error_icmp:
tx_error:
stats->tx_errors++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
^ permalink raw reply related
* [PATCH] tunnel: eliminate recursion field
From: Eric Dumazet @ 2009-09-23 20:17 UTC (permalink / raw)
To: David S. Miller; +Cc: Linux Netdev List
It seems recursion field from "struct ip_tunnel" is not anymore needed.
recursion prevention is done at the upper level (in dev_queue_xmit()),
since we use HARD_TX_LOCK protection for tunnels.
This avoids a cache line ping pong on "struct ip_tunnel" : This structure
should be now mostly read on xmit and receive paths.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/net/ipip.h | 1 -
net/ipv4/ip_gre.c | 13 +------------
net/ipv4/ipip.c | 8 --------
3 files changed, 1 insertion(+), 21 deletions(-)
diff --git a/include/net/ipip.h b/include/net/ipip.h
index 5d3036f..76e3ea6 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -12,7 +12,6 @@ struct ip_tunnel
struct ip_tunnel *next;
struct net_device *dev;
- int recursion; /* Depth of hard_start_xmit recursion */
int err_count; /* Number of arrived ICMP errors */
unsigned long err_time; /* Time when the last ICMP error arrived */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d9645c9..41ada99 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -66,10 +66,7 @@
solution, but it supposes maintaing new variable in ALL
skb, even if no tunneling is used.
- Current solution: t->recursion lock breaks dead loops. It looks
- like dev->tbusy flag, but I preferred new variable, because
- the semantics is different. One day, when hard_start_xmit
- will be multithreaded we will have to use skb->encapsulation.
+ Current solution: HARD_TX_LOCK lock breaks dead loops.
@@ -678,11 +675,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
__be32 dst;
int mtu;
- if (tunnel->recursion++) {
- stats->collisions++;
- goto tx_error;
- }
-
if (dev->type == ARPHRD_ETHER)
IPCB(skb)->flags = 0;
@@ -820,7 +812,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
ip_rt_put(rt);
stats->tx_dropped++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
if (skb->sk)
@@ -888,7 +879,6 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
nf_reset(skb);
IPTUNNEL_XMIT();
- tunnel->recursion--;
return NETDEV_TX_OK;
tx_error_icmp:
@@ -897,7 +887,6 @@ tx_error_icmp:
tx_error:
stats->tx_errors++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 62548cb..08ccd34 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -402,11 +402,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
__be32 dst = tiph->daddr;
int mtu;
- if (tunnel->recursion++) {
- stats->collisions++;
- goto tx_error;
- }
-
if (skb->protocol != htons(ETH_P_IP))
goto tx_error;
@@ -485,7 +480,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
ip_rt_put(rt);
stats->tx_dropped++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
if (skb->sk)
@@ -523,7 +517,6 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
nf_reset(skb);
IPTUNNEL_XMIT();
- tunnel->recursion--;
return NETDEV_TX_OK;
tx_error_icmp:
@@ -531,7 +524,6 @@ tx_error_icmp:
tx_error:
stats->tx_errors++;
dev_kfree_skb(skb);
- tunnel->recursion--;
return NETDEV_TX_OK;
}
^ permalink raw reply related
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server
From: Avi Kivity @ 2009-09-23 19:37 UTC (permalink / raw)
To: Gregory Haskins
Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm,
linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze,
alacrityvm-devel
In-Reply-To: <4ABA61D1.80703@gmail.com>
On 09/23/2009 08:58 PM, Gregory Haskins wrote:
>>
>>> It also pulls parts of the device model into the host kernel.
>>>
>> That is the point. Most of it needs to be there for performance.
>>
> To clarify this point:
>
> There are various aspects about designing high-performance virtual
> devices such as providing the shortest paths possible between the
> physical resources and the consumers. Conversely, we also need to
> ensure that we meet proper isolation/protection guarantees at the same
> time. What this means is there are various aspects to any
> high-performance PV design that require to be placed in-kernel to
> maximize the performance yet properly isolate the guest.
>
> For instance, you are required to have your signal-path (interrupts and
> hypercalls), your memory-path (gpa translation), and
> addressing/isolation model in-kernel to maximize performance.
>
Exactly. That's what vhost puts into the kernel and nothing more.
> Vbus accomplishes its in-kernel isolation model by providing a
> "container" concept, where objects are placed into this container by
> userspace. The host kernel enforces isolation/protection by using a
> namespace to identify objects that is only relevant within a specific
> container's context (namely, a "u32 dev-id"). The guest addresses the
> objects by its dev-id, and the kernel ensures that the guest can't
> access objects outside of its dev-id namespace.
>
vhost manages to accomplish this without any kernel support. The guest
simply has not access to any vhost resources other than the guest->host
doorbell, which is handed to the guest outside vhost (so it's somebody
else's problem, in userspace).
> All that is required is a way to transport a message with a "devid"
> attribute as an address (such as DEVCALL(devid)) and the framework
> provides the rest of the decode+execute function.
>
vhost avoids that.
> Contrast this to vhost+virtio-pci (called simply "vhost" from here).
>
It's the wrong name. vhost implements only the data path.
> It is not immune to requiring in-kernel addressing support either, but
> rather it just does it differently (and its not as you might expect via
> qemu).
>
> Vhost relies on QEMU to render PCI objects to the guest, which the guest
> assigns resources (such as BARs, interrupts, etc).
vhost does not rely on qemu. It relies on its user to handle
configuration. In one important case it's qemu+pci. It could just as
well be the lguest launcher.
> A PCI-BAR in this
> example may represent a PIO address for triggering some operation in the
> device-model's fast-path. For it to have meaning in the fast-path, KVM
> has to have in-kernel knowledge of what a PIO-exit is, and what to do
> with it (this is where pio-bus and ioeventfd come in). The programming
> of the PIO-exit and the ioeventfd are likewise controlled by some
> userspace management entity (i.e. qemu). The PIO address and value
> tuple form the address, and the ioeventfd framework within KVM provide
> the decode+execute function.
>
Right.
> This idea seemingly works fine, mind you, but it rides on top of a *lot*
> of stuff including but not limited to: the guests pci stack, the qemu
> pci emulation, kvm pio support, and ioeventfd. When you get into
> situations where you don't have PCI or even KVM underneath you (e.g. a
> userspace container, Ira's rig, etc) trying to recreate all of that PCI
> infrastructure for the sake of using PCI is, IMO, a lot of overhead for
> little gain.
>
For the N+1th time, no. vhost is perfectly usable without pci. Can we
stop raising and debunking this point?
> All you really need is a simple decode+execute mechanism, and a way to
> program it from userspace control. vbus tries to do just that:
> commoditize it so all you need is the transport of the control messages
> (like DEVCALL()), but the decode+execute itself is reuseable, even
> across various environments (like KVM or Iras rig).
>
If you think it should be "commodotized", write libvhostconfig.so.
> And your argument, I believe, is that vbus allows both to be implemented
> in the kernel (though to reiterate, its optional) and is therefore a bad
> design, so lets discuss that.
>
> I believe the assertion is that things like config-space are best left
> to userspace, and we should only relegate fast-path duties to the
> kernel. The problem is that, in my experience, a good deal of
> config-space actually influences the fast-path and thus needs to
> interact with the fast-path mechanism eventually anyway.
> Whats left
> over that doesn't fall into this category may cheaply ride on existing
> plumbing, so its not like we created something new or unnatural just to
> support this subclass of config-space.
>
Flexibility is reduced, because changing code in the kernel is more
expensive than in userspace, and kernel/user interfaces aren't typically
as wide as pure userspace interfaces. Security is reduced, since a bug
in the kernel affects the host, while a bug in userspace affects just on
guest.
Example: feature negotiation. If it happens in userspace, it's easy to
limit what features we expose to the guest. If it happens in the
kernel, we need to add an interface to let the kernel know which
features it should expose to the guest. We also need to add an
interface to let userspace know which features were negotiated, if we
want to implement live migration. Something fairly trivial bloats rapidly.
> For example: take an attribute like the mac-address assigned to a NIC.
> This clearly doesn't need to be in-kernel and could go either way (such
> as a PCI config-space register).
>
> As another example: consider an option bit that enables a new feature
> that affects the fast-path, like RXBUF merging. If we use the split
> model where config space is handled by userspace and fast-path is
> in-kernel, the userspace component is only going to act as a proxy.
> I.e. it will pass the option down to the kernel eventually. Therefore,
> there is little gain in trying to split this type of slow-path out to
> userspace. In fact, its more work.
>
As you can see above, userspace needs to be involved in this, and the
number of interfaces required is smaller if it's in userspace: you only
need to know which features the kernel supports (they can be enabled
unconditionally, just not exposed).
Further, some devices are perfectly happy to be implemented in
userspace, so we need userspace configuration support anyway. Why
reimplement it in the kernel?
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Is perfectly
From: Paul Martin @ 2009-09-23 19:05 UTC (permalink / raw)
To: srucnoc, alan, support, contact, contact, cris_mihai, daniel,
hydrogin
http://becomtec.com/THBedc9IZ6.html
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Rick Jones @ 2009-09-23 19:03 UTC (permalink / raw)
To: Peter P Waskiewicz Jr; +Cc: Ben Greear, NetDev
In-Reply-To: <1253731834.2538.32.camel@localhost.localdomain>
>>Next time you guys re-compile your hardware, please consider adding byte counters :)
>
>
> On 10G adapters, byte counters can skyrocket quickly, so we'd need to
> read them often to avoid them wrapping.
10G Ethernet is ~1.16 GB/s (GiB/s for purists I guess) for simplicity, call that
2GB/s or 2^31 bytes per second. If the counter is 64 bits, that would suggest
wrap in 2^64/2^31 or 2^33 seconds right? Or have I made some nasty math error?
I'm having quit a difficult time imagining that someone would have 32 bit
counters in a 10G NIC.
rick jones
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Ben Greear @ 2009-09-23 18:56 UTC (permalink / raw)
To: Peter P Waskiewicz Jr; +Cc: NetDev
In-Reply-To: <1253731834.2538.32.camel@localhost.localdomain>
On 09/23/2009 11:50 AM, Peter P Waskiewicz Jr wrote:
>> That's a bummer. I'm guessing you might get close to right on average with some
>> trivial math, but if someone is sending you pkts with size of 1000 and
>> your MTU is 1500, would there be any way to tell that the pkts were originally
>> 1000 bytes instead of 1500?
>
> Good point.
>
>> Next time you guys re-compile your hardware, please consider adding byte counters :)
>
> On 10G adapters, byte counters can skyrocket quickly, so we'd need to
> read them often to avoid them wrapping. But I will forward your request
> to our HW design folks and see if they have other ideas to implement
> these counters and make them efficient.
It still takes a while to wrap 64-bit counters :)
But, you do have to read every 3 secs or so if you're using 32-bit counters.
This can be dealt with in user-space easily enough as long as polling
the NIC for counters is efficient.
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Peter P Waskiewicz Jr @ 2009-09-23 18:50 UTC (permalink / raw)
To: Ben Greear; +Cc: NetDev
In-Reply-To: <4ABA6D0D.8010200@candelatech.com>
On Wed, 2009-09-23 at 11:46 -0700, Ben Greear wrote:
> On 09/23/2009 11:32 AM, Peter P Waskiewicz Jr wrote:
> > On Wed, 2009-09-23 at 09:40 -0700, Ben Greear wrote:
> >> I notice that with LRO enabled, the interface stats count the LRO'd pkts,
> >> not the physical ones on the wire.
> >>
> >> I also tried using ethtool -S, but it seems those counters are the same.
> >>
> >> Is there any way to get the actual rx/tx packet count on the wire?
> >>
> >
> > Depending on which device you're using ixgbe with, there are slightly
> > different registers to get what you want.
> >
> > The only suggestion I have for you though is to refer to the datasheets
> > for each device on our SourceForge site (e1000.sf.net). Some of the
> > relevant counters to look at are PRC64, PRC127, etc, and GPRC/GPTC. For
> > the per-queue stuff, you'll need to look at the TQSMR and RQSMR mapping
> > registers. Let me know if you need assistance in using these registers.
>
> Thanks, I'll look at the data-sheet.
>
> I don't care about per-queue stats at this time, just
> over-all NIC stats.
>
> >> Also, for the rx/tx bytes, I assume that isn't counting the protocol headers
> >> for the physical pkts that have been merged into a single LRO packet. Is
> >> there any way to get the wire stats for bytes as well?
> >
> > The counters for per-byte are purely software-based, so if the packet is
> > LRO'd, you can probably do some somewhat trivial math with the MTU to
> > find the actual wire stats. But we only compute what we pass to the
> > stack, so it'd be the LRO'd packet.
>
> That's a bummer. I'm guessing you might get close to right on average with some
> trivial math, but if someone is sending you pkts with size of 1000 and
> your MTU is 1500, would there be any way to tell that the pkts were originally
> 1000 bytes instead of 1500?
Good point.
> Next time you guys re-compile your hardware, please consider adding byte counters :)
On 10G adapters, byte counters can skyrocket quickly, so we'd need to
read them often to avoid them wrapping. But I will forward your request
to our HW design folks and see if they have other ideas to implement
these counters and make them efficient.
Cheers,
-PJ
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Ben Greear @ 2009-09-23 18:46 UTC (permalink / raw)
To: Peter P Waskiewicz Jr; +Cc: NetDev
In-Reply-To: <1253730766.2538.28.camel@localhost.localdomain>
On 09/23/2009 11:32 AM, Peter P Waskiewicz Jr wrote:
> On Wed, 2009-09-23 at 09:40 -0700, Ben Greear wrote:
>> I notice that with LRO enabled, the interface stats count the LRO'd pkts,
>> not the physical ones on the wire.
>>
>> I also tried using ethtool -S, but it seems those counters are the same.
>>
>> Is there any way to get the actual rx/tx packet count on the wire?
>>
>
> Depending on which device you're using ixgbe with, there are slightly
> different registers to get what you want.
>
> The only suggestion I have for you though is to refer to the datasheets
> for each device on our SourceForge site (e1000.sf.net). Some of the
> relevant counters to look at are PRC64, PRC127, etc, and GPRC/GPTC. For
> the per-queue stuff, you'll need to look at the TQSMR and RQSMR mapping
> registers. Let me know if you need assistance in using these registers.
Thanks, I'll look at the data-sheet.
I don't care about per-queue stats at this time, just
over-all NIC stats.
>> Also, for the rx/tx bytes, I assume that isn't counting the protocol headers
>> for the physical pkts that have been merged into a single LRO packet. Is
>> there any way to get the wire stats for bytes as well?
>
> The counters for per-byte are purely software-based, so if the packet is
> LRO'd, you can probably do some somewhat trivial math with the MTU to
> find the actual wire stats. But we only compute what we pass to the
> stack, so it'd be the LRO'd packet.
That's a bummer. I'm guessing you might get close to right on average with some
trivial math, but if someone is sending you pkts with size of 1000 and
your MTU is 1500, would there be any way to tell that the pkts were originally
1000 bytes instead of 1500?
Next time you guys re-compile your hardware, please consider adding byte counters :)
Thanks,
Ben
--
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc http://www.candelatech.com
^ permalink raw reply
* Re: Getting physical packet counts with LRO enabled with ixgbe?
From: Peter P Waskiewicz Jr @ 2009-09-23 18:32 UTC (permalink / raw)
To: Ben Greear; +Cc: NetDev
In-Reply-To: <4ABA4F8B.6040504@candelatech.com>
On Wed, 2009-09-23 at 09:40 -0700, Ben Greear wrote:
> I notice that with LRO enabled, the interface stats count the LRO'd pkts,
> not the physical ones on the wire.
>
> I also tried using ethtool -S, but it seems those counters are the same.
>
> Is there any way to get the actual rx/tx packet count on the wire?
>
Depending on which device you're using ixgbe with, there are slightly
different registers to get what you want.
The only suggestion I have for you though is to refer to the datasheets
for each device on our SourceForge site (e1000.sf.net). Some of the
relevant counters to look at are PRC64, PRC127, etc, and GPRC/GPTC. For
the per-queue stuff, you'll need to look at the TQSMR and RQSMR mapping
registers. Let me know if you need assistance in using these registers.
> Also, for the rx/tx bytes, I assume that isn't counting the protocol headers
> for the physical pkts that have been merged into a single LRO packet. Is
> there any way to get the wire stats for bytes as well?
The counters for per-byte are purely software-based, so if the packet is
LRO'd, you can probably do some somewhat trivial math with the MTU to
find the actual wire stats. But we only compute what we pass to the
stack, so it'd be the LRO'd packet.
Cheers,
-PJ Waskiewicz
^ permalink raw reply
* [PATCH 8/8] [RFC] CAIF Protocol Stack
From: sjur.brandeland @ 2009-09-23 17:31 UTC (permalink / raw)
To: netdev; +Cc: Kim.xx.Lilliestierna, sjur.brandeland
From: Kim Lilliestierna <Kim.xx.Lilliestierna@ericsson.com>
Signed-off-by: sjur.brandeland@stericsson.com
---
drivers/net/Makefile | 1 +
net/Kconfig | 1 +
net/Makefile | 1 +
3 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7629c90..d8441a8 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -255,4 +255,5 @@ obj-$(CONFIG_NETXEN_NIC) += netxen/
obj-$(CONFIG_NIU) += niu.o
obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
obj-$(CONFIG_SFC) += sfc/
+obj-$(CONFIG_CAIF) += caif/
diff --git a/net/Kconfig b/net/Kconfig
index 0210002..51e3eaa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -253,5 +253,6 @@ endmenu
source "net/rfkill/Kconfig"
source "net/9p/Kconfig"
+source "net/caif/Kconfig"
endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ba324ae..449dd91 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_NETLABEL) += netlabel/
obj-$(CONFIG_IUCV) += iucv/
obj-$(CONFIG_RFKILL) += rfkill/
obj-$(CONFIG_NET_9P) += 9p/
+obj-$(CONFIG_CAIF) += caif/
ifneq ($(CONFIG_DCB),)
obj-y += dcb/
endif
--
1.6.0.4
^ permalink raw reply related
* [PATCH 7/8] [RFC] CAIF Protocol Stack
From: sjur.brandeland @ 2009-09-23 17:31 UTC (permalink / raw)
To: netdev; +Cc: Kim.xx.Lilliestierna, sjur.brandeland
From: Kim Lilliestierna <Kim.xx.Lilliestierna@ericsson.com>
Signed-off-by: sjur.brandeland@stericsson.com
---
Documentation/CAIF/Linux-CAIF.txt | 319 +++++++++++++++++
Documentation/CAIF/README | 60 ++++
Documentation/CAIF/chardevconfig/Makefile | 11 +
Documentation/CAIF/chardevconfig/README | 39 ++
Documentation/CAIF/chardevconfig/caif_cmd_parse.c | 365 ++++++++++++++++++++
Documentation/CAIF/chardevconfig/chardevconfig.c | 111 ++++++
.../CAIF/chardevconfig/create_devices.config | 12 +
.../CAIF/chardevconfig/delete_devices.config | 12 +
Documentation/CAIF/ldiscd/ldiscd.c | 123 +++++++
9 files changed, 1052 insertions(+), 0 deletions(-)
create mode 100644 Documentation/CAIF/Linux-CAIF.txt
create mode 100644 Documentation/CAIF/README
create mode 100644 Documentation/CAIF/chardevconfig/Makefile
create mode 100644 Documentation/CAIF/chardevconfig/README
create mode 100644 Documentation/CAIF/chardevconfig/caif_cmd_parse.c
create mode 100644 Documentation/CAIF/chardevconfig/chardevconfig.c
create mode 100644 Documentation/CAIF/chardevconfig/create_devices.config
create mode 100644 Documentation/CAIF/chardevconfig/delete_devices.config
create mode 100644 Documentation/CAIF/ldiscd/ldiscd.c
diff --git a/Documentation/CAIF/Linux-CAIF.txt b/Documentation/CAIF/Linux-CAIF.txt
new file mode 100644
index 0000000..42fd66e
--- /dev/null
+++ b/Documentation/CAIF/Linux-CAIF.txt
@@ -0,0 +1,319 @@
+Linux CAIF
+===========
+
+Introduction
+------------
+CAIF is a MUX protocol used by ST-Ericsson cellular modems for
+communication
+between Modem and host. The host processes can open virtual AT
+channels, initiate GPRS Data connections, Video channels and
+Utility Channels.
+The Utility Channels are general purpose pipes between modem
+and host.
+
+ST-Ericsson modems support a number of transports between modem
+and host,
+currently Uart and Shared Memory are available for Linux.
+
+Architecture:
+------------
+The Implementation of CAIF is divided into:
+* CAIF Drivers: Character Device, Net Device and Kernel API.
+* CAIF Generic Protocol Implementation
+* CAIF Link Layer
+
+CAIF is using IOCTLs to manage the CAIF Drivers.
+
+
+ IOCTL
+ !
+ ! +------+ +------+ +------+
+ ! +------+! +------+! +------+!
+ ! ! Char !! !Kernel!! ! Net !!
+ ! ! Dev !+ ! API !+ ! Dev !+ <- CAIF Drivers
+ ! +------+ +------! +------+
+ ! ! ! !
+ ! +----------!----------+
+ ! +------+ <- CAIF Protocol Implementation
+ +-------> ! CAIF ! /dev/caifconfig
+ +------+
+ +--------!--------+
+ ! !
+ +------+ +-----+
+ !ShMem ! ! TTY ! <- Link Layer
+ +------+ +-----+
+
+
+
+Using CAIF Character Device
+-----------------------------
+CAIF character devices are configured by use of IOCTLs on the
+node "/dev/caifconfig". E.g. the following code will create an
+CAIF Character Device that will make an AT channel accessible:
+
+ struct caif_channel_create_action at = {
+ .name = {
+ .name = "cnhlatl",
+ .type = CAIF_DEV_CHR
+ },
+ .config = {
+ .channel = CAIF_CHTY_AT,
+ }};
+ fd = open("/dev/caifconfig",..);
+ ioctl(fd, CAIF_IOC_CONFIG_DEVICE,&at_config);
+
+A configuration tool chardevconfig exist in order to simplify
+creation of CAIF Channels (typically used from init scripts).
+E.g:
+
+ $chardevconfig /dev/caifconfig -
+ CREATE TYPE=AT NAME=chnlat1 DEVTYPE=CHAR ^D
+
+This will result in creation of the device node "/dev/chnlat1".
+"/dev/chnlat1" can be used to read and write AT commands and
+responses
+from the modem:
+
+ $cat /dev/chnlat1 &
+ $printf "AT\r" > /dev/chnlat1
+ OK
+
+
+
+Using CAIF Net Device
+----------------------
+CAIF Net device can be created similarly as the character
+device.
+E.g:
+
+ $chardevconfig /dev/caifconfig -
+ CREATE TYPE=DGM NAME=caif0 DEVTYPE=NET CONNID=1 ^D
+
+ $ifconfig caif0 <ip address> up
+
+
+Using the Kernel API
+----------------------
+The Kernel API is used for accessing CAIF channels from the
+kernel.
+The user of the API has to implement two callbacks for receive
+and control.
+The receive callback give a CAIF packet as a SKB. The control
+callback will
+notify about channel initialization complete, and flow-on/flow-
+off.
+
+
+ struct caif_device caif_dev = {
+ .caif_config = {
+ .name = "MYDEV"
+ .type = CAIF_CHTY_AT
+ }
+ .receive_cb = my_receive,
+ .control_cb = my_control,
+ };
+
+ caif_add_device(&caif_dev);
+
+ caif_transmit(&caif_dev, skb);
+
+
+See the caif_kernel.h for details about the CAIF kernel API.
+
+
+
+
+
+
+
+
+I M P L E M E N T A T I O N
+===========================
+===========================
+
+
+
+
+GenCAIF - The Generic CAIF Protocol Layer
+=========================================
+
+
+GenCaif is a generic CAIF protocol implementation. It implements the CAIF
+protocol as specified in "CAIF Protocol Specification" (155 19-CRH 109 913).
+GenCaif implements the CAIF protocol stack in a layered approach, where
+each layer described in the specification is implemented as a separate layer.
+The architecture is inspired by the design patterns "Protocol Layer" and
+"Protocol Packet".
+
+== CAIF structure ==
+
+The goal is to have caif as system independent as possible.
+All caif code can be found under GenCaif/src and GenCaif/inc.
+The actual linux module implementation is under src/kernel.
+There is also a user space program that is not up to date to run the stack in
+user space for testing.
+
+We have tested the kernel implementation on the emulator with a modem and we
+are able to enumerate and make a link setup.
+
+GenCAIF is:
+ - Simple implementation of CAIF.
+ - Layered architecture (ala Streams), each layer specified CAIF
+ specification is implemented in a separate c-file.
+ - Client of GenCaif must implement PHY layer to access physical HW
+ with receive and transmit functions.
+ - Client of GenCaif must call configuration function add PHY layer.
+ - Client of GenCaif must implement adaptation layer to consume/produce
+ CAIF payload with receive and transmit functions.
+ - Client of GenCaif must call configuration function add adaptation
+ layer.
+ - When receiving / transmitting CAIF Packets (cfpkt) ownership is passed
+ to the called function (except Framinglayer's receive function).
+
+
+
+Layered Architecture
+--------------------
+The CAIF protocol can be divided into two parts Support functions and Protocol
+Implementation. The support functions include:
+
+ - CFPKT CAIF Packet. Implementation of CAIF Protocol Packet. The
+ CAIF Packet has functions for creating,destroying, adding content, and
+ adding/extracting header and trailers to protocol packets.
+
+ - CFLST CAIF list implementation.
+
+ - CFGLUE CAIF Glue. Contains OS Specifics such as memory
+ allocation, endianness etc.
+
+
+The CAIF Protocol implementation contains:
+
+ - CFCNFG CAIF Configuration layer. Configures the CAIF Protocol
+ Stack, and has Client interface for adding Link-Layer and
+ Driver interfaces on top of the CAIF Stack.
+
+ - CFCTRL CAIF Control layer. Encodes and Decodes control messages
+ such as enumeration, and channel setup. And matches request and
+ response messages.
+
+ - CFSERVL General CAIF Service Layer functionality, handle flow
+ control and remote shutdown requests.
+
+ - CFVEI CAIF VEI layer. Handles CAIF VEI layer (AT-Channel),
+ code/encodes VEI frames.
+
+ - CFDGML CAIF Data-gram layer. Handles CAIF Data-gram layer(IP
+ traffic), code/encodes Datagram frames.
+
+ - CFMUX CAIF Mux layer. Handles multiplexing between multiple
+ physical bearers and multiple channels such as VEI, Data-gram etc
+ The MUX is keeping track of the existing CAIF Channels and
+ Physical Instances and selects the apropriate instance based
+ on Channel-Id and Physical-ID.
+
+ - CFFRML CAIF Framing layer. Handles Framing i.e. Frame length
+ and frame checksum.
+
+ - CFSERL CAIF Serial layer. Handles concatenation/split of frames
+ into CAIF Frames with correct length.
+
+ - CFSHML CAIF Shared Memory layer.
+
+
+
+ +---------+
+ | Config |
+ | CFCNFG |
+ +---------+
+ !
+ +---------+ +---------+ +---------+
+ | AT | | Control | | Datagram|
+ | CFVEIL | | CFCTRL | | CFDGML |
+ +---------+ +---------+ +---------+
+ \_____________!______________/
+ !
+ +---------+
+ | MUX |
+ | |
+ +---------+
+ _____!_____
+ / \
+ +---------+ +---------+
+ | CFFRML | | CFFRML |
+ | Framing | | Framing |
+ +---------+ +---------+
+ ! !
+ +---------+ +---------+
+ | Sh-Mem | | Serial |
+ | CFSHML | | CFSERL |
+ +---------+ +---------+
+
+
+
+In this layered approach the following "rules" applies.
+ - All layers embedd the same structure 'struct layer'
+ - Layer do not depend on any others layer private data.
+ - Layers are stacked by setting the pointers
+ layer->up , layer->dn
+ - In order to send data upwards each layer should do
+ layer->up->receive(layer->up, packet);
+ - In oder to send data downwards each layer should do
+ layer->dn->transmit(layer->dn, packet);
+
+
+
+Linux Driver Implementation
+===========================
+
+Linux GPRS Net Device and Character Devices are implemented on top of the
+Generic CAIF protocol. The Net device and Chr device has an instance of
+'struct layer' as the generic caif protocol stack.
+Net and Chr device implements the 'receive()' function defined by
+'struct layer' as the rest of the CAIF stack. In this way transmit and
+reception of packets is handled as the rest of the layers, 'dn->transmit()'
+function is called in order to tranmit data.
+
+The layer on top of the Generic CAIF is called an "adaptation layer".
+
+
+Configuration of Drivers
+------------------------
+
+Configuration is the most complex part of the CAIF protocol.
+Configuration is controlled by the Misc device 'caifconfig'
+implemented in caif_chr. A device is created when a IOCTL
+command for creation is received containing information about
+the CAIF Channel type to be created and the type of device to instanciate
+(Net Device or Character Device).
+
+The Net Device and Character Device will register into the 'caifconfig'
+device by calling 'caif_register_netdev' and 'caif_register_chrdev'.
+When registered the 'caifconfig' module will keep function pointers
+to the devices used when IOCTL creates new devices.
+
+
+The CAIF Configuration module CFCNFG is responsible for connecting and
+setting up the entire CAIF stack.
+
+The function 'cfcnfg_add_adapt_layer' is used to connect a Linux Driver
+to the ST-Ericsson modem. This function will trigger the setup of CAIF
+Channel by sending a "LinkSetup" message to the modem. When the
+"LinkSetupResponse" is received the CAIF protocol for the requested
+CAIF Service will be set up.
+
+The CAIF Channel configuration parameters will be given as input.
+
+
+
+Configuration of Link Layer
+---------------------------
+The Link Layer (or Phy Layer) must implement the 'transmit' function
+defined by 'struct layer' in order to send payload. When data is received
+the Link Layer calls 'up->receive()'.
+Configuring the link layer is done by the function 'cfcnfg_add_phy_layer'.
+This function will set up the CAIF Layers for the new Link Layer.
+
+
+The physical Link Layers registers intself into 'caifconfig' by
+calling the function 'caif_phy_register()'.
diff --git a/Documentation/CAIF/README b/Documentation/CAIF/README
new file mode 100644
index 0000000..aa04150
--- /dev/null
+++ b/Documentation/CAIF/README
@@ -0,0 +1,60 @@
+copyright (C) ST-Ericsson AB 2009
+Author: Sjur Brendeland/ sjur.brandeland@stericsson.com
+ Kim Lilliestierna Kim.xx.Lilliestierna@ericsson.com
+License terms: GNU General Public License (GPL), version 2.
+
+=== Start ===
+Copy the .ko files onto the board, and do insmod:
+
+insmod caif.ko
+insmod phyif_msl.ko
+insmod chnl_chr.ko
+insmod chnl_net.ko
+ifconfig caif0 <your-home-address> up
+
+
+=== Test Loopback on net device ===
+insmod chnl_net.ko loop=yes
+ifconfig caif0 192.168.0.1 up
+ping -c 10 -s 1000 192.168.0.2
+
+=== Preparing the setup.===
+
+Make sure that the kernel is built with module support.
+
+There are some things that need to be
+tweaked to get the host TTY correctly setup to talk to the modem.
+Since the CAIF stack is running in the kernel and we want to use the existing
+TTY we are installing our physical serial driver as a line discipline above
+the TTY device.
+
+To achieve this we need the help of a daemon program called ldiscd.
+The benefit is that we can hook up to any TTY, the downside is that we need
+an extra operation in order to install the line discipline.
+
+Getting the host TTY to behave (This should only be necessary when running
+on the emulator, otherwise the ldiscd should correctly configure the UART).
+
+Retrieve the current settings:
+
+$ stty -a -F /dev/ttyUSB1
+
+Make sure that we are having 115200, 8n1, CTS/RTS (for example if CTS/RTS is missing):
+$ stty -F /dev/ttyUSB1 ctsrts
+
+Build the line discipline daemon. (You need to change CAIF_LDISC_TTY
+if your not using /dev/ttyS0.)
+
+$ gcc ldiscd.c -o ldisc
+
+Install the line discipline (daemon)
+$ ldisc
+
+Install the VEI channel (this will enumerate and do the linksetup of the first VEI channel. If this goes well you should see /dev/chn*) (There are printks logging all buffers that can be checked with dmesg):
+$ modprobe chnl_chr
+
+The AT (VEI) channel is ready to use (you can now send AT commands on it):
+$ echo -e "AT\r\n" > /dev/chnlat10
+
+Verify that you got an OK response (There are printks logging all buffers that can be checked with dmesg):
+$ cat /dev/chnlat10
diff --git a/Documentation/CAIF/chardevconfig/Makefile b/Documentation/CAIF/chardevconfig/Makefile
new file mode 100644
index 0000000..5bd7d90
--- /dev/null
+++ b/Documentation/CAIF/chardevconfig/Makefile
@@ -0,0 +1,11 @@
+CFLAGS=-g -Wall -I ../../../include -I../../../include/linux/caif
+
+PROGS=chardevconfig
+OBJS=chardevconfig.o caif_cmd_parse.o
+all: $(PROGS)
+
+chardevconfig: chardevconfig.o caif_cmd_parse.o
+ $(CC) $(CFLAGS) -o chardevconfig $(OBJS)
+
+clean:
+ rm -f $(PROGS) $(OBJS)
diff --git a/Documentation/CAIF/chardevconfig/README b/Documentation/CAIF/chardevconfig/README
new file mode 100644
index 0000000..bc7013b
--- /dev/null
+++ b/Documentation/CAIF/chardevconfig/README
@@ -0,0 +1,39 @@
+Usage: chardevconfig configdevice configfile
+Usage: chardevconfig configdevice -
+
+The program will read commands from the configfile (or stdin), parse them and
+do ioctl calls to the configdevice. One command per line. Lines with syntax
+errors (e.g. starting with a #) will be skipped.
+
+Examples:
+
+CREATE TYPE=AT NAME=chnlat10 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=AT NAME=chnlat11 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=AT NAME=chnlat12 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=AT NAME=chnlat13 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+
+CREATE TYPE=RFM NAME=chn_rfm DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=rfm
+CREATE TYPE=RFM NAME=chn_afs DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=/afs
+CREATE TYPE=RFM NAME=chn_ifs DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=/ifs
+CREATE TYPE=RFM NAME=chn_sys DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=/sys"
+
+CREATE TYPE=UTIL NAME=chn_psocktest DEVTYPE=CHAR PHYPREF=LAT SOCK=CAIF_PSOC_TEST PARAM=01
+
+CREATE TYPE=DGM NAME=chn_datagram DEVTYPE=CHAR PHYPREF=BW CONNID=1
+CREATE TYPE=DGM NAME=datagram_raw_ip DEVTYPE=CHAR PHYPREF=BW CONNID=1
+
+CREATE TYPE=DGMLOOP NAME=datagram_loop DEVTYPE=CHAR PHYPREF=BW
+
+
+DELETE NAME=chnlat10 DEVTYPE=CHAR
+DELETE NAME=chnlat11 DEVTYPE=CHAR
+DELETE NAME=chnlat12 DEVTYPE=CHAR
+DELETE NAME=chnlat13 DEVTYPE=CHAR
+DELETE NAME=chn_rfm DEVTYPE=CHAR
+DELETE NAME=chn_afs DEVTYPE=CHAR
+DELETE NAME=chn_ifs DEVTYPE=CHAR
+DELETE NAME=chn_sys DEVTYPE=CHAR
+DELETE NAME=chn_psocktest DEVTYPE=CHAR
+DELETE NAME=chn_datagram DEVTYPE=CHAR
+DELETE NAME=datagram_loop DEVTYPE=CHAR
+DELETE NAME=datagram_raw_ip DEVTYPE=CHAR
diff --git a/Documentation/CAIF/chardevconfig/caif_cmd_parse.c b/Documentation/CAIF/chardevconfig/caif_cmd_parse.c
new file mode 100644
index 0000000..a52ded0
--- /dev/null
+++ b/Documentation/CAIF/chardevconfig/caif_cmd_parse.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2009
+ *
+ * Author: Daniel Martensson / Daniel.Martensson@stericsson.com
+ *
+ * License terms: GNU General Public License (GPL), version 2.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <caif_config.h>
+#include <caif_ioctl.h>
+
+
+#define CFLOG_TRACE(a)
+
+
+/** Strips of blanks */
+void skip_blanks(char **c)
+{
+ while (**c == ' ')
+ (*c)++;
+
+}
+
+/** Parses the specified command.
+ * @param[in/out] in Pointer to where to start parsing, and if parsing successfull position behind parsed tex
+ * @param[in] cmd String to parse
+ * @return 1 on success, 0 on error
+ */
+int cmd_parse(char **in, char *cmd, int *err)
+{
+
+ char *pos = *in;
+ skip_blanks(&pos);
+ if (strncmp(pos, cmd, strlen(cmd)) == 0) {
+ pos += strlen(cmd);
+ skip_blanks(&pos);
+ *in = pos;
+ CFLOG_TRACE(("arg_parse: Match '%s'\n", cmd));
+
+ return 1;
+ }
+ return 0;
+}
+
+/** Parses a value pair on the format <CMD> = <arg>
+ * @param[in/out] in Pointer to where to start parsing, and if parsing successfull position behind parsed tex
+ * @param[in] cmd String to parse
+ * @param[out] arg Argument on the right side of '='
+ * @return 1 on success, 0 on error
+ */
+int arg_parse(char **in, char *cmd, char *arg, int arglen, int *err)
+{
+ char *pos = *in;
+ skip_blanks(&pos);
+ if (strncmp(pos, cmd, strlen(cmd)) == 0) {
+ pos += strlen(cmd);
+ skip_blanks(&pos);
+ if (*pos != '=') {
+ *err = 1;
+ return 0;
+ }
+ pos++;
+ while (*pos && *pos != ' ')
+ *arg++ = *pos++;
+ *arg = 0;
+ skip_blanks(&pos);
+ *in = pos;
+ CFLOG_TRACE(("arg_parse: Match '%s' Arg: '%s'\n", cmd, pos));
+ return 1;
+ }
+ return 0;
+}
+
+/** Parses a value pair on the format <CMD> = <int>
+ * @param[in/out] in Pointer to where to start parsing, and if parsing successfull position behind parsed tex
+ * @param[in] cmd
+ * @param[out] val Parsed integer value
+ * @return 1 on success, 0 on error
+ */
+int int_parse(char **in, char *cmd, int *val, int *err)
+{
+ char *pos = *in;
+ char arg[100];
+ if (arg_parse(&pos, cmd, arg, sizeof(arg), err)) {
+ sscanf(arg, "%d", val);
+ *in = pos;
+ CFLOG_TRACE(("int_parse: Match '%s' '%d'\n", cmd, *val));
+
+ return 1;
+ }
+ return 0;
+}
+
+/** Parses a value pair on the format <CMD> = <hex-string>
+ * @param[in/out] in Pointer to where to start parsing, and if parsing successfull position behind parsed tex
+ * @param[in] cmd Integer to parse
+ * @param[out] tobuf Parsed hex string
+ * @param[in] maxlen Max-len of the binary parsed hex string
+ * @param[out] buflen Length of the parsed hex sting (in bytes)
+ * @return 1 on success, 0 on error
+ */
+
+int
+hex_parse(char **in, char *cmd, unsigned char *tobuf, int maxlen,
+ int *buflen, int *err)
+{
+ char *pos = *in;
+ char tmp[3];
+ int start = maxlen - 1;
+ int val = 0;
+ int len = 0;
+ int i;
+ char hexstr[100];
+ unsigned char buf[256];
+ if (arg_parse(&pos, cmd, hexstr, sizeof(hexstr), err)) {
+
+ i = strlen(hexstr);
+ while (i > 0) {
+ tmp[0] = hexstr[i - 2];
+ tmp[1] = hexstr[i - 1];
+ tmp[2] = 0;
+ sscanf(tmp, "%x", &val);
+ buf[start--] = (unsigned char) (val & 0xff);
+ len++;
+ i -= 2;
+ }
+ *buflen = len;
+ for (i = 0; i < len; i++)
+ tobuf[i] = buf[maxlen - len + i];
+
+ *in = pos;
+ CFLOG_TRACE(("hex_parse: Match '%s' '%s'\n", cmd, tmp));
+ return 1;
+ }
+ return 0;
+}
+
+/** Parses a value pair on the format <CMD> = <token>
+ * @param[in/out] in Pointer to where to start parsing, and if parsing successfull position behind parsed tex
+ * @param[in] cmd Integer to parse
+ * @param[out] toktype Specify the class of tokens to be parsed
+ * @param[in] val Specify the value (in enums) corresponding to a token
+ * @return 1 on success, 0 on error
+ */
+
+int tok_parse(char **in, char *cmd, char *toktype, int *val, int *err)
+{
+ struct {
+ char *tok;
+ char *toktype;
+ int val;
+ } tokens[] = {
+ {
+ "LAT", "PHYPREF", CAIF_PHYPREF_LOW_LAT}, {
+ "BW", "PHYPREF", CAIF_PHYPREF_HIGH_BW}, {
+ "LOOP", "PHYPREF", _CAIF_PHYPREF_LOOP}, {
+ "LOW", "PRIO", CAIF_PRIO_LOW}, {
+ "NORM", "PRIO", CAIF_PRIO_NORMAL}, {
+ "HI", "PRIO", CAIF_PRIO_HIGH}, {
+ "AT", "CHTY", CAIF_CHTY_AT}, {
+ "DGM", "CHTY", CAIF_CHTY_DATAGRAM}, {
+ "DGMLOOP", "CHTY", CAIF_CHTY_DATAGRAM_LOOP}, {
+ "VIDEO", "CHTY", CAIF_CHTY_VIDEO}, {
+ "DEBUG", "CHTY", CAIF_CHTY_DEBUG}, {
+ "TRACE", "CHTY", CAIF_CHTY_DEBUG_TRACE}, {
+ "IDEBUG", "CHTY", CAIF_CHTY_DEBUG_INTERACT}, {
+ "RFM", "CHTY", CAIF_CHTY_RFM}, {
+ "UTIL", "CHTY", CAIF_CHTY_UTILITY}, {
+ "YES", "BOOL", 1}, {
+ "NO", "BOOL", 0}, {
+ "CHAR", "DEVTY", CAIF_DEV_CHR}, {
+ "NET", "DEVTY", CAIF_DEV_NET}, {
+
+ NULL, 0}
+ };
+
+ char tok[100];
+ char *pos = *in;
+
+ if (arg_parse(&pos, cmd, tok, sizeof(tok), err)) {
+ int i;
+ for (i = 0; tokens[i].tok != NULL; i++) {
+ if (strcmp(tokens[i].toktype, toktype) == 0
+ && strcmp(tokens[i].tok, tok) == 0) {
+
+ *val = tokens[i].val;
+ *in = pos;
+ CFLOG_TRACE(("tok_parse:"
+ " Match '%s' '%s' (%s)->%d\n",
+ cmd, tok, toktype, *val));
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+/** Parses a command string from user.
+ * @param[in] cmd The command string to be parsed
+ * @param[out] action The type of action of this string
+ * @param[in] param The action parameters for this command
+ * @return 0 on success, < 0 on error.
+ */
+int caif_cmd_parse(char *cmd, int *action, union caif_action *param)
+{
+
+ int err = 0;
+ char *pos = cmd;
+ int val;
+ int len;
+ unsigned char *u;
+ int phy_specified = 0;
+ memset(param, 0, sizeof(*param));
+ if (cmd_parse(&pos, "HELP", &err))
+ return 0;
+
+ if (cmd_parse(&pos, "DELETE", &err)) {
+ struct caif_device_name *cf = ¶m->delete_channel;
+ *action = CAIF_IOC_REMOVE_DEVICE;
+
+ CFLOG_TRACE(("DELETE - pos='%s'\n", pos));
+
+ if (arg_parse(&pos, "NAME", cf->name, sizeof(cf->name), &err)) {
+ CFLOG_TRACE(("NAME - arg='%s' pos='%s'\n", cf->name,
+ pos));
+
+ } else {
+ CFLOG_TRACE(("Parse Error for DELETE: '%s'\n", pos));
+ return -1;
+ }
+
+ if (tok_parse(&pos, "DEVTYPE", "DEVTY", &val, &err)) {
+ cf->devtype = val;
+ CFLOG_TRACE(("DEVTYPE - arg='%d' pos='%s'\n", val,
+ pos));
+
+ } else {
+ CFLOG_TRACE(("DEVTYPE REQUIRED\n"));
+ return -1;
+ }
+
+
+ if (strlen(pos) > 0) {
+ CFLOG_TRACE(("Could not parse string '%s'\n", pos));
+ return -1;
+ }
+ } else if (cmd_parse(&pos, "CREATE", &err)) {
+ struct caif_channel_create_action *cf = ¶m->create_channel;
+ *action = CAIF_IOC_CONFIG_DEVICE;
+ CFLOG_TRACE(("CREATE - pos='%s'\n", pos));
+
+
+ if (tok_parse(&pos, "TYPE", "CHTY", &val, &err)) {
+ cf->config.type = val;
+ CFLOG_TRACE(("TYPE - arg='%d' pos='%s'\n", val, pos));
+
+ } else {
+ CFLOG_TRACE(("TYPE REQUIRED\n"));
+ return -1;
+ }
+
+ if (arg_parse
+ (&pos, "NAME", cf->name.name, sizeof(cf->name.name),
+ &err)) {
+ CFLOG_TRACE(("NAME - arg='%s' pos='%s'\n",
+ cf->name.name, pos));
+
+ } else {
+ CFLOG_TRACE(("NAME REQUIRED\n"));
+ return -1;
+ }
+
+ if (tok_parse(&pos, "DEVTYPE", "DEVTY", &val, &err)) {
+ cf->name.devtype = val;
+ CFLOG_TRACE(("DEVTYPE - arg='%d' pos='%s'\n", val,
+ pos));
+
+ } else {
+ CFLOG_TRACE(("DEVTYPE REQUIRED\n"));
+ return -1;
+ }
+
+
+ if (arg_parse(&pos, "PHYNAME", cf->config.phy_name,
+ sizeof(cf->config.phy_name), &err)) {
+ phy_specified = 1;
+ CFLOG_TRACE(("PHYNAME - arg='%s' pos='%s'\n",
+ cf->config.phy_name, pos));
+
+ }
+ if (tok_parse(&pos, "PHYPREF", "PHYPREF", &val, &err)) {
+ cf->config.phy_pref = val;
+ phy_specified = 1;
+ CFLOG_TRACE(("NAME - val='%d' pos='%s'\n", val, pos));
+
+ }
+
+ if (!phy_specified) {
+ CFLOG_TRACE(("PHYNAME or PHYPREF REQUIRED\n"));
+ return -1;
+ }
+
+ if (tok_parse(&pos, "PRIO", "PRIO", &val, &err)) {
+ cf->config.priority = val;
+ CFLOG_TRACE(("PRIO - val='%d' pos='%s'\n", val, pos));
+
+ }
+ if (cf->config.type == CAIF_CHTY_DATAGRAM
+ && int_parse(&pos, "CONNID", &val, &err)) {
+ cf->config.u.dgm.connection_id = val;
+ CFLOG_TRACE(("CONNID - val='%d' pos='%s'\n", val,
+ pos));
+
+ }
+ if (cf->config.type == CAIF_CHTY_RFM
+ && int_parse(&pos, "CONNID", &val, &err)) {
+ cf->config.u.rfm.connection_id = val;
+ CFLOG_TRACE(("CONNID - val=%d pos='%s'\n", val, pos));
+
+ }
+ if (cf->config.type == CAIF_CHTY_RFM
+ && arg_parse(&pos, "VOLUME", cf->config.u.rfm.volume,
+ sizeof(cf->config.u.rfm.volume), &err)) {
+ cf->config.u.rfm.connection_id = val;
+ CFLOG_TRACE(("VOLUME - arg='%s' pos='%s'\n",
+ cf->config.u.rfm.volume, pos));
+
+ }
+ if (cf->config.type == CAIF_CHTY_UTILITY
+ && arg_parse(&pos, "SOCK", cf->config.u.utility.name,
+ sizeof(cf->config.u.utility.name), &err)) {
+ cf->config.u.rfm.connection_id = val;
+ CFLOG_TRACE(("SOCK - arg='%s' pos='%s'\n",
+ cf->config.u.utility.name, pos));
+
+ }
+
+ if (cf->config.type == CAIF_CHTY_UTILITY
+ && hex_parse(&pos, "PARAM",
+ cf->config.u.utility.params,
+ sizeof(cf->config.u.utility.params), &len,
+ &err)) {
+ cf->config.u.utility.paramlen = len;
+ u = cf->config.u.utility.params;
+ CFLOG_TRACE(("PARAM %02x,%02x,%02x,%02x- pos='%s'\n",
+ u[0], u[1], u[2], u[3], pos));
+
+ }
+
+ if (strlen(pos) > 0) {
+ CFLOG_TRACE(("Could not parse string '%s'\n", pos));
+ return -1;
+ }
+
+ } else {
+ CFLOG_TRACE(("UNRECOGNIZED COMMAND '%s'\n", pos));
+ return -1;
+ }
+ return 0;
+
+}
diff --git a/Documentation/CAIF/chardevconfig/chardevconfig.c b/Documentation/CAIF/chardevconfig/chardevconfig.c
new file mode 100644
index 0000000..cb09b0d
--- /dev/null
+++ b/Documentation/CAIF/chardevconfig/chardevconfig.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2009
+ *
+ * Author: Per Sigmond / Per.Sigmond@stericsson.com
+ *
+ * License terms: GNU General Public License (GPL), version 2.
+ *
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <ctype.h>
+#include <caif_ioctl.h>
+
+
+#define BUFFERLENGTH 512
+
+
+int caif_cmd_parse(char *cmd, int *action, union caif_action *param);
+
+void usage(char *argv0)
+{
+ fprintf(stderr, "Usage: %s device configfile\n", argv0);
+ fprintf(stderr, "Usage: %s device -\n", argv0);
+}
+
+int main(int argc, char *argv[])
+{
+
+ int fd;
+ FILE *configfile;
+ char *config_devname;
+ union caif_action param;
+ int request;
+ int ret;
+ char *cmd;
+ int bytes_read;
+
+ if (argc < 3) {
+ usage(argv[0]);
+ exit(-1);
+ }
+ config_devname = argv[1];
+
+ if (!strncmp(argv[2], "-", 1)) {
+ /* stdin */
+ configfile = stdin;
+ } else {
+ configfile = fopen(argv[2], "r");
+ }
+ if (configfile == NULL) {
+ fprintf(stderr, "fopen %s: %s\n", argv[2], strerror(errno));
+ exit(-1);
+ }
+
+ cmd = malloc(BUFFERLENGTH);
+ if (cmd == NULL) {
+ fprintf(stderr, "malloc error: %s\n", strerror(errno));
+ exit(-1);
+ }
+
+ while (fgets(cmd, BUFFERLENGTH, configfile) != NULL) {
+
+ bytes_read = strlen(cmd);
+ while (isspace(cmd[bytes_read - 1])) {
+ cmd[bytes_read - 1] = 0;
+ bytes_read--;
+ }
+
+ ret = caif_cmd_parse(cmd, &request, ¶m);
+
+ if (ret != 0) {
+ fprintf(stderr, "'%s'\n", cmd);
+ fprintf(stderr, "Error parsing config string.\n");
+ continue;
+ }
+
+ fd = open(config_devname, O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "open %s: %s\n", config_devname,
+ strerror(errno));
+ exit(-1);
+ }
+
+ if (ioctl(fd, request, ¶m) < 0) {
+ fprintf(stderr, "'%s'\n", cmd);
+ fprintf(stderr, "ioctl: %s\n", strerror(errno));
+ }
+ close(fd);
+
+ if (request == CAIF_IOC_CONFIG_DEVICE) {
+ printf
+ ("Create device: name = %s, "
+ "major = %d, minor = %d\n",
+ param.create_channel.name.name,
+ param.create_channel.major,
+ param.create_channel.minor);
+ }
+ }
+
+ fclose(configfile);
+ free(cmd);
+
+ return 0;
+}
diff --git a/Documentation/CAIF/chardevconfig/create_devices.config b/Documentation/CAIF/chardevconfig/create_devices.config
new file mode 100644
index 0000000..645ba8d
--- /dev/null
+++ b/Documentation/CAIF/chardevconfig/create_devices.config
@@ -0,0 +1,12 @@
+CREATE TYPE=AT NAME=chnlat10 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=AT NAME=chnlat11 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=AT NAME=chnlat12 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=AT NAME=chnlat13 DEVTYPE=CHAR PHYPREF=LAT PRIO=HI
+CREATE TYPE=RFM NAME=chn_rfm DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=rfm
+CREATE TYPE=RFM NAME=chn_afs DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=/afs
+CREATE TYPE=RFM NAME=chn_ifs DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=/ifs
+CREATE TYPE=RFM NAME=chn_sys DEVTYPE=CHAR PHYPREF=BW CONNID=1 VOLUME=/sys"
+CREATE TYPE=UTIL NAME=chn_psocktest DEVTYPE=CHAR PHYPREF=LAT SOCK=CAIF_PSOCK_TEST PARAM=01
+CREATE TYPE=DGM NAME=chn_datagram DEVTYPE=CHAR PHYPREF=BW CONNID=1
+CREATE TYPE=DGMLOOP NAME=datagram_loop DEVTYPE=CHAR PHYPREF=BW
+CREATE TYPE=DGM NAME=datagram_raw_ip DEVTYPE=CHAR PHYPREF=BW CONNID=1
diff --git a/Documentation/CAIF/chardevconfig/delete_devices.config b/Documentation/CAIF/chardevconfig/delete_devices.config
new file mode 100644
index 0000000..220c40a
--- /dev/null
+++ b/Documentation/CAIF/chardevconfig/delete_devices.config
@@ -0,0 +1,12 @@
+DELETE NAME=chnlat10 DEVTYPE=CHAR
+DELETE NAME=chnlat11 DEVTYPE=CHAR
+DELETE NAME=chnlat12 DEVTYPE=CHAR
+DELETE NAME=chnlat13 DEVTYPE=CHAR
+DELETE NAME=chn_rfm DEVTYPE=CHAR
+DELETE NAME=chn_afs DEVTYPE=CHAR
+DELETE NAME=chn_ifs DEVTYPE=CHAR
+DELETE NAME=chn_sys DEVTYPE=CHAR
+DELETE NAME=chn_psocktest DEVTYPE=CHAR
+DELETE NAME=chn_datagram DEVTYPE=CHAR
+DELETE NAME=datagram_loop DEVTYPE=CHAR
+DELETE NAME=datagram_raw_ip DEVTYPE=CHAR
diff --git a/Documentation/CAIF/ldiscd/ldiscd.c b/Documentation/CAIF/ldiscd/ldiscd.c
new file mode 100644
index 0000000..9e3483d
--- /dev/null
+++ b/Documentation/CAIF/ldiscd/ldiscd.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2009
+ *
+ * Author: Daniel Martensson / Daniel.Martensson@stericsson.com
+ *
+ * License terms: GNU General Public License (GPL), version 2.
+ *
+ */
+
+
+
+
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <string.h>
+#include <termios.h>
+#include <asm/ioctls.h>
+
+#define CAIF_LDISC_TTY "/dev/ttyS0"
+
+int main(void)
+{
+
+ /* Our process ID and Session ID */
+ pid_t pid, sid;
+
+ /* Termios structure for UART configuration. */
+ struct termios tio;
+
+ /* File handle to the tty device node */
+ int fd;
+
+ /* Result */
+ int result;
+
+ /* Line discipline number to use (N_MOUSE) */
+ int ldiscnr = 2;
+
+ /* Fork off the parent process */
+ pid = fork();
+ if (pid < 0)
+ exit(EXIT_FAILURE);
+
+ /* If we got a good PID, then
+ we can exit the parent process. */
+ if (pid > 0)
+ exit(EXIT_SUCCESS);
+
+
+ /* Change the file mode mask */
+ umask(0);
+
+ /* Open any logs here */
+
+ /* Create a new SID for the child process */
+ sid = setsid();
+ if (sid < 0)
+ /* Log the failure */
+ exit(EXIT_FAILURE);
+
+
+ /* Change the current working directory */
+ if ((chdir("/")) < 0)
+ /* Log the failure */
+ exit(EXIT_FAILURE);
+
+
+ /* Close out the standard file descriptors */
+ close(STDIN_FILENO);
+ close(STDOUT_FILENO);
+ close(STDERR_FILENO);
+
+ /* Daemon-specific initialization goes here */
+
+ /* Open the tty device node */
+ fd = open(CAIF_LDISC_TTY, O_RDWR);
+ if (fd < 0) {
+ /* Log the failure */
+ exit(EXIT_FAILURE);
+ }
+
+ /* Configure UART settings. */
+ memset(&tio, 0, sizeof(tio));
+
+ /* 115200 baud, 8n1, CTS/RTS flow control. */
+ tio.c_cflag = B115200 | CRTSCTS | CS8 | CLOCAL | CREAD;
+
+ /* Flush TTY and set new termios. */
+ result = tcflush(fd, TCIOFLUSH);
+ if (result)
+ /* Log the failure */
+ exit(EXIT_FAILURE);
+
+
+ result = tcsetattr(fd, TCSANOW, &tio);
+ if (result)
+ /* Log the failure */
+ exit(EXIT_FAILURE);
+
+
+ /* Change line discipline for the tty device and keep it open */
+ result = ioctl(fd, TIOCSETD, &ldiscnr);
+ if (result < 0)
+ /* Log the failure */
+ exit(EXIT_FAILURE);
+
+
+ /* The Big Loop */
+ while (1)
+ sleep(0x7FFFFFFF);
+
+
+ close(fd);
+
+ exit(EXIT_SUCCESS);
+}
--
1.6.0.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox