Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v13 01/16] Add a new structure for skb buffer from external.
From: xiaohui.xin @ 2010-10-15  9:12 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <1287133937-5538-1-git-send-email-xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/skbuff.h |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 77eb60d..696e690 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -211,6 +211,15 @@ struct skb_shared_info {
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
 
+/* The structure is for a skb which pages may point to
+ * an external buffer, which is not allocated from kernel space.
+ * It also contains a destructor for itself.
+ */
+struct skb_ext_page {
+	struct		page *page;
+	void		(*dtor)(struct skb_ext_page *);
+};
+
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
-- 
1.7.3

^ permalink raw reply related

* Re: [PATCH net-next] can-raw: add msg_flags to distinguish local traffic
From: Daniel Baluta @ 2010-10-15  9:09 UTC (permalink / raw)
  To: Kurt Van Dijck
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w, netdev, Oliver Hartkopp
In-Reply-To: <20101015073709.GA387-MxZ6Iy/zr/UdbCeoMzGj59i2O/JbrIOy@public.gmane.org>

On Fri, Oct 15, 2010 at 10:37 AM, Kurt Van Dijck <kurt.van.dijck-/BeEPy95v10@public.gmane.org> wrote:
> CAN has no addressing scheme. It is currently impossible
> for userspace to tell is a received CAN frame comes from
> another process on the local host, or from a remote CAN
> device.
> This patch add support for userspace applications to distinguish
> between 'own', 'local' and 'remote' CAN traffic.
> Distinction is made by returning some flags in msg->msg_flags
> in the call to recvmsg.
> MSG_CONFIRM flag means 'own', as in 'transmission confirmation'
> MSG_DONTROUTE flag means 'local', not routed.
> Obviously, msgs with MSG_CONFIRM will have MSG_DONTROUTE set too.
>
> Please note that on SocketCAN mailing list, different opinions
> exist on the exact meaning of MSG_DONTROUTE. Better (=more
> intuitive) alternatives are appreciated.
>
> Signed-off-by: Kurt Van Dijck <kurt.van.dijck-AgBVmzD5pcezQB+pC5nmwQ@public.gmane.org>
> ---
>  net/can/raw.c |   33 ++++++++++++++++++++++++++++++---
>  1 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/net/can/raw.c b/net/can/raw.c
> index 7d77e67..f98709e 100644
> --- a/net/can/raw.c
> +++ b/net/can/raw.c
> @@ -90,23 +90,39 @@ struct raw_sock {
>        can_err_mask_t err_mask;
>  };
>
> +/*
> + * return some space to store extra msg flags in.
> + * We use 1 int beyond the 'struct sockaddr_can' in skb->cb
> + * to store those.
> + * These flags will be use in raw_recvmsg()
> + */
> +static inline int *raw_flags(struct sk_buff *skb)
> +{
> +       BUILD_BUG_ON(sizeof(skb->cb)
> +                       <= (sizeof(struct sockaddr_can) + sizeof(int)));
> +       /* return pointer after struct sockaddr_can */
> +       return (int *)(&((struct sockaddr_can *)skb->cb)[1]);

Since msg_flags is unsigned, it would be nice if this function returns unsigned.

> +}
> +
>  static inline struct raw_sock *raw_sk(const struct sock *sk)
>  {
>        return (struct raw_sock *)sk;
>  }
>
> -static void raw_rcv(struct sk_buff *skb, void *data)
> +static void raw_rcv(struct sk_buff *oskb, void *data)
>  {
>        struct sock *sk = (struct sock *)data;
>        struct raw_sock *ro = raw_sk(sk);
>        struct sockaddr_can *addr;
> +       struct sk_buff *skb;
> +       int *pflags;
>
>        /* check the received tx sock reference */
> -       if (!ro->recv_own_msgs && skb->sk == sk)
> +       if (!ro->recv_own_msgs && oskb->sk == sk)
>                return;
>
>        /* clone the given skb to be able to enqueue it into the rcv queue */
> -       skb = skb_clone(skb, GFP_ATOMIC);
> +       skb = skb_clone(oskb, GFP_ATOMIC);
>        if (!skb)
>                return;
>
> @@ -123,6 +139,14 @@ static void raw_rcv(struct sk_buff *skb, void *data)
>        addr->can_family  = AF_CAN;
>        addr->can_ifindex = skb->dev->ifindex;
>
> +       /* prepare the flags for raw_recvmsg() */
> +       pflags = raw_flags(skb);
> +       *pflags = 0;
> +       if (oskb->sk)
> +               *pflags |= MSG_DONTROUTE;
> +       if (oskb->sk == sk)
> +               *pflags |= MSG_CONFIRM;
> +
>        if (sock_queue_rcv_skb(sk, skb) < 0)
>                kfree_skb(skb);
>  }
> @@ -707,6 +731,9 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
>                memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
>        }
>
> +       /* assign the flags that have been recorded in in raw_rcv() */
small typo: double in
> +       msg->msg_flags |= *(raw_flags(skb));
> +
>        skb_free_datagram(sk, skb);
>
>        return size;

thanks,
Daniel.

^ permalink raw reply

* Re: VLAN packets silently dropped in promiscuous mode
From: Guillaume Gaudonville @ 2010-10-15  9:16 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Roger Luethi, netdev, Patrick McHardy
In-Reply-To: <AANLkTikrfx1T3ay0AGWkS6Ab28J4O_er2TsbRinBSGen@mail.gmail.com>

Jesse Gross wrote:
> On Thu, Sep 30, 2010 at 1:07 AM, Roger Luethi <rl@hellgate.ch> wrote:
>   
>> On Wed, 29 Sep 2010 10:44:26 -0700, Jesse Gross wrote:
>>     
>>> On Wed, Sep 29, 2010 at 4:37 AM, Roger Luethi <rl@hellgate.ch> wrote:
>>>       
>>>> I noticed packets for unknown VLANs getting silently dropped even in
>>>> promiscuous mode (this is true only for the hardware accelerated path).
>>>> netif_nit_deliver was introduced specifically to prevent that, but the
>>>> function gets called only _after_ packets from unknown VLANs have been
>>>> dropped.
>>>>         
>>> Some drivers are fixing this on a case by case basis by disabling
>>> hardware accelerated VLAN stripping when in promiscuous mode, i.e.:
>>> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=5f6c01819979afbfec7e0b15fe52371b8eed87e8
>>>
>>> However, at this point it is more or less random which drivers do
>>> this.  It would obviously be much better if it were consistent.
>>>       
>> My understanding is this. Hardware VLAN tagging and stripping can always be
>> enabled. The kernel passes 802.1Q information along with the stripped
>> header to libpcap which reassembles the original header where necessary.
>> Works for me.
>>     
>
> Sorry, I misread your original post as saying that the VLAN header
> gets dropped, rather than the entire packet.  I agree that this is how
> it should work but not necessarily how it does work (again, depending
> on the driver).  Here's the problem that I was talking about:
>
> Most drivers have a snippet of code that looks something like this
> (taken from ixgbe):
>
> if (adapter->vlgrp && is_vlan && (tag & VLAN_VID_MASK))
> 	vlan_gro_receive(napi, adapter->vlgrp, tag, skb);
> else
> 	napi_gro_receive(napi, skb);
>
> At this point the VLAN has already been stripped in hardware.  If
> there is no VLAN group configured on the device then we hit the second
> case.  The VLAN header was removed from the SKB and the tag variable
> is unused.  It is no longer possible for libpcap to reconstruct the
> header because the information was thrown away (even the fact that
> there was a VLAN tag at all).
>
> There are a couple ways to fix this:
>
> * Turn off VLAN stripping when in promiscuous mode (as done by the ixgbe driver)
>   
This is not totally true: if changing the MTU ixgbe_change_mtu will call:
 ixgbe_reinit_locked--> ixgbe_up --> ixgbe_configure:
                 --> ixgbe_set_rx_mode: flag IFF_PROMISC is tested 
ixgbe_vlan_filter_enable is not called
                 --> ixgbe_restore_vlan --> ixgbe_vlan_rx_register: flag 
IFF_PROMISC is not tested ixgbe_vlan_filter_enable
                      will be called.

In fact it should happen each time we configure something which needs a 
reset of the device. Why don't add a test
on flag promiscuous directly in ixgbe_vlan_filter_enable? Or do it on 
each call, if we want to allow a device in promiscuous
mode to enable this feature.

What do you think?

> * Reconstruct the VLAN header when there is no VLAN group (as done by
> the tg3 driver)
>   
> A bunch of drivers do neither (bnx2x, for example) and exhibit this
> problem.  It's getting better but it seems like a common issue.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


-- 
Guillaume Gaudonville
6WIND
Software Engineer

Tel: +33 1 39 30 92 63
Mob: +33 6 47 85 34 33
Fax: +33 1 39 30 92 11
guillaume.gaudonville@6wind.com
www.6wind.com
Join the Multicore Packet Processing Forum: www.multicorepacketprocessing.com

Ce courriel ainsi que toutes les pièces jointes, est uniquement destiné à son ou ses destinataires. Il contient des informations confidentielles qui sont la propriété de 6WIND. Toute révélation, distribution ou copie des informations qu'il contient est strictement interdite. Si vous avez reçu ce message par erreur, veuillez immédiatement le signaler à l'émetteur et détruire toutes les données reçues

This e-mail message, including any attachments, is for the sole use of the intended recipient(s) and contains information that is confidential and proprietary to 6WIND. All unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply e-mail and destroy all copies of the original message.


^ permalink raw reply

* Re: [Bugme-new] [Bug 19692] New: linux-2.6.36-rc5 crash with gianfar ethernet at full line rate traffic
From: Jarek Poplawski @ 2010-10-15  8:58 UTC (permalink / raw)
  To: emin ak
  Cc: Andrew Morton, netdev, bugzilla-daemon, bugme-daemon,
	Anton Vorontsov
In-Reply-To: <20101010103236.GA1919@del.dom.local>

On 2010-10-10 12:32, Jarek Poplawski wrote:
> On Sat, Oct 09, 2010 at 03:10:25PM +0300, emin ak wrote:
>> Hi Jarek,
>> My test setup is at my office so that I will try your patch on Monday
>> immediately and turn you the results.
> 
> No need to hurry, especially on Mondays, but thanks for the info.

Hi Emin,
Are there any problems with this testing?

Jarek P.

^ permalink raw reply

* (unknown), 
From: WESTERN UNION TRANSFER @ 2010-10-15  8:56 UTC (permalink / raw)





-- 
My working partner has helped me to send your
first payment of US$7,500 to you as
instructed by Mr. David Cameron and will
keep sending you US$7,500 twice a week until
the payment of (US$360,000) is completed
within six months and here is the information
below:

MONEY TRANSFER CONTROL NUMBER (MTCN):
291-371-8010

SENDER'S NAME:Solomon Daniel
AMOUNT: US$7,500

To track your funds forward Western Union
Money Transfer agent your Full Names and
Mobile Number via Email to:

Mr Gary Moore
E-mail:western-union.transfer02@w.cn
D/L: +447024044997

Please direct all enquiring to:
western-union.transfer02@w.cn

Best Regards,
Mr Gary Moore.





^ permalink raw reply

* [PATCH v13 12/16] Add mp(mediate passthru) device.
From: xiaohui.xin @ 2010-10-15  9:12 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1287132437.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The patch add mp(mediate passthru) device, which now
based on vhost-net backend driver and provides proto_ops
to send/receive guest buffers data from/to guest vitio-net
driver.
It also exports async functions which can be used by other
drivers like macvtap to utilize zero-copy too.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 drivers/vhost/mpassthru.c | 1380 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 1380 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 0000000..5389f3e
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1380 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME        "mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include <linux/compat.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/poll.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/aio.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/miscdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/crc32.h>
+#include <linux/nsproxy.h>
+#include <linux/uaccess.h>
+#include <linux/virtio_net.h>
+#include <linux/mpassthru.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+#include <asm/system.h>
+
+struct mp_struct {
+	struct mp_file		*mfile;
+	struct net_device       *dev;
+	struct page_pool	*pool;
+	struct socket           socket;
+	struct socket_wq	wq;
+	struct mm_struct	*mm;
+};
+
+struct mp_file {
+	atomic_t count;
+	struct mp_struct *mp;
+	struct net *net;
+};
+
+struct mp_sock {
+	struct sock		sk;
+	struct mp_struct	*mp;
+};
+
+/* The main function to allocate external buffers */
+static struct skb_ext_page *page_ctor(struct mp_port *port,
+				      struct sk_buff *skb,
+				      int npages)
+{
+	int i;
+	unsigned long flags;
+	struct page_pool *pool;
+	struct page_info *info = NULL;
+
+	if (npages != 1)
+		BUG();
+	pool = container_of(port, struct page_pool, port);
+
+	spin_lock_irqsave(&pool->read_lock, flags);
+	if (!list_empty(&pool->readq)) {
+		info = list_first_entry(&pool->readq, struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&pool->read_lock, flags);
+	if (!info)
+		return NULL;
+
+	for (i = 0; i < info->pnum; i++)
+		get_page(info->pages[i]);
+	info->skb = skb;
+	return &info->ext_page;
+}
+
+static struct page_info *mp_hash_lookup(struct page_pool *pool,
+					struct page *page);
+static struct page_info *mp_hash_delete(struct page_pool *pool,
+					struct page_info *info);
+
+static struct skb_ext_page *mp_lookup(struct net_device *dev,
+				      struct page *page)
+{
+	struct mp_struct *mp =
+		container_of(dev->mp_port->sock->sk, struct mp_sock, sk)->mp;
+	struct page_pool *pool = mp->pool;
+	struct page_info *info;
+
+	info = mp_hash_lookup(pool, page);
+	if (!info)
+		return NULL;
+	return &info->ext_page;
+}
+
+struct page_pool *page_pool_create(struct net_device *dev,
+				  struct socket *sock)
+{
+	struct page_pool *pool;
+	int rc;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+	rc = netdev_mp_port_prep(dev, &pool->port);
+	if (rc)
+		goto fail;
+
+	INIT_LIST_HEAD(&pool->readq);
+	spin_lock_init(&pool->read_lock);
+	pool->hash_table =
+		kzalloc(sizeof(struct page_info *) * HASH_BUCKETS, GFP_KERNEL);
+	if (!pool->hash_table)
+		goto fail;
+	dev_hold(dev);
+	pool->dev = dev;
+	pool->port.ctor = page_ctor;
+	pool->port.sock = sock;
+	pool->port.hash = mp_lookup;
+	pool->locked_pages = 0;
+	pool->cur_pages = 0;
+	pool->orig_locked_vm = 0;
+	/* should be protected by rtnl_lock() */
+	dev->mp_port = &pool->port;
+	return pool;
+fail:
+	kfree(pool);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(page_pool_create);
+
+void dev_change_state(struct net_device *dev)
+{
+	dev_change_flags(dev, dev->flags & (~IFF_UP));
+	dev_change_flags(dev, dev->flags | IFF_UP);
+}
+EXPORT_SYMBOL_GPL(dev_change_state);
+
+static int mp_page_pool_attach(struct mp_struct *mp, struct page_pool *pool)
+{
+	int rc = 0;
+	/* should be protected by mp_mutex */
+	if (mp->pool) {
+		rc = -EBUSY;
+		goto fail;
+	}
+	if (mp->dev != pool->dev) {
+		rc = -EFAULT;
+		goto fail;
+	}
+	mp->pool = pool;
+	return 0;
+fail:
+	kfree(pool->hash_table);
+	kfree(pool);
+	dev_put(mp->dev);
+	return rc;
+}
+
+struct page_info *info_dequeue(struct page_pool *pool)
+{
+	unsigned long flags;
+	struct page_info *info = NULL;
+	spin_lock_irqsave(&pool->read_lock, flags);
+	if (!list_empty(&pool->readq)) {
+		info = list_first_entry(&pool->readq,
+				struct page_info, list);
+		list_del(&info->list);
+	}
+	spin_unlock_irqrestore(&pool->read_lock, flags);
+	return info;
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+	struct page_info *info = (struct page_info *)(iocb->private);
+	int i;
+
+	if (info->flags == INFO_READ) {
+		for (i = 0; i < info->pnum; i++) {
+			if (info->pages[i]) {
+				set_page_dirty_lock(info->pages[i]);
+				put_page(info->pages[i]);
+			}
+		}
+		mp_hash_delete(info->pool, info);
+		if (info->skb) {
+			info->skb->destructor = NULL;
+			kfree_skb(info->skb);
+		}
+	}
+	/* Decrement the number of locked pages */
+	info->pool->cur_pages -= info->pnum;
+	kmem_cache_free(ext_page_info_cache, info);
+
+	return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+	struct kiocb *iocb = NULL;
+
+	iocb = info->iocb;
+	if (!iocb)
+		return iocb;
+	iocb->ki_flags = 0;
+	iocb->ki_users = 1;
+	iocb->ki_key = 0;
+	iocb->ki_ctx = NULL;
+	iocb->ki_cancel = NULL;
+	iocb->ki_retry = NULL;
+	iocb->ki_eventfd = NULL;
+	iocb->ki_pos = info->desc_pos;
+	iocb->ki_nbytes = size;
+	iocb->ki_dtor(iocb);
+	iocb->private = (void *)info;
+	iocb->ki_dtor = mp_ki_dtor;
+
+	return iocb;
+}
+
+void page_pool_destroy(struct mm_struct *mm, struct page_pool *pool)
+{
+	struct page_info *info;
+	int i;
+
+	while ((info = info_dequeue(pool))) {
+		for (i = 0; i < info->pnum; i++)
+			if (info->pages[i])
+				put_page(info->pages[i]);
+		create_iocb(info, 0);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	down_write(&mm->mmap_sem);
+	mm->locked_vm -= pool->locked_pages;
+	up_write(&mm->mmap_sem);
+
+	/* should be locked by rtnl_lock */
+	pool->dev->mp_port = NULL;
+	dev_put(pool->dev);
+	kfree(pool->hash_table);
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(page_pool_destroy);
+
+static void mp_page_pool_detach(struct mp_struct *mp)
+{
+	/* locked by mp_mutex */
+	if (mp->pool) {
+		page_pool_destroy(mp->mm, mp->pool);
+		mp->pool = NULL;
+	}
+}
+
+static void __mp_detach(struct mp_struct *mp)
+{
+	mp->mfile = NULL;
+
+	dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
+	mp_page_pool_detach(mp);
+	dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
+
+	/* Drop the extra count on the net device */
+	dev_put(mp->dev);
+}
+
+static DEFINE_MUTEX(mp_mutex);
+
+static void mp_detach(struct mp_struct *mp)
+{
+	mutex_lock(&mp_mutex);
+	__mp_detach(mp);
+	mutex_unlock(&mp_mutex);
+}
+
+static struct mp_struct *mp_get(struct mp_file *mfile)
+{
+	struct mp_struct *mp = NULL;
+	if (atomic_inc_not_zero(&mfile->count))
+		mp = mfile->mp;
+
+	return mp;
+}
+
+static void mp_put(struct mp_file *mfile)
+{
+	if (atomic_dec_and_test(&mfile->count)) {
+		if (!rtnl_is_locked()) {
+			rtnl_lock();
+			mp_detach(mfile->mp);
+			rtnl_unlock();
+		} else
+			mp_detach(mfile->mp);
+	}
+}
+
+static void iocb_tag(struct kiocb *iocb)
+{
+	iocb->ki_flags = 1;
+}
+
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_ext_page *ext_page)
+{
+	struct page_info *info;
+	struct page_pool *pool;
+	struct sock *sk;
+	struct sk_buff *skb;
+
+	if (!ext_page)
+		return;
+	info = container_of(ext_page, struct page_info, ext_page);
+	if (!info)
+		return;
+	pool = info->pool;
+	skb = info->skb;
+
+	if (info->flags == INFO_READ) {
+		create_iocb(info, 0);
+		return;
+	}
+
+	/* For transmit, we should wait for the DMA finish by hardware.
+	 * Queue the notifier to wake up the backend driver
+	 */
+
+	iocb_tag(info->iocb);
+	sk = pool->port.sock->sk;
+	sk->sk_write_space(sk);
+
+	return;
+}
+
+/* For small exteranl buffers transmit, we don't need to call
+ * get_user_pages().
+ */
+static struct page_info *alloc_small_page_info(struct page_pool *pool,
+		struct kiocb *iocb, int total)
+{
+	struct page_info *info =
+		kmem_cache_alloc(ext_page_info_cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+	info->ext_page.dtor = page_dtor;
+	info->pool = pool;
+	info->flags = INFO_WRITE;
+	info->iocb = iocb;
+	info->pnum = 0;
+	return info;
+}
+
+typedef u32 key_mp_t;
+static inline key_mp_t mp_hash(struct page *page, int buckets)
+{
+	key_mp_t k;
+#if BITS_PER_LONG == 64
+	k = ((((unsigned long)page << 32UL) >> 32UL) /
+			sizeof(struct page)) % buckets ;
+#elif BITS_PER_LONG == 32
+	k = ((unsigned long)page / sizeof(struct page)) % buckets;
+#endif
+
+	return k;
+}
+
+static void mp_hash_insert(struct page_pool *pool,
+		struct page *page, struct page_info *page_info)
+{
+	struct page_info *tmp;
+	key_mp_t key = mp_hash(page, HASH_BUCKETS);
+	if (!pool->hash_table[key]) {
+		pool->hash_table[key] = page_info;
+		return;
+	}
+
+	tmp = pool->hash_table[key];
+	while (tmp->next)
+		tmp = tmp->next;
+
+	tmp->next = page_info;
+	page_info->prev = tmp;
+	return;
+}
+
+static struct page_info *mp_hash_delete(struct page_pool *pool,
+					struct page_info *info)
+{
+	key_mp_t key = mp_hash(info->pages[0], HASH_BUCKETS);
+	struct page_info *tmp = NULL;
+
+	tmp = pool->hash_table[key];
+	while (tmp) {
+		if (tmp == info) {
+			if (!tmp->prev) {
+				pool->hash_table[key] = tmp->next;
+				if (tmp->next)
+					tmp->next->prev = NULL;
+			} else {
+				tmp->prev->next = tmp->next;
+				if (tmp->next)
+					tmp->next->prev = tmp->prev;
+			}
+			return tmp;
+		}
+		tmp = tmp->next;
+	}
+	return tmp;
+}
+
+static struct page_info *mp_hash_lookup(struct page_pool *pool,
+					struct page *page)
+{
+	key_mp_t key = mp_hash(page, HASH_BUCKETS);
+	struct page_info *tmp = NULL;
+
+	int i;
+	tmp = pool->hash_table[key];
+	while (tmp) {
+		for (i = 0; i < tmp->pnum; i++) {
+			if (tmp->pages[i] == page)
+				return tmp;
+		}
+		tmp = tmp->next;
+	}
+	return tmp;
+}
+
+/* The main function to transform the guest user space address
+ * to host kernel address via get_user_pages(). Thus the hardware
+ * can do DMA directly to the external buffer address.
+ */
+static struct page_info *alloc_page_info(struct page_pool *pool,
+		struct kiocb *iocb, struct iovec *iov,
+		int count, struct frag *frags,
+		int npages, int total)
+{
+	int rc;
+	int i, j, n = 0;
+	int len;
+	unsigned long base;
+	struct page_info *info = NULL;
+
+	if (pool->cur_pages + count > pool->locked_pages) {
+		printk(KERN_INFO "Exceed memory lock rlimt.");
+		return NULL;
+	}
+
+	info = kmem_cache_alloc(ext_page_info_cache, GFP_KERNEL);
+
+	if (!info)
+		return NULL;
+	info->skb = NULL;
+	info->next = info->prev = NULL;
+
+	for (i = j = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+
+		if (!len)
+			continue;
+		n = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+
+		rc = get_user_pages_fast(base, n, npages ? 1 : 0,
+				&info->pages[j]);
+		if (rc != n)
+			goto failed;
+
+		while (n--) {
+			frags[j].offset = base & ~PAGE_MASK;
+			frags[j].size = min_t(int, len,
+					PAGE_SIZE - frags[j].offset);
+			len -= frags[j].size;
+			base += frags[j].size;
+			j++;
+		}
+	}
+
+#ifdef CONFIG_HIGHMEM
+	if (npages && !(dev->features & NETIF_F_HIGHDMA)) {
+		for (i = 0; i < j; i++) {
+			if (PageHighMem(info->pages[i]))
+				goto failed;
+		}
+	}
+#endif
+
+	info->ext_page.dtor = page_dtor;
+	info->ext_page.page = info->pages[0];
+	info->pool = pool;
+	info->pnum = j;
+	info->iocb = iocb;
+	if (!npages)
+		info->flags = INFO_WRITE;
+	else
+		info->flags = INFO_READ;
+
+	if (info->flags == INFO_READ) {
+		if (frags[0].offset == 0 && iocb->ki_iovec[0].iov_len) {
+			frags[0].offset = iocb->ki_iovec[0].iov_len;
+			pool->port.vnet_hlen = iocb->ki_iovec[0].iov_len;
+		}
+		for (i = 0; i < j; i++)
+			mp_hash_insert(pool, info->pages[i], info);
+	}
+	/* increment the number of locked pages */
+	pool->cur_pages += j;
+	return info;
+
+failed:
+	for (i = 0; i < j; i++)
+		put_page(info->pages[i]);
+
+	kmem_cache_free(ext_page_info_cache, info);
+
+	return NULL;
+}
+
+static void mp_sock_destruct(struct sock *sk)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+	wait_queue_head_t *wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_sync_poll(wqueue, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+	wait_queue_head_t *wqueue = sk_sleep(sk);
+	if (wqueue && waitqueue_active(wqueue))
+		wake_up_interruptible_sync_poll(wqueue, POLLOUT);
+}
+
+void async_data_ready(struct sock *sk, struct page_pool *pool)
+{
+	struct sk_buff *skb = NULL;
+	struct page_info *info = NULL;
+	int len;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		struct page *page;
+		int off;
+		int size = 0, i = 0;
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+		struct skb_ext_page *ext_page =
+			(struct skb_ext_page *)(shinfo->destructor_arg);
+		struct virtio_net_hdr_mrg_rxbuf hdr = {
+			.hdr.flags = 0,
+			.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
+		};
+
+		if (skb->ip_summed == CHECKSUM_COMPLETE)
+			printk(KERN_INFO "Complete checksum occurs\n");
+
+		if (shinfo->frags[0].page == ext_page->page) {
+			info = container_of(ext_page,
+					    struct page_info,
+					    ext_page);
+			if (shinfo->nr_frags)
+				hdr.num_buffers = shinfo->nr_frags;
+			else
+				hdr.num_buffers = shinfo->nr_frags + 1;
+		} else {
+			info = container_of(ext_page,
+					    struct page_info,
+					    ext_page);
+			hdr.num_buffers = shinfo->nr_frags + 1;
+		}
+		skb_push(skb, ETH_HLEN);
+
+		if (skb_is_gso(skb)) {
+			hdr.hdr.hdr_len = skb_headlen(skb);
+			hdr.hdr.gso_size = shinfo->gso_size;
+			if (shinfo->gso_type & SKB_GSO_TCPV4)
+				hdr.hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (shinfo->gso_type & SKB_GSO_TCPV6)
+				hdr.hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (shinfo->gso_type & SKB_GSO_UDP)
+				hdr.hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (shinfo->gso_type & SKB_GSO_TCP_ECN)
+				hdr.hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+
+		} else
+			hdr.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			hdr.hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			hdr.hdr.csum_start =
+				skb->csum_start - skb_headroom(skb);
+			hdr.hdr.csum_offset = skb->csum_offset;
+		}
+
+		off = info->hdr[0].iov_len;
+		len = memcpy_toiovec(info->iov, (unsigned char *)&hdr, off);
+		if (len) {
+			pr_debug("Unable to write vnet_hdr at addr '%p': '%d'\n",
+				info->iov, len);
+			goto clean;
+		}
+
+		memcpy_toiovec(info->iov, skb->data, skb_headlen(skb));
+
+		info->iocb->ki_left = hdr.num_buffers;
+		if (shinfo->frags[0].page == ext_page->page) {
+			size = shinfo->frags[0].size +
+				shinfo->frags[0].page_offset - off;
+			i = 1;
+		} else {
+			size = skb_headlen(skb);
+			i = 0;
+		}
+		create_iocb(info, off + size);
+		for (i = i; i < shinfo->nr_frags; i++) {
+			page = shinfo->frags[i].page;
+			info = mp_hash_lookup(pool, shinfo->frags[i].page);
+			create_iocb(info, shinfo->frags[i].size);
+		}
+		info->skb = skb;
+		shinfo->nr_frags = 0;
+		shinfo->destructor_arg = NULL;
+		continue;
+clean:
+		kfree_skb(skb);
+		for (i = 0; i < info->pnum; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	return;
+}
+EXPORT_SYMBOL_GPL(async_data_ready);
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+	struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+	struct page_pool *pool = NULL;
+
+	pool = mp->pool;
+	if (!pool)
+		return;
+	return async_data_ready(sk, pool);
+}
+
+static inline struct sk_buff *mp_alloc_skb(struct sock *sk, size_t prepad,
+					   size_t len, size_t linear,
+					   int noblock, int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+			err);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, prepad);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
+static int mp_skb_from_vnet_hdr(struct sk_buff *skb,
+		struct virtio_net_hdr *vnet_hdr)
+{
+	unsigned short gso_type = 0;
+	if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+			gso_type = SKB_GSO_TCPV4;
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			gso_type = SKB_GSO_TCPV6;
+			break;
+		case VIRTIO_NET_HDR_GSO_UDP:
+			gso_type = SKB_GSO_UDP;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+			gso_type |= SKB_GSO_TCP_ECN;
+
+		if (vnet_hdr->gso_size == 0)
+			return -EINVAL;
+	}
+
+	if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		if (!skb_partial_csum_set(skb, vnet_hdr->csum_start,
+					vnet_hdr->csum_offset))
+			return -EINVAL;
+	}
+
+	if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		skb_shinfo(skb)->gso_size = vnet_hdr->gso_size;
+		skb_shinfo(skb)->gso_type = gso_type;
+
+		/* Header must be checked, and gso_segs computed. */
+		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+		skb_shinfo(skb)->gso_segs = 0;
+	}
+	return 0;
+}
+
+int async_sendmsg(struct sock *sk, struct kiocb *iocb, struct page_pool *pool,
+		  struct iovec *iov, int count)
+{
+	struct virtio_net_hdr vnet_hdr = {0};
+	int hdr_len = 0;
+	struct page_info *info = NULL;
+	struct frag frags[MAX_SKB_FRAGS];
+	struct sk_buff *skb;
+	int total = 0, header, n, i, len, rc;
+	unsigned long base;
+
+	total = iov_length(iov, count);
+
+	if (total < ETH_HLEN)
+		return -EINVAL;
+
+	if (total <= COPY_THRESHOLD)
+		goto copy;
+
+	n = 0;
+	for (i = 0; i < count; i++) {
+		base = (unsigned long)iov[i].iov_base;
+		len = iov[i].iov_len;
+		if (!len)
+			continue;
+		n += ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		if (n > MAX_SKB_FRAGS)
+			return -EINVAL;
+	}
+
+copy:
+	hdr_len = sizeof(vnet_hdr);
+	if ((total - iocb->ki_iovec[0].iov_len) < 0)
+		return -EINVAL;
+
+	rc = memcpy_fromiovecend((void *)&vnet_hdr, iocb->ki_iovec, 0, hdr_len);
+	if (rc < 0)
+		return -EINVAL;
+
+	if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+			vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
+			vnet_hdr.hdr_len)
+		vnet_hdr.hdr_len = vnet_hdr.csum_start +
+			vnet_hdr.csum_offset + 2;
+
+	if (vnet_hdr.hdr_len > total)
+		return -EINVAL;
+
+	header = total > COPY_THRESHOLD ? COPY_HDR_LEN : total;
+
+	skb = mp_alloc_skb(sk, NET_IP_ALIGN, header,
+			iocb->ki_iovec[0].iov_len, 1, &rc);
+	if (!skb)
+		goto drop;
+
+	skb_set_network_header(skb, ETH_HLEN);
+	memcpy_fromiovec(skb->data, iov, header);
+
+	skb_reset_mac_header(skb);
+	skb->protocol = eth_hdr(skb)->h_proto;
+
+	rc = mp_skb_from_vnet_hdr(skb, &vnet_hdr);
+	if (rc)
+		goto drop;
+
+	if (header == total) {
+		rc = total;
+		info = alloc_small_page_info(pool, iocb, total);
+	} else {
+		info = alloc_page_info(pool, iocb, iov, count, frags, 0, total);
+		if (info)
+			for (i = 0; i < info->pnum; i++) {
+				skb_add_rx_frag(skb, i, info->pages[i],
+						frags[i].offset, frags[i].size);
+				info->pages[i] = NULL;
+			}
+	}
+	if (!pool->cur_pages)
+		sk->sk_state_change(sk);
+
+	if (info != NULL) {
+		info->desc_pos = iocb->ki_pos;
+		info->skb = skb;
+		skb_shinfo(skb)->destructor_arg = &info->ext_page;
+		skb->dev = pool->dev;
+		create_iocb(info, total);
+		dev_queue_xmit(skb);
+		return 0;
+	}
+drop:
+	kfree_skb(skb);
+	if (info) {
+		for (i = 0; i < info->pnum; i++)
+			put_page(info->pages[i]);
+		kmem_cache_free(ext_page_info_cache, info);
+	}
+	pool->dev->stats.tx_dropped++;
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(async_sendmsg);
+
+static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_pool *pool;
+	struct iovec *iov = m->msg_iov;
+	int count = m->msg_iovlen;
+
+	pool = mp->pool;
+	if (!pool)
+		return -ENODEV;
+	return async_sendmsg(sock->sk, iocb, pool, iov, count);
+}
+
+int async_recvmsg(struct kiocb *iocb, struct page_pool *pool,
+		  struct iovec *iov, int count, int flags)
+{
+	int npages, payload;
+	struct page_info *info;
+	struct frag frags[MAX_SKB_FRAGS];
+	unsigned long base;
+	int i, len;
+	unsigned long flag;
+
+	if (!(flags & MSG_DONTWAIT))
+		return -EINVAL;
+
+	if (!pool)
+		return -EINVAL;
+
+	/* Error detections in case invalid external buffer */
+	if (count > 2 && iov[1].iov_len < pool->port.hdr_len &&
+			pool->dev->features & NETIF_F_SG) {
+		return -EINVAL;
+	}
+
+	npages = pool->port.npages;
+	payload = pool->port.data_len;
+
+	/* If KVM guest virtio-net FE driver use SG feature */
+	if (count > 2) {
+		for (i = 2; i < count; i++) {
+			base = (unsigned long)iov[i].iov_base & ~PAGE_MASK;
+			len = iov[i].iov_len;
+			if (npages == 1)
+				len = min_t(int, len, PAGE_SIZE - base);
+			else if (base)
+				break;
+			payload -= len;
+			if (payload <= 0)
+				goto proceed;
+			if (npages == 1 || (len & ~PAGE_MASK))
+				break;
+		}
+	}
+
+	if ((((unsigned long)iov[1].iov_base & ~PAGE_MASK)
+				- NET_SKB_PAD - NET_IP_ALIGN) >= 0)
+		goto proceed;
+
+	return -EINVAL;
+proceed:
+	/* skip the virtnet head */
+	if (count > 1) {
+		iov++;
+		count--;
+	}
+
+	/* Translate address to kernel */
+	info = alloc_page_info(pool, iocb, iov, count, frags, npages, 0);
+	if (!info)
+		return -ENOMEM;
+	info->hdr[0].iov_base = iocb->ki_iovec[0].iov_base;
+	info->hdr[0].iov_len = iocb->ki_iovec[0].iov_len;
+	iocb->ki_iovec[0].iov_len = 0;
+	iocb->ki_left = 0;
+	info->desc_pos = iocb->ki_pos;
+
+	if (count > 1) {
+		iov--;
+		count++;
+	}
+
+	memcpy(info->iov, iov, sizeof(struct iovec) * count);
+
+	spin_lock_irqsave(&pool->read_lock, flag);
+	list_add_tail(&info->list, &pool->readq);
+	spin_unlock_irqrestore(&pool->read_lock, flag);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(async_recvmsg);
+
+static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
+		struct msghdr *m, size_t total_len,
+		int flags)
+{
+	struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+	struct page_pool *pool;
+	struct iovec *iov = m->msg_iov;
+	int count = m->msg_iovlen;
+
+	pool = mp->pool;
+	if (!pool)
+		return -EINVAL;
+
+	return async_recvmsg(iocb, pool, iov, count, flags);
+}
+
+/* Ops structure to mimic raw sockets with mp device */
+static const struct proto_ops mp_socket_ops = {
+	.sendmsg = mp_sendmsg,
+	.recvmsg = mp_recvmsg,
+};
+
+static struct proto mp_proto = {
+	.name           = "mp",
+	.owner          = THIS_MODULE,
+	.obj_size       = sizeof(struct mp_sock),
+};
+
+static int mp_chr_open(struct inode *inode, struct file * file)
+{
+	struct mp_file *mfile;
+	cycle_kernel_lock();
+
+	pr_debug("mp: mp_chr_open\n");
+	mfile = kzalloc(sizeof(*mfile), GFP_KERNEL);
+	if (!mfile)
+		return -ENOMEM;
+	atomic_set(&mfile->count, 0);
+	mfile->mp = NULL;
+	mfile->net = get_net(current->nsproxy->net_ns);
+	file->private_data = mfile;
+	return 0;
+}
+
+static int mp_attach(struct mp_struct *mp, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	int err;
+
+	netif_tx_lock_bh(mp->dev);
+
+	err = -EINVAL;
+
+	if (mfile->mp)
+		goto out;
+
+	err = -EBUSY;
+	if (mp->mfile)
+		goto out;
+
+	err = 0;
+	mfile->mp = mp;
+	mp->mfile = mfile;
+	mp->socket.file = file;
+	dev_hold(mp->dev);
+	sock_hold(mp->socket.sk);
+	atomic_inc(&mfile->count);
+
+out:
+	netif_tx_unlock_bh(mp->dev);
+	return err;
+}
+
+static int do_unbind(struct mp_file *mfile)
+{
+	struct mp_struct *mp = mp_get(mfile);
+
+	if (!mp)
+		return -EINVAL;
+
+	mp_detach(mp);
+	sock_put(mp->socket.sk);
+	mp_put(mfile);
+	return 0;
+}
+
+static long mp_chr_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+	struct net_device *dev;
+	struct page_pool *pool;
+	void __user* argp = (void __user *)arg;
+	unsigned long  __user *limitp = argp;
+	struct ifreq ifr;
+	struct sock *sk;
+	unsigned long limit, locked, lock_limit;
+	int ret;
+
+	ret = -EINVAL;
+
+	switch (cmd) {
+	case MPASSTHRU_BINDDEV:
+		ret = -EFAULT;
+		if (copy_from_user(&ifr, argp, sizeof ifr))
+			break;
+
+		ifr.ifr_name[IFNAMSIZ-1] = '\0';
+
+		ret = -ENODEV;
+
+		rtnl_lock();
+		dev = dev_get_by_name(mfile->net, ifr.ifr_name);
+		if (!dev) {
+			rtnl_unlock();
+			break;
+		}
+
+		mutex_lock(&mp_mutex);
+
+		ret = -EBUSY;
+
+		/* the device can be only bind once */
+		if (dev_is_mpassthru(dev))
+			goto err_dev_put;
+
+		ret = -EFAULT;
+
+		if (!(dev->features & NETIF_F_SG)) {
+			pr_debug("The device has no SG features.\n");
+			goto err_dev_put;
+		}
+		mp = mfile->mp;
+		if (mp)
+			goto err_dev_put;
+
+		mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+		if (!mp) {
+			ret = -ENOMEM;
+			goto err_dev_put;
+		}
+		mp->dev = dev;
+		mp->mm = get_task_mm(current);
+		ret = -ENOMEM;
+
+		sk = sk_alloc(mfile->net, AF_UNSPEC, GFP_KERNEL, &mp_proto);
+		if (!sk)
+			goto err_free_mp;
+
+		mp->socket.wq = &mp->wq;
+		init_waitqueue_head(&mp->wq.wait);
+		mp->socket.ops = &mp_socket_ops;
+		sock_init_data(&mp->socket, sk);
+		sk->sk_sndbuf = INT_MAX;
+		container_of(sk, struct mp_sock, sk)->mp = mp;
+
+		sk->sk_destruct = mp_sock_destruct;
+		sk->sk_data_ready = mp_sock_data_ready;
+		sk->sk_write_space = mp_sock_write_space;
+		sk->sk_state_change = mp_sock_state_change;
+		ret = mp_attach(mp, file);
+		if (ret < 0)
+			goto err_free_sk;
+		pool = page_pool_create(dev, &mp->socket);
+		if (!pool)
+			goto err_free_sk;
+
+		ret = mp_page_pool_attach(mp, pool);
+		if (ret < 0)
+			goto err_free_sk;
+		dev_change_state(dev);
+out:
+		mutex_unlock(&mp_mutex);
+		rtnl_unlock();
+		break;
+err_free_sk:
+		sk_free(sk);
+err_free_mp:
+		kfree(mp);
+err_dev_put:
+		dev_put(dev);
+		goto out;
+
+	case MPASSTHRU_UNBINDDEV:
+		rtnl_lock();
+		ret = do_unbind(mfile);
+		rtnl_unlock();
+		break;
+
+	case MPASSTHRU_SET_MEM_LOCKED:
+		ret = copy_from_user(&limit, limitp, sizeof limit);
+		if (ret < 0)
+			return ret;
+
+		mp = mp_get(mfile);
+		if (!mp)
+			return -ENODEV;
+
+		mutex_lock(&mp_mutex);
+		if (mp->mm != current->mm) {
+			mutex_unlock(&mp_mutex);
+			return -EPERM;
+		}
+
+		limit = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+		down_write(&mp->mm->mmap_sem);
+		if (!mp->pool->locked_pages)
+			mp->pool->orig_locked_vm = mp->mm->locked_vm;
+		locked = limit + mp->pool->orig_locked_vm;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+		if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+			up_write(&mp->mm->mmap_sem);
+			mutex_unlock(&mp_mutex);
+			mp_put(mfile);
+			return -ENOMEM;
+		}
+		mp->mm->locked_vm = locked;
+		up_write(&mp->mm->mmap_sem);
+
+		mp->pool->locked_pages = limit;
+		mutex_unlock(&mp_mutex);
+
+		mp_put(mfile);
+		return 0;
+
+	case MPASSTHRU_GET_MEM_LOCKED_NEED:
+		limit = DEFAULT_NEED;
+		return copy_to_user(limitp, &limit, sizeof limit);
+
+
+	default:
+		break;
+	}
+	return ret;
+}
+
+static unsigned int mp_chr_poll(struct file *file, poll_table * wait)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp = mp_get(mfile);
+	struct sock *sk;
+	unsigned int mask = 0;
+
+	if (!mp)
+		return POLLERR;
+
+	sk = mp->socket.sk;
+
+	poll_wait(file, &mp->wq.wait, wait);
+
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		mask |= POLLIN | POLLRDNORM;
+
+	if (sock_writeable(sk) ||
+		(!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+			 sock_writeable(sk)))
+		mask |= POLLOUT | POLLWRNORM;
+
+	if (mp->dev->reg_state != NETREG_REGISTERED)
+		mask = POLLERR;
+
+	mp_put(mfile);
+	return mask;
+}
+
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct mp_struct *mp = mp_get(file->private_data);
+	struct sock *sk = mp->socket.sk;
+	struct sk_buff *skb;
+	int len, err;
+	ssize_t result = 0;
+
+	if (!mp)
+		return -EBADFD;
+
+	/* currently, async is not supported.
+	 * but we may support real async aio from user application,
+	 * maybe qemu virtio-net backend.
+	 */
+	if (!is_sync_kiocb(iocb))
+		return -EFAULT;
+
+	len = iov_length(iov, count);
+
+	if (unlikely(len < ETH_HLEN))
+		return -EINVAL;
+
+	skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+				  file->f_flags & O_NONBLOCK, &err);
+
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, NET_IP_ALIGN);
+	skb_put(skb, len);
+
+	if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+		kfree_skb(skb);
+		return -EAGAIN;
+	}
+
+	skb->protocol = eth_type_trans(skb, mp->dev);
+	skb->dev = mp->dev;
+
+	dev_queue_xmit(skb);
+
+	mp_put(file->private_data);
+	return result;
+}
+
+static int mp_chr_close(struct inode *inode, struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+
+	/*
+	 * Ignore return value since an error only means there was nothing to
+	 * do
+	 */
+	rtnl_lock();
+	do_unbind(mfile);
+	rtnl_unlock();
+	put_net(mfile->net);
+	kfree(mfile);
+
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static long mp_chr_compat_ioctl(struct file *f, unsigned int ioctl,
+				unsigned long arg)
+{
+	return mp_chr_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations mp_fops = {
+	.owner  = THIS_MODULE,
+	.llseek = no_llseek,
+	.write  = do_sync_write,
+	.aio_write = mp_chr_aio_write,
+	.poll   = mp_chr_poll,
+	.unlocked_ioctl = mp_chr_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = mp_chr_compat_ioctl,
+#endif
+	.open   = mp_chr_open,
+	.release = mp_chr_close,
+};
+
+static struct miscdevice mp_miscdev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "mp",
+	.nodename = "net/mp",
+	.fops = &mp_fops,
+};
+
+static int mp_device_event(struct notifier_block *unused,
+		unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct mp_port *port;
+	struct mp_struct *mp = NULL;
+	struct socket *sock = NULL;
+	struct sock *sk;
+
+	port = dev->mp_port;
+	if (port == NULL)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		sock = dev->mp_port->sock;
+		mp = container_of(sock->sk, struct mp_sock, sk)->mp;
+		do_unbind(mp->mfile);
+		break;
+	case NETDEV_CHANGE:
+		sk = dev->mp_port->sock->sk;
+		sk->sk_state_change(sk);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mp_notifier_block __read_mostly = {
+	.notifier_call  = mp_device_event,
+};
+
+static int mp_init(void)
+{
+	int err = 0;
+
+	ext_page_info_cache = kmem_cache_create("skb_page_info",
+						sizeof(struct page_info),
+						0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!ext_page_info_cache)
+		return -ENOMEM;
+
+	err = misc_register(&mp_miscdev);
+	if (err) {
+		printk(KERN_ERR "mp: Can't register misc device\n");
+		kmem_cache_destroy(ext_page_info_cache);
+	} else {
+		printk(KERN_INFO "Registering mp misc device - minor = %d\n",
+				mp_miscdev.minor);
+		register_netdevice_notifier(&mp_notifier_block);
+	}
+	return err;
+}
+
+void mp_exit(void)
+{
+	unregister_netdevice_notifier(&mp_notifier_block);
+	misc_deregister(&mp_miscdev);
+	kmem_cache_destroy(ext_page_info_cache);
+}
+
+/* Get an underlying socket object from mp file.  Returns error unless file is
+ * attached to a device.  The returned object works like a packet socket, it
+ * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
+ * holding a reference to the file for as long as the socket is in use. */
+struct socket *mp_get_socket(struct file *file)
+{
+	struct mp_file *mfile = file->private_data;
+	struct mp_struct *mp;
+
+	if (file->f_op != &mp_fops)
+		return ERR_PTR(-EINVAL);
+	mp = mp_get(mfile);
+	if (!mp)
+		return ERR_PTR(-EBADFD);
+	mp_put(mfile);
+	return &mp->socket;
+}
+EXPORT_SYMBOL_GPL(mp_get_socket);
+
+module_init(mp_init);
+module_exit(mp_exit);
+MODULE_AUTHOR(DRV_COPYRIGHT);
+MODULE_DESCRIPTION(DRV_DESCRIPTION);
+MODULE_LICENSE("GPL v2");
-- 
1.7.3


^ permalink raw reply related

* [PATCH v13 10/16] Add a hook to intercept external buffers from NIC driver.
From: xiaohui.xin @ 2010-10-15  9:12 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1287132437.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The hook is called in __netif_receive_skb().

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhao81new@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 net/core/dev.c |   37 +++++++++++++++++++++++++++++++++++++
 1 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index e48639d..235eaab 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2814,6 +2814,37 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
 }
 EXPORT_SYMBOL(__skb_bond_should_drop);
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+/* Add a hook to intercept mediate passthru(zero-copy) packets,
+ * and insert it to the socket queue owned by mp_port specially.
+ */
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+					       struct packet_type **pt_prev,
+					       int *ret,
+					       struct net_device *orig_dev)
+{
+	struct mp_port *mp_port = NULL;
+	struct sock *sk = NULL;
+
+	if (!dev_is_mpassthru(skb->dev))
+		return skb;
+	mp_port = skb->dev->mp_port;
+
+	if (*pt_prev) {
+		*ret = deliver_skb(skb, *pt_prev, orig_dev);
+		*pt_prev = NULL;
+	}
+
+	sk = mp_port->sock->sk;
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+	sk->sk_state_change(sk);
+
+	return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev)     (skb)
+#endif
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -2891,6 +2922,11 @@ static int __netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
+	/* To intercept mediate passthru(zero-copy) packets here */
+	skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+	if (!skb)
+		goto out;
+
 	/* Handle special case of bridge or macvlan */
 	rx_handler = rcu_dereference(skb->dev->rx_handler);
 	if (rx_handler) {
@@ -2991,6 +3027,7 @@ err:
 EXPORT_SYMBOL(netdev_mp_port_prep);
 #endif
 
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
-- 
1.7.3


^ permalink raw reply related

* [PATCH v13 04/16] Add a function make external buffer owner to query capability.
From: xiaohui.xin @ 2010-10-15  9:12 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike; +Cc: Xin Xiaohui
In-Reply-To: <fc6e95d63a2c62aaf77f8ded22fc43ccefcdbbff.1287132437.git.xiaohui.xin@intel.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

The external buffer owner can use the functions to get
the capability of the underlying NIC driver.

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Zhao Yu <yzhaonew@gmail.com>
Reviewed-by: Jeff Dike <jdike@linux.intel.com>
---
 include/linux/netdevice.h |    2 +
 net/core/dev.c            |   49 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 575777f..8dcf6de 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1736,6 +1736,8 @@ extern gro_result_t	napi_frags_finish(struct napi_struct *napi,
 					  gro_result_t ret);
 extern struct sk_buff *	napi_frags_skb(struct napi_struct *napi);
 extern gro_result_t	napi_gro_frags(struct napi_struct *napi);
+extern int netdev_mp_port_prep(struct net_device *dev,
+				struct mp_port *port);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 660dd41..e48639d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2942,6 +2942,55 @@ out:
 	return ret;
 }
 
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry. If a driver does not use the API to export,
+ * then we may try to use a default value, currently,
+ * we use the default value from an IGB driver. Now,
+ * it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+		struct mp_port *port)
+{
+	int rc;
+	int npages, data_len;
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_mp_port_prep) {
+		rc = ops->ndo_mp_port_prep(dev, port);
+		if (rc)
+			return rc;
+	} else {
+		/* If the NIC driver did not report this,
+		 * then we try to use default value.
+		 */
+		port->hdr_len = 128;
+		port->data_len = 2048;
+		port->npages = 1;
+	}
+
+	if (port->hdr_len <= 0)
+		goto err;
+
+	npages = port->npages;
+	data_len = port->data_len;
+	if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+			(data_len < PAGE_SIZE * (npages - 1) ||
+			 data_len > PAGE_SIZE * npages))
+		goto err;
+
+	return 0;
+err:
+	dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
 /**
  *	netif_receive_skb - process receive buffer from network
  *	@skb: buffer to process
-- 
1.7.3


^ permalink raw reply related

* [PATCH v13 00/16] Provide a zero-copy method on KVM virtio-net.
From: xiaohui.xin @ 2010-10-15  9:12 UTC (permalink / raw)
  To: netdev, kvm, linux-kernel, mst, mingo, davem, herbert, jdike

We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-10:  	net core and kernel changes.
patch 11-13:  	new device as interface to mantpulate external buffers.
patch 14: 	for vhost-net.
patch 15:	An example on modifying NIC driver to using napi_gro_frags().
patch 16:	An example how to get guest buffers based on driver
		who using napi_gro_frags().

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later.

What we have not done yet:
	Performance tuning

what we have done in v1:
	polish the RCU usage
	deal with write logging in asynchroush mode in vhost
	add notifier block for mp device
	rename page_ctor to mp_port in netdevice.h to make it looks generic
	add mp_dev_change_flags() for mp device to change NIC state
	add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
	a small fix for missing dev_put when fail
	using dynamic minor instead of static minor number
	a __KERNEL__ protect to mp_get_sock()

what we have done in v2:
	
	remove most of the RCU usage, since the ctor pointer is only
	changed by BIND/UNBIND ioctl, and during that time, NIC will be
	stopped to get good cleanup(all outstanding requests are finished),
	so the ctor pointer cannot be raced into wrong situation.

	Remove the struct vhost_notifier with struct kiocb.
	Let vhost-net backend to alloc/free the kiocb and transfer them
	via sendmsg/recvmsg.

	use get_user_pages_fast() and set_page_dirty_lock() when read.

	Add some comments for netdev_mp_port_prep() and handle_mpassthru().

what we have done in v3:
	the async write logging is rewritten 
	a drafted synchronous write function for qemu live migration
	a limit for locked pages from get_user_pages_fast() to prevent Dos
	by using RLIMIT_MEMLOCK
	

what we have done in v4:
	add iocb completion callback from vhost-net to queue iocb in mp device
	replace vq->receiver by mp_sock_data_ready()
	remove stuff in mp device which access structures from vhost-net
	modify skb_reserve() to ignore host NIC driver reserved space
	rebase to the latest vhost tree
	split large patches into small pieces, especially for net core part.
	

what we have done in v5:
	address Arnd Bergmann's comments
		-remove IFF_MPASSTHRU_EXCL flag in mp device
		-Add CONFIG_COMPAT macro
		-remove mp_release ops
	move dev_is_mpassthru() as inline func
	fix a bug in memory relinquish
	Apply to current git (2.6.34-rc6) tree.

what we have done in v6:
	move create_iocb() out of page_dtor which may happen in interrupt context
	-This remove the potential issues which lock called in interrupt context
	make the cache used by mp, vhost as static, and created/destoryed during
	modules init/exit functions.
	-This makes multiple mp guest created at the same time.

what we have done in v7:
	some cleanup prepared to suppprt PS mode

what we have done in v8:
	discarding the modifications to point skb->data to guest buffer directly.
	Add code to modify driver to support napi_gro_frags() with Herbert's comments.
	To support PS mode.
	Add mergeable buffer support in mp device.
	Add GSO/GRO support in mp deice.
	Address comments from Eric Dumazet about cache line and rcu usage.

what we have done in v9:
	v8 patch is based on a fix in dev_gro_receive().
	But Herbert did not agree with the fix we have sent out.
	And he suggest another fix. v9 is modified to base on that fix.
	

what we have done in v10:
	Fix a partial csum error.
	Cleanup some unused fields with struct page_info{} in mp device.
	Modify kmem_cache_zalloc() to kmem_cache_alloc() based on Michael S. Thirkin.

what we have done in v11:
	Address comments from Michael S. Thirkin to add two new ioctls in mp device.
	But still need to revise.

what we have done in v12:
	Address most comments from Ben Hutchings, except the compat ioctls.
	As the comments are sparse, so do not make a split patch.
	Change struct mpassthru_port to struct mp_port, and struct page_ctor
	to struct page_pool.

what we have done in v13:
	Export functions to other drivers like macvtap, in case it want to reuse it to
	get zero-copy.
	Rebase on 2.6.36-rc7.
 
Performance:
	We have seen the performance data request from mailling-list.
	And we are now looking into this.

^ permalink raw reply

* Re: tbf/htb qdisc limitations
From: Jarek Poplawski @ 2010-10-15  8:18 UTC (permalink / raw)
  To: Bill Fink; +Cc: Rick Jones, Steven Brudenell, netdev
In-Reply-To: <20101015023749.f085006b.billfink@mindspring.com>

On Fri, Oct 15, 2010 at 02:37:49AM -0400, Bill Fink wrote:
> On Thu, 14 Oct 2010, Jarek Poplawski wrote:
> 
> > On Thu, Oct 14, 2010 at 08:09:39AM +0000, Jarek Poplawski wrote:
> > > On Thu, Oct 14, 2010 at 03:13:54AM -0400, Bill Fink wrote:
> > > > TSO/GSO was disabled and was using 9000-byte jumbo frames
> > > > (and specified mtu 9000 to tc command).
> > > > 
> > > > Here was one attempt I made using tbf:
> > > > 
> > > > tc qdisc add dev eth2 root handle 1: prio
> > > > tc qdisc add dev eth2 parent 1:1 handle 10: tbf rate 8900mbit buffer 1112500 limit 10000 mtu 9000
> > > > tc filter add dev eth2 protocol ip parent 1: prio 1 u32 match ip dst 192.168.1.23 flowid 10:1
> > > > 
> > > > I tried many variations of the above, all without success.
> > > 
> > > The main problem are smaller packets. If you had (almost) only 9000b
> > > frames this probably could work. [...]
> > 
> > On the other hand, e.g. the limit above seems too low wrt mtu & rate.
> 
> Actually, I discovered my commands above work just fine on
> a 2.6.35 box:
> 
> i7test7% nuttcp -T10 -i1 192.168.1.17
>  1045.3125 MB /   1.00 sec = 8768.3573 Mbps     0 retrans
>  1045.6875 MB /   1.00 sec = 8772.0292 Mbps     0 retrans
>  1049.5625 MB /   1.00 sec = 8804.2627 Mbps     0 retrans
>  1043.1875 MB /   1.00 sec = 8750.9960 Mbps     0 retrans
>  1048.6875 MB /   1.00 sec = 8796.3246 Mbps     0 retrans
>  1033.4375 MB /   1.00 sec = 8669.3188 Mbps     0 retrans
>  1040.7500 MB /   1.00 sec = 8730.7057 Mbps     0 retrans
>  1047.0000 MB /   1.00 sec = 8783.2063 Mbps     0 retrans
>  1040.0000 MB /   1.00 sec = 8724.0564 Mbps     0 retrans
>  1037.4375 MB /   1.00 sec = 8702.5434 Mbps     0 retrans
> 
> 10431.5608 MB /  10.00 sec = 8749.7542 Mbps 25 %TX 35 %RX 0 retrans 0.11 msRTT
> 
> The problems I encountered were on a field system running
> 2.6.30.10.  I will investigate upgrading the field system
> to 2.6.35.

This change from 2.6.31 should matter here:
http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.35.y.git;a=commit;h=a4a710c4a7490587406462bf1d54504b7783d7d7

Jarek P.

^ permalink raw reply

* Re: -j MARK in raw vs. mangle (was Re: xfrm by MARK: tcp problems when mark for in and out differ)
From: Gerd v. Egidy @ 2010-10-15  8:05 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: hadi, netfilter-devel, netdev
In-Reply-To: <4CB7FCEB.5070804@trash.net>

Hi Patrick,

> > So it seems this has nothing to do with xfrm, but that the MARK target
> > has different effects when used in raw than in mangle. I was using raw
> > because I had to set conntrack zones too and it was more conveniant to
> > do both in one place.
> > 
> > Can one of the netfilter guys comment on this? Is using MARK in raw not
> > fully supported or has known deficiencies?
> 
> No, the problem is most likely that for outgoing packets, the XFRM
> lookup is done with the route lookup before the packet is even sent,
> so once it hits the raw or mangle table, it is too late. mangle however
> performs rerouting when the mark value changes, at which point a new
> XFRM lookup is performed.

ah, this would explain it. Thanks for the explanation. I'll just stick with 
mangle for marking.

Kind regards,

Gerd

-- 
Address (better: trap) for people I really don't want to get mail from:
jonas@cactusamerica.com

^ permalink raw reply

* [PATCH 1/1] ARC vmac ethernet driver.
From: Andreas Fenkart @ 2010-10-15  7:54 UTC (permalink / raw)
  To: netdev; +Cc: Andreas Fenkart
In-Reply-To: <1287129254-18078-1-git-send-email-andreas.fenkart@streamunlimited.com>


Signed-off-by: Andreas Fenkart <andreas.fenkart@streamunlimited.com>
---
 drivers/net/Kconfig   |    9 +
 drivers/net/Makefile  |    1 +
 drivers/net/arcvmac.c | 1411 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/arcvmac.h |  268 ++++++++++
 4 files changed, 1689 insertions(+), 0 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 5db667c..f534587 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -248,6 +248,15 @@ config AX88796_93CX6
 	help
 	  Select this if your platform comes with an external 93CX6 eeprom.
 
+config ARCVMAC
+	tristate "ARC VMAC ethernet support"
+	depends on HAS_DMA
+	select MII
+	select PHYLIB
+	select CRC32
+	help
+	  MAC device present on Zoran Quatro43XX
+
 config MACE
 	tristate "MACE (Power Mac ethernet) support"
 	depends on PPC_PMAC && PPC32
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 1d05ea5..da41896 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -137,6 +137,7 @@ obj-$(CONFIG_ULTRA32) += smc-ultra32.o 8390.o
 obj-$(CONFIG_E2100) += e2100.o 8390.o
 obj-$(CONFIG_ES3210) += es3210.o 8390.o
 obj-$(CONFIG_LNE390) += lne390.o 8390.o
+obj-$(CONFIG_ARCVMAC) += arcvmac.o
 obj-$(CONFIG_NE3210) += ne3210.o 8390.o
 obj-$(CONFIG_SB1250_MAC) += sb1250-mac.o
 obj-$(CONFIG_B44) += b44.o
diff --git a/drivers/net/arcvmac.c b/drivers/net/arcvmac.c
new file mode 100644
index 0000000..e49e1c1
--- /dev/null
+++ b/drivers/net/arcvmac.c
@@ -0,0 +1,1411 @@
+/*
+ * linux/arch/arc/drivers/arcvmac.c
+ *
+ * Copyright (C) 2003-2006 Codito Technologies, for linux-2.4 port
+ * Copyright (C) 2006-2007 Celunite Inc, for linux-2.6 port
+ * Copyright (C) 2007-2008 Sagem Communications, Fehmi HAFSI
+ * Copyright (C) 2009 Sagem Communications, Andreas Fenkart
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * external PHY support based on dnet.c
+ * ring management based on bcm63xx_enet.c
+ *
+ * Authors: amit.bhor@celunite.com, sameer.dhavale@celunite.com
+ */
+
+#undef DEBUG
+
+#include <linux/clk.h>
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "arcvmac.h"
+
+/* Register access macros */
+#define vmac_writel(port, value, reg)	\
+	writel((value), (port)->regs + reg##_OFFSET)
+#define vmac_readl(port, reg)	readl((port)->regs + reg##_OFFSET)
+
+static unsigned char *read_mac_reg(struct net_device *dev,
+		unsigned char hwaddr[ETH_ALEN])
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned mac_lo, mac_hi;
+
+	WARN_ON(!hwaddr);
+	mac_lo = vmac_readl(ap, ADDRL);
+	mac_hi = vmac_readl(ap, ADDRH);
+
+	hwaddr[0] = (mac_lo >> 0) & 0xff;
+	hwaddr[1] = (mac_lo >> 8) & 0xff;
+	hwaddr[2] = (mac_lo >> 16) & 0xff;
+	hwaddr[3] = (mac_lo >> 24) & 0xff;
+	hwaddr[4] = (mac_hi >> 0) & 0xff;
+	hwaddr[5] = (mac_hi >> 8) & 0xff;
+	return hwaddr;
+}
+
+static void write_mac_reg(struct net_device *dev, unsigned char* hwaddr)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned mac_lo, mac_hi;
+
+	mac_lo = hwaddr[3] << 24 | hwaddr[2] << 16 | hwaddr[1] << 8 | hwaddr[0];
+	mac_hi = hwaddr[5] << 8 | hwaddr[4];
+
+	vmac_writel(ap, mac_lo, ADDRL);
+	vmac_writel(ap, mac_hi, ADDRH);
+}
+
+static void vmac_mdio_xmit(struct vmac_priv *ap, unsigned val)
+{
+	init_completion(&ap->mdio_complete);
+	vmac_writel(ap, val, MDIO_DATA);
+	wait_for_completion(&ap->mdio_complete);
+}
+
+static int vmac_mdio_read(struct mii_bus *bus, int phy_id, int phy_reg)
+{
+	struct vmac_priv *vmac = bus->priv;
+	unsigned int val;
+	/* only 5 bits allowed for phy-addr and reg_offset */
+	WARN_ON(phy_id & ~0x1f || phy_reg & ~0x1f);
+
+	val = MDIO_BASE | MDIO_OP_READ;
+	val |= phy_id << 23 | phy_reg << 18;
+	vmac_mdio_xmit(vmac, val);
+
+	val = vmac_readl(vmac, MDIO_DATA);
+	return val & MDIO_DATA_MASK;
+}
+
+static int vmac_mdio_write(struct mii_bus *bus, int phy_id, int phy_reg,
+			 u16 value)
+{
+	struct vmac_priv *vmac = bus->priv;
+	unsigned int val;
+	/* only 5 bits allowed for phy-addr and reg_offset */
+	WARN_ON(phy_id & ~0x1f || phy_reg & ~0x1f);
+
+	val = MDIO_BASE | MDIO_OP_WRITE;
+	val |= phy_id << 23 | phy_reg << 18;
+	val |= (value & MDIO_DATA_MASK);
+	vmac_mdio_xmit(vmac, val);
+	return 0;
+}
+
+static void vmac_handle_link_change(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+	unsigned long flags;
+	int report_change = 0;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	if (phydev->duplex != ap->duplex) {
+		unsigned tmp;
+
+		tmp = vmac_readl(ap, ENABLE);
+
+		if (phydev->duplex)
+			tmp |= ENFL_MASK;
+		else
+			tmp &= ~ENFL_MASK;
+
+		vmac_writel(ap, tmp, ENABLE);
+
+		ap->duplex = phydev->duplex;
+		report_change = 1;
+	}
+
+	if (phydev->speed != ap->speed) {
+		ap->speed = phydev->speed;
+		report_change = 1;
+	}
+
+	if (phydev->link != ap->link) {
+		ap->link = phydev->link;
+		report_change = 1;
+	}
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	if (report_change)
+		phy_print_status(ap->phy_dev);
+}
+
+static int __devinit vmac_mii_probe(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = NULL;
+	struct clk *sys_clk;
+	unsigned long clock_rate;
+	int phy_addr, err;
+
+	/* find the first phy */
+	for (phy_addr = 0; phy_addr < PHY_MAX_ADDR; phy_addr++) {
+		if (ap->mii_bus->phy_map[phy_addr]) {
+			phydev = ap->mii_bus->phy_map[phy_addr];
+			break;
+		}
+	}
+
+	if (!phydev) {
+		dev_err(&dev->dev, "no PHY found\n");
+		return -ENODEV;
+	}
+
+	/* add pin_irq, if avail */
+
+	phydev = phy_connect(dev, dev_name(&phydev->dev),
+			&vmac_handle_link_change, 0,
+			PHY_INTERFACE_MODE_MII);
+
+	if (IS_ERR(phydev)) {
+		err = PTR_ERR(phydev);
+		dev_err(&dev->dev, "could not attach to PHY %d\n", err);
+		goto err_out;
+	}
+
+	phydev->supported &= PHY_BASIC_FEATURES;
+	phydev->supported |= SUPPORTED_Asym_Pause | SUPPORTED_Pause;
+
+	sys_clk = clk_get(&ap->pdev->dev, "arcvmac");
+	if (IS_ERR(sys_clk)) {
+		err = PTR_ERR(sys_clk);
+		goto err_disconnect;
+	}
+
+	clock_rate = clk_get_rate(sys_clk);
+	clk_put(sys_clk);
+
+	dev_dbg(&ap->pdev->dev, "clk_get: dev_name : %s %lu\n",
+			dev_name(&ap->pdev->dev),
+			clock_rate);
+
+	if (clock_rate < 25000000)
+		phydev->supported &= ~(SUPPORTED_100baseT_Half |
+				SUPPORTED_100baseT_Full);
+
+	phydev->advertising = phydev->supported;
+
+	ap->link = 0;
+	ap->speed = 0;
+	ap->duplex = -1;
+	ap->phy_dev = phydev;
+
+	return 0;
+
+err_disconnect:
+	phy_disconnect(phydev);
+err_out:
+	return err;
+}
+
+static int __devinit vmac_mii_init(struct vmac_priv *ap)
+{
+	int err, i;
+
+	ap->mii_bus = mdiobus_alloc();
+	if (ap->mii_bus == NULL)
+		return -ENOMEM;
+
+	ap->mii_bus->name = "vmac_mii_bus";
+	ap->mii_bus->read = &vmac_mdio_read;
+	ap->mii_bus->write = &vmac_mdio_write;
+
+	snprintf(ap->mii_bus->id, MII_BUS_ID_SIZE, "%x", 0);
+
+	ap->mii_bus->priv = ap;
+
+	err = -ENOMEM;
+	ap->mii_bus->irq = kmalloc(sizeof(int) * PHY_MAX_ADDR, GFP_KERNEL);
+	if (!ap->mii_bus->irq)
+		goto err_out;
+
+	for (i = 0; i < PHY_MAX_ADDR; i++)
+		ap->mii_bus->irq[i] = PHY_POLL;
+
+#if 0
+	/* FIXME: what is it used for? */
+	platform_set_drvdata(ap->dev, ap->mii_bus);
+#endif
+
+	err = mdiobus_register(ap->mii_bus);
+	if (err)
+		goto err_out_free_mdio_irq;
+
+	err = vmac_mii_probe(ap->dev);
+	if (err)
+		goto err_out_unregister_bus;
+
+	return 0;
+
+err_out_unregister_bus:
+	mdiobus_unregister(ap->mii_bus);
+err_out_free_mdio_irq:
+	kfree(ap->mii_bus->irq);
+err_out:
+	mdiobus_free(ap->mii_bus);
+	return err;
+}
+
+static void vmac_mii_exit(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	if (ap->phy_dev)
+		phy_disconnect(ap->phy_dev);
+
+	mdiobus_unregister(ap->mii_bus);
+	kfree(ap->mii_bus->irq);
+	mdiobus_free(ap->mii_bus);
+}
+
+static int vmacether_get_settings(struct net_device *dev,
+		struct ethtool_cmd *cmd)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_gset(phydev, cmd);
+}
+
+static int vmacether_set_settings(struct net_device *dev,
+		struct ethtool_cmd *cmd)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_sset(phydev, cmd);
+}
+
+static int vmac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+
+	if (!netif_running(dev))
+		return -EINVAL;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_mii_ioctl(phydev, rq, cmd);
+}
+
+static void vmacether_get_drvinfo(struct net_device *dev,
+		struct ethtool_drvinfo *info)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+	snprintf(info->bus_info, sizeof(info->bus_info),
+			"platform 0x%x", ap->mem_base);
+}
+
+static int update_error_counters(struct net_device *dev, int status)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	dev_dbg(&ap->pdev->dev, "rx error counter overrun. status = 0x%x\n",
+			status);
+
+	/* programming error */
+	WARN_ON(status & TXCH_MASK);
+	WARN_ON(!(status & (MSER_MASK | RXCR_MASK | RXFR_MASK | RXFL_MASK)));
+
+	if (status & MSER_MASK)
+		ap->stats.rx_over_errors += 256; /* ran out of BD */
+	if (status & RXCR_MASK)
+		ap->stats.rx_crc_errors += 256;
+	if (status & RXFR_MASK)
+		ap->stats.rx_frame_errors += 256;
+	if (status & RXFL_MASK)
+		ap->stats.rx_fifo_errors += 256;
+
+	return 0;
+}
+
+static void update_tx_errors(struct net_device *dev, int status)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	if (status & UFLO)
+		ap->stats.tx_fifo_errors++;
+
+	if (ap->duplex)
+		return;
+
+	/* half duplex flags */
+	if (status & LTCL)
+		ap->stats.tx_window_errors++;
+	if (status & RETRY_CT)
+		ap->stats.collisions += (status & RETRY_CT) >> 24;
+	if (status & DROP)  /* too many retries */
+		ap->stats.tx_aborted_errors++;
+	if (status & DEFER)
+		dev_vdbg(&ap->pdev->dev, "\"defer to traffic\"\n");
+	if (status & CARLOSS)
+		ap->stats.tx_carrier_errors++;
+}
+
+static int vmac_rx_reclaim_force(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	int ct;
+
+	ct = 0;
+
+	dev_dbg(&ap->pdev->dev, "%s need to release %d rx sk_buff\n",
+	    __func__, fifo_used(&ap->rx_ring));
+
+	while (!fifo_empty(&ap->rx_ring) && ct++ < ap->rx_ring.size) {
+		struct vmac_buffer_desc *desc;
+		struct sk_buff *skb;
+		int desc_idx;
+
+		desc_idx = ap->rx_ring.tail;
+		desc = &ap->rxbd[desc_idx];
+		fifo_inc_tail(&ap->rx_ring);
+
+		if (!ap->rx_skbuff[desc_idx]) {
+			dev_err(&ap->pdev->dev, "non-populated rx_skbuff found %d\n",
+					desc_idx);
+			continue;
+		}
+
+		skb = ap->rx_skbuff[desc_idx];
+		ap->rx_skbuff[desc_idx] = NULL;
+
+		dma_unmap_single(&ap->pdev->dev, desc->data, skb->len,
+		    DMA_TO_DEVICE);
+
+		dev_kfree_skb(skb);
+	}
+
+	if (!fifo_empty(&ap->rx_ring)) {
+		dev_err(&ap->pdev->dev, "failed to reclaim %d rx sk_buff\n",
+				fifo_used(&ap->rx_ring));
+	}
+
+	return 0;
+}
+
+static int vmac_rx_refill(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	WARN_ON(fifo_full(&ap->rx_ring));
+
+	while (!fifo_full(&ap->rx_ring)) {
+		struct vmac_buffer_desc *desc;
+		struct sk_buff *skb;
+		dma_addr_t p;
+		int desc_idx;
+
+		desc_idx = ap->rx_ring.head;
+		desc = &ap->rxbd[desc_idx];
+
+		/* make sure we read the actual descriptor status */
+		rmb();
+
+		if (ap->rx_skbuff[desc_idx]) {
+			/* dropped packet / buffer chaining */
+			fifo_inc_head(&ap->rx_ring);
+
+			/* return to DMA */
+			wmb();
+			desc->info = OWN_MASK | ap->rx_skb_size;
+			continue;
+		}
+
+		skb = netdev_alloc_skb(dev, ap->rx_skb_size + 2);
+		if (!skb) {
+			dev_info(&ap->pdev->dev, "failed to allocate rx_skb, skb's left %d\n",
+					fifo_used(&ap->rx_ring));
+			break;
+		}
+
+		/* IP header Alignment (14 byte Ethernet header) */
+		skb_reserve(skb, 2);
+		WARN_ON(skb->len != 0); /* nothing received yet */
+
+		ap->rx_skbuff[desc_idx] = skb;
+
+		p = dma_map_single(&ap->pdev->dev, skb->data, ap->rx_skb_size,
+				DMA_FROM_DEVICE);
+
+		desc->data = p;
+
+		wmb();
+		desc->info = OWN_MASK | ap->rx_skb_size;
+
+		fifo_inc_head(&ap->rx_ring);
+	}
+
+	/* If rx ring is still empty, set a timer to try allocating
+	 * again at a later time. */
+	if (fifo_empty(&ap->rx_ring) && netif_running(dev)) {
+		dev_warn(&ap->pdev->dev, "unable to refill rx ring\n");
+		ap->rx_timeout.expires = jiffies + HZ;
+		add_timer(&ap->rx_timeout);
+	}
+
+	return 0;
+}
+
+/*
+ * timer callback to defer refill rx queue in case we're OOM
+ */
+static void vmac_refill_rx_timer(unsigned long data)
+{
+	struct net_device *dev;
+	struct vmac_priv *ap;
+
+	dev = (struct net_device *)data;
+	ap = netdev_priv(dev);
+
+	spin_lock(&ap->rx_lock);
+	vmac_rx_refill(dev);
+	spin_unlock(&ap->rx_lock);
+}
+
+/* merge buffer chaining  */
+struct sk_buff *vmac_merge_rx_buffers(struct net_device *dev,
+		struct vmac_buffer_desc *after,
+		int pkt_len) /* data */
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct sk_buff *merge_skb, *cur_skb;
+	struct dma_fifo *rx_ring;
+	struct vmac_buffer_desc *desc;
+
+	rx_ring = &ap->rx_ring;
+	desc = &ap->rxbd[rx_ring->tail];
+
+	WARN_ON(desc == after);
+
+	/* strip FCS */
+	pkt_len -= 4;
+
+	/* IP header Alignment (14 byte Ethernet header) */
+	merge_skb = netdev_alloc_skb(dev, pkt_len + 2);
+	if (!merge_skb) {
+		dev_err(&ap->pdev->dev, "failed to allocate merged rx_skb, rx skb's left %d\n",
+				fifo_used(rx_ring));
+
+		return NULL;
+	}
+
+	skb_reserve(merge_skb, 2);
+
+	while (desc != after && pkt_len) {
+		struct vmac_buffer_desc *desc;
+		int buf_len, valid;
+
+		/* desc needs wrapping */
+		desc = &ap->rxbd[rx_ring->tail];
+		cur_skb = ap->rx_skbuff[rx_ring->tail];
+		WARN_ON(!cur_skb);
+
+		dma_unmap_single(&ap->pdev->dev, desc->data, ap->rx_skb_size,
+				DMA_FROM_DEVICE);
+
+		/* do not copy FCS */
+		buf_len = desc->info & LEN_MASK;
+		valid = min(pkt_len, buf_len);
+		pkt_len -= valid;
+
+		memcpy(skb_put(merge_skb, valid), cur_skb->data, valid);
+
+		fifo_inc_tail(rx_ring);
+	}
+
+	/* merging_pressure++ */
+
+	if (unlikely(pkt_len != 0))
+		dev_err(&ap->pdev->dev, "buffer chaining bytes missing %d\n",
+				pkt_len);
+
+	WARN_ON(desc != after);
+
+	return merge_skb;
+}
+
+int vmac_rx_receive(struct net_device *dev, int budget)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct vmac_buffer_desc *first;
+	int processed, pkt_len, pkt_err;
+	struct dma_fifo lookahead;
+
+	processed = 0;
+
+	first = NULL;
+	pkt_err = pkt_len = 0;
+
+	/* look ahead, till packet complete */
+	lookahead = ap->rx_ring;
+
+	do {
+		struct vmac_buffer_desc *desc; /* cur_ */
+		int desc_idx; /* cur_ */
+		struct sk_buff *skb; /* pkt_ */
+
+		desc_idx = lookahead.tail;
+		desc = &ap->rxbd[desc_idx];
+
+		/* make sure we read the actual descriptor status */
+		rmb();
+
+		/* break if dma ownership belongs to hw */
+		if (desc->info & OWN_MASK) {
+			ap->mac_rxring_head = vmac_readl(ap, MAC_RXRING_HEAD);
+			break;
+		}
+
+		if (desc->info & FRST_MASK) {
+			pkt_len = 0;
+			pkt_err = 0;
+
+			/* don't free current */
+			ap->rx_ring.tail = lookahead.tail;
+			first = desc;
+		}
+
+		fifo_inc_tail(&lookahead);
+
+		/* check bd */
+
+		pkt_len += desc->info & LEN_MASK;
+		pkt_err |= (desc->info & BUFF);
+
+		if (!(desc->info & LAST_MASK))
+			continue;
+
+		/* received complete packet */
+
+		if (unlikely(pkt_err || !first)) {
+			/* recycle buffers */
+			ap->rx_ring.tail = lookahead.tail;
+			continue;
+		}
+
+		WARN_ON(!(first->info & FRST_MASK) ||
+				!(desc->info & LAST_MASK));
+		WARN_ON(pkt_err);
+
+		/* -- valid packet -- */
+
+		if (first != desc) {
+			skb = vmac_merge_rx_buffers(dev, desc, pkt_len);
+
+			if (!skb) {
+				/* kill packet */
+				ap->rx_ring.tail = lookahead.tail;
+				ap->rx_merge_error++;
+				continue;
+			}
+		} else {
+			dma_unmap_single(&ap->pdev->dev, desc->data,
+					ap->rx_skb_size, DMA_FROM_DEVICE);
+
+			skb = ap->rx_skbuff[desc_idx];
+			ap->rx_skbuff[desc_idx] = NULL;
+			/* desc->data != skb->data => desc->data DMA mapped */
+
+			/* strip FCS */
+			skb_put(skb, pkt_len - 4);
+		}
+
+		/* free buffers */
+		ap->rx_ring.tail = lookahead.tail;
+
+		WARN_ON(skb->len != pkt_len - 4);
+		processed++;
+		skb->dev = dev;
+		skb->protocol = eth_type_trans(skb, dev);
+		ap->stats.rx_packets++;
+		ap->stats.rx_bytes += skb->len;
+		dev->last_rx = jiffies;
+		netif_rx(skb);
+
+	} while (!fifo_empty(&lookahead) && (processed < budget));
+
+	dev_vdbg(&ap->pdev->dev, "processed pkt %d, remaining rx buff %d\n",
+			processed,
+			fifo_used(&ap->rx_ring));
+
+	if (processed || fifo_empty(&ap->rx_ring))
+		vmac_rx_refill(dev);
+
+	return processed;
+}
+
+static void vmac_toggle_irqmask(struct net_device *dev, int enable, int mask)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long tmp;
+
+	tmp = vmac_readl(ap, ENABLE);
+	if (enable)
+		tmp |= mask;
+	else
+		tmp &= ~mask;
+	vmac_writel(ap, tmp, ENABLE);
+}
+
+static void vmac_toggle_txint(struct net_device *dev, int enable)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->lock, flags);
+	vmac_toggle_irqmask(dev, enable, TXINT_MASK);
+	spin_unlock_irqrestore(&ap->lock, flags);
+}
+
+static void vmac_toggle_rxint(struct net_device *dev, int enable)
+{
+	vmac_toggle_irqmask(dev, enable, RXINT_MASK);
+}
+
+static int vmac_poll(struct napi_struct *napi, int budget)
+{
+	struct vmac_priv *ap;
+	struct net_device *dev;
+	int rx_work_done;
+	unsigned long flags;
+
+	ap = container_of(napi, struct vmac_priv, napi);
+	dev = ap->dev;
+
+	/* ack interrupt */
+	vmac_writel(ap, RXINT_MASK, STAT);
+
+	spin_lock(&ap->rx_lock);
+	rx_work_done = vmac_rx_receive(dev, budget);
+	spin_unlock(&ap->rx_lock);
+
+#ifdef VERBOSE_DEBUG
+	if (printk_ratelimit()) {
+		dev_vdbg(&ap->pdev->dev, "poll budget %d receive rx_work_done %d\n",
+				budget,
+				rx_work_done);
+	}
+#endif
+
+	if (rx_work_done >= budget) {
+		/* rx queue is not yet empty/clean */
+		return rx_work_done;
+	}
+
+	/* no more packet in rx/tx queue, remove device from poll
+	 * queue */
+	spin_lock_irqsave(&ap->lock, flags);
+	napi_complete(napi);
+	vmac_toggle_rxint(dev, 1);
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	return rx_work_done;
+}
+
+static int vmac_tx_reclaim(struct net_device *dev, int force);
+
+static irqreturn_t vmac_intr(int irq, void *dev_instance)
+{
+	struct net_device *dev = dev_instance;
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned int status;
+
+	spin_lock(&ap->lock);
+
+	status = vmac_readl(ap, STAT);
+	vmac_writel(ap, status, STAT);
+
+#ifdef DEBUG
+	if (unlikely(ap->shutdown))
+		dev_err(&ap->pdev->dev, "ISR during close\n");
+
+	if (unlikely(!status & (RXINT_MASK|MDIO_MASK|ERR_MASK)))
+		dev_err(&ap->pdev->dev, "No source of IRQ found\n");
+#endif
+
+	if ((status & RXINT_MASK) &&
+			(ap->mac_rxring_head !=
+			 vmac_readl(ap, MAC_RXRING_HEAD))) {
+		vmac_toggle_rxint(dev, 0);
+		napi_schedule(&ap->napi);
+	}
+
+	if (unlikely(netif_queue_stopped(dev) && (status & TXINT_MASK)))
+		vmac_tx_reclaim(dev, 0);
+
+	if (status & MDIO_MASK)
+		complete(&ap->mdio_complete);
+
+	if (unlikely(status & ERR_MASK))
+		update_error_counters(dev, status);
+
+	spin_unlock(&ap->lock);
+
+	return IRQ_HANDLED;
+}
+
+static int vmac_tx_reclaim(struct net_device *dev, int force)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	int released = 0;
+
+	/* buffer chaining not used, see vmac_start_xmit */
+
+	while (!fifo_empty(&ap->tx_ring)) {
+		struct vmac_buffer_desc *desc;
+		struct sk_buff *skb;
+		int desc_idx;
+
+		desc_idx = ap->tx_ring.tail;
+		desc = &ap->txbd[desc_idx];
+
+		/* ensure other field of the descriptor were not read
+		 * before we checked ownership */
+		rmb();
+
+		if ((desc->info & OWN_MASK) && !force)
+			break;
+
+		if (desc->info & ERR_MSK_TX) {
+			update_tx_errors(dev, desc->info);
+			/* recycle packet, let upper level deal with it */
+		}
+
+		skb = ap->tx_skbuff[desc_idx];
+		ap->tx_skbuff[desc_idx] = NULL;
+		WARN_ON(!skb);
+
+		dma_unmap_single(&ap->pdev->dev, desc->data, skb->len,
+				DMA_TO_DEVICE);
+
+		dev_kfree_skb_any(skb);
+
+		released++;
+		fifo_inc_tail(&ap->tx_ring);
+	}
+
+	if (netif_queue_stopped(dev) && released) {
+		netif_wake_queue(dev);
+		vmac_toggle_txint(dev, 0);
+	}
+
+	if (unlikely(force && !fifo_empty(&ap->tx_ring))) {
+		dev_err(&ap->pdev->dev, "failed to reclaim %d tx sk_buff\n",
+				fifo_used(&ap->tx_ring));
+	}
+
+	return released;
+}
+
+int vmac_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct vmac_buffer_desc *desc;
+	unsigned int tmp;
+
+	/* running under xmit lock */
+
+	/* no scatter/gatter see features below */
+	WARN_ON(skb_shinfo(skb)->nr_frags != 0);
+	WARN_ON(skb->len > MAX_TX_BUFFER_LEN);
+
+	if (unlikely(fifo_full(&ap->tx_ring))) {
+		netif_stop_queue(dev);
+		vmac_toggle_txint(dev, 1);
+		dev_err(&ap->pdev->dev, "xmit called with no tx desc available\n");
+		return NETDEV_TX_BUSY;
+	}
+
+	if (unlikely(skb->len < ETH_ZLEN)) {
+		struct sk_buff *short_skb;
+		short_skb = netdev_alloc_skb(dev, ETH_ZLEN);
+		if (!short_skb)
+			return NETDEV_TX_LOCKED;
+
+		memset(short_skb->data, 0, ETH_ZLEN);
+		memcpy(skb_put(short_skb, ETH_ZLEN), skb->data, skb->len);
+		dev_kfree_skb(skb);
+		skb = short_skb;
+	}
+
+	/* fill descriptor */
+	ap->tx_skbuff[ap->tx_ring.head] = skb;
+
+	desc = &ap->txbd[ap->tx_ring.head];
+	desc->data = dma_map_single(&ap->pdev->dev, skb->data, skb->len,
+			DMA_TO_DEVICE);
+
+	/* dma might already be polling */
+	wmb();
+	desc->info = OWN_MASK | FRST_MASK | LAST_MASK | skb->len;
+	wmb();
+
+	/* kick tx dma */
+	tmp = vmac_readl(ap, STAT);
+	vmac_writel(ap, tmp | TXPL_MASK, STAT);
+
+	ap->stats.tx_packets++;
+	ap->stats.tx_bytes += skb->len;
+	dev->trans_start = jiffies;
+	fifo_inc_head(&ap->tx_ring);
+
+	/* vmac_tx_reclaim independent of vmac_tx_timeout */
+	if (fifo_used(&ap->tx_ring) > 8)
+		vmac_tx_reclaim(dev, 0);
+
+	/* stop queue if no more desc available */
+	if (fifo_full(&ap->tx_ring)) {
+		netif_stop_queue(dev);
+		vmac_toggle_txint(dev, 1);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+static int alloc_buffers(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	int err = -ENOMEM;
+	int size;
+
+	fifo_init(&ap->rx_ring, RX_BDT_LEN);
+	fifo_init(&ap->tx_ring, TX_BDT_LEN);
+
+	/* initialize skb list */
+	memset(ap->rx_skbuff, 0, sizeof(ap->rx_skbuff));
+	memset(ap->tx_skbuff, 0, sizeof(ap->tx_skbuff));
+
+	/* allocate DMA received descriptors */
+	size = sizeof(*ap->rxbd) * ap->rx_ring.size;
+	ap->rxbd = dma_alloc_coherent(&ap->pdev->dev, size,
+			&ap->rxbd_dma,
+			GFP_KERNEL);
+	if (ap->rxbd == NULL)
+		goto err_out;
+
+	/* allocate DMA transmit descriptors */
+	size = sizeof(*ap->txbd) * ap->tx_ring.size;
+	ap->txbd = dma_alloc_coherent(&ap->pdev->dev, size,
+			&ap->txbd_dma,
+			GFP_KERNEL);
+	if (ap->txbd == NULL)
+		goto err_free_rxbd;
+
+	/* ensure 8-byte aligned */
+	WARN_ON(((int)ap->txbd & 0x7) || ((int)ap->rxbd & 0x7));
+
+	memset(ap->txbd, 0, sizeof(*ap->txbd) * ap->tx_ring.size);
+	memset(ap->rxbd, 0, sizeof(*ap->rxbd) * ap->rx_ring.size);
+
+	/* allocate rx skb */
+	err = vmac_rx_refill(dev);
+	if (err)
+		goto err_free_txbd;
+
+	return 0;
+
+err_free_txbd:
+	dma_free_coherent(&ap->pdev->dev, sizeof(*ap->txbd) * ap->tx_ring.size,
+			ap->txbd, ap->txbd_dma);
+err_free_rxbd:
+	dma_free_coherent(&ap->pdev->dev, sizeof(*ap->rxbd) * ap->rx_ring.size,
+			ap->rxbd, ap->rxbd_dma);
+err_out:
+	return err;
+}
+
+static int free_buffers(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	/* free skbuff */
+	vmac_tx_reclaim(dev, 1);
+	vmac_rx_reclaim_force(dev);
+
+	/* free DMA ring */
+	dma_free_coherent(&ap->pdev->dev, sizeof(ap->txbd) * ap->tx_ring.size,
+			ap->txbd, ap->txbd_dma);
+	dma_free_coherent(&ap->pdev->dev, sizeof(ap->rxbd) * ap->rx_ring.size,
+			ap->rxbd, ap->rxbd_dma);
+
+	return 0;
+}
+
+static int vmac_hw_init(struct net_device *dev)
+{
+	struct vmac_priv *priv = netdev_priv(dev);
+
+	/* clear IRQ mask */
+	vmac_writel(priv, 0, ENABLE);
+
+	/* clear pending IRQ */
+	vmac_writel(priv, 0xffffffff, STAT);
+
+	/* Initialize logical address filter */
+	vmac_writel(priv, 0x0, LAFL);
+	vmac_writel(priv, 0x0, LAFH);
+
+	return 0;
+}
+
+int vmac_open(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev;
+	unsigned int temp;
+	int err = 0;
+
+	if (ap == NULL)
+		return -ENODEV;
+
+	ap->shutdown = 0;
+
+	vmac_hw_init(dev);
+
+	/* mac address changed? */
+	write_mac_reg(dev, dev->dev_addr);
+
+	err = alloc_buffers(dev);
+	if (err)
+		goto err_out;
+
+	err = request_irq(dev->irq, &vmac_intr, 0, dev->name, dev);
+	if (err) {
+		dev_err(&ap->pdev->dev, "Unable to request IRQ %d (error %d)\n",
+				dev->irq, err);
+		goto err_free_buffers;
+	}
+
+	/* install DMA ring pointers */
+	vmac_writel(ap, ap->rxbd_dma, RXRINGPTR);
+	vmac_writel(ap, ap->txbd_dma, TXRINGPTR);
+
+	/* set poll rate to 1 ms */
+	vmac_writel(ap, POLLRATE_TIME, POLLRATE);
+
+	/* make sure we enable napi before rx interrupt  */
+	napi_enable(&ap->napi);
+
+	/* IRQ mask */
+	temp = RXINT_MASK | ERR_MASK | TXCH_MASK | MDIO_MASK;
+	vmac_writel(ap, temp, ENABLE);
+
+	/* Set control */
+	temp = (RX_BDT_LEN << 24) | (TX_BDT_LEN << 16) | TXRN_MASK | RXRN_MASK;
+	vmac_writel(ap, temp, CONTROL);
+
+	/* enable, after all other bits are set */
+	vmac_writel(ap, temp | EN_MASK, CONTROL);
+
+	netif_start_queue(dev);
+	netif_carrier_off(dev);
+
+	/* register the PHY board fixup, if needed */
+	err = vmac_mii_init(ap);
+	if (err)
+		goto err_free_irq;
+
+	/* schedule a link state check */
+	phy_start(ap->phy_dev);
+
+	phydev = ap->phy_dev;
+	dev_info(&ap->pdev->dev, "PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n",
+	       phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
+
+	return 0;
+
+err_free_irq:
+	free_irq(dev->irq, dev);
+err_free_buffers:
+	free_buffers(dev);
+err_out:
+	return err;
+}
+
+int vmac_close(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned int temp;
+
+	netif_stop_queue(dev);
+	napi_disable(&ap->napi);
+
+	/* stop running transfers */
+	temp = vmac_readl(ap, CONTROL);
+	temp &= ~(TXRN_MASK | RXRN_MASK);
+	vmac_writel(ap, temp, CONTROL);
+
+	del_timer_sync(&ap->rx_timeout);
+
+	/* disable phy */
+	phy_stop(ap->phy_dev);
+	vmac_mii_exit(dev);
+	netif_carrier_off(dev);
+
+	/* disable interrupts */
+	vmac_writel(ap, 0, ENABLE);
+	free_irq(dev->irq, dev);
+
+	/* turn off vmac */
+	vmac_writel(ap, 0, CONTROL);
+	/* vmac_reset_hw(vmac) */
+
+	ap->shutdown = 1;
+	wmb();
+
+	free_buffers(dev);
+	return 0;
+}
+
+void vmac_update_stats(struct vmac_priv *ap)
+{
+	struct net_device_stats *_stats = &ap->stats;
+	unsigned long miss, rxerr;
+	unsigned long rxfram, rxcrc, rxoflow;
+
+	/* compare with /proc/net/dev,
+	 * see net/core/dev.c:dev_seq_printf_stats */
+
+	/* rx stats */
+	rxerr = vmac_readl(ap, RXERR);
+	miss = vmac_readl(ap, MISS);
+
+	rxcrc = (rxerr & RXERR_CRC);
+	rxfram = (rxerr & RXERR_FRM) >> 8;
+	rxoflow = (rxerr & RXERR_OFLO) >> 16;
+
+	_stats->rx_length_errors = 0;
+	_stats->rx_over_errors += miss;
+	_stats->rx_crc_errors += rxcrc;
+	_stats->rx_frame_errors += rxfram;
+	_stats->rx_fifo_errors += rxoflow;
+	_stats->rx_missed_errors = 0;
+
+	/* TODO check rx_dropped/rx_errors/tx_dropped/tx_errors have not
+	 * been updated elsewhere */
+	_stats->rx_dropped = _stats->rx_over_errors +
+		_stats->rx_fifo_errors +
+		ap->rx_merge_error;
+
+	_stats->rx_errors = _stats->rx_length_errors + _stats->rx_crc_errors +
+		_stats->rx_frame_errors +
+		_stats->rx_missed_errors +
+		_stats->rx_dropped;
+
+	/* tx stats */
+	_stats->tx_dropped = 0; /* otherwise queue stopped */
+
+	_stats->tx_errors = _stats->tx_aborted_errors +
+		_stats->tx_carrier_errors +
+		_stats->tx_fifo_errors +
+		_stats->tx_heartbeat_errors +
+		_stats->tx_window_errors +
+		_stats->tx_dropped +
+		ap->tx_timeout_error;
+}
+
+struct net_device_stats *vmac_stats(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->lock, flags);
+	vmac_update_stats(ap);
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	return &ap->stats;
+}
+
+void vmac_tx_timeout(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned int status;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	/* queue did not progress for timeo jiffies */
+	WARN_ON(!netif_queue_stopped(dev));
+	WARN_ON(!fifo_full(&ap->tx_ring));
+
+	/* TX IRQ lost? */
+	status = vmac_readl(ap, STAT);
+	if (status & TXINT_MASK) {
+		dev_err(&ap->pdev->dev, "lost tx interrupt, IRQ mask %x\n",
+				vmac_readl(ap, ENABLE));
+		vmac_writel(ap, TXINT_MASK, STAT);
+	}
+
+	/* TODO RX/MDIO/ERR as well? */
+
+	vmac_tx_reclaim(dev, 0);
+	if (fifo_full(&ap->tx_ring))
+		dev_err(&ap->pdev->dev, "DMA state machine not active\n");
+
+	/* We can accept TX packets again */
+	ap->tx_timeout_error++;
+	dev->trans_start = jiffies;
+	netif_wake_queue(dev);
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+}
+
+static void create_multicast_filter(struct net_device *dev,
+	unsigned long *bitmask)
+{
+	struct netdev_hw_addr *ha;
+	unsigned long crc;
+	char *addrs;
+
+	WARN_ON(netdev_mc_count(dev) == 0);
+	WARN_ON(dev->flags & IFF_ALLMULTI);
+
+	bitmask[0] = bitmask[1] = 0;
+
+	netdev_for_each_mc_addr(ha, dev) {
+		addrs = ha->addr;
+
+		/* skip non-multicast addresses */
+		if (!(*addrs & 1))
+			continue;
+
+		crc = ether_crc_le(ETH_ALEN, addrs);
+		set_bit(crc >> 26, bitmask);
+	}
+}
+
+static void vmac_set_multicast_list(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags, bitmask[2];
+	int promisc, reg;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	promisc = !!(dev->flags & IFF_PROMISC);
+	reg = vmac_readl(ap, ENABLE);
+	if (promisc != !!(reg & PROM_MASK)) {
+		reg ^= PROM_MASK;
+		vmac_writel(ap, reg, ENABLE);
+	}
+
+	if (dev->flags & IFF_ALLMULTI)
+		memset(bitmask, 1, sizeof(bitmask));
+	else if (netdev_mc_count(dev) == 0)
+		memset(bitmask, 0, sizeof(bitmask));
+	else
+		create_multicast_filter(dev, bitmask);
+
+	vmac_writel(ap, bitmask[0], LAFL);
+	vmac_writel(ap, bitmask[1], LAFH);
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+}
+
+static struct ethtool_ops vmac_ethtool_ops = {
+	.get_settings		= vmacether_get_settings,
+	.set_settings		= vmacether_set_settings,
+	.get_drvinfo		= vmacether_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+};
+
+static const struct net_device_ops vmac_netdev_ops = {
+	.ndo_open		= vmac_open,
+	.ndo_stop		= vmac_close,
+	.ndo_get_stats		= vmac_stats,
+	.ndo_start_xmit		= vmac_start_xmit,
+	.ndo_do_ioctl		= vmac_ioctl,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_tx_timeout		= vmac_tx_timeout,
+	.ndo_set_multicast_list = vmac_set_multicast_list,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+};
+
+static int __devinit vmac_probe(struct platform_device *pdev)
+{
+	struct net_device *dev;
+	struct vmac_priv *ap;
+	struct resource *res;
+	unsigned int mem_base, mem_size, irq;
+	int err;
+
+	dev = alloc_etherdev(sizeof(*ap));
+	if (!dev) {
+		dev_err(&pdev->dev, "etherdev alloc failed, aborting.\n");
+		return -ENOMEM;
+	}
+
+	ap = netdev_priv(dev);
+
+	err = -ENODEV;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "no mmio resource defined\n");
+		goto err_out;
+	}
+	mem_base = res->start;
+	mem_size = resource_size(res);
+	irq = platform_get_irq(pdev, 0);
+
+	err = -EBUSY;
+	if (!request_mem_region(mem_base, mem_size, DRV_NAME)) {
+		dev_err(&pdev->dev, "no memory region available\n");
+		goto err_out;
+	}
+
+	err = -ENOMEM;
+	ap->regs = ioremap(mem_base, mem_size);
+	if (!ap->regs) {
+		dev_err(&pdev->dev, "failed to map registers, aborting.\n");
+		goto err_out_release_mem;
+	}
+
+	/* no checksum support, hence no scatter/gather */
+	dev->features |= NETIF_F_HIGHDMA;
+
+	spin_lock_init(&ap->lock);
+
+	SET_NETDEV_DEV(dev, &pdev->dev);
+	ap->dev = dev;
+	ap->pdev = pdev;
+
+	/* init rx timeout (used for oom) */
+	init_timer(&ap->rx_timeout);
+	ap->rx_timeout.function = vmac_refill_rx_timer;
+	ap->rx_timeout.data = (unsigned long)dev;
+
+	netif_napi_add(dev, &ap->napi, vmac_poll, 2);
+	dev->netdev_ops = &vmac_netdev_ops;
+	dev->ethtool_ops = &vmac_ethtool_ops;
+	dev->irq = irq;
+
+	dev->flags |= IFF_MULTICAST;
+
+	dev->base_addr = (unsigned long)ap->regs;
+	ap->mem_base = mem_base;
+
+	/* prevent buffer chaining, favor speed over space */
+	ap->rx_skb_size = ETH_FRAME_LEN + VMAC_BUFFER_PAD;
+
+	/* private struct functional */
+
+	/* mac address intialize, set vmac_open  */
+	read_mac_reg(dev, dev->dev_addr);
+
+	if (!is_valid_ether_addr(dev->dev_addr))
+		random_ether_addr(dev->dev_addr);
+
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
+		goto err_out_iounmap;
+	}
+
+	dev_info(&pdev->dev, "ARC VMAC at 0x%08x irq %d %pM\n", mem_base,
+	    dev->irq, dev->dev_addr);
+	platform_set_drvdata(pdev, dev);
+
+	return 0;
+
+err_out_iounmap:
+	iounmap(ap->regs);
+err_out_release_mem:
+	release_mem_region(mem_base, mem_size);
+err_out:
+	free_netdev(dev);
+	return err;
+}
+
+static int __devexit vmac_remove(struct platform_device *pdev)
+{
+	struct net_device *dev;
+	struct vmac_priv *ap;
+	struct resource *res;
+
+	dev = platform_get_drvdata(pdev);
+	if (!dev) {
+		dev_err(&pdev->dev, "%s no valid dev found\n", __func__);
+		return 0;
+	}
+
+	ap = netdev_priv(dev);
+
+	/* MAC */
+	unregister_netdev(dev);
+	iounmap(ap->regs);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	release_mem_region(res->start, resource_size(res));
+
+	platform_set_drvdata(pdev, NULL);
+	free_netdev(dev);
+	return 0;
+}
+
+static struct platform_driver arcvmac_driver = {
+	.probe		= vmac_probe,
+	.remove		= __devexit_p(vmac_remove),
+	.driver		= {
+		.name		= "arcvmac",
+	},
+};
+
+static int __init vmac_init(void)
+{
+	return platform_driver_register(&arcvmac_driver);
+}
+
+static void __exit vmac_exit(void)
+{
+	platform_driver_unregister(&arcvmac_driver);
+}
+
+module_init(vmac_init);
+module_exit(vmac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ARC VMAC Ethernet driver");
+MODULE_AUTHOR("amit.bhor@celunite.com, sameer.dhavale@celunite.com, andreas.fenkart@streamunlimited.com");
diff --git a/drivers/net/arcvmac.h b/drivers/net/arcvmac.h
new file mode 100644
index 0000000..44c0587
--- /dev/null
+++ b/drivers/net/arcvmac.h
@@ -0,0 +1,268 @@
+/*
+ * linux/arch/arc/drivers/arcvmac.h
+ *
+ * Copyright (C) 2003-2006 Codito Technologies, for linux-2.4 port
+ * Copyright (C) 2006-2007 Celunite Inc, for linux-2.6 port
+ * Copyright (C) 2007-2008 Sagem Communications, Fehmi HAFSI
+ * Copyright (C) 2009 Sagem Communications, Andreas Fenkart
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Authors: amit.bhor@celunite.com, sameer.dhavale@celunite.com
+ */
+
+#ifndef _ARCVMAC_H
+#define _ARCVMAC_H
+
+#define DRV_NAME		"arcvmac"
+#define DRV_VERSION		"1.0"
+
+/* Buffer descriptors */
+#define TX_BDT_LEN		16    /* Number of receive BD's */
+#define RX_BDT_LEN		256   /* Number of transmit BD's */
+
+/* BD poll rate, in 1024 cycles. @100Mhz: x * 1024 cy * 10ns = 1ms */
+#define POLLRATE_TIME		200
+
+/* next power of two, bigger than ETH_FRAME_LEN + VLAN  */
+#define MAX_RX_BUFFER_LEN	0x800	/* 2^11 = 2048 = 0x800 */
+#define MAX_TX_BUFFER_LEN	0x800	/* 2^11 = 2048 = 0x800 */
+
+/* 14 bytes of ethernet header, 4 bytes VLAN, FCS,
+ * plus extra pad to prevent buffer chaining of
+ * maximum sized ethernet packets (1514 bytes) */
+#define	VMAC_BUFFER_PAD		(ETH_HLEN + 4 + ETH_FCS_LEN + 4)
+
+/* VMAC register definitions, offsets in the ref manual are in bytes */
+#define ID_OFFSET		(0x00/0x4)
+#define STAT_OFFSET		(0x04/0x4)
+#define ENABLE_OFFSET		(0x08/0x4)
+#define CONTROL_OFFSET		(0x0c/0x4)
+#define POLLRATE_OFFSET		(0x10/0x4)
+#define RXERR_OFFSET		(0x14/0x4)
+#define MISS_OFFSET		(0x18/0x4)
+#define TXRINGPTR_OFFSET	(0x1c/0x4)
+#define RXRINGPTR_OFFSET	(0x20/0x4)
+#define ADDRL_OFFSET		(0x24/0x4)
+#define ADDRH_OFFSET		(0x28/0x4)
+#define LAFL_OFFSET		(0x2c/0x4)
+#define LAFH_OFFSET		(0x30/0x4)
+#define MDIO_DATA_OFFSET	(0x34/0x4)
+#define MAC_TXRING_HEAD_OFFSET	(0x38/0x4)
+#define MAC_RXRING_HEAD_OFFSET	(0x3C/0x4)
+
+/* STATUS and ENABLE register bit masks */
+#define TXINT_MASK		(1<<0)	/* Transmit interrupt */
+#define RXINT_MASK		(1<<1)	/* Receive interrupt */
+#define ERR_MASK		(1<<2)	/* Error interrupt */
+#define TXCH_MASK		(1<<3)	/* Transmit chaining error interrupt */
+#define MSER_MASK		(1<<4)	/* Missed packet counter error */
+#define RXCR_MASK		(1<<8)	/* RXCRCERR counter rolled over	 */
+#define RXFR_MASK		(1<<9)	/* RXFRAMEERR counter rolled over */
+#define RXFL_MASK		(1<<10)	/* RXOFLOWERR counter rolled over */
+#define MDIO_MASK		(1<<12)	/* MDIO complete */
+#define TXPL_MASK		(1<<31)	/* TXPOLL */
+
+/* CONTROL register bitmasks */
+#define EN_MASK			(1<<0)	/* VMAC enable */
+#define TXRN_MASK		(1<<3)	/* TX enable */
+#define RXRN_MASK		(1<<4)	/* RX enable */
+#define DSBC_MASK		(1<<8)	/* Disable receive broadcast */
+#define ENFL_MASK		(1<<10)	/* Enable Full Duplex */
+#define PROM_MASK		(1<<11)	/* Promiscuous mode */
+
+/* RXERR register bitmasks */
+#define RXERR_CRC		0x000000ff
+#define RXERR_FRM		0x0000ff00
+#define RXERR_OFLO		0x00ff0000 /* fifo overflow */
+
+/* MDIO data register bit masks */
+#define MDIO_SFD		0xC0000000
+#define MDIO_OP			0x30000000
+#define MDIO_ID_MASK		0x0F800000
+#define MDIO_REG_MASK		0x007C0000
+#define MDIO_TA			0x00030000
+#define MDIO_DATA_MASK		0x0000FFFF
+
+#define MDIO_BASE		0x40020000
+#define MDIO_OP_READ		0x20000000
+#define MDIO_OP_WRITE		0x10000000
+
+/* Buffer descriptor INFO bit masks */
+#define OWN_MASK		(1<<31)	/* ownership of buffer, 0 CPU, 1 DMA */
+#define BUFF			(1<<30) /* buffer invalid, rx */
+#define UFLO			(1<<29) /* underflow, tx */
+#define LTCL			(1<<28) /* late collision, tx  */
+#define RETRY_CT		(0xf<<24)  /* tx */
+#define DROP			(1<<23) /* drop, more than 16 retries, tx */
+#define DEFER			(1<<22) /* traffic on the wire, tx */
+#define CARLOSS			(1<<21) /* carrier loss while transmission, tx, rx? */
+/* 20:19 reserved */
+#define ADCR			(1<<18) /* add crc, ignored if not disaddcrc */
+#define LAST_MASK		(1<<17)	/* Last buffer in chain */
+#define FRST_MASK		(1<<16)	/* First buffer in chain */
+/* 15:11 reserved */
+#define LEN_MASK		0x000007FF
+
+#define ERR_MSK_TX		0x3fe00000 /* UFLO | LTCL | RTRY | DROP | DEFER | CRLS */
+
+
+/* arcvmac private data structures */
+struct vmac_buffer_desc {
+	unsigned int info;
+	dma_addr_t data;
+};
+
+struct dma_fifo {
+	int head; /* head */
+	int tail; /* tail */
+	int size;
+};
+
+struct	vmac_priv {
+	struct net_device *dev;
+	struct platform_device *pdev;
+	struct net_device_stats stats;
+
+	spinlock_t lock; /* TODO revisit */
+	struct completion mdio_complete;
+
+	/* base address of register set */
+	int *regs;
+	unsigned int mem_base;
+
+	/* DMA ring buffers */
+	struct vmac_buffer_desc *rxbd;
+	dma_addr_t rxbd_dma;
+
+	struct vmac_buffer_desc *txbd;
+	dma_addr_t txbd_dma;
+
+	/* socket buffers */
+	struct sk_buff *rx_skbuff[RX_BDT_LEN];
+	struct sk_buff *tx_skbuff[TX_BDT_LEN];
+	int rx_skb_size;
+
+	/* skb / dma desc managing */
+	struct dma_fifo rx_ring;
+	struct dma_fifo tx_ring;
+
+	/* descriptor last polled/processed by the VMAC */
+	unsigned long mac_rxring_head;
+	/* used when rx skb allocation failed, so we defer rx queue
+	 * refill */
+	struct timer_list rx_timeout;
+
+	/* lock rx_timeout against rx normal operation */
+	spinlock_t rx_lock;
+
+	struct napi_struct napi;
+
+	/* rx buffer chaining */
+	int rx_merge_error;
+	int tx_timeout_error;
+
+	/* PHY stuff */
+	struct mii_bus *mii_bus;
+	struct phy_device *phy_dev;
+
+	int link;
+	int speed;
+	int duplex;
+
+	/* debug */
+	int shutdown;
+};
+
+/* DMA ring management */
+
+/* for a fifo with size n,
+ * - [0..n] fill levels are n + 1 states
+ * - there are only n different deltas (head - tail) values
+ * => not all fill levels can be represented with head, tail
+ *    pointers only
+ * we give up the n fill level, aka fifo full */
+
+/* sacrifice one elt as a sentinel */
+static inline int fifo_used(struct dma_fifo *f);
+static inline int fifo_inc_ct(int ct, int size);
+static inline void fifo_dump(struct dma_fifo *fifo);
+
+static inline int fifo_empty(struct dma_fifo *f)
+{
+	return (f->head == f->tail);
+}
+
+static inline int fifo_free(struct dma_fifo *f)
+{
+	int free;
+
+	free = f->tail - f->head;
+	if (free <= 0)
+		free += f->size;
+
+	return free;
+}
+
+static inline int fifo_used(struct dma_fifo *f)
+{
+	int used;
+
+	used = f->head - f->tail;
+	if (used < 0)
+		used += f->size;
+
+	return used;
+}
+
+static inline int fifo_full(struct dma_fifo *f)
+{
+	return (fifo_used(f) + 1) == f->size;
+}
+
+/* manipulate */
+static inline void fifo_init(struct dma_fifo *fifo, int size)
+{
+	fifo->size = size;
+	fifo->head = fifo->tail = 0; /* empty */
+}
+
+static inline void fifo_inc_head(struct dma_fifo *fifo)
+{
+	BUG_ON(fifo_full(fifo));
+	fifo->head = fifo_inc_ct(fifo->head, fifo->size);
+}
+
+static inline void fifo_inc_tail(struct dma_fifo *fifo)
+{
+	BUG_ON(fifo_empty(fifo));
+	fifo->tail = fifo_inc_ct(fifo->tail, fifo->size);
+}
+
+/* internal funcs */
+static inline void fifo_dump(struct dma_fifo *fifo)
+{
+	printk(KERN_INFO "fifo: head %d, tail %d, size %d\n", fifo->head,
+			fifo->tail,
+			fifo->size);
+}
+
+static inline int fifo_inc_ct(int ct, int size)
+{
+	return (++ct == size) ? 0 : ct;
+}
+
+#endif	  /* _ARCVMAC_H */
-- 
1.7.1


^ permalink raw reply related

* [PATCH 0/1] ARC vmac ethernet driver.
From: Andreas Fenkart @ 2010-10-15  7:54 UTC (permalink / raw)
  To: netdev

This is a driver for the MAC IP block from Arc international. It
is based on an existing driver found in Arc linux distribution,
but essentially, it is a full rewrite. Influenced heavily by
dnet/bcm63xx_enet drivers. 

This is a resubmission, changes are:

- Kconfig dependencies, HAS_DMA/CRC32/PHYLIB/MII

- removed mac_addr module parameter

- rebased linux tree: 3b72090a7317a034d1276a8fbe3b68c3cb77bd92

regards

Andreas

^ permalink raw reply

* [PATCH net-next] can-raw: add msg_flags to distinguish local traffic
From: Kurt Van Dijck @ 2010-10-15  7:37 UTC (permalink / raw)
  To: netdev, socketcan-core-0fE9KPoRgkgATYTw5x5z8w; +Cc: Oliver Hartkopp

CAN has no addressing scheme. It is currently impossible
for userspace to tell is a received CAN frame comes from
another process on the local host, or from a remote CAN
device.
This patch add support for userspace applications to distinguish
between 'own', 'local' and 'remote' CAN traffic.
Distinction is made by returning some flags in msg->msg_flags
in the call to recvmsg.
MSG_CONFIRM flag means 'own', as in 'transmission confirmation'
MSG_DONTROUTE flag means 'local', not routed.
Obviously, msgs with MSG_CONFIRM will have MSG_DONTROUTE set too.

Please note that on SocketCAN mailing list, different opinions
exist on the exact meaning of MSG_DONTROUTE. Better (=more
intuitive) alternatives are appreciated.

Signed-off-by: Kurt Van Dijck <kurt.van.dijck-AgBVmzD5pcezQB+pC5nmwQ@public.gmane.org>
---
 net/can/raw.c |   33 ++++++++++++++++++++++++++++++---
 1 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/net/can/raw.c b/net/can/raw.c
index 7d77e67..f98709e 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -90,23 +90,39 @@ struct raw_sock {
 	can_err_mask_t err_mask;
 };
 
+/*
+ * return some space to store extra msg flags in.
+ * We use 1 int beyond the 'struct sockaddr_can' in skb->cb
+ * to store those.
+ * These flags will be use in raw_recvmsg()
+ */
+static inline int *raw_flags(struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb)
+			<= (sizeof(struct sockaddr_can) + sizeof(int)));
+	/* return pointer after struct sockaddr_can */
+	return (int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
 static inline struct raw_sock *raw_sk(const struct sock *sk)
 {
 	return (struct raw_sock *)sk;
 }
 
-static void raw_rcv(struct sk_buff *skb, void *data)
+static void raw_rcv(struct sk_buff *oskb, void *data)
 {
 	struct sock *sk = (struct sock *)data;
 	struct raw_sock *ro = raw_sk(sk);
 	struct sockaddr_can *addr;
+	struct sk_buff *skb;
+	int *pflags;
 
 	/* check the received tx sock reference */
-	if (!ro->recv_own_msgs && skb->sk == sk)
+	if (!ro->recv_own_msgs && oskb->sk == sk)
 		return;
 
 	/* clone the given skb to be able to enqueue it into the rcv queue */
-	skb = skb_clone(skb, GFP_ATOMIC);
+	skb = skb_clone(oskb, GFP_ATOMIC);
 	if (!skb)
 		return;
 
@@ -123,6 +139,14 @@ static void raw_rcv(struct sk_buff *skb, void *data)
 	addr->can_family  = AF_CAN;
 	addr->can_ifindex = skb->dev->ifindex;
 
+	/* prepare the flags for raw_recvmsg() */
+	pflags = raw_flags(skb);
+	*pflags = 0;
+	if (oskb->sk)
+		*pflags |= MSG_DONTROUTE;
+	if (oskb->sk == sk)
+		*pflags |= MSG_CONFIRM;
+
 	if (sock_queue_rcv_skb(sk, skb) < 0)
 		kfree_skb(skb);
 }
@@ -707,6 +731,9 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock,
 		memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
 	}
 
+	/* assign the flags that have been recorded in in raw_rcv() */
+	msg->msg_flags |= *(raw_flags(skb));
+
 	skb_free_datagram(sk, skb);
 
 	return size;

^ permalink raw reply related

* Re: [GIT PATCH] ioat2: fix performance regression
From: Linus Walleij @ 2010-10-15  7:15 UTC (permalink / raw)
  To: Dan Williams
  Cc: akpm@linux-foundation.org, torvalds@linux-foundation.org,
	Dave Jiang, netdev@vger.kernel.org, Maciej Sosnowski,
	Jesse Brandeburg, linux-kernel@vger.kernel.org, stable@kernel.org,
	Richard Scobie
In-Reply-To: <20101013230248.32594.35857.stgit@localhost6.localdomain6>

Dan Williams wrote:

> Commit 0793448 "DMAENGINE: generic channel status v2" changed the interface for
> how dma channel progress is retrieved.  It inadvertently exported an internal
> helper function ioat_tx_status() instead of ioat_dma_tx_status().

Acked-by: Linus Walleij <linus.walleij@stericsson.com>

Yours,
Linus Walleij

^ permalink raw reply

* Re: -j MARK in raw vs. mangle (was Re: xfrm by MARK: tcp problems when mark for in and out differ)
From: Patrick McHardy @ 2010-10-15  7:04 UTC (permalink / raw)
  To: Gerd v. Egidy; +Cc: hadi, netfilter-devel, netdev
In-Reply-To: <201010141616.58795.lists@egidy.de>

On 14.10.2010 16:16, Gerd v. Egidy wrote:
> Hi Jamal,
> 
> thanks for your help.
> 
>>> So it seems like the fl->mark is never initialized with the packet mark
>>> in the first place. What would be the correct stage in the kernel
>>> network stack to do that?
>>
>> Can you try a simple setup without xfrm/ipsec and see if this reverse
>> path works? Was there a kernel where it worked?
> 
> I just tried opening a simple tcp connection without any xfrm or other weird 
> stuff. I just had one iptables rule in place:
> 
> -t raw -A OUTPUT -d 192.168.5.200 -j MARK --set-mark 99
> 
> 192.168.5.200 is the other system I open the tcp connection from. So this 
> should mark all response packets to the client.
> 
> But the moment __xfrm_lookup is called (this is where my debug printk sits), 
> fl->mark is always 0.
> 
> By chance I changed the rule over to the mangle table:
> 
> -t mangle -A OUTPUT -d 192.168.5.200 -j MARK --set-mark 99
> 
> Now it works, the mark in the flow is 99!
> 
> So it seems this has nothing to do with xfrm, but that the MARK target has 
> different effects when used in raw than in mangle. I was using raw because I 
> had to set conntrack zones too and it was more conveniant to do both in one 
> place.
> 
> Can one of the netfilter guys comment on this? Is using MARK in raw not fully 
> supported or has known deficiencies?

No, the problem is most likely that for outgoing packets, the XFRM
lookup is done with the route lookup before the packet is even sent,
so once it hits the raw or mangle table, it is too late. mangle however
performs rerouting when the mark value changes, at which point a new
XFRM lookup is performed.


^ permalink raw reply

* [PATCH net-next 1/3] fib_hash: embed initial hash table in fn_zone
From: Eric Dumazet @ 2010-10-15  6:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <1287084159.2659.0.camel@edumazet-laptop>

While looking for false sharing problems, I noticed 
sizeof(struct fn_zone) was small (28 bytes) and possibly sharing a cache
line with an often written kernel structure.

Most of the time, fn_zone uses its initial hash table of 16 slots.

We can avoid the false sharing problem by embedding this initial hash
table in fn_zone itself, so that sizeof(fn_zone) > L1_CACHE_BYTES

We did a similar optimization in commit a6501e080c (Reduce memory needs
and speedup lookups)

Add a fz_revorder field to speedup fn_hash() a bit.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/fib_hash.c |   52 ++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 83cca68..10001aa 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,23 +54,23 @@ struct fib_node {
 	struct fib_alias        fn_embedded_alias;
 };
 
+#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
+
 struct fn_zone {
 	struct fn_zone		*fz_next;	/* Next not empty zone	*/
 	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
-	int			fz_nent;	/* Number of entries	*/
-
-	int			fz_divisor;	/* Hash divisor		*/
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
-#define FZ_HASHMASK(fz)		((fz)->fz_hashmask)
 
-	int			fz_order;	/* Zone order		*/
-	__be32			fz_mask;
+	u8			fz_order;	/* Zone order (0..32)	*/
+	u8			fz_revorder;	/* 32 - fz_order	*/
+	__be32			fz_mask;	/* inet_make_mask(order) */
 #define FZ_MASK(fz)		((fz)->fz_mask)
-};
 
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
+	struct hlist_head	fz_embedded_hash[EMBEDDED_HASH_SIZE];
+
+	int			fz_nent;	/* Number of entries	*/
+	int			fz_divisor;	/* Hash size (mask+1)	*/
+};
 
 struct fn_hash {
 	struct fn_zone	*fn_zones[33];
@@ -79,11 +79,11 @@ struct fn_hash {
 
 static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
 {
-	u32 h = ntohl(key)>>(32 - fz->fz_order);
+	u32 h = ntohl(key) >> fz->fz_revorder;
 	h ^= (h>>20);
 	h ^= (h>>10);
 	h ^= (h>>5);
-	h &= FZ_HASHMASK(fz);
+	h &= fz->fz_hashmask;
 	return h;
 }
 
@@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
 	int old_divisor, new_divisor;
 	u32 new_hashmask;
 
-	old_divisor = fz->fz_divisor;
+	new_divisor = old_divisor = fz->fz_divisor;
 
 	switch (old_divisor) {
-	case 16:
-		new_divisor = 256;
+	case EMBEDDED_HASH_SIZE:
+		new_divisor *= EMBEDDED_HASH_SIZE;
 		break;
-	case 256:
-		new_divisor = 1024;
+	case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
+		new_divisor *= (EMBEDDED_HASH_SIZE/2);
 		break;
 	default:
 		if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -184,7 +184,8 @@ static void fn_rehash_zone(struct fn_zone *fz)
 		fib_hash_genid++;
 		write_unlock_bh(&fib_hash_lock);
 
-		fz_hash_free(old_ht, old_divisor);
+		if (old_ht != fz->fz_embedded_hash)
+			fz_hash_free(old_ht, old_divisor);
 	}
 }
 
@@ -210,18 +211,11 @@ fn_new_zone(struct fn_hash *table, int z)
 	if (!fz)
 		return NULL;
 
-	if (z) {
-		fz->fz_divisor = 16;
-	} else {
-		fz->fz_divisor = 1;
-	}
-	fz->fz_hashmask = (fz->fz_divisor - 1);
-	fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
-	if (!fz->fz_hash) {
-		kfree(fz);
-		return NULL;
-	}
+	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
+	fz->fz_hashmask = fz->fz_divisor - 1;
+	fz->fz_hash = fz->fz_embedded_hash;
 	fz->fz_order = z;
+	fz->fz_revorder = 32 - z;
 	fz->fz_mask = inet_make_mask(z);
 
 	/* Find the first not empty zone with more specific mask */



^ permalink raw reply related

* [PATCH net-next 3/3] fib_hash: RCU conversion  phase 2
From: Eric Dumazet @ 2010-10-15  6:56 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Get rid of fib_hash_lock rwlock.

The fn_zone hash table resize is the noticeable part of this patch.

I added a seqlock per fn_zone, so that readers can restart their lookup
in the (very rare) case a writer expanded the hash table.

Add rcu heads in fib_alias and fib_node, use call_rcu() to defer their
freeing, and use appropriate _rcu list manipulations.

Stress test (160.000.000 udp frames sent, IP route cache disabled to
mimic DDOS attack, FIB_HASH)

Before:
real	0m41.191s
user	0m13.137s
sys	8m55.241s

After:
real	0m38.091s
user	0m13.189s
sys	7m53.018s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/fib_hash.c   |  176 ++++++++++++++++++++++------------------
 net/ipv4/fib_lookup.h |    2 
 2 files changed, 101 insertions(+), 77 deletions(-)

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 04f05a9..4f1aafd 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -58,7 +58,8 @@ struct fib_node {
 
 struct fn_zone {
 	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
-	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
+	struct hlist_head __rcu	*fz_hash;	/* Hash table pointer	*/
+	seqlock_t		fz_lock;
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
 
 	u8			fz_order;	/* Zone order (0..32)	*/
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
 	return dst & FZ_MASK(fz);
 }
 
-static DEFINE_RWLOCK(fib_hash_lock);
 static unsigned int fib_hash_genid;
 
 #define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
 {
 	unsigned long size = divisor * sizeof(struct hlist_head);
 
-	if (size <= PAGE_SIZE) {
+	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
-	} else {
-		return (struct hlist_head *)
-			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-	}
+
+	return (struct hlist_head *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
 }
 
 /* The fib hash lock must be held when this is called. */
@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
 		struct fib_node *f;
 
 		hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
-			struct hlist_head *new_head;
+			struct hlist_head __rcu *new_head;
 
-			hlist_del(&f->fn_hash);
+			hlist_del_rcu(&f->fn_hash);
 
 			new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-			hlist_add_head(&f->fn_hash, new_head);
+			hlist_add_head_rcu(&f->fn_hash, new_head);
 		}
 	}
 }
@@ -175,32 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
 	ht = fz_hash_alloc(new_divisor);
 
 	if (ht)	{
-		write_lock_bh(&fib_hash_lock);
+		struct fn_zone nfz;
+
+		memcpy(&nfz, fz, sizeof(nfz));
+
+		write_seqlock_bh(&fz->fz_lock);
 		old_ht = fz->fz_hash;
-		fz->fz_hash = ht;
+		nfz.fz_hash = ht;
+		nfz.fz_hashmask = new_hashmask;
+		nfz.fz_divisor = new_divisor;
+		fn_rebuild_zone(&nfz, old_ht, old_divisor);
+		fib_hash_genid++;
+		rcu_assign_pointer(fz->fz_hash, ht);
 		fz->fz_hashmask = new_hashmask;
 		fz->fz_divisor = new_divisor;
-		fn_rebuild_zone(fz, old_ht, old_divisor);
-		fib_hash_genid++;
-		write_unlock_bh(&fib_hash_lock);
+		write_sequnlock_bh(&fz->fz_lock);
 
-		if (old_ht != fz->fz_embedded_hash)
+		if (old_ht != fz->fz_embedded_hash) {
+			synchronize_rcu();
 			fz_hash_free(old_ht, old_divisor);
+		}
 	}
 }
 
-static inline void fn_free_node(struct fib_node * f)
+static void fn_free_node_rcu(struct rcu_head *head)
 {
+	struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
+
 	kmem_cache_free(fn_hash_kmem, f);
 }
 
+static inline void fn_free_node(struct fib_node *f)
+{
+	call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
+}
+
+static void fn_free_alias_rcu(struct rcu_head *head)
+{
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
 static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
 {
 	fib_release_info(fa->fa_info);
 	if (fa == &f->fn_embedded_alias)
 		fa->fa_info = NULL;
 	else
-		kmem_cache_free(fn_alias_kmem, fa);
+		call_rcu(&fa->rcu, fn_free_alias_rcu);
 }
 
 static struct fn_zone *
@@ -211,6 +233,7 @@ fn_new_zone(struct fn_hash *table, int z)
 	if (!fz)
 		return NULL;
 
+	seqlock_init(&fz->fz_lock);
 	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
 	fz->fz_hashmask = fz->fz_divisor - 1;
 	fz->fz_hash = fz->fz_embedded_hash;
@@ -246,30 +269,34 @@ int fib_table_lookup(struct fib_table *tb,
 	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 
 	rcu_read_lock();
-	read_lock(&fib_hash_lock);
 	for (fz = rcu_dereference(t->fn_zone_list);
 	     fz != NULL;
 	     fz = rcu_dereference(fz->fz_next)) {
-		struct hlist_head *head;
+		struct hlist_head __rcu *head;
 		struct hlist_node *node;
 		struct fib_node *f;
-		__be32 k = fz_key(flp->fl4_dst, fz);
+		__be32 k;
+		unsigned int seq;
 
-		head = &fz->fz_hash[fn_hash(k, fz)];
-		hlist_for_each_entry(f, node, head, fn_hash) {
-			if (f->fn_key != k)
-				continue;
+		do {
+			seq = read_seqbegin(&fz->fz_lock);
+			k = fz_key(flp->fl4_dst, fz);
+
+			head = &fz->fz_hash[fn_hash(k, fz)];
+			hlist_for_each_entry_rcu(f, node, head, fn_hash) {
+				if (f->fn_key != k)
+					continue;
 
-			err = fib_semantic_match(&f->fn_alias,
+				err = fib_semantic_match(&f->fn_alias,
 						 flp, res,
 						 fz->fz_order, fib_flags);
-			if (err <= 0)
-				goto out;
-		}
+				if (err <= 0)
+					goto out;
+			}
+		} while (read_seqretry(&fz->fz_lock, seq));
 	}
 	err = 1;
 out:
-	read_unlock(&fib_hash_lock);
 	rcu_read_unlock();
 	return err;
 }
@@ -292,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb,
 	last_resort = NULL;
 	order = -1;
 
-	read_lock(&fib_hash_lock);
-	hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
 		struct fib_alias *fa;
 
-		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 			struct fib_info *next_fi = fa->fa_info;
 
 			if (fa->fa_scope != res->scope ||
@@ -340,7 +367,7 @@ void fib_table_select_default(struct fib_table *tb,
 		fib_result_assign(res, last_resort);
 	tb->tb_default = last_idx;
 out:
-	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 }
 
 /* Insert node F to FZ. */
@@ -348,7 +375,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
 {
 	struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
 
-	hlist_add_head(&f->fn_hash, head);
+	hlist_add_head_rcu(&f->fn_hash, head);
 }
 
 /* Return the node in FZ matching KEY. */
@@ -358,7 +385,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	struct hlist_node *node;
 	struct fib_node *f;
 
-	hlist_for_each_entry(f, node, head, fn_hash) {
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		if (f->fn_key == key)
 			return f;
 	}
@@ -366,6 +393,16 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	return NULL;
 }
 
+
+static struct fib_alias *fib_fast_alloc(struct fib_node *f)
+{
+	struct fib_alias *fa = &f->fn_embedded_alias;
+
+	if (fa->fa_info != NULL)
+		fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+	return fa;
+}
+
 /* Caller must hold RTNL. */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
@@ -451,7 +488,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		}
 
 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
-			struct fib_info *fi_drop;
 			u8 state;
 
 			fa = fa_first;
@@ -460,21 +496,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 					err = 0;
 				goto out;
 			}
-			write_lock_bh(&fib_hash_lock);
-			fi_drop = fa->fa_info;
-			fa->fa_info = fi;
-			fa->fa_type = cfg->fc_type;
-			fa->fa_scope = cfg->fc_scope;
+			err = -ENOBUFS;
+			new_fa = fib_fast_alloc(f);
+			if (new_fa == NULL)
+				goto out;
+
+			new_fa->fa_tos = fa->fa_tos;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = cfg->fc_type;
+			new_fa->fa_scope = cfg->fc_scope;
 			state = fa->fa_state;
-			fa->fa_state &= ~FA_S_ACCESSED;
+			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			fib_hash_genid++;
-			write_unlock_bh(&fib_hash_lock);
+			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
-			fib_release_info(fi_drop);
+			fn_free_alias(fa, f);
 			if (state & FA_S_ACCESSED)
 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-			rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
-				  &cfg->fc_nlinfo, NLM_F_REPLACE);
+			rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
+				  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
 			return 0;
 		}
 
@@ -506,12 +546,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		f = new_f;
 	}
 
-	new_fa = &f->fn_embedded_alias;
-	if (new_fa->fa_info != NULL) {
-		new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
-		if (new_fa == NULL)
-			goto out;
-	}
+	new_fa = fib_fast_alloc(f);
+	if (new_fa == NULL)
+		goto out;
+
 	new_fa->fa_info = fi;
 	new_fa->fa_tos = tos;
 	new_fa->fa_type = cfg->fc_type;
@@ -522,13 +560,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	 * Insert new entry to the list.
 	 */
 
-	write_lock_bh(&fib_hash_lock);
 	if (new_f)
 		fib_insert_node(fz, new_f);
-	list_add_tail(&new_fa->fa_list,
+	list_add_tail_rcu(&new_fa->fa_list,
 		 (fa ? &fa->fa_list : &f->fn_alias));
 	fib_hash_genid++;
-	write_unlock_bh(&fib_hash_lock);
 
 	if (new_f)
 		fz->fz_nent++;
@@ -603,14 +639,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 			  tb->tb_id, &cfg->fc_nlinfo, 0);
 
 		kill_fn = 0;
-		write_lock_bh(&fib_hash_lock);
-		list_del(&fa->fa_list);
+		list_del_rcu(&fa->fa_list);
 		if (list_empty(&f->fn_alias)) {
-			hlist_del(&f->fn_hash);
+			hlist_del_rcu(&f->fn_hash);
 			kill_fn = 1;
 		}
 		fib_hash_genid++;
-		write_unlock_bh(&fib_hash_lock);
 
 		if (fa->fa_state & FA_S_ACCESSED)
 			rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -641,14 +675,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 			struct fib_info *fi = fa->fa_info;
 
 			if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-				write_lock_bh(&fib_hash_lock);
-				list_del(&fa->fa_list);
+				list_del_rcu(&fa->fa_list);
 				if (list_empty(&f->fn_alias)) {
-					hlist_del(&f->fn_hash);
+					hlist_del_rcu(&f->fn_hash);
 					kill_f = 1;
 				}
 				fib_hash_genid++;
-				write_unlock_bh(&fib_hash_lock);
 
 				fn_free_alias(fa, f);
 				found++;
@@ -693,10 +725,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 
 	s_i = cb->args[4];
 	i = 0;
-	hlist_for_each_entry(f, node, head, fn_hash) {
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		struct fib_alias *fa;
 
-		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 			if (i < s_i)
 				goto next;
 
@@ -714,7 +746,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 				cb->args[4] = i;
 				return -1;
 			}
-		next:
+next:
 			i++;
 		}
 	}
@@ -755,7 +787,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 
 	s_m = cb->args[2];
 	rcu_read_lock();
-	read_lock(&fib_hash_lock);
 	for (fz = rcu_dereference(table->fn_zone_list);
 	     fz != NULL;
 	     fz = rcu_dereference(fz->fz_next), m++) {
@@ -763,14 +794,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 			continue;
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 			cb->args[2] = m;
-			read_unlock(&fib_hash_lock);
 			rcu_read_unlock();
 			return -1;
 		}
 		memset(&cb->args[3], 0,
 		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 	}
-	read_unlock(&fib_hash_lock);
 	rcu_read_unlock();
 	cb->args[2] = m;
 	return skb->len;
@@ -960,13 +989,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(fib_hash_lock)
 	__acquires(RCU)
 {
 	void *v = NULL;
 
 	rcu_read_lock();
-	read_lock(&fib_hash_lock);
 	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
 		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 	return v;
@@ -979,17 +1006,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void fib_seq_stop(struct seq_file *seq, void *v)
-	__releases(fib_hash_lock)
 	__releases(RCU)
 {
-	read_unlock(&fib_hash_lock);
 	rcu_read_unlock();
 }
 
 static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
 {
 	static const unsigned type2flags[RTN_MAX + 1] = {
-		[7] = RTF_REJECT, [8] = RTF_REJECT,
+		[7] = RTF_REJECT,
+		[8] = RTF_REJECT,
 	};
 	unsigned flags = type2flags[type];
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b9c9a9f..5072d8e 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,9 +12,7 @@ struct fib_alias {
 	u8			fa_type;
 	u8			fa_scope;
 	u8			fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
 	struct rcu_head		rcu;
-#endif
 };
 
 #define FA_S_ACCESSED	0x01



^ permalink raw reply related

* [PATCH net-next 2/3] fib_hash: RCU conversion phase 1
From: Eric Dumazet @ 2010-10-15  6:53 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

First step for RCU conversion of fib_hash :

struct fn_zone are created and never deleted.

Very classic conversion, using rcu_assign_pointer(), rcu_dereference()
and rtnl_dereference() verbs.

__rcu markers on fz_next and fn_zone_list

They are created under RTNL, we dont need fib_hash_lock anymore in
fn_new_zone().

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/fib_hash.c |   57 ++++++++++++++++++++++++++++--------------
 1 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 10001aa..04f05a9 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -57,7 +57,7 @@ struct fib_node {
 #define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
 
 struct fn_zone {
-	struct fn_zone		*fz_next;	/* Next not empty zone	*/
+	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
 	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
 
@@ -73,8 +73,8 @@ struct fn_zone {
 };
 
 struct fn_hash {
-	struct fn_zone	*fn_zones[33];
-	struct fn_zone	*fn_zone_list;
+	struct fn_zone		*fn_zones[33];
+	struct fn_zone __rcu	*fn_zone_list;
 };
 
 static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
@@ -219,21 +219,21 @@ fn_new_zone(struct fn_hash *table, int z)
 	fz->fz_mask = inet_make_mask(z);
 
 	/* Find the first not empty zone with more specific mask */
-	for (i=z+1; i<=32; i++)
+	for (i = z + 1; i <= 32; i++)
 		if (table->fn_zones[i])
 			break;
-	write_lock_bh(&fib_hash_lock);
-	if (i>32) {
+	if (i > 32) {
 		/* No more specific masks, we are the first. */
-		fz->fz_next = table->fn_zone_list;
-		table->fn_zone_list = fz;
+		rcu_assign_pointer(fz->fz_next,
+				   rtnl_dereference(table->fn_zone_list));
+		rcu_assign_pointer(table->fn_zone_list, fz);
 	} else {
-		fz->fz_next = table->fn_zones[i]->fz_next;
-		table->fn_zones[i]->fz_next = fz;
+		rcu_assign_pointer(fz->fz_next,
+				   rtnl_dereference(table->fn_zones[i]->fz_next));
+		rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
 	}
 	table->fn_zones[z] = fz;
 	fib_hash_genid++;
-	write_unlock_bh(&fib_hash_lock);
 	return fz;
 }
 
@@ -245,8 +245,11 @@ int fib_table_lookup(struct fib_table *tb,
 	struct fn_zone *fz;
 	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 
+	rcu_read_lock();
 	read_lock(&fib_hash_lock);
-	for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+	for (fz = rcu_dereference(t->fn_zone_list);
+	     fz != NULL;
+	     fz = rcu_dereference(fz->fz_next)) {
 		struct hlist_head *head;
 		struct hlist_node *node;
 		struct fib_node *f;
@@ -267,6 +270,7 @@ int fib_table_lookup(struct fib_table *tb,
 	err = 1;
 out:
 	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -362,6 +366,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	return NULL;
 }
 
+/* Caller must hold RTNL. */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -657,13 +662,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 	return found;
 }
 
+/* caller must hold RTNL. */
 int fib_table_flush(struct fib_table *tb)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 	struct fn_zone *fz;
 	int found = 0;
 
-	for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+	for (fz = rtnl_dereference(table->fn_zone_list);
+	     fz != NULL;
+	     fz = rtnl_dereference(fz->fz_next)) {
 		int i;
 
 		for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -741,23 +749,29 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 		   struct netlink_callback *cb)
 {
-	int m, s_m;
+	int m = 0, s_m;
 	struct fn_zone *fz;
 	struct fn_hash *table = (struct fn_hash *)tb->tb_data;
 
 	s_m = cb->args[2];
+	rcu_read_lock();
 	read_lock(&fib_hash_lock);
-	for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
-		if (m < s_m) continue;
+	for (fz = rcu_dereference(table->fn_zone_list);
+	     fz != NULL;
+	     fz = rcu_dereference(fz->fz_next), m++) {
+		if (m < s_m)
+			continue;
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 			cb->args[2] = m;
 			read_unlock(&fib_hash_lock);
+			rcu_read_unlock();
 			return -1;
 		}
 		memset(&cb->args[3], 0,
 		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 	}
 	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 	cb->args[2] = m;
 	return skb->len;
 }
@@ -820,8 +834,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
 	iter->genid	= fib_hash_genid;
 	iter->valid	= 1;
 
-	for (iter->zone = table->fn_zone_list; iter->zone;
-	     iter->zone = iter->zone->fz_next) {
+	for (iter->zone = rcu_dereference(table->fn_zone_list);
+	     iter->zone != NULL;
+	     iter->zone = rcu_dereference(iter->zone->fz_next)) {
 		int maxslot;
 
 		if (!iter->zone->fz_nent)
@@ -906,7 +921,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
 			}
 		}
 
-		iter->zone = iter->zone->fz_next;
+		iter->zone = rcu_dereference(iter->zone->fz_next);
 
 		if (!iter->zone)
 			goto out;
@@ -946,9 +961,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(fib_hash_lock)
+	__acquires(RCU)
 {
 	void *v = NULL;
 
+	rcu_read_lock();
 	read_lock(&fib_hash_lock);
 	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
 		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -963,8 +980,10 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static void fib_seq_stop(struct seq_file *seq, void *v)
 	__releases(fib_hash_lock)
+	__releases(RCU)
 {
 	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 }
 
 static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)



^ permalink raw reply related

* Re: tbf/htb qdisc limitations
From: Eric Dumazet @ 2010-10-15  6:44 UTC (permalink / raw)
  To: Bill Fink; +Cc: Jarek Poplawski, Rick Jones, Steven Brudenell, netdev
In-Reply-To: <20101015023749.f085006b.billfink@mindspring.com>

Le vendredi 15 octobre 2010 à 02:37 -0400, Bill Fink a écrit :
> On Thu, 14 Oct 2010, Jarek Poplawski wrote:
> 
> > On Thu, Oct 14, 2010 at 08:09:39AM +0000, Jarek Poplawski wrote:
> > > On Thu, Oct 14, 2010 at 03:13:54AM -0400, Bill Fink wrote:
> > > > TSO/GSO was disabled and was using 9000-byte jumbo frames
> > > > (and specified mtu 9000 to tc command).
> > > > 
> > > > Here was one attempt I made using tbf:
> > > > 
> > > > tc qdisc add dev eth2 root handle 1: prio
> > > > tc qdisc add dev eth2 parent 1:1 handle 10: tbf rate 8900mbit buffer 1112500 limit 10000 mtu 9000
> > > > tc filter add dev eth2 protocol ip parent 1: prio 1 u32 match ip dst 192.168.1.23 flowid 10:1
> > > > 
> > > > I tried many variations of the above, all without success.
> > > 
> > > The main problem are smaller packets. If you had (almost) only 9000b
> > > frames this probably could work. [...]
> > 
> > On the other hand, e.g. the limit above seems too low wrt mtu & rate.
> 
> Actually, I discovered my commands above work just fine on
> a 2.6.35 box:
> 
> i7test7% nuttcp -T10 -i1 192.168.1.17
>  1045.3125 MB /   1.00 sec = 8768.3573 Mbps     0 retrans
>  1045.6875 MB /   1.00 sec = 8772.0292 Mbps     0 retrans
>  1049.5625 MB /   1.00 sec = 8804.2627 Mbps     0 retrans
>  1043.1875 MB /   1.00 sec = 8750.9960 Mbps     0 retrans
>  1048.6875 MB /   1.00 sec = 8796.3246 Mbps     0 retrans
>  1033.4375 MB /   1.00 sec = 8669.3188 Mbps     0 retrans
>  1040.7500 MB /   1.00 sec = 8730.7057 Mbps     0 retrans
>  1047.0000 MB /   1.00 sec = 8783.2063 Mbps     0 retrans
>  1040.0000 MB /   1.00 sec = 8724.0564 Mbps     0 retrans
>  1037.4375 MB /   1.00 sec = 8702.5434 Mbps     0 retrans
> 
> 10431.5608 MB /  10.00 sec = 8749.7542 Mbps 25 %TX 35 %RX 0 retrans 0.11 msRTT
> 
> The problems I encountered were on a field system running
> 2.6.30.10.  I will investigate upgrading the field system
> to 2.6.35.
> 

Yes, I noticed same thing for me on net-next-2.6 

Please report :

tc -s -d qdisc



^ permalink raw reply

* Re: tbf/htb qdisc limitations
From: Bill Fink @ 2010-10-15  6:37 UTC (permalink / raw)
  To: Jarek Poplawski; +Cc: Rick Jones, Steven Brudenell, netdev
In-Reply-To: <20101014085005.GA8349@ff.dom.local>

On Thu, 14 Oct 2010, Jarek Poplawski wrote:

> On Thu, Oct 14, 2010 at 08:09:39AM +0000, Jarek Poplawski wrote:
> > On Thu, Oct 14, 2010 at 03:13:54AM -0400, Bill Fink wrote:
> > > TSO/GSO was disabled and was using 9000-byte jumbo frames
> > > (and specified mtu 9000 to tc command).
> > > 
> > > Here was one attempt I made using tbf:
> > > 
> > > tc qdisc add dev eth2 root handle 1: prio
> > > tc qdisc add dev eth2 parent 1:1 handle 10: tbf rate 8900mbit buffer 1112500 limit 10000 mtu 9000
> > > tc filter add dev eth2 protocol ip parent 1: prio 1 u32 match ip dst 192.168.1.23 flowid 10:1
> > > 
> > > I tried many variations of the above, all without success.
> > 
> > The main problem are smaller packets. If you had (almost) only 9000b
> > frames this probably could work. [...]
> 
> On the other hand, e.g. the limit above seems too low wrt mtu & rate.

Actually, I discovered my commands above work just fine on
a 2.6.35 box:

i7test7% nuttcp -T10 -i1 192.168.1.17
 1045.3125 MB /   1.00 sec = 8768.3573 Mbps     0 retrans
 1045.6875 MB /   1.00 sec = 8772.0292 Mbps     0 retrans
 1049.5625 MB /   1.00 sec = 8804.2627 Mbps     0 retrans
 1043.1875 MB /   1.00 sec = 8750.9960 Mbps     0 retrans
 1048.6875 MB /   1.00 sec = 8796.3246 Mbps     0 retrans
 1033.4375 MB /   1.00 sec = 8669.3188 Mbps     0 retrans
 1040.7500 MB /   1.00 sec = 8730.7057 Mbps     0 retrans
 1047.0000 MB /   1.00 sec = 8783.2063 Mbps     0 retrans
 1040.0000 MB /   1.00 sec = 8724.0564 Mbps     0 retrans
 1037.4375 MB /   1.00 sec = 8702.5434 Mbps     0 retrans

10431.5608 MB /  10.00 sec = 8749.7542 Mbps 25 %TX 35 %RX 0 retrans 0.11 msRTT

The problems I encountered were on a field system running
2.6.30.10.  I will investigate upgrading the field system
to 2.6.35.

					-Bill

^ permalink raw reply

* [net-next-2.6 PATCH] ixgbe: DCB: remove DCB check config
From: Jeff Kirsher @ 2010-10-15  5:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, gospo, bphilips, John Fastabend, Jeff Kirsher

From: John Fastabend <john.r.fastabend@intel.com>

Remove a DCB check config from DCB configuration we
continue to configure DCB even if it fails so don't
even bother to check.  Plus user space (lldpad) checks
this before programming the hw anyways.

Worse case is we program some values into the hw that
don't make total sense resulting in incorrect bandwidth
allocation.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 drivers/net/ixgbe/ixgbe_main.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 95dbf60..790a0da 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -3374,7 +3374,6 @@ static void ixgbe_configure_dcb(struct ixgbe_adapter *adapter)
 	if (hw->mac.type == ixgbe_mac_82598EB)
 		netif_set_gso_max_size(adapter->netdev, 32768);
 
-	ixgbe_dcb_check_config(&adapter->dcb_cfg);
 	ixgbe_dcb_calculate_tc_credits(&adapter->dcb_cfg, DCB_TX_CONFIG);
 	ixgbe_dcb_calculate_tc_credits(&adapter->dcb_cfg, DCB_RX_CONFIG);
 


^ permalink raw reply related

* ixgbe: net-next build breakage
From: Harvey Harrison @ 2010-10-15  4:33 UTC (permalink / raw)
  To: emil.s.tantilov
  Cc: jeffrey.t.kirsher, stephen.s.ko, shemminger, David Miller, netdev

commit f32f837b75233588cd4f8542214a30915ab7847b
Author: Emil Tantilov <emil.s.tantilov@intel.com>
Date:   Tue Oct 12 22:20:34 2010 +0000

    ixgbe: remove unused functions


Breaks the build:
drivers/net/ixgbe/ixgbe_main.c: In function ‘ixgbe_configure_dcb’:
drivers/net/ixgbe/ixgbe_main.c:3377: error: implicit declaration of
function ‘ixgbe_dcb_check_config’
make[2]: *** [drivers/net/ixgbe/ixgbe_main.o] Error 1
make[1]: *** [drivers/net/ixgbe] Error 2
make: *** [drivers/net/] Error 2

As ixgbe_dcb_check_config no longer exists in the tree.

Cheers,

Harvey

^ permalink raw reply

* [PATCHv2 net-2.6] r6040: Fix multicast filter some more
From: Ben Hutchings @ 2010-10-15  3:41 UTC (permalink / raw)
  To: David Miller; +Cc: Florian Fainelli, netdev
In-Reply-To: <1287112636.20865.13.camel@localhost>

This code has been broken forever, but in several different and
creative ways.

So far as I can work out, the R6040 MAC filter has 4 exact-match
entries, the first of which the driver uses for its assigned unicast
address, plus a 64-entry hash-based filter for multicast addresses
(maybe unicast as well?).

The original version of this code would write the first 4 multicast
addresses as exact-match entries from offset 1 (bug #1: there is no
entry 4 so this could write to some PHY registers).  It would fill the
remainder of the exact-match entries with the broadcast address (bug #2:
this would overwrite the last used entry).  If more than 4 multicast
addresses were configured, it would set up the hash table, write some
random crap to the MAC control register (bug #3) and finally walk off
the end of the list when filling the exact-match entries (bug #4).

All of this seems to be pointless, since it sets the promiscuous bit
when the interface is made promiscuous or if >4 multicast addresses
are enabled, and never clears it (bug #5, masking bug #2).

The recent(ish) changes to the multicast list fixed bug #4, but
completely removed the limit on iteration over the exact-match entries
(bug #6).

Bug #4 was reported as
<https://bugzilla.kernel.org/show_bug.cgi?id=15355> and more recently
as <http://bugs.debian.org/600155>.  Florian Fainelli attempted to fix
these in commit 3bcf8229a8c49769e48d3e0bd1e20d8e003f8106, but that
actually dealt with bugs #1-3, bug #4 having been fixed in mainline at
that point.

That commit fixes the most important current bug #6.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@kernel.org [2.6.35 only]
---
Commit message was slightly mangled in the previous message.

Ben.

 drivers/net/r6040.c |   22 ++++++++++++----------
 1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 142c381..80666f0 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -893,16 +893,18 @@ static void r6040_multicast_list(struct net_device *dev)
 	/* Multicast Address 1~4 case */
 	i = 0;
 	netdev_for_each_mc_addr(ha, dev) {
-		if (i < MCAST_MAX) {
-			adrp = (u16 *) ha->addr;
-			iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
-			iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
-			iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
-		} else {
-			iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
-		}
+		if (i >= MCAST_MAX)
+			break;
+		adrp = (u16 *) ha->addr;
+		iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
+		iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
+		iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
+		i++;
+	}
+	while (i < MCAST_MAX) {
+		iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
+		iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
+		iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
 		i++;
 	}
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-2.6] r6040: Fix multicast filter some more
From: Ben Hutchings @ 2010-10-15  3:17 UTC (permalink / raw)
  To: David Miller, Florian Fainelli; +Cc: netdev

This code has been broken forever, but in several different and
creative ways.

So far as I can work out, the R6040 MAC filter has 4 exact-match
entries, the first of which the driver uses for its assigned unicast
address, plus a 64-entry hash-based filter for multicast addresses
(maybe unicast as well?).

The original version of this code would write the first 4 multicast
addresses as exact-match entries from offset 1 (bug #1: there is no
entry 4 so this could write to some PHY registers).  It would fill the
remainder of the exact-match entries with the broadcast address (bug
addresses were configured, it would set up the hash table, write some
random crap to the MAC control register (bug #3) and finally walk off
the end of the list when filling the exact-match entries (bug #4).

All of this seems to be pointless, since it sets the promiscuous bit
when the interface is made promiscuous or if >4 multicast addresses
are enabled, and never clears it (bug #5, masking bug #2).

The recent(ish) changes to the multicast list fixed bug #4, but
completely removed the limit on iteration over the exact-match entries
(bug #6).

Bug #4 was reported as
<https://bugzilla.kernel.org/show_bug.cgi?id=15355> and more recently
as <http://bugs.debian.org/600155>.  Florian Fainelli attempted to fix
these in commit 3bcf8229a8c49769e48d3e0bd1e20d8e003f8106, but that
actually dealt with bugs #1-3, bug #4 having been fixed in mainline at
that point.

That commit fixes the most important current bug #6.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@kernel.org [2.6.35 only]
---
Compile-tested only.

Ben.

 drivers/net/r6040.c |   22 ++++++++++++----------
 1 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 142c381..80666f0 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -893,16 +893,18 @@ static void r6040_multicast_list(struct net_device *dev)
 	/* Multicast Address 1~4 case */
 	i = 0;
 	netdev_for_each_mc_addr(ha, dev) {
-		if (i < MCAST_MAX) {
-			adrp = (u16 *) ha->addr;
-			iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
-			iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
-			iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
-		} else {
-			iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
-			iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
-		}
+		if (i >= MCAST_MAX)
+			break;
+		adrp = (u16 *) ha->addr;
+		iowrite16(adrp[0], ioaddr + MID_1L + 8 * i);
+		iowrite16(adrp[1], ioaddr + MID_1M + 8 * i);
+		iowrite16(adrp[2], ioaddr + MID_1H + 8 * i);
+		i++;
+	}
+	while (i < MCAST_MAX) {
+		iowrite16(0xffff, ioaddr + MID_1L + 8 * i);
+		iowrite16(0xffff, ioaddr + MID_1M + 8 * i);
+		iowrite16(0xffff, ioaddr + MID_1H + 8 * i);
 		i++;
 	}
 }
-- 
1.7.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox