Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 1/1] ARC VMAC ethernet driver.
From: Eric Dumazet @ 2011-02-17 10:13 UTC (permalink / raw)
  To: Andreas Fenkart; +Cc: netdev
In-Reply-To: <1297935091-15504-1-git-send-email-afenkart@gmail.com>

Le jeudi 17 février 2011 à 10:31 +0100, Andreas Fenkart a écrit :
> Signed-off-by: Andreas Fenkart <afenkart@gmail.com>
> ---
>  drivers/net/Kconfig   |   10 +
>  drivers/net/Makefile  |    1 +
>  drivers/net/arcvmac.c | 1494 +++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/net/arcvmac.h |  265 +++++++++
>  4 files changed, 1770 insertions(+), 0 deletions(-)
> 



> +/* merge buffer chaining  */
> +struct sk_buff *vmac_merge_rx_buffers_unlocked(struct net_device *dev,
> +		struct vmac_buffer_desc *after,
> +		int pkt_len) /* data */
> +{
> +	struct vmac_priv *ap = netdev_priv(dev);
> +	struct sk_buff *merge_skb, *cur_skb;
> +	struct dma_fifo *rx_ring;
> +	struct vmac_buffer_desc *desc;
> +
> +	/* locking: same as vmac_rx_receive */
> +
> +	rx_ring = &ap->rx_ring;
> +	desc = &ap->rxbd[rx_ring->tail];
> +
> +	WARN_ON(desc == after);
> +
> +	/* strip FCS */
> +	pkt_len -= 4;
> +
> +	merge_skb = netdev_alloc_skb_ip_align(dev, pkt_len + NET_IP_ALIGN);


You can remove the "+ NET_IP_ALIGN", its already done in
netdev_alloc_skb_ip_align()

Also, it seems strange you want to build one big SKB (no frag), while
this NIC is able to feed multiple frags.

(Change to get a SKB to hold a 9000 bytes frame is very very low if your
memory gets fragmented)

> +	if (!merge_skb) {
> +		dev_err(&ap->pdev->dev, "failed to allocate merged rx_skb, rx skb's left %d\n",
> +				fifo_used(rx_ring));
> +
> +		return NULL;
> +	}
> +
> +	while (desc != after && pkt_len) {
> +		struct vmac_buffer_desc *desc;
> +		int buf_len, valid;
> +
> +		/* desc needs wrapping */
> +		desc = &ap->rxbd[rx_ring->tail];
> +		cur_skb = ap->rx_skbuff[rx_ring->tail];
> +		WARN_ON(!cur_skb);
> +
> +		dma_unmap_single(&ap->pdev->dev, desc->data, ap->rx_skb_size,
> +				DMA_FROM_DEVICE);
> +
> +		/* do not copy FCS */
> +		buf_len = le32_to_cpu(desc->info) & BD_LEN;
> +		valid = min(pkt_len, buf_len);
> +		pkt_len -= valid;
> +
> +		memcpy(skb_put(merge_skb, valid), cur_skb->data, valid);
> +
> +		fifo_inc_tail(rx_ring);
> +	}
> +
> +	/* merging_pressure++ */
> +
> +	if (unlikely(pkt_len != 0))
> +		dev_err(&ap->pdev->dev, "buffer chaining bytes missing %d\n",
> +				pkt_len);
> +
> +	WARN_ON(desc != after);
> +
> +	return merge_skb;
> +}
> +

> +
> +int vmac_start_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct vmac_priv *ap = netdev_priv(dev);
> +	struct vmac_buffer_desc *desc;
> +	unsigned long flags;
> +	unsigned int tmp;
> +
> +	/* running under xmit lock */
> +	/* locking: modifies tx_ring head, tx_reclaim only tail */
> +
> +	/* no scatter/gatter see features below */
> +	WARN_ON(skb_shinfo(skb)->nr_frags != 0);
> +	WARN_ON(skb->len > MAX_TX_BUFFER_LEN);
> +
> +	if (unlikely(fifo_full(&ap->tx_ring))) {
> +		netif_stop_queue(dev);
> +		vmac_toggle_txint(dev, 1);
> +		dev_err(&ap->pdev->dev, "xmit called with no tx desc available\n");
> +		return NETDEV_TX_BUSY;
> +	}
> +
> +	if (unlikely(skb->len < ETH_ZLEN)) {
> +		struct sk_buff *short_skb;
> +		short_skb = netdev_alloc_skb_ip_align(dev, ETH_ZLEN);

I guess you dont really need the _ip_align() version here

> +		if (!short_skb)
> +			return NETDEV_TX_LOCKED;
> +
> +		memset(short_skb->data, 0, ETH_ZLEN);
> +		memcpy(skb_put(short_skb, ETH_ZLEN), skb->data, skb->len);
> +		dev_kfree_skb(skb);
> +		skb = short_skb;
> +	}
> +
> +	/* fill descriptor */
> +	ap->tx_skbuff[ap->tx_ring.head] = skb;
> +	desc = &ap->txbd[ap->tx_ring.head];
> +	WARN_ON(desc->info & cpu_to_le32(BD_DMA_OWN));
> +
> +	desc->data = dma_map_single(&ap->pdev->dev, skb->data, skb->len,
> +			DMA_TO_DEVICE);
> +
> +	/* dma might already be polling */
> +	wmb();
> +	desc->info = cpu_to_le32(BD_DMA_OWN | BD_FRST | BD_LAST | skb->len);
> +	wmb();

Not sure you need this wmb();

> +
> +	/* lock device data */
> +	spin_lock_irqsave(&ap->lock, flags);
> +
> +	/* kick tx dma */
> +	tmp = vmac_readl(ap, STAT);
> +	vmac_writel(ap, tmp | TXPL_MASK, STAT);
> +
> +	dev->stats.tx_packets++;
> +	dev->stats.tx_bytes += skb->len;
> +	dev->trans_start = jiffies;

trans_start doesnt need to be set anymore in drivers.

> +	fifo_inc_head(&ap->tx_ring);
> +
> +	/* vmac_tx_reclaim outside of vmac_tx_timeout */
> +	if (fifo_used(&ap->tx_ring) > 8)
> +		vmac_tx_reclaim_unlocked(dev, 0);
> +
> +	/* unlock device data */
> +	spin_unlock_irqrestore(&ap->lock, flags);
> +
> +	/* stop queue if no more desc available */
> +	if (fifo_full(&ap->tx_ring)) {
> +		netif_stop_queue(dev);
> +		vmac_toggle_txint(dev, 1);
> +	}
> +
> +	return NETDEV_TX_OK;
> +}
> +


> +static void create_multicast_filter(struct net_device *dev,
> +	unsigned long *bitmask)
> +{
> +	unsigned long crc;
> +	char *addrs;
> +
> +	/* locking: done by net_device */
> +
> +	WARN_ON(netdev_mc_count(dev) == 0);
> +	WARN_ON(dev->flags & IFF_ALLMULTI);
> +
> +	bitmask[0] = bitmask[1] = 0;
> +
> +	{
> +		struct netdev_hw_addr *ha;
> +		netdev_for_each_mc_addr(ha, dev) {
> +			addrs = ha->addr;
> +
> +			/* skip non-multicast addresses */
> +			if (!(*addrs & 1))
> +				continue;
> +
> +			crc = ether_crc_le(ETH_ALEN, addrs);
> +			set_bit(crc >> 26, bitmask);

I am wondering if it works on 64bit arches ;)

> +		}
> +	}
> +}
> +

> +static struct ethtool_ops vmac_ethtool_ops = {

please add const qualifier

static const struct ethtool_ops vmac_ethtool_ops = {


> +	.get_settings		= vmacether_get_settings,
> +	.set_settings		= vmacether_set_settings,
> +	.get_drvinfo		= vmacether_get_drvinfo,
> +	.get_link		= ethtool_op_get_link,
> +};
> +

> +static int __devinit vmac_probe(struct platform_device *pdev)
> +{
> +	struct net_device *dev;
> +	struct vmac_priv *ap;
> +	struct resource *mem;
> +	int err;
> +
> +	/* locking: no concurrency */
> +
> +	if (dma_get_mask(&pdev->dev) > DMA_BIT_MASK(32) ||
> +			pdev->dev.coherent_dma_mask > DMA_BIT_MASK(32)) {
> +		dev_err(&pdev->dev, "arcvmac supports only 32-bit DMA addresses\n");
> +		return -ENODEV;
> +	}
> +
> +	dev = alloc_etherdev(sizeof(*ap));
> +	if (!dev) {
> +		dev_err(&pdev->dev, "etherdev alloc failed, aborting.\n");
> +		return -ENOMEM;
> +	}
> +
> +	ap = netdev_priv(dev);
> +
> +	err = -ENODEV;
> +	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	if (!mem) {
> +		dev_err(&pdev->dev, "no mmio resource defined\n");
> +		goto err_out;
> +	}
> +	ap->mem = mem;
> +
> +	err = platform_get_irq(pdev, 0);
> +	if (err < 0) {
> +		dev_err(&pdev->dev, "no irq found\n");
> +		goto err_out;
> +	}
> +	dev->irq = err;
> +
> +	spin_lock_init(&ap->lock);
> +
> +	SET_NETDEV_DEV(dev, &pdev->dev);
> +	ap->dev = dev;
> +	ap->pdev = pdev;
> +
> +	/* init rx timeout (used for oom) */
> +	init_timer(&ap->refill_timer);
> +	ap->refill_timer.function = vmac_refill_rx_timer;
> +	ap->refill_timer.data = (unsigned long)dev;
> +	spin_lock_init(&ap->refill_lock);
> +
> +	netif_napi_add(dev, &ap->napi, vmac_poll, 2);

2 ?

You have 16 skb in RX ring, please use 16 (or 64)

> +	dev->netdev_ops = &vmac_netdev_ops;
> +	dev->ethtool_ops = &vmac_ethtool_ops;
> +
> +	dev->flags |= IFF_MULTICAST;
> +
> +	dev->base_addr = (unsigned long)ap->regs; /* TODO */
> +
> +	/* prevent buffer chaining, favor speed over space */
> +	ap->rx_skb_size = ETH_FRAME_LEN + VMAC_BUFFER_PAD;
> +
> +	/* private struct functional */
> +
> +	/* temporarily map registers to fetch mac addr */
> +	err = get_register_map(ap);
> +	if (err)
> +		goto err_out;
> +
> +	/* mac address intialize, set vmac_open  */
> +	read_mac_reg(dev, dev->dev_addr); /* TODO */
> +
> +	if (!is_valid_ether_addr(dev->dev_addr))
> +		random_ether_addr(dev->dev_addr);
> +
> +	err = register_netdev(dev);
> +	if (err) {
> +		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
> +		goto err_out;
> +	}
> +
> +	/* release the memory region, till open is called */
> +	put_register_map(ap);
> +
> +	dev_info(&pdev->dev, "ARC VMAC at 0x%pP irq %d %pM\n", &mem->start,
> +	    dev->irq, dev->dev_addr);
> +	platform_set_drvdata(pdev, ap);
> +
> +	return 0;
> +
> +err_out:
> +	free_netdev(dev);
> +	return err;
> +}
> +

> diff --git a/drivers/net/arcvmac.h b/drivers/net/arcvmac.h
> new file mode 100644
> index 0000000..ee570a5
> --- /dev/null
> +++ b/drivers/net/arcvmac.h
> @@ -0,0 +1,265 @@
> +/*
> + * ARC VMAC Driver
> + *
> + * Copyright (C) 2003-2006 Codito Technologies, for linux-2.4 port
> + * Copyright (C) 2006-2007 Celunite Inc, for linux-2.6 port
> + * Copyright (C) 2007-2008 Sagem Communications, Fehmi HAFSI
> + * Copyright (C) 2009-2011 Sagem Communications, Andreas Fenkart
> + * All Rights Reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#ifndef _ARCVMAC_H
> +#define _ARCVMAC_H
> +
> +#define DRV_NAME		"arcvmac"
> +#define DRV_VERSION		"1.0"
> +
> +/* Buffer descriptors */
> +#define TX_BDT_LEN		16    /* Number of receive BD's */

16 is a bit small. Is it a hardware limitation or user choice ?

> +#define RX_BDT_LEN		256   /* Number of transmit BD's */

If hardware permits it, I suggest using 128 RX and 128 TX descs

> +
> +/* BD poll rate, in 1024 cycles. @100Mhz: x * 1024 cy * 10ns = 1ms */
> +#define POLLRATE_TIME		200
> +
> +/* next power of two, bigger than ETH_FRAME_LEN + VLAN  */
> +#define MAX_RX_BUFFER_LEN	0x800	/* 2^11 = 2048 = 0x800 */
> +#define MAX_TX_BUFFER_LEN	0x800	/* 2^11 = 2048 = 0x800 */
> +
> +/* 14 bytes of ethernet header, 4 bytes VLAN, FCS,
> + * plus extra pad to prevent buffer chaining of
> + * maximum sized ethernet packets (1514 bytes) */
> +#define	VMAC_BUFFER_PAD		(ETH_HLEN + 4 + ETH_FCS_LEN + 4)
> +
> +/* VMAC register definitions, offsets in bytes */
> +#define VMAC_ID			0x00
> +
> +/* stat/enable use same bit mask */
> +#define VMAC_STAT		0x04
> +#define VMAC_ENABLE		0x08
> +#  define TXINT_MASK		0x00000001 /* Transmit interrupt */
> +#  define RXINT_MASK		0x00000002 /* Receive interrupt */
> +#  define ERR_MASK		0x00000004 /* Error interrupt */
> +#  define TXCH_MASK		0x00000008 /* Transmit chaining error */
> +#  define MSER_MASK		0x00000010 /* Missed packet counter error */
> +#  define RXCR_MASK		0x00000100 /* RXCRCERR counter rolled over	 */
> +#  define RXFR_MASK		0x00000200 /* RXFRAMEERR counter rolled over */
> +#  define RXFL_MASK		0x00000400 /* RXOFLOWERR counter rolled over */
> +#  define MDIO_MASK		0x00001000 /* MDIO complete */
> +#  define TXPL_MASK		0x80000000 /* TXPOLL */
> +
> +#define VMAC_CONTROL		0x0c
> +#  define EN_MASK		0x00000001 /* VMAC enable */
> +#  define TXRN_MASK		0x00000008 /* TX enable */
> +#  define RXRN_MASK		0x00000010 /* RX enable */
> +#  define DSBC_MASK		0x00000100 /* Disable receive broadcast */
> +#  define ENFL_MASK		0x00000400 /* Enable Full Duplex */
> +#  define PROM_MASK		0x00000800 /* Promiscuous mode */
> +
> +#define VMAC_POLLRATE		0x10
> +
> +#define VMAC_RXERR		0x14
> +#  define RXERR_CRC		0x000000ff
> +#  define RXERR_FRM		0x0000ff00
> +#  define RXERR_OFLO		0x00ff0000 /* fifo overflow */
> +
> +#define VMAC_MISS		0x18
> +#define VMAC_TXRINGPTR		0x1c
> +#define VMAC_RXRINGPTR		0x20
> +#define VMAC_ADDRL		0x24
> +#define VMAC_ADDRH		0x28
> +#define VMAC_LAFL		0x2c
> +#define VMAC_LAFH		0x30
> +#define VMAC_MAC_TXRING_HEAD	0x38
> +#define VMAC_MAC_RXRING_HEAD	0x3C
> +
> +#define VMAC_MDIO_DATA		0x34
> +#  define MDIO_SFD		0xC0000000
> +#  define MDIO_OP		0x30000000
> +#  define MDIO_ID_MASK		0x0F800000
> +#  define MDIO_REG_MASK		0x007C0000
> +#  define MDIO_TA		0x00030000
> +#  define MDIO_DATA_MASK	0x0000FFFF
> +/* common combinations */
> +#  define MDIO_BASE		0x40020000
> +#  define MDIO_OP_READ		0x20000000
> +#  define MDIO_OP_WRITE		0x10000000
> +
> +/* Buffer descriptor INFO bit masks */
> +#define BD_DMA_OWN		0x80000000 /* buffer ownership, 0 CPU, 1 DMA */
> +#define BD_BUFF			0x40000000 /* buffer invalid, rx */
> +#define BD_UFLO			0x20000000 /* underflow, tx */
> +#define BD_LTCL			0x10000000 /* late collision, tx  */
> +#define BD_RETRY_CT		0x0f000000 /* tx */
> +#define BD_DROP			0x00800000 /* drop, more than 16 retries, tx */
> +#define BD_DEFER		0x00400000 /* traffic on the wire, tx */
> +#define BD_CARLOSS		0x00200000 /* carrier loss while transmission, tx, rx? */
> +/* 20:19 reserved */
> +#define BD_ADCR			0x00040000 /* add crc, ignored if not disaddcrc */
> +#define BD_LAST			0x00020000 /* Last buffer in chain */
> +#define BD_FRST			0x00010000 /* First buffer in chain */
> +/* 15:11 reserved */
> +#define BD_LEN			0x000007FF
> +
> +/* common combinations */
> +#define BD_TX_ERR		(BD_UFLO | BD_LTCL | BD_RETRY_CT | BD_DROP | \
> +		BD_DEFER | BD_CARLOSS)
> +
> +
> +/* arcvmac private data structures */
> +struct vmac_buffer_desc {
> +	__le32 info;
> +	__le32 data;
> +};
> +
> +struct dma_fifo {
> +	int head; /* head */
> +	int tail; /* tail */
> +	int size;
> +};
> +
> +struct	vmac_priv {
> +	struct net_device *dev;
> +	struct platform_device *pdev;
> +
> +	struct completion mdio_complete;
> +	spinlock_t lock; /* protects structure plus hw regs of device */
> +
> +	/* base address of register set */
> +	char *regs;
> +	struct resource *mem;
> +
> +	/* DMA ring buffers */
> +	struct vmac_buffer_desc *rxbd;
> +	dma_addr_t rxbd_dma;
> +
> +	struct vmac_buffer_desc *txbd;
> +	dma_addr_t txbd_dma;
> +
> +	/* socket buffers */
> +	struct sk_buff *rx_skbuff[RX_BDT_LEN];
> +	struct sk_buff *tx_skbuff[TX_BDT_LEN];
> +	int rx_skb_size;
> +
> +	/* skb / dma desc managing */
> +	struct dma_fifo rx_ring; /* valid rx buffers */
> +	struct dma_fifo tx_ring;
> +
> +	/* descriptor last polled/processed by the VMAC */
> +	unsigned long dma_rx_head;
> +
> +	/* timer to retry rx skb allocation, if failed during receive */
> +	struct timer_list refill_timer;
> +	spinlock_t refill_lock;
> +
> +	struct napi_struct napi;
> +
> +	/* rx buffer chaining */
> +	int rx_merge_error;
> +	int tx_timeout_error;
> +
> +	/* PHY stuff */
> +	struct mii_bus *mii_bus;
> +	struct phy_device *phy_dev;
> +
> +	int link;
> +	int speed;
> +	int duplex;
> +
> +	/* debug */
> +	int shutdown;
> +};
> +
> +/* DMA ring management */
> +
> +/* for a fifo with size n,
> + * - [0..n] fill levels are n + 1 states
> + * - there are only n different deltas (head - tail) values
> + * => not all fill levels can be represented with head, tail
> + *    pointers only
> + * we give up the n fill level, aka fifo full */
> +
> +/* sacrifice one elt as a sentinel */
> +static inline int fifo_used(struct dma_fifo *f);
> +static inline int fifo_inc_ct(int ct, int size);
> +static inline void fifo_dump(struct dma_fifo *fifo);
> +
> +static inline int fifo_empty(struct dma_fifo *f)
> +{
> +	return (f->head == f->tail);

return f->head == f->tail;

> +}
> +
> +static inline int fifo_free(struct dma_fifo *f)
> +{
> +	int free;
> +
> +	free = f->tail - f->head;
> +	if (free <= 0)
> +		free += f->size;
> +
> +	return free;
> +}
> +
> +static inline int fifo_used(struct dma_fifo *f)
> +{
> +	int used;
> +
> +	used = f->head - f->tail;
> +	if (used < 0)
> +		used += f->size;
> +
> +	return used;
> +}
> +
> +static inline int fifo_full(struct dma_fifo *f)
> +{
> +	return (fifo_used(f) + 1) == f->size;
> +}
> +
> +/* manipulate */
> +static inline void fifo_init(struct dma_fifo *fifo, int size)
> +{
> +	fifo->size = size;
> +	fifo->head = fifo->tail = 0; /* empty */
> +}
> +
> +static inline void fifo_inc_head(struct dma_fifo *fifo)
> +{
> +	BUG_ON(fifo_full(fifo));
> +	fifo->head = fifo_inc_ct(fifo->head, fifo->size);
> +}
> +
> +static inline void fifo_inc_tail(struct dma_fifo *fifo)
> +{
> +	BUG_ON(fifo_empty(fifo));
> +	fifo->tail = fifo_inc_ct(fifo->tail, fifo->size);
> +}
> +
> +/* internal funcs */
> +static inline void fifo_dump(struct dma_fifo *fifo)
> +{
> +	printk(KERN_INFO "fifo: head %d, tail %d, size %d\n", fifo->head,
> +			fifo->tail,
> +			fifo->size);

	pr_info() is preferred in new code
> +}
> +
> +static inline int fifo_inc_ct(int ct, int size)
> +{
> +	return (++ct == size) ? 0 : ct;
> +}
> +
> +#endif	  /* _ARCVMAC_H */



^ permalink raw reply

* Re: mac addresses of local interfaces do not obey setageing 0
From: David Lamparter @ 2011-02-17  9:56 UTC (permalink / raw)
  To: Veaceslav Falico; +Cc: Stephen Hemminger, netdev, bridge
In-Reply-To: <20110215102538.5f24d62a@nehalam>

[-- Attachment #1: Type: text/plain, Size: 1663 bytes --]

On Tue, Feb 15, 2011 at 10:25:38AM -0800, Stephen Hemminger wrote:
> On Wed, 9 Feb 2011 19:17:52 +0100
> Veaceslav Falico <vfalico@redhat.com> wrote:
> 
> > Hello,
> > 
> > I have a host and a VM inside this host bridged. I've set ageing_time and
> > forward_delay to 0 and trying to capture all the traffic that goes through that
> > bridge from my VM, but it fails to capture the traffic that has dst ether
> > address the same as the hosts address (i.e. I can't capture the traffic to the
> > host).
> > 
> > From the code, I see that br->ageing_time doesn't really work with local mac
> > addresses - has_expired() function never says that a local interface mac address
> > is expired, because it verifies if fdb->is_static is set and returns right away.
> > 
> > Is this the desired behaviour? If so, is there a way to capture packets with
> > destination to a local interface from another interface?
> > 
> > I've also done a small patch and it seems to fix the situation, but I am not
> > sure if it's the right way to do it.
> 
> No.
> Local addresses should never age.
> 
> The proper way to capture packet is to us AF_PACKET or tc actions.

If you really want to use a bridge for this kind of traffic capturing,
you can attach one side of a veth pair as third interface to the bridge,
clear off all the mac addresses (and have no higher-layer addresses on
the bridge either) and use the other side of the veth pair for your
host. Since the "far" side of the veth is not on the bridge, the bridge
will not treat its address any special.

(This is somewhat of a funny hack though. Use AF_PACKET really...)


-David


[-- Attachment #2: Type: application/pgp-signature, Size: 198 bytes --]

^ permalink raw reply

* [net-2.6 4/4] ixgbe: work around for DDP last buffer size
From: Jeff Kirsher @ 2011-02-17  9:35 UTC (permalink / raw)
  To: davem; +Cc: Amir Hanania, netdev, gospo, bphilips, Jeff Kirsher
In-Reply-To: <1297935310-23510-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Amir Hanania <amir.hanania@intel.com>

A HW limitation was recently discovered where the last buffer in a DDP offload
cannot be a full buffer size in length. Fix the issue with a work around by
adding another buffer with size = 1.

Signed-off-by: Amir Hanania <amir.hanania@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe_fcoe.c |   51 +++++++++++++++++++++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_fcoe.h |    2 +
 2 files changed, 52 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_fcoe.c b/drivers/net/ixgbe/ixgbe_fcoe.c
index 8753980..c54a882 100644
--- a/drivers/net/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ixgbe/ixgbe_fcoe.c
@@ -159,7 +159,7 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 	struct scatterlist *sg;
 	unsigned int i, j, dmacount;
 	unsigned int len;
-	static const unsigned int bufflen = 4096;
+	static const unsigned int bufflen = IXGBE_FCBUFF_MIN;
 	unsigned int firstoff = 0;
 	unsigned int lastsize;
 	unsigned int thisoff = 0;
@@ -254,6 +254,24 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 	/* only the last buffer may have non-full bufflen */
 	lastsize = thisoff + thislen;
 
+	/*
+	 * lastsize can not be buffer len.
+	 * If it is then adding another buffer with lastsize = 1.
+	 */
+	if (lastsize == bufflen) {
+		if (j >= IXGBE_BUFFCNT_MAX) {
+			e_err(drv, "xid=%x:%d,%d,%d:addr=%llx "
+				"not enough user buffers. We need an extra "
+				"buffer because lastsize is bufflen.\n",
+				xid, i, j, dmacount, (u64)addr);
+			goto out_noddp_free;
+		}
+
+		ddp->udl[j] = (u64)(fcoe->extra_ddp_buffer_dma);
+		j++;
+		lastsize = 1;
+	}
+
 	fcbuff = (IXGBE_FCBUFF_4KB << IXGBE_FCBUFF_BUFFSIZE_SHIFT);
 	fcbuff |= ((j & 0xff) << IXGBE_FCBUFF_BUFFCNT_SHIFT);
 	fcbuff |= (firstoff << IXGBE_FCBUFF_OFFSET_SHIFT);
@@ -532,6 +550,24 @@ void ixgbe_configure_fcoe(struct ixgbe_adapter *adapter)
 			e_err(drv, "failed to allocated FCoE DDP pool\n");
 
 		spin_lock_init(&fcoe->lock);
+
+		/* Extra buffer to be shared by all DDPs for HW work around */
+		fcoe->extra_ddp_buffer = kmalloc(IXGBE_FCBUFF_MIN, GFP_ATOMIC);
+		if (fcoe->extra_ddp_buffer == NULL) {
+			e_err(drv, "failed to allocated extra DDP buffer\n");
+			goto out_extra_ddp_buffer_alloc;
+		}
+
+		fcoe->extra_ddp_buffer_dma =
+			dma_map_single(&adapter->pdev->dev,
+				       fcoe->extra_ddp_buffer,
+				       IXGBE_FCBUFF_MIN,
+				       DMA_FROM_DEVICE);
+		if (dma_mapping_error(&adapter->pdev->dev,
+				      fcoe->extra_ddp_buffer_dma)) {
+			e_err(drv, "failed to map extra DDP buffer\n");
+			goto out_extra_ddp_buffer_dma;
+		}
 	}
 
 	/* Enable L2 eth type filter for FCoE */
@@ -581,6 +617,14 @@ void ixgbe_configure_fcoe(struct ixgbe_adapter *adapter)
 		}
 	}
 #endif
+
+	return;
+
+out_extra_ddp_buffer_dma:
+	kfree(fcoe->extra_ddp_buffer);
+out_extra_ddp_buffer_alloc:
+	pci_pool_destroy(fcoe->pool);
+	fcoe->pool = NULL;
 }
 
 /**
@@ -600,6 +644,11 @@ void ixgbe_cleanup_fcoe(struct ixgbe_adapter *adapter)
 	if (fcoe->pool) {
 		for (i = 0; i < IXGBE_FCOE_DDP_MAX; i++)
 			ixgbe_fcoe_ddp_put(adapter->netdev, i);
+		dma_unmap_single(&adapter->pdev->dev,
+				 fcoe->extra_ddp_buffer_dma,
+				 IXGBE_FCBUFF_MIN,
+				 DMA_FROM_DEVICE);
+		kfree(fcoe->extra_ddp_buffer);
 		pci_pool_destroy(fcoe->pool);
 		fcoe->pool = NULL;
 	}
diff --git a/drivers/net/ixgbe/ixgbe_fcoe.h b/drivers/net/ixgbe/ixgbe_fcoe.h
index 4bc2c55..65cc8fb 100644
--- a/drivers/net/ixgbe/ixgbe_fcoe.h
+++ b/drivers/net/ixgbe/ixgbe_fcoe.h
@@ -70,6 +70,8 @@ struct ixgbe_fcoe {
 	spinlock_t lock;
 	struct pci_pool *pool;
 	struct ixgbe_fcoe_ddp ddp[IXGBE_FCOE_DDP_MAX];
+	unsigned char *extra_ddp_buffer;
+	dma_addr_t extra_ddp_buffer_dma;
 };
 
 #endif /* _IXGBE_FCOE_H */
-- 
1.7.4


^ permalink raw reply related

* [net-2.6 2/4] e1000e: flush all writebacks before unload
From: Jeff Kirsher @ 2011-02-17  9:35 UTC (permalink / raw)
  To: davem; +Cc: Jesse Brandeburg, netdev, gospo, bphilips, Jeff Kirsher
In-Reply-To: <1297935310-23510-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

The driver was not flushing all writebacks before unloading, possibly
causing memory to be written by the hardware after the driver had
reinitialized the rings.

This adds missing functionality to flush any pending writebacks and is
called in all spots where descriptors should be completed before the driver
begins processing.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/netdev.c |   31 +++++++++++++++++++++----------
 1 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 174633c..3fa110d 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -3344,6 +3344,21 @@ int e1000e_up(struct e1000_adapter *adapter)
 	return 0;
 }
 
+static void e1000e_flush_descriptors(struct e1000_adapter *adapter)
+{
+	struct e1000_hw *hw = &adapter->hw;
+
+	if (!(adapter->flags2 & FLAG2_DMA_BURST))
+		return;
+
+	/* flush pending descriptor writebacks to memory */
+	ew32(TIDV, adapter->tx_int_delay | E1000_TIDV_FPD);
+	ew32(RDTR, adapter->rx_int_delay | E1000_RDTR_FPD);
+
+	/* execute the writes immediately */
+	e1e_flush();
+}
+
 void e1000e_down(struct e1000_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
@@ -3383,6 +3398,9 @@ void e1000e_down(struct e1000_adapter *adapter)
 
 	if (!pci_channel_offline(adapter->pdev))
 		e1000e_reset(adapter);
+
+	e1000e_flush_descriptors(adapter);
+
 	e1000_clean_tx_ring(adapter);
 	e1000_clean_rx_ring(adapter);
 
@@ -4354,19 +4372,12 @@ link_up:
 	else
 		ew32(ICS, E1000_ICS_RXDMT0);
 
+	/* flush pending descriptors to memory before detecting Tx hang */
+	e1000e_flush_descriptors(adapter);
+
 	/* Force detection of hung controller every watchdog period */
 	adapter->detect_tx_hung = 1;
 
-	/* flush partial descriptors to memory before detecting Tx hang */
-	if (adapter->flags2 & FLAG2_DMA_BURST) {
-		ew32(TIDV, adapter->tx_int_delay | E1000_TIDV_FPD);
-		ew32(RDTR, adapter->rx_int_delay | E1000_RDTR_FPD);
-		/*
-		 * no need to flush the writes because the timeout code does
-		 * an er32 first thing
-		 */
-	}
-
 	/*
 	 * With 82571 controllers, LAA may be overwritten due to controller
 	 * reset from the other port. Set the appropriate LAA in RAR[0]
-- 
1.7.4


^ permalink raw reply related

* [net-2.6 3/4] ixgbe: fix panic due to uninitialised pointer
From: Jeff Kirsher @ 2011-02-17  9:35 UTC (permalink / raw)
  To: davem; +Cc: Andy Gospodarek, netdev, gospo, bphilips, Jeff Kirsher
In-Reply-To: <1297935310-23510-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Andy Gospodarek <andy@greyhouse.net>

Systems containing an 82599EB and running a backported driver from
upstream were panicing on boot.  It turns out hw->mac.ops.setup_sfp is
only set for 82599, so one should check to be sure that pointer is set
before continuing in ixgbe_sfp_config_module_task.  I verified by
inspection that the upstream driver has the same issue and also added a
check before the call in ixgbe_sfp_link_config.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe_main.c |    6 ++++--
 1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index fbae703..30f9ccf 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -3728,7 +3728,8 @@ static void ixgbe_sfp_link_config(struct ixgbe_adapter *adapter)
 			 * We need to try and force an autonegotiation
 			 * session, then bring up link.
 			 */
-			hw->mac.ops.setup_sfp(hw);
+			if (hw->mac.ops.setup_sfp)
+				hw->mac.ops.setup_sfp(hw);
 			if (!(adapter->flags & IXGBE_FLAG_IN_SFP_LINK_TASK))
 				schedule_work(&adapter->multispeed_fiber_task);
 		} else {
@@ -5968,7 +5969,8 @@ static void ixgbe_sfp_config_module_task(struct work_struct *work)
 		unregister_netdev(adapter->netdev);
 		return;
 	}
-	hw->mac.ops.setup_sfp(hw);
+	if (hw->mac.ops.setup_sfp)
+		hw->mac.ops.setup_sfp(hw);
 
 	if (!(adapter->flags & IXGBE_FLAG_IN_SFP_LINK_TASK))
 		/* This will also work for DA Twinax connections */
-- 
1.7.4


^ permalink raw reply related

* [net-2.6 0/4][pull request] Intel Wired LAN Driver Updates
From: Jeff Kirsher @ 2011-02-17  9:35 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, bphilips

The following series contains two fixes for e1000e and ixgbe.

The following are changes since commit 6d90e8f45697c633f522269368297d7416fd8783:
  isdn: hisax: Use l2headersize() instead of dup (and buggy) func.

and are available in the git repository at:
  master.kernel.org:/pub/scm/linux/kernel/git/jkirsher/net-2.6 master

Amir Hanania (1):
  ixgbe: work around for DDP last buffer size

Andy Gospodarek (1):
  ixgbe: fix panic due to uninitialised pointer

Jesse Brandeburg (2):
  e1000e: check down flag in tasks
  e1000e: flush all writebacks before unload

 drivers/net/e1000e/netdev.c    |   52 ++++++++++++++++++++++++++++++++-------
 drivers/net/ixgbe/ixgbe_fcoe.c |   51 ++++++++++++++++++++++++++++++++++++++-
 drivers/net/ixgbe/ixgbe_fcoe.h |    2 +
 drivers/net/ixgbe/ixgbe_main.c |    6 +++-
 4 files changed, 98 insertions(+), 13 deletions(-)

-- 
1.7.4


^ permalink raw reply

* [net-2.6 1/4] e1000e: check down flag in tasks
From: Jeff Kirsher @ 2011-02-17  9:35 UTC (permalink / raw)
  To: davem; +Cc: Jesse Brandeburg, netdev, gospo, bphilips, Jeff Kirsher
In-Reply-To: <1297935310-23510-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jesse Brandeburg <jesse.brandeburg@intel.com>

This change is part of a fix to avoid any tasks running while the driver is
exiting and deinitializing resources.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000e/netdev.c |   21 +++++++++++++++++++++
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 3065870..174633c 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -937,6 +937,9 @@ static void e1000_print_hw_hang(struct work_struct *work)
 	u16 phy_status, phy_1000t_status, phy_ext_status;
 	u16 pci_status;
 
+	if (test_bit(__E1000_DOWN, &adapter->state))
+		return;
+
 	e1e_rphy(hw, PHY_STATUS, &phy_status);
 	e1e_rphy(hw, PHY_1000T_STATUS, &phy_1000t_status);
 	e1e_rphy(hw, PHY_EXT_STATUS, &phy_ext_status);
@@ -1506,6 +1509,9 @@ static void e1000e_downshift_workaround(struct work_struct *work)
 	struct e1000_adapter *adapter = container_of(work,
 					struct e1000_adapter, downshift_task);
 
+	if (test_bit(__E1000_DOWN, &adapter->state))
+		return;
+
 	e1000e_gig_downshift_workaround_ich8lan(&adapter->hw);
 }
 
@@ -3765,6 +3771,10 @@ static void e1000e_update_phy_task(struct work_struct *work)
 {
 	struct e1000_adapter *adapter = container_of(work,
 					struct e1000_adapter, update_phy_task);
+
+	if (test_bit(__E1000_DOWN, &adapter->state))
+		return;
+
 	e1000_get_phy_info(&adapter->hw);
 }
 
@@ -3775,6 +3785,10 @@ static void e1000e_update_phy_task(struct work_struct *work)
 static void e1000_update_phy_info(unsigned long data)
 {
 	struct e1000_adapter *adapter = (struct e1000_adapter *) data;
+
+	if (test_bit(__E1000_DOWN, &adapter->state))
+		return;
+
 	schedule_work(&adapter->update_phy_task);
 }
 
@@ -4149,6 +4163,9 @@ static void e1000_watchdog_task(struct work_struct *work)
 	u32 link, tctl;
 	int tx_pending = 0;
 
+	if (test_bit(__E1000_DOWN, &adapter->state))
+		return;
+
 	link = e1000e_has_link(adapter);
 	if ((netif_carrier_ok(netdev)) && link) {
 		/* Cancel scheduled suspend requests. */
@@ -4887,6 +4904,10 @@ static void e1000_reset_task(struct work_struct *work)
 	struct e1000_adapter *adapter;
 	adapter = container_of(work, struct e1000_adapter, reset_task);
 
+	/* don't run the task if already down */
+	if (test_bit(__E1000_DOWN, &adapter->state))
+		return;
+
 	if (!((adapter->flags & FLAG_RX_NEEDS_RESTART) &&
 	      (adapter->flags & FLAG_RX_RESTART_NOW))) {
 		e1000e_dump(adapter);
-- 
1.7.4


^ permalink raw reply related

* [PATCH 1/1] ARC VMAC ethernet driver.
From: Andreas Fenkart @ 2011-02-17  9:31 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, Andreas Fenkart
In-Reply-To: <AANLkTi=5XOmzzO=Bov4Rse_rhw_0uPnjPNN1DN9CLSXX@mail.gmail.com>

Signed-off-by: Andreas Fenkart <afenkart@gmail.com>
---
 drivers/net/Kconfig   |   10 +
 drivers/net/Makefile  |    1 +
 drivers/net/arcvmac.c | 1494 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/arcvmac.h |  265 +++++++++
 4 files changed, 1770 insertions(+), 0 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0382332..ab239da 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -251,6 +251,16 @@ config AX88796_93CX6
 	help
 	  Select this if your platform comes with an external 93CX6 eeprom.
 
+config ARCVMAC
+	tristate "ARC VMAC ethernet support"
+	depends on HAS_DMA
+	select MII
+	select PHYLIB
+	select CRC32
+
+	help
+	  MAC present on Zoran Quatro43XX
+
 config MACE
 	tristate "MACE (Power Mac ethernet) support"
 	depends on PPC_PMAC && PPC32
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index b90738d..059e253 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -140,6 +140,7 @@ obj-$(CONFIG_ULTRA32) += smc-ultra32.o 8390.o
 obj-$(CONFIG_E2100) += e2100.o 8390.o
 obj-$(CONFIG_ES3210) += es3210.o 8390.o
 obj-$(CONFIG_LNE390) += lne390.o 8390.o
+obj-$(CONFIG_ARCVMAC) += arcvmac.o
 obj-$(CONFIG_NE3210) += ne3210.o 8390.o
 obj-$(CONFIG_SB1250_MAC) += sb1250-mac.o
 obj-$(CONFIG_B44) += b44.o
diff --git a/drivers/net/arcvmac.c b/drivers/net/arcvmac.c
new file mode 100644
index 0000000..8f4f208
--- /dev/null
+++ b/drivers/net/arcvmac.c
@@ -0,0 +1,1494 @@
+/*
+ * ARC VMAC Driver
+ *
+ * Copyright (C) 2003-2006 Codito Technologies, for linux-2.4 port
+ * Copyright (C) 2006-2007 Celunite Inc, for linux-2.6 port
+ * Copyright (C) 2007-2008 Sagem Communications, Fehmi Hafsi
+ * Copyright (C) 2009-2011 Sagem Communications, Andreas Fenkart
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * external PHY support based on dnet.c
+ * ring management based on bcm63xx_enet.c
+ */
+
+#include <linux/clk.h>
+#include <linux/crc32.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/etherdevice.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include "arcvmac.h"
+
+/* Register access macros */
+#define vmac_writel(port, value, reg)	\
+	writel(cpu_to_le32(value), (port)->regs + VMAC_##reg)
+#define vmac_readl(port, reg)	le32_to_cpu(readl((port)->regs + VMAC_##reg))
+
+static int get_register_map(struct vmac_priv *ap);
+static int put_register_map(struct vmac_priv *ap);
+
+static unsigned char *read_mac_reg(struct net_device *dev,
+		unsigned char hwaddr[ETH_ALEN])
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned mac_lo, mac_hi;
+
+	WARN_ON(!hwaddr);
+	mac_lo = vmac_readl(ap, ADDRL);
+	mac_hi = vmac_readl(ap, ADDRH);
+
+	hwaddr[0] = (mac_lo >> 0) & 0xff;
+	hwaddr[1] = (mac_lo >> 8) & 0xff;
+	hwaddr[2] = (mac_lo >> 16) & 0xff;
+	hwaddr[3] = (mac_lo >> 24) & 0xff;
+	hwaddr[4] = (mac_hi >> 0) & 0xff;
+	hwaddr[5] = (mac_hi >> 8) & 0xff;
+	return hwaddr;
+}
+
+static void write_mac_reg(struct net_device *dev, unsigned char* hwaddr)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned mac_lo, mac_hi;
+
+	mac_lo = hwaddr[3] << 24 | hwaddr[2] << 16 | hwaddr[1] << 8 |
+		hwaddr[0];
+	mac_hi = hwaddr[5] << 8 | hwaddr[4];
+
+	vmac_writel(ap, mac_lo, ADDRL);
+	vmac_writel(ap, mac_hi, ADDRH);
+}
+
+static void vmac_mdio_xmit(struct vmac_priv *ap, unsigned val)
+{
+	init_completion(&ap->mdio_complete);
+	vmac_writel(ap, val, MDIO_DATA);
+	wait_for_completion(&ap->mdio_complete);
+}
+
+static int vmac_mdio_read(struct mii_bus *bus, int phy_id, int phy_reg)
+{
+	struct vmac_priv *vmac = bus->priv;
+	unsigned int val;
+
+	/* only 5 bits allowed for phy-addr and reg_offset */
+	WARN_ON(phy_id & ~0x1f || phy_reg & ~0x1f);
+
+	val = MDIO_BASE | MDIO_OP_READ;
+	val |= phy_id << 23 | phy_reg << 18;
+	vmac_mdio_xmit(vmac, val);
+
+	val = vmac_readl(vmac, MDIO_DATA);
+	return val & MDIO_DATA_MASK;
+}
+
+static int vmac_mdio_write(struct mii_bus *bus, int phy_id, int phy_reg,
+			 u16 value)
+{
+	struct vmac_priv *vmac = bus->priv;
+	unsigned int val;
+
+	/* only 5 bits allowed for phy-addr and reg_offset */
+	WARN_ON(phy_id & ~0x1f || phy_reg & ~0x1f);
+
+	val = MDIO_BASE | MDIO_OP_WRITE;
+	val |= phy_id << 23 | phy_reg << 18;
+	val |= (value & MDIO_DATA_MASK);
+	vmac_mdio_xmit(vmac, val);
+
+	return 0;
+}
+
+static void vmac_handle_link_change(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+	unsigned long flags;
+	int report_change = 0;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	if (phydev->duplex != ap->duplex) {
+		unsigned tmp;
+
+		tmp = vmac_readl(ap, ENABLE);
+
+		if (phydev->duplex)
+			tmp |= ENFL_MASK;
+		else
+			tmp &= ~ENFL_MASK;
+
+		vmac_writel(ap, tmp, ENABLE);
+
+		ap->duplex = phydev->duplex;
+		report_change = 1;
+	}
+
+	if (phydev->speed != ap->speed) {
+		ap->speed = phydev->speed;
+		report_change = 1;
+	}
+
+	if (phydev->link != ap->link) {
+		ap->link = phydev->link;
+		report_change = 1;
+	}
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	if (report_change)
+		phy_print_status(ap->phy_dev);
+}
+
+static int __devinit vmac_mii_probe(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = NULL;
+	struct clk *vmac_clk;
+	unsigned long clock_rate;
+	int phy_addr, err;
+
+	/* find the first phy */
+	for (phy_addr = 0; phy_addr < PHY_MAX_ADDR; phy_addr++) {
+		if (ap->mii_bus->phy_map[phy_addr]) {
+			phydev = ap->mii_bus->phy_map[phy_addr];
+			break;
+		}
+	}
+
+	if (!phydev) {
+		dev_err(&ap->pdev->dev, "no PHY found\n");
+		return -ENODEV;
+	}
+
+	/* FIXME: add pin_irq, if avail */
+
+	phydev = phy_connect(dev, dev_name(&phydev->dev),
+			&vmac_handle_link_change, 0,
+			PHY_INTERFACE_MODE_MII);
+
+	if (IS_ERR(phydev)) {
+		err = PTR_ERR(phydev);
+		dev_err(&ap->pdev->dev, "could not attach to PHY %d\n", err);
+		goto err_out;
+	}
+
+	phydev->supported &= PHY_BASIC_FEATURES;
+	phydev->supported |= SUPPORTED_Asym_Pause | SUPPORTED_Pause;
+
+	vmac_clk = clk_get(&ap->pdev->dev, "arcvmac");
+	if (IS_ERR(vmac_clk)) {
+		err = PTR_ERR(vmac_clk);
+		goto err_disconnect;
+	}
+
+	clock_rate = clk_get_rate(vmac_clk);
+	clk_put(vmac_clk);
+
+	dev_dbg(&ap->pdev->dev, "vmac_clk: %lu Hz\n", clock_rate);
+
+	if (clock_rate < 25000000)
+		phydev->supported &= ~(SUPPORTED_100baseT_Half |
+				SUPPORTED_100baseT_Full);
+
+	phydev->advertising = phydev->supported;
+
+	ap->link = 0;
+	ap->speed = 0;
+	ap->duplex = -1;
+	ap->phy_dev = phydev;
+
+	return 0;
+
+err_disconnect:
+	phy_disconnect(phydev);
+err_out:
+	return err;
+}
+
+static int __devinit vmac_mii_init(struct vmac_priv *ap)
+{
+	unsigned long flags;
+	int err, i;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	ap->mii_bus = mdiobus_alloc();
+	if (ap->mii_bus == NULL)
+		return -ENOMEM;
+
+	ap->mii_bus->name = "vmac_mii_bus";
+	ap->mii_bus->read = &vmac_mdio_read;
+	ap->mii_bus->write = &vmac_mdio_write;
+
+	snprintf(ap->mii_bus->id, MII_BUS_ID_SIZE, "%x", 0);
+
+	ap->mii_bus->priv = ap;
+
+	err = -ENOMEM;
+	ap->mii_bus->irq = kmalloc(sizeof(int) * PHY_MAX_ADDR, GFP_KERNEL);
+	if (!ap->mii_bus->irq)
+		goto err_out;
+
+	for (i = 0; i < PHY_MAX_ADDR; i++)
+		ap->mii_bus->irq[i] = PHY_POLL;
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	/* locking: mdio concurrency */
+
+	err = mdiobus_register(ap->mii_bus);
+	if (err)
+		goto err_out_free_mdio_irq;
+
+	err = vmac_mii_probe(ap->dev);
+	if (err)
+		goto err_out_unregister_bus;
+
+	return 0;
+
+err_out_unregister_bus:
+	mdiobus_unregister(ap->mii_bus);
+err_out_free_mdio_irq:
+	kfree(ap->mii_bus->irq);
+err_out:
+	mdiobus_free(ap->mii_bus);
+	return err;
+}
+
+static void vmac_mii_exit_unlocked(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	if (ap->phy_dev)
+		phy_disconnect(ap->phy_dev);
+
+	mdiobus_unregister(ap->mii_bus);
+	kfree(ap->mii_bus->irq);
+	mdiobus_free(ap->mii_bus);
+}
+
+static int vmacether_get_settings(struct net_device *dev,
+		struct ethtool_cmd *cmd)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_gset(phydev, cmd);
+}
+
+static int vmacether_set_settings(struct net_device *dev,
+		struct ethtool_cmd *cmd)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_ethtool_sset(phydev, cmd);
+}
+
+static int vmac_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev = ap->phy_dev;
+
+	if (!netif_running(dev))
+		return -EINVAL;
+
+	if (!phydev)
+		return -ENODEV;
+
+	return phy_mii_ioctl(phydev, rq, cmd);
+}
+
+static void vmacether_get_drvinfo(struct net_device *dev,
+		struct ethtool_drvinfo *info)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+	snprintf(info->bus_info, sizeof(info->bus_info),
+			"platform 0x%pP", &ap->mem->start);
+}
+
+static int update_error_counters_unlocked(struct net_device *dev, int status)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	dev_dbg(&ap->pdev->dev, "rx error counter overrun. status = 0x%x\n",
+			status);
+
+	/* programming error */
+	WARN_ON(status & TXCH_MASK);
+	WARN_ON(!(status & (MSER_MASK | RXCR_MASK | RXFR_MASK | RXFL_MASK)));
+
+	if (status & MSER_MASK)
+		dev->stats.rx_over_errors += 256; /* ran out of BD */
+	if (status & RXCR_MASK)
+		dev->stats.rx_crc_errors += 256;
+	if (status & RXFR_MASK)
+		dev->stats.rx_frame_errors += 256;
+	if (status & RXFL_MASK)
+		dev->stats.rx_fifo_errors += 256;
+
+	return 0;
+}
+
+static void update_tx_errors_unlocked(struct net_device *dev, int status)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	if (status & BD_UFLO)
+		dev->stats.tx_fifo_errors++;
+
+	if (ap->duplex)
+		return;
+
+	/* half duplex flags */
+	if (status & BD_LTCL)
+		dev->stats.tx_window_errors++;
+	if (status & BD_RETRY_CT)
+		dev->stats.collisions += (status & BD_RETRY_CT) >> 24;
+	if (status & BD_DROP)  /* too many retries */
+		dev->stats.tx_aborted_errors++;
+	if (status & BD_DEFER)
+		dev_vdbg(&ap->pdev->dev, "\"defer to traffic\"\n");
+	if (status & BD_CARLOSS)
+		dev->stats.tx_carrier_errors++;
+}
+
+static int vmac_rx_reclaim_force_unlocked(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	int ct;
+
+	/* locking: no conurrency, runs only during shutdown */
+	WARN_ON(!ap->shutdown);
+
+	dev_dbg(&ap->pdev->dev, "need to release %d rx sk_buff\n",
+			fifo_used(&ap->rx_ring));
+
+	ct = 0;
+	while (!fifo_empty(&ap->rx_ring) && ct++ < ap->rx_ring.size) {
+		struct vmac_buffer_desc *desc;
+		struct sk_buff *skb;
+		int desc_idx;
+
+		desc_idx = ap->rx_ring.tail;
+		desc = &ap->rxbd[desc_idx];
+		fifo_inc_tail(&ap->rx_ring);
+
+		if (!ap->rx_skbuff[desc_idx]) {
+			dev_err(&ap->pdev->dev, "non-populated rx_skbuff found %d\n",
+					desc_idx);
+			continue;
+		}
+
+		skb = ap->rx_skbuff[desc_idx];
+		ap->rx_skbuff[desc_idx] = NULL;
+
+		dma_unmap_single(&ap->pdev->dev, desc->data, skb->len,
+		    DMA_TO_DEVICE);
+
+		dev_kfree_skb(skb);
+	}
+
+	if (!fifo_empty(&ap->rx_ring)) {
+		dev_err(&ap->pdev->dev, "failed to reclaim %d rx sk_buff\n",
+				fifo_used(&ap->rx_ring));
+	}
+
+	return 0;
+}
+
+/* Function refills empty buffer descriptors and passes ownership to DMA */
+static int vmac_rx_refill_unlocked(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	/* locking: protect from refill_timer */
+	/* locking: fct owns area outside rx_ring, head exclusive tail,
+	 *	modifies head */
+
+	spin_lock(&ap->refill_lock);
+
+	WARN_ON(fifo_full(&ap->rx_ring));
+
+	while (!fifo_full(&ap->rx_ring)) {
+		struct vmac_buffer_desc *desc;
+		struct sk_buff *skb;
+		dma_addr_t p;
+		int desc_idx;
+
+		desc_idx = ap->rx_ring.head;
+		desc = &ap->rxbd[desc_idx];
+
+		/* make sure we read the actual descriptor status */
+		rmb();
+
+		if (ap->rx_skbuff[desc_idx]) {
+			/* dropped packet / buffer chaining */
+			fifo_inc_head(&ap->rx_ring);
+
+			/* return to DMA */
+			wmb();
+			desc->info = cpu_to_le32(BD_DMA_OWN | ap->rx_skb_size);
+			continue;
+		}
+
+		skb = netdev_alloc_skb_ip_align(dev, ap->rx_skb_size);
+		if (!skb) {
+			dev_info(&ap->pdev->dev, "failed to allocate rx_skb, skb's left %d\n",
+					fifo_used(&ap->rx_ring));
+			break;
+		}
+
+		ap->rx_skbuff[desc_idx] = skb;
+
+		p = dma_map_single(&ap->pdev->dev, skb->data, ap->rx_skb_size,
+				DMA_FROM_DEVICE);
+
+		desc->data = p;
+
+		wmb();
+		desc->info = cpu_to_le32(BD_DMA_OWN | ap->rx_skb_size);
+
+		fifo_inc_head(&ap->rx_ring);
+	}
+
+	spin_unlock(&ap->refill_lock);
+
+	/* If rx ring is still empty, set a timer to try allocating
+	 * again at a later time. */
+	if (fifo_empty(&ap->rx_ring) && netif_running(dev)) {
+		dev_warn(&ap->pdev->dev, "unable to refill rx ring\n");
+		ap->refill_timer.expires = jiffies + HZ;
+		add_timer(&ap->refill_timer);
+	}
+
+	return 0;
+}
+
+/*
+ * timer callback to defer refill rx queue in case we're OOM
+ */
+static void vmac_refill_rx_timer(unsigned long data)
+{
+	vmac_rx_refill_unlocked((struct net_device *)data);
+}
+
+/* merge buffer chaining  */
+struct sk_buff *vmac_merge_rx_buffers_unlocked(struct net_device *dev,
+		struct vmac_buffer_desc *after,
+		int pkt_len) /* data */
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct sk_buff *merge_skb, *cur_skb;
+	struct dma_fifo *rx_ring;
+	struct vmac_buffer_desc *desc;
+
+	/* locking: same as vmac_rx_receive */
+
+	rx_ring = &ap->rx_ring;
+	desc = &ap->rxbd[rx_ring->tail];
+
+	WARN_ON(desc == after);
+
+	/* strip FCS */
+	pkt_len -= 4;
+
+	merge_skb = netdev_alloc_skb_ip_align(dev, pkt_len + NET_IP_ALIGN);
+	if (!merge_skb) {
+		dev_err(&ap->pdev->dev, "failed to allocate merged rx_skb, rx skb's left %d\n",
+				fifo_used(rx_ring));
+
+		return NULL;
+	}
+
+	while (desc != after && pkt_len) {
+		struct vmac_buffer_desc *desc;
+		int buf_len, valid;
+
+		/* desc needs wrapping */
+		desc = &ap->rxbd[rx_ring->tail];
+		cur_skb = ap->rx_skbuff[rx_ring->tail];
+		WARN_ON(!cur_skb);
+
+		dma_unmap_single(&ap->pdev->dev, desc->data, ap->rx_skb_size,
+				DMA_FROM_DEVICE);
+
+		/* do not copy FCS */
+		buf_len = le32_to_cpu(desc->info) & BD_LEN;
+		valid = min(pkt_len, buf_len);
+		pkt_len -= valid;
+
+		memcpy(skb_put(merge_skb, valid), cur_skb->data, valid);
+
+		fifo_inc_tail(rx_ring);
+	}
+
+	/* merging_pressure++ */
+
+	if (unlikely(pkt_len != 0))
+		dev_err(&ap->pdev->dev, "buffer chaining bytes missing %d\n",
+				pkt_len);
+
+	WARN_ON(desc != after);
+
+	return merge_skb;
+}
+
+int vmac_rx_receive(struct net_device *dev, int budget)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct vmac_buffer_desc *first;
+	int processed, pkt_len, pkt_err;
+	struct dma_fifo lookahead;
+
+	/* true concurrency -> DMA engine running in parallel */
+	/* locking: fct owns rx_ring tail to current DMA read position, alias
+	 * 'received packets'. rx_refill owns area outside rx_ring, doesn't
+	 * modify tail */
+
+	processed = 0;
+
+	first = NULL;
+	pkt_err = pkt_len = 0;
+
+	/* look ahead, till packet complete */
+	lookahead = ap->rx_ring;
+
+	do {
+		struct vmac_buffer_desc *desc; /* cur_ */
+		int desc_idx; /* cur_ */
+		struct sk_buff *skb; /* pkt_ */
+
+		desc_idx = lookahead.tail;
+		desc = &ap->rxbd[desc_idx];
+
+		/* make sure we read the actual descriptor status */
+		rmb();
+
+		/* break if dma ownership belongs to hw */
+		if (desc->info & cpu_to_le32(BD_DMA_OWN)) {
+			/* safe the dma position */
+			ap->dma_rx_head = vmac_readl(ap, MAC_RXRING_HEAD);
+			break;
+		}
+
+		if (desc->info & cpu_to_le32(BD_FRST)) {
+			pkt_len = 0;
+			pkt_err = 0;
+
+			/* don't free current */
+			ap->rx_ring.tail = lookahead.tail;
+			first = desc;
+		}
+
+		fifo_inc_tail(&lookahead);
+
+		/* check bd */
+
+		pkt_len += desc->info & cpu_to_le32(BD_LEN);
+		pkt_err |= desc->info & cpu_to_le32(BD_BUFF);
+
+		if (!(desc->info & cpu_to_le32(BD_LAST)))
+			continue;
+
+		/* received complete packet */
+
+		if (unlikely(pkt_err || !first)) {
+			/* recycle buffers */
+			ap->rx_ring.tail = lookahead.tail;
+			continue;
+		}
+
+#ifdef DEBUG
+		WARN_ON(!(first->info & cpu_to_le32(BD_FRST)) ||
+				!(desc->info & cpu_to_le32(BD_LAST)));
+		WARN_ON(pkt_err);
+#endif
+
+		/* -- valid packet -- */
+
+		if (first != desc) {
+			skb = vmac_merge_rx_buffers_unlocked(dev, desc,
+					pkt_len);
+
+			if (!skb) {
+				/* kill packet */
+				ap->rx_ring.tail = lookahead.tail;
+				ap->rx_merge_error++;
+				continue;
+			}
+		} else {
+			dma_unmap_single(&ap->pdev->dev, desc->data,
+					ap->rx_skb_size, DMA_FROM_DEVICE);
+
+			skb = ap->rx_skbuff[desc_idx];
+			ap->rx_skbuff[desc_idx] = NULL;
+			/* desc->data != skb->data => desc->data is DMA
+			 * mapped */
+
+			/* strip FCS */
+			skb_put(skb, pkt_len - ETH_FCS_LEN);
+		}
+
+		/* free buffers */
+		ap->rx_ring.tail = lookahead.tail;
+
+#ifdef DEBUG
+		WARN_ON(skb->len != pkt_len - ETH_FCS_LEN);
+#endif
+		processed++;
+		skb->protocol = eth_type_trans(skb, dev);
+		dev->stats.rx_packets++;
+		dev->stats.rx_bytes += skb->len;
+		netif_receive_skb(skb);
+
+	} while (!fifo_empty(&lookahead) && (processed < budget));
+
+	dev_vdbg(&ap->pdev->dev, "processed pkt %d, remaining rx buff %d\n",
+			processed,
+			fifo_used(&ap->rx_ring));
+
+	if (processed || fifo_empty(&ap->rx_ring))
+		vmac_rx_refill_unlocked(dev);
+
+	return processed;
+}
+
+static void vmac_toggle_irqmask_unlocked(struct net_device *dev, int enable,
+		int mask)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long tmp;
+
+	tmp = vmac_readl(ap, ENABLE);
+	if (enable)
+		tmp |= mask;
+	else
+		tmp &= ~mask;
+	vmac_writel(ap, tmp, ENABLE);
+}
+
+static void vmac_toggle_txint(struct net_device *dev, int enable)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->lock, flags);
+	vmac_toggle_irqmask_unlocked(dev, enable, TXINT_MASK);
+	spin_unlock_irqrestore(&ap->lock, flags);
+}
+
+static void vmac_toggle_rxint_unlocked(struct net_device *dev, int enable)
+{
+	vmac_toggle_irqmask_unlocked(dev, enable, RXINT_MASK);
+}
+
+static int vmac_poll(struct napi_struct *napi, int budget)
+{
+	struct vmac_priv *ap;
+	struct net_device *dev;
+	int rx_work_done;
+	unsigned long flags;
+
+	ap = container_of(napi, struct vmac_priv, napi);
+	dev = ap->dev;
+
+	/* ack interrupt */
+	spin_lock_irqsave(&ap->lock, flags);
+	vmac_writel(ap, RXINT_MASK, STAT);
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	rx_work_done = vmac_rx_receive(dev, budget);
+
+	if (0 && printk_ratelimit()) {
+		dev_dbg(&ap->pdev->dev, "poll budget %d receive rx_work_done %d\n",
+				budget,
+				rx_work_done);
+	}
+
+	if (rx_work_done >= budget) {
+		/* rx queue is not yet empty/clean */
+		return rx_work_done;
+	}
+
+	/* no more packet in rx/tx queue, remove device from poll
+	 * queue */
+	spin_lock_irqsave(&ap->lock, flags);
+	napi_complete(napi);
+	vmac_toggle_rxint_unlocked(dev, 1);
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	return rx_work_done;
+}
+
+static int vmac_tx_reclaim_unlocked(struct net_device *dev, int force);
+
+static irqreturn_t vmac_intr(int irq, void *dev_instance)
+{
+	struct net_device *dev = dev_instance;
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned int status;
+
+	spin_lock(&ap->lock);
+
+	status = vmac_readl(ap, STAT);
+	vmac_writel(ap, status, STAT);
+
+#ifdef DEBUG
+	if (unlikely(ap->shutdown))
+		dev_err(&ap->pdev->dev, "ISR during close\n");
+
+	if (unlikely(!status & (RXINT_MASK|MDIO_MASK|ERR_MASK)))
+		dev_err(&ap->pdev->dev, "No source of IRQ found\n");
+#endif
+
+	if ((status & RXINT_MASK) &&
+			(ap->dma_rx_head !=
+			 vmac_readl(ap, MAC_RXRING_HEAD))) {
+		vmac_toggle_rxint_unlocked(dev, 0);
+		napi_schedule(&ap->napi);
+	}
+
+	if (unlikely(netif_queue_stopped(dev) && (status & TXINT_MASK)))
+		vmac_tx_reclaim_unlocked(dev, 0);
+
+	if (status & MDIO_MASK)
+		complete(&ap->mdio_complete);
+
+	if (unlikely(status & ERR_MASK))
+		update_error_counters_unlocked(dev, status);
+
+	spin_unlock(&ap->lock);
+
+	return IRQ_HANDLED;
+}
+
+static int vmac_tx_reclaim_unlocked(struct net_device *dev, int force)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	int released = 0;
+
+	/* locking: modifies tx_ring tail, head only during shutdown */
+	/* locking: call with ap->lock held */
+	WARN_ON(force && !ap->shutdown);
+
+	/* buffer chaining not used, see vmac_start_xmit */
+
+	while (!fifo_empty(&ap->tx_ring)) {
+		struct vmac_buffer_desc *desc;
+		struct sk_buff *skb;
+		int desc_idx;
+
+		desc_idx = ap->tx_ring.tail;
+		desc = &ap->txbd[desc_idx];
+
+		/* ensure other field of the descriptor were not read
+		 * before we checked ownership */
+		rmb();
+
+		if ((desc->info & cpu_to_le32(BD_DMA_OWN)) && !force)
+			break;
+
+		if (desc->info & cpu_to_le32(BD_TX_ERR)) {
+			update_tx_errors_unlocked(dev,
+					le32_to_cpu(desc->info));
+			/* recycle packet, let upper level deal with it */
+		}
+
+		skb = ap->tx_skbuff[desc_idx];
+		ap->tx_skbuff[desc_idx] = NULL;
+		WARN_ON(!skb);
+
+		dma_unmap_single(&ap->pdev->dev, desc->data, skb->len,
+				DMA_TO_DEVICE);
+
+		dev_kfree_skb_any(skb);
+
+		released++;
+		fifo_inc_tail(&ap->tx_ring);
+	}
+
+	if (netif_queue_stopped(dev) && released) {
+		netif_wake_queue(dev);
+		vmac_toggle_txint(dev, 0);
+	}
+
+	if (unlikely(force && !fifo_empty(&ap->tx_ring))) {
+		dev_err(&ap->pdev->dev, "failed to reclaim %d tx sk_buff\n",
+				fifo_used(&ap->tx_ring));
+	}
+
+	return released;
+}
+
+int vmac_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct vmac_buffer_desc *desc;
+	unsigned long flags;
+	unsigned int tmp;
+
+	/* running under xmit lock */
+	/* locking: modifies tx_ring head, tx_reclaim only tail */
+
+	/* no scatter/gatter see features below */
+	WARN_ON(skb_shinfo(skb)->nr_frags != 0);
+	WARN_ON(skb->len > MAX_TX_BUFFER_LEN);
+
+	if (unlikely(fifo_full(&ap->tx_ring))) {
+		netif_stop_queue(dev);
+		vmac_toggle_txint(dev, 1);
+		dev_err(&ap->pdev->dev, "xmit called with no tx desc available\n");
+		return NETDEV_TX_BUSY;
+	}
+
+	if (unlikely(skb->len < ETH_ZLEN)) {
+		struct sk_buff *short_skb;
+		short_skb = netdev_alloc_skb_ip_align(dev, ETH_ZLEN);
+		if (!short_skb)
+			return NETDEV_TX_LOCKED;
+
+		memset(short_skb->data, 0, ETH_ZLEN);
+		memcpy(skb_put(short_skb, ETH_ZLEN), skb->data, skb->len);
+		dev_kfree_skb(skb);
+		skb = short_skb;
+	}
+
+	/* fill descriptor */
+	ap->tx_skbuff[ap->tx_ring.head] = skb;
+	desc = &ap->txbd[ap->tx_ring.head];
+	WARN_ON(desc->info & cpu_to_le32(BD_DMA_OWN));
+
+	desc->data = dma_map_single(&ap->pdev->dev, skb->data, skb->len,
+			DMA_TO_DEVICE);
+
+	/* dma might already be polling */
+	wmb();
+	desc->info = cpu_to_le32(BD_DMA_OWN | BD_FRST | BD_LAST | skb->len);
+	wmb();
+
+	/* lock device data */
+	spin_lock_irqsave(&ap->lock, flags);
+
+	/* kick tx dma */
+	tmp = vmac_readl(ap, STAT);
+	vmac_writel(ap, tmp | TXPL_MASK, STAT);
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+	dev->trans_start = jiffies;
+	fifo_inc_head(&ap->tx_ring);
+
+	/* vmac_tx_reclaim outside of vmac_tx_timeout */
+	if (fifo_used(&ap->tx_ring) > 8)
+		vmac_tx_reclaim_unlocked(dev, 0);
+
+	/* unlock device data */
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	/* stop queue if no more desc available */
+	if (fifo_full(&ap->tx_ring)) {
+		netif_stop_queue(dev);
+		vmac_toggle_txint(dev, 1);
+	}
+
+	return NETDEV_TX_OK;
+}
+
+static int alloc_buffers_unlocked(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	int err = -ENOMEM;
+	int size;
+
+	fifo_init(&ap->rx_ring, RX_BDT_LEN);
+	fifo_init(&ap->tx_ring, TX_BDT_LEN);
+
+	/* initialize skb list */
+	memset(ap->rx_skbuff, 0, sizeof(ap->rx_skbuff));
+	memset(ap->tx_skbuff, 0, sizeof(ap->tx_skbuff));
+
+	/* allocate DMA received descriptors */
+	size = sizeof(*ap->rxbd) * ap->rx_ring.size;
+	ap->rxbd = dma_alloc_coherent(&ap->pdev->dev, size,
+			&ap->rxbd_dma,
+			GFP_KERNEL);
+	if (ap->rxbd == NULL)
+		goto err_out;
+
+	/* allocate DMA transmit descriptors */
+	size = sizeof(*ap->txbd) * ap->tx_ring.size;
+	ap->txbd = dma_alloc_coherent(&ap->pdev->dev, size,
+			&ap->txbd_dma,
+			GFP_KERNEL);
+	if (ap->txbd == NULL)
+		goto err_free_rxbd;
+
+	/* ensure 8-byte aligned */
+	WARN_ON(((uintptr_t)ap->txbd & 0x7) || ((uintptr_t)ap->rxbd & 0x7));
+
+	memset(ap->txbd, 0, sizeof(*ap->txbd) * ap->tx_ring.size);
+	memset(ap->rxbd, 0, sizeof(*ap->rxbd) * ap->rx_ring.size);
+
+	/* allocate rx skb */
+	err = vmac_rx_refill_unlocked(dev);
+	if (err)
+		goto err_free_txbd;
+
+	return 0;
+
+err_free_txbd:
+	dma_free_coherent(&ap->pdev->dev, sizeof(*ap->txbd) * ap->tx_ring.size,
+			ap->txbd, ap->txbd_dma);
+err_free_rxbd:
+	dma_free_coherent(&ap->pdev->dev, sizeof(*ap->rxbd) * ap->rx_ring.size,
+			ap->rxbd, ap->rxbd_dma);
+err_out:
+	return err;
+}
+
+static int free_buffers_unlocked(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+
+	/* free skbuff */
+	vmac_tx_reclaim_unlocked(dev, 1);
+	vmac_rx_reclaim_force_unlocked(dev);
+
+	/* free DMA ring */
+	dma_free_coherent(&ap->pdev->dev, sizeof(ap->txbd) * ap->tx_ring.size,
+			ap->txbd, ap->txbd_dma);
+	dma_free_coherent(&ap->pdev->dev, sizeof(ap->rxbd) * ap->rx_ring.size,
+			ap->rxbd, ap->rxbd_dma);
+
+	return 0;
+}
+
+static int vmac_hw_init(struct net_device *dev)
+{
+	struct vmac_priv *priv = netdev_priv(dev);
+
+	/* clear IRQ mask */
+	vmac_writel(priv, 0, ENABLE);
+
+	/* clear pending IRQ */
+	vmac_writel(priv, 0xffffffff, STAT);
+
+	/* Initialize logical address filter */
+	vmac_writel(priv, 0x0, LAFL);
+	vmac_writel(priv, 0x0, LAFH);
+
+	return 0;
+}
+
+int vmac_open(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	struct phy_device *phydev;
+	unsigned long flags;
+	unsigned int temp, ctrl;
+	int err = 0;
+
+	/* locking: no concurrency yet */
+
+	if (ap == NULL)
+		return -ENODEV;
+
+	spin_lock_irqsave(&ap->lock, flags);
+	ap->shutdown = 0;
+
+	err = get_register_map(ap);
+	if (err)
+		return err;
+
+	vmac_hw_init(dev);
+
+	/* mac address changed? */
+	write_mac_reg(dev, dev->dev_addr);
+
+	err = alloc_buffers_unlocked(dev);
+	if (err)
+		return err;
+
+	/* install DMA ring pointers */
+	vmac_writel(ap, ap->rxbd_dma, RXRINGPTR);
+	vmac_writel(ap, ap->txbd_dma, TXRINGPTR);
+
+	/* set poll rate to 1 ms */
+	vmac_writel(ap, POLLRATE_TIME, POLLRATE);
+
+	/* Set control */
+	ctrl = (RX_BDT_LEN << 24) | (TX_BDT_LEN << 16) | TXRN_MASK | RXRN_MASK;
+	vmac_writel(ap, ctrl, CONTROL);
+
+	/* make sure we enable napi before rx interrupt  */
+	napi_enable(&ap->napi);
+
+	err = request_irq(dev->irq, &vmac_intr, 0, dev->name, dev);
+	if (err) {
+		dev_err(&ap->pdev->dev, "Unable to request IRQ %d (error %d)\n",
+				dev->irq, err);
+		goto err_free_buffers;
+	}
+
+	/* IRQ mask */
+	temp = RXINT_MASK | ERR_MASK | TXCH_MASK | MDIO_MASK;
+	vmac_writel(ap, temp, ENABLE);
+
+	/* enable, after all other bits are set */
+	vmac_writel(ap, ctrl | EN_MASK, CONTROL);
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	/* locking: concurrency */
+
+	netif_start_queue(dev);
+	netif_carrier_off(dev);
+
+	/* register the PHY board fixup, if needed */
+	err = vmac_mii_init(ap);
+	if (err)
+		goto err_free_irq;
+
+	/* schedule a link state check */
+	phy_start(ap->phy_dev);
+
+	phydev = ap->phy_dev;
+	dev_info(&ap->pdev->dev, "PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n",
+	       phydev->drv->name, dev_name(&phydev->dev), phydev->irq);
+
+	return 0;
+
+err_free_irq:
+	free_irq(dev->irq, dev);
+err_free_buffers:
+	napi_disable(&ap->napi);
+	free_buffers_unlocked(dev);
+	return err;
+}
+
+int vmac_close(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags;
+	unsigned int temp;
+
+	/* locking: protect everything, DMA / IRQ / timer */
+	spin_lock_irqsave(&ap->lock, flags);
+
+	/* complete running transfer, then stop */
+	temp = vmac_readl(ap, CONTROL);
+	temp &= ~(TXRN_MASK | RXRN_MASK);
+	vmac_writel(ap, temp, CONTROL);
+
+	/* reenable IRQ, process pending */
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(msecs_to_jiffies(20));
+
+	/* shut it down now */
+	spin_lock_irqsave(&ap->lock, flags);
+	ap->shutdown = 1;
+
+	netif_stop_queue(dev);
+	napi_disable(&ap->napi);
+
+	/* disable phy */
+	phy_stop(ap->phy_dev);
+	vmac_mii_exit_unlocked(dev);
+	netif_carrier_off(dev);
+
+	/* disable interrupts */
+	vmac_writel(ap, 0, ENABLE);
+	free_irq(dev->irq, dev);
+
+	/* turn off vmac */
+	vmac_writel(ap, 0, CONTROL);
+	/* vmac_reset_hw(vmac) */
+
+	/* locking: concurrency off */
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	del_timer_sync(&ap->refill_timer);
+	free_buffers_unlocked(dev);
+
+	put_register_map(ap);
+
+	return 0;
+}
+
+void update_vmac_stats_unlocked(struct net_device *dev)
+{
+	struct net_device_stats *_stats = &dev->stats;
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long miss, rxerr;
+	unsigned long rxfram, rxcrc, rxoflow;
+
+	/* compare with /proc/net/dev,
+	 * see net/core/dev.c:dev_seq_printf_stats */
+
+	/* rx stats */
+	rxerr = vmac_readl(ap, RXERR);
+	miss = vmac_readl(ap, MISS);
+
+	rxcrc = (rxerr & RXERR_CRC);
+	rxfram = (rxerr & RXERR_FRM) >> 8;
+	rxoflow = (rxerr & RXERR_OFLO) >> 16;
+
+	_stats->rx_length_errors = 0;
+	_stats->rx_over_errors += miss;
+	_stats->rx_crc_errors += rxcrc;
+	_stats->rx_frame_errors += rxfram;
+	_stats->rx_fifo_errors += rxoflow;
+	_stats->rx_missed_errors = 0;
+
+	/* TODO check rx_dropped/rx_errors/tx_dropped/tx_errors have not
+	 * been updated elsewhere */
+	_stats->rx_dropped = _stats->rx_over_errors +
+		_stats->rx_fifo_errors +
+		ap->rx_merge_error;
+
+	_stats->rx_errors = _stats->rx_length_errors + _stats->rx_crc_errors +
+		_stats->rx_frame_errors +
+		_stats->rx_missed_errors +
+		_stats->rx_dropped;
+
+	/* tx stats */
+	_stats->tx_dropped = 0; /* otherwise queue stopped */
+
+	_stats->tx_errors = _stats->tx_aborted_errors +
+		_stats->tx_carrier_errors +
+		_stats->tx_fifo_errors +
+		_stats->tx_heartbeat_errors +
+		_stats->tx_window_errors +
+		_stats->tx_dropped +
+		ap->tx_timeout_error;
+}
+
+struct net_device_stats *vmac_stats(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->lock, flags);
+	update_vmac_stats_unlocked(dev);
+	spin_unlock_irqrestore(&ap->lock, flags);
+
+	return &dev->stats;
+}
+
+void vmac_tx_timeout(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned int status;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	/* queue did not progress for timeo jiffies */
+	WARN_ON(!netif_queue_stopped(dev));
+	WARN_ON(!fifo_full(&ap->tx_ring));
+
+	/* TX IRQ lost? */
+	status = vmac_readl(ap, STAT);
+	if (status & TXINT_MASK) {
+		dev_err(&ap->pdev->dev, "lost tx interrupt, IRQ mask %x\n",
+				vmac_readl(ap, ENABLE));
+		vmac_writel(ap, TXINT_MASK, STAT);
+	}
+
+	/* TODO RX/MDIO/ERR as well? */
+
+	vmac_tx_reclaim_unlocked(dev, 0);
+	if (fifo_full(&ap->tx_ring))
+		dev_err(&ap->pdev->dev, "DMA state machine not active\n");
+
+	/* We can accept TX packets again */
+	ap->tx_timeout_error++;
+	dev->trans_start = jiffies;
+	netif_wake_queue(dev);
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+}
+
+static void create_multicast_filter(struct net_device *dev,
+	unsigned long *bitmask)
+{
+	unsigned long crc;
+	char *addrs;
+
+	/* locking: done by net_device */
+
+	WARN_ON(netdev_mc_count(dev) == 0);
+	WARN_ON(dev->flags & IFF_ALLMULTI);
+
+	bitmask[0] = bitmask[1] = 0;
+
+	{
+		struct netdev_hw_addr *ha;
+		netdev_for_each_mc_addr(ha, dev) {
+			addrs = ha->addr;
+
+			/* skip non-multicast addresses */
+			if (!(*addrs & 1))
+				continue;
+
+			crc = ether_crc_le(ETH_ALEN, addrs);
+			set_bit(crc >> 26, bitmask);
+		}
+	}
+}
+
+static void vmac_set_multicast_list(struct net_device *dev)
+{
+	struct vmac_priv *ap = netdev_priv(dev);
+	unsigned long flags, bitmask[2];
+	int promisc, reg;
+
+	spin_lock_irqsave(&ap->lock, flags);
+
+	promisc = !!(dev->flags & IFF_PROMISC);
+	reg = vmac_readl(ap, ENABLE);
+	if (promisc != !!(reg & PROM_MASK)) {
+		reg ^= PROM_MASK;
+		vmac_writel(ap, reg, ENABLE);
+	}
+
+	if (dev->flags & IFF_ALLMULTI)
+		memset(bitmask, 1, sizeof(bitmask));
+	else if (netdev_mc_count(dev) == 0)
+		memset(bitmask, 0, sizeof(bitmask));
+	else
+		create_multicast_filter(dev, bitmask);
+
+	vmac_writel(ap, bitmask[0], LAFL);
+	vmac_writel(ap, bitmask[1], LAFH);
+
+	spin_unlock_irqrestore(&ap->lock, flags);
+}
+
+static struct ethtool_ops vmac_ethtool_ops = {
+	.get_settings		= vmacether_get_settings,
+	.set_settings		= vmacether_set_settings,
+	.get_drvinfo		= vmacether_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+};
+
+static const struct net_device_ops vmac_netdev_ops = {
+	.ndo_open		= vmac_open,
+	.ndo_stop		= vmac_close,
+	.ndo_get_stats		= vmac_stats,
+	.ndo_start_xmit		= vmac_start_xmit,
+	.ndo_do_ioctl		= vmac_ioctl,
+	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_tx_timeout		= vmac_tx_timeout,
+	.ndo_set_multicast_list = vmac_set_multicast_list,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= eth_change_mtu,
+};
+
+static int get_register_map(struct vmac_priv *ap)
+{
+	int err;
+
+	err = -EBUSY;
+	if (!request_mem_region(ap->mem->start, resource_size(ap->mem),
+				DRV_NAME)) {
+		dev_err(&ap->pdev->dev, "no memory region available\n");
+		return err;
+	}
+
+	err = -ENOMEM;
+	ap->regs = ioremap(ap->mem->start, resource_size(ap->mem));
+	if (!ap->regs) {
+		dev_err(&ap->pdev->dev, "failed to map registers, aborting.\n");
+		goto err_out_release_mem;
+	}
+
+	return 0;
+
+err_out_release_mem:
+	release_mem_region(ap->mem->start, resource_size(ap->mem));
+	return err;
+}
+
+static int put_register_map(struct vmac_priv *ap)
+{
+	iounmap(ap->regs);
+	release_mem_region(ap->mem->start, resource_size(ap->mem));
+	return 0;
+}
+
+static int __devinit vmac_probe(struct platform_device *pdev)
+{
+	struct net_device *dev;
+	struct vmac_priv *ap;
+	struct resource *mem;
+	int err;
+
+	/* locking: no concurrency */
+
+	if (dma_get_mask(&pdev->dev) > DMA_BIT_MASK(32) ||
+			pdev->dev.coherent_dma_mask > DMA_BIT_MASK(32)) {
+		dev_err(&pdev->dev, "arcvmac supports only 32-bit DMA addresses\n");
+		return -ENODEV;
+	}
+
+	dev = alloc_etherdev(sizeof(*ap));
+	if (!dev) {
+		dev_err(&pdev->dev, "etherdev alloc failed, aborting.\n");
+		return -ENOMEM;
+	}
+
+	ap = netdev_priv(dev);
+
+	err = -ENODEV;
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem) {
+		dev_err(&pdev->dev, "no mmio resource defined\n");
+		goto err_out;
+	}
+	ap->mem = mem;
+
+	err = platform_get_irq(pdev, 0);
+	if (err < 0) {
+		dev_err(&pdev->dev, "no irq found\n");
+		goto err_out;
+	}
+	dev->irq = err;
+
+	spin_lock_init(&ap->lock);
+
+	SET_NETDEV_DEV(dev, &pdev->dev);
+	ap->dev = dev;
+	ap->pdev = pdev;
+
+	/* init rx timeout (used for oom) */
+	init_timer(&ap->refill_timer);
+	ap->refill_timer.function = vmac_refill_rx_timer;
+	ap->refill_timer.data = (unsigned long)dev;
+	spin_lock_init(&ap->refill_lock);
+
+	netif_napi_add(dev, &ap->napi, vmac_poll, 2);
+	dev->netdev_ops = &vmac_netdev_ops;
+	dev->ethtool_ops = &vmac_ethtool_ops;
+
+	dev->flags |= IFF_MULTICAST;
+
+	dev->base_addr = (unsigned long)ap->regs; /* TODO */
+
+	/* prevent buffer chaining, favor speed over space */
+	ap->rx_skb_size = ETH_FRAME_LEN + VMAC_BUFFER_PAD;
+
+	/* private struct functional */
+
+	/* temporarily map registers to fetch mac addr */
+	err = get_register_map(ap);
+	if (err)
+		goto err_out;
+
+	/* mac address intialize, set vmac_open  */
+	read_mac_reg(dev, dev->dev_addr); /* TODO */
+
+	if (!is_valid_ether_addr(dev->dev_addr))
+		random_ether_addr(dev->dev_addr);
+
+	err = register_netdev(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
+		goto err_out;
+	}
+
+	/* release the memory region, till open is called */
+	put_register_map(ap);
+
+	dev_info(&pdev->dev, "ARC VMAC at 0x%pP irq %d %pM\n", &mem->start,
+	    dev->irq, dev->dev_addr);
+	platform_set_drvdata(pdev, ap);
+
+	return 0;
+
+err_out:
+	free_netdev(dev);
+	return err;
+}
+
+static int __devexit vmac_remove(struct platform_device *pdev)
+{
+	struct vmac_priv *ap;
+
+	/* locking: no concurrency */
+
+	ap = platform_get_drvdata(pdev);
+	if (!ap) {
+		dev_err(&pdev->dev, "vmac_remove no valid dev found\n");
+		return 0;
+	}
+
+	/* MAC */
+	unregister_netdev(ap->dev);
+	netif_napi_del(&ap->napi);
+
+	platform_set_drvdata(pdev, NULL);
+	free_netdev(ap->dev);
+	return 0;
+}
+
+static struct platform_driver arcvmac_driver = {
+	.probe		= vmac_probe,
+	.remove		= __devexit_p(vmac_remove),
+	.driver		= {
+		.name		= "arcvmac",
+	},
+};
+
+static int __init vmac_init(void)
+{
+	return platform_driver_register(&arcvmac_driver);
+}
+
+static void __exit vmac_exit(void)
+{
+	platform_driver_unregister(&arcvmac_driver);
+}
+
+module_init(vmac_init);
+module_exit(vmac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ARC VMAC Ethernet driver");
+MODULE_AUTHOR("afenkart@gmail.com");
diff --git a/drivers/net/arcvmac.h b/drivers/net/arcvmac.h
new file mode 100644
index 0000000..ee570a5
--- /dev/null
+++ b/drivers/net/arcvmac.h
@@ -0,0 +1,265 @@
+/*
+ * ARC VMAC Driver
+ *
+ * Copyright (C) 2003-2006 Codito Technologies, for linux-2.4 port
+ * Copyright (C) 2006-2007 Celunite Inc, for linux-2.6 port
+ * Copyright (C) 2007-2008 Sagem Communications, Fehmi HAFSI
+ * Copyright (C) 2009-2011 Sagem Communications, Andreas Fenkart
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef _ARCVMAC_H
+#define _ARCVMAC_H
+
+#define DRV_NAME		"arcvmac"
+#define DRV_VERSION		"1.0"
+
+/* Buffer descriptors */
+#define TX_BDT_LEN		16    /* Number of receive BD's */
+#define RX_BDT_LEN		256   /* Number of transmit BD's */
+
+/* BD poll rate, in 1024 cycles. @100Mhz: x * 1024 cy * 10ns = 1ms */
+#define POLLRATE_TIME		200
+
+/* next power of two, bigger than ETH_FRAME_LEN + VLAN  */
+#define MAX_RX_BUFFER_LEN	0x800	/* 2^11 = 2048 = 0x800 */
+#define MAX_TX_BUFFER_LEN	0x800	/* 2^11 = 2048 = 0x800 */
+
+/* 14 bytes of ethernet header, 4 bytes VLAN, FCS,
+ * plus extra pad to prevent buffer chaining of
+ * maximum sized ethernet packets (1514 bytes) */
+#define	VMAC_BUFFER_PAD		(ETH_HLEN + 4 + ETH_FCS_LEN + 4)
+
+/* VMAC register definitions, offsets in bytes */
+#define VMAC_ID			0x00
+
+/* stat/enable use same bit mask */
+#define VMAC_STAT		0x04
+#define VMAC_ENABLE		0x08
+#  define TXINT_MASK		0x00000001 /* Transmit interrupt */
+#  define RXINT_MASK		0x00000002 /* Receive interrupt */
+#  define ERR_MASK		0x00000004 /* Error interrupt */
+#  define TXCH_MASK		0x00000008 /* Transmit chaining error */
+#  define MSER_MASK		0x00000010 /* Missed packet counter error */
+#  define RXCR_MASK		0x00000100 /* RXCRCERR counter rolled over	 */
+#  define RXFR_MASK		0x00000200 /* RXFRAMEERR counter rolled over */
+#  define RXFL_MASK		0x00000400 /* RXOFLOWERR counter rolled over */
+#  define MDIO_MASK		0x00001000 /* MDIO complete */
+#  define TXPL_MASK		0x80000000 /* TXPOLL */
+
+#define VMAC_CONTROL		0x0c
+#  define EN_MASK		0x00000001 /* VMAC enable */
+#  define TXRN_MASK		0x00000008 /* TX enable */
+#  define RXRN_MASK		0x00000010 /* RX enable */
+#  define DSBC_MASK		0x00000100 /* Disable receive broadcast */
+#  define ENFL_MASK		0x00000400 /* Enable Full Duplex */
+#  define PROM_MASK		0x00000800 /* Promiscuous mode */
+
+#define VMAC_POLLRATE		0x10
+
+#define VMAC_RXERR		0x14
+#  define RXERR_CRC		0x000000ff
+#  define RXERR_FRM		0x0000ff00
+#  define RXERR_OFLO		0x00ff0000 /* fifo overflow */
+
+#define VMAC_MISS		0x18
+#define VMAC_TXRINGPTR		0x1c
+#define VMAC_RXRINGPTR		0x20
+#define VMAC_ADDRL		0x24
+#define VMAC_ADDRH		0x28
+#define VMAC_LAFL		0x2c
+#define VMAC_LAFH		0x30
+#define VMAC_MAC_TXRING_HEAD	0x38
+#define VMAC_MAC_RXRING_HEAD	0x3C
+
+#define VMAC_MDIO_DATA		0x34
+#  define MDIO_SFD		0xC0000000
+#  define MDIO_OP		0x30000000
+#  define MDIO_ID_MASK		0x0F800000
+#  define MDIO_REG_MASK		0x007C0000
+#  define MDIO_TA		0x00030000
+#  define MDIO_DATA_MASK	0x0000FFFF
+/* common combinations */
+#  define MDIO_BASE		0x40020000
+#  define MDIO_OP_READ		0x20000000
+#  define MDIO_OP_WRITE		0x10000000
+
+/* Buffer descriptor INFO bit masks */
+#define BD_DMA_OWN		0x80000000 /* buffer ownership, 0 CPU, 1 DMA */
+#define BD_BUFF			0x40000000 /* buffer invalid, rx */
+#define BD_UFLO			0x20000000 /* underflow, tx */
+#define BD_LTCL			0x10000000 /* late collision, tx  */
+#define BD_RETRY_CT		0x0f000000 /* tx */
+#define BD_DROP			0x00800000 /* drop, more than 16 retries, tx */
+#define BD_DEFER		0x00400000 /* traffic on the wire, tx */
+#define BD_CARLOSS		0x00200000 /* carrier loss while transmission, tx, rx? */
+/* 20:19 reserved */
+#define BD_ADCR			0x00040000 /* add crc, ignored if not disaddcrc */
+#define BD_LAST			0x00020000 /* Last buffer in chain */
+#define BD_FRST			0x00010000 /* First buffer in chain */
+/* 15:11 reserved */
+#define BD_LEN			0x000007FF
+
+/* common combinations */
+#define BD_TX_ERR		(BD_UFLO | BD_LTCL | BD_RETRY_CT | BD_DROP | \
+		BD_DEFER | BD_CARLOSS)
+
+
+/* arcvmac private data structures */
+struct vmac_buffer_desc {
+	__le32 info;
+	__le32 data;
+};
+
+struct dma_fifo {
+	int head; /* head */
+	int tail; /* tail */
+	int size;
+};
+
+struct	vmac_priv {
+	struct net_device *dev;
+	struct platform_device *pdev;
+
+	struct completion mdio_complete;
+	spinlock_t lock; /* protects structure plus hw regs of device */
+
+	/* base address of register set */
+	char *regs;
+	struct resource *mem;
+
+	/* DMA ring buffers */
+	struct vmac_buffer_desc *rxbd;
+	dma_addr_t rxbd_dma;
+
+	struct vmac_buffer_desc *txbd;
+	dma_addr_t txbd_dma;
+
+	/* socket buffers */
+	struct sk_buff *rx_skbuff[RX_BDT_LEN];
+	struct sk_buff *tx_skbuff[TX_BDT_LEN];
+	int rx_skb_size;
+
+	/* skb / dma desc managing */
+	struct dma_fifo rx_ring; /* valid rx buffers */
+	struct dma_fifo tx_ring;
+
+	/* descriptor last polled/processed by the VMAC */
+	unsigned long dma_rx_head;
+
+	/* timer to retry rx skb allocation, if failed during receive */
+	struct timer_list refill_timer;
+	spinlock_t refill_lock;
+
+	struct napi_struct napi;
+
+	/* rx buffer chaining */
+	int rx_merge_error;
+	int tx_timeout_error;
+
+	/* PHY stuff */
+	struct mii_bus *mii_bus;
+	struct phy_device *phy_dev;
+
+	int link;
+	int speed;
+	int duplex;
+
+	/* debug */
+	int shutdown;
+};
+
+/* DMA ring management */
+
+/* for a fifo with size n,
+ * - [0..n] fill levels are n + 1 states
+ * - there are only n different deltas (head - tail) values
+ * => not all fill levels can be represented with head, tail
+ *    pointers only
+ * we give up the n fill level, aka fifo full */
+
+/* sacrifice one elt as a sentinel */
+static inline int fifo_used(struct dma_fifo *f);
+static inline int fifo_inc_ct(int ct, int size);
+static inline void fifo_dump(struct dma_fifo *fifo);
+
+static inline int fifo_empty(struct dma_fifo *f)
+{
+	return (f->head == f->tail);
+}
+
+static inline int fifo_free(struct dma_fifo *f)
+{
+	int free;
+
+	free = f->tail - f->head;
+	if (free <= 0)
+		free += f->size;
+
+	return free;
+}
+
+static inline int fifo_used(struct dma_fifo *f)
+{
+	int used;
+
+	used = f->head - f->tail;
+	if (used < 0)
+		used += f->size;
+
+	return used;
+}
+
+static inline int fifo_full(struct dma_fifo *f)
+{
+	return (fifo_used(f) + 1) == f->size;
+}
+
+/* manipulate */
+static inline void fifo_init(struct dma_fifo *fifo, int size)
+{
+	fifo->size = size;
+	fifo->head = fifo->tail = 0; /* empty */
+}
+
+static inline void fifo_inc_head(struct dma_fifo *fifo)
+{
+	BUG_ON(fifo_full(fifo));
+	fifo->head = fifo_inc_ct(fifo->head, fifo->size);
+}
+
+static inline void fifo_inc_tail(struct dma_fifo *fifo)
+{
+	BUG_ON(fifo_empty(fifo));
+	fifo->tail = fifo_inc_ct(fifo->tail, fifo->size);
+}
+
+/* internal funcs */
+static inline void fifo_dump(struct dma_fifo *fifo)
+{
+	printk(KERN_INFO "fifo: head %d, tail %d, size %d\n", fifo->head,
+			fifo->tail,
+			fifo->size);
+}
+
+static inline int fifo_inc_ct(int ct, int size)
+{
+	return (++ct == size) ? 0 : ct;
+}
+
+#endif	  /* _ARCVMAC_H */
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH 1/1] ARC VMAC ethernet driver.
From: Andreas Fenkart @ 2011-02-17  9:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev
In-Reply-To: <1297764149.3201.4.camel@edumazet-laptop>

fixed, pls find new version in reply to this mail.

2011/2/15 Eric Dumazet <eric.dumazet@gmail.com>:
> Le mardi 15 février 2011 à 10:31 +0100, Andreas Fenkart a écrit :
>> Signed-off-by: Andreas Fenkart <afenkart@gmail.com>
>
>
>> +             processed++;
>> +             skb->dev = dev;
>        eth_type_trans() already sets "skb->dev = dev;"
>
>> +             skb->protocol = eth_type_trans(skb, dev);
>> +             ap->stats.rx_packets++;
> Hmm, why dont you use dev->stats internal structure ? No need to
> maintain a shadow in ap->stats.
>
>> +             ap->stats.rx_bytes += skb->len;
>> +             dev->last_rx = jiffies;
>        /* last_rx = jiffies; not needed anymore  */
>
>
>> +             netif_rx(skb);
>
> A NAPI driver should use netif_receive_skb(), not netif_rx()
>
>
>
>

^ permalink raw reply

* Re: [PATCH v2] bnx2x: Support for managing RX indirection table
From: Eric Dumazet @ 2011-02-17  9:09 UTC (permalink / raw)
  To: Tom Herbert; +Cc: davem, eilong, netdev
In-Reply-To: <alpine.DEB.2.00.1102161210190.31937@pokey.mtv.corp.google.com>

Le mercredi 16 février 2011 à 12:27 -0800, Tom Herbert a écrit :
> Support fetching and retrieving RX indirection table via ethtool.
> 
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  drivers/net/bnx2x/bnx2x.h         |    2 +
>  drivers/net/bnx2x/bnx2x_ethtool.c |   56 +++++++++++++++++++++++++++++++++++++
>  drivers/net/bnx2x/bnx2x_main.c    |   22 +++++++++++---
>  3 files changed, 75 insertions(+), 5 deletions(-)

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>


As this is a bit unusual stuff (ethtool -X), I guess some hints can be
useful to testers and admins :

I tested this patch on my dev machine

Broadcom Corporation NetXtreme II BCM57711E 10Gigabit PCIe
(Hewlett-Packard Company NC532i Dual Port 10GbE Multifunction BL-C
Adapter)


# ethtool -X eth1 weight 0 1 0 2 0 4 0 8 0 16 0 32
# ethtool -x eth1 
RX flow hash indirection table for eth1 with 16 RX ring(s):
    0:      1     1     3     3     3     3     5     5
    8:      5     5     5     5     5     5     7     7
   16:      7     7     7     7     7     7     7     7
   24:      7     7     7     7     7     7     9     9
   32:      9     9     9     9     9     9     9     9
   40:      9     9     9     9     9     9     9     9
   48:      9     9     9     9     9     9     9     9
   56:      9     9     9     9     9     9    11    11
   64:     11    11    11    11    11    11    11    11
   72:     11    11    11    11    11    11    11    11
   80:     11    11    11    11    11    11    11    11
   88:     11    11    11    11    11    11    11    11
   96:     11    11    11    11    11    11    11    11
  104:     11    11    11    11    11    11    11    11
  112:     11    11    11    11    11    11    11    11
  120:     11    11    11    11    11    11    11    11


After some (distributed) trafic on eth1 I get :

[root@svivoipvnx021 ~]# ethtool -S eth1|grep rx_ucast_packets
     [0]: rx_ucast_packets: 62
     [1]: rx_ucast_packets: 15870
     [2]: rx_ucast_packets: 9
     [3]: rx_ucast_packets: 31507
     [4]: rx_ucast_packets: 0
     [5]: rx_ucast_packets: 63106
     [6]: rx_ucast_packets: 0
     [7]: rx_ucast_packets: 122051
     [8]: rx_ucast_packets: 1
     [9]: rx_ucast_packets: 248071
     [10]: rx_ucast_packets: 0
     [11]: rx_ucast_packets: 519864
     [12]: rx_ucast_packets: 0
     [13]: rx_ucast_packets: 0
     [14]: rx_ucast_packets: 0
     [15]: rx_ucast_packets: 0
     rx_ucast_packets: 1000541



^ permalink raw reply

* Re: [PATCH] drivers/net: Call netif_carrier_off at the end of the probe
From: Ivan Vecera @ 2011-02-17  8:24 UTC (permalink / raw)
  To: Francois Romieu, davem; +Cc: netdev, aabdulla, Ben Hutchings
In-Reply-To: <483985247.35721.1297792863958.JavaMail.root@zmail07.collab.prod.int.phx2.redhat.com>

On Tue, 2011-02-15 at 13:01 -0500, Ivan Vecera wrote:
> ----- Original Message -----
> > On Tue, 2011-02-15 at 16:22 +0100, Francois Romieu wrote:
> > > Stated this way it sounds like a core dev layer issue.
> > 
> > ...
> > > I am not completely sure after reading some history. Namely:
> > > - (37e8273cd30592d3a82bcb70cbb1bdc4eaeb6b71 ?)
> > > - c276e098d3ee33059b4a1c747354226cec58487c
> > > - 22604c866889c4b2e12b73cbf1683bda1b72a313
> > > - b47300168e770b60ab96c8924854c3b0eb4260eb
> > >
> > > I am confused.
> > 
> > Drivers that can report carrier state should do so initially some time
> > between registering a device and bringing it up (either in the bus
> > probe
> > function or the ndo_open function). It generally seems to be safe to
> > assume that the link is down initially, and then to rely on
> > notifications from the hardware. However, that does depend on the
> > behaviour of the hardware.
> > 
> Yes,that's true... forcedeth and r8169 are the drivers that detect link
> state when device is opened and call netif_carrier_on(off) appropriately.
> 
> Ivan
Dave, Francois, is it acceptable?

Ivan


^ permalink raw reply

* Re: [RFC, PATCH 2/4] net: sh_eth: remove the SH_TSU_ADDR
From: Yoshihiro Shimoda @ 2011-02-17  6:51 UTC (permalink / raw)
  To: Nobuhiro Iwamatsu; +Cc: netdev, SH-Linux
In-Reply-To: <AANLkTimJM0yEOU6RgoAs+Njv5b7ixVRnPubnUg=F72GX@mail.gmail.com>

2011/02/17 9:24, Nobuhiro Iwamatsu wrote:
>> @@ -1559,7 +1565,9 @@ out:
>>  static int sh_eth_drv_remove(struct platform_device *pdev)
>>  {
>>        struct net_device *ndev = platform_get_drvdata(pdev);
>> +       struct sh_eth_private *mdp = netdev_priv(ndev);
>>
>> +       iounmap(mdp->tsu_addr);
>>        sh_mdio_release(ndev);
>>        unregister_netdev(ndev);
>>        pm_runtime_disable(&pdev->dev);
> 
> You forget to fix for ARSTR.
Thank you for your point! I will fix it.

Best regards,
Yoshihiro Shimoda

^ permalink raw reply

* Re: [RFC, PATCH 1/4] net: sh_eth: modify the definitions of register
From: Yoshihiro Shimoda @ 2011-02-17  6:51 UTC (permalink / raw)
  To: Nobuhiro Iwamatsu; +Cc: netdev, SH-Linux
In-Reply-To: <AANLkTikxYtG620D4z09k15v7=r5CZ7xCcDWenbJmM_sP@mail.gmail.com>

2011/02/17 9:56, Nobuhiro Iwamatsu wrote:
> 2011/2/17 Nobuhiro Iwamatsu <iwamatsu@nigauri.org>:
>>> +static const u16 *sh_eth_get_register_offset(int register_type)
>>> +{
>>> +       const u16 *reg_offset = NULL;
>>> +
>>> +       switch (register_type) {
>>> +       case SH_ETH_REG_GIGABIT:
>>> +               reg_offset = sh_eth_offset_gigabit;
>>> +               break;
>>> +       case SH_ETH_REG_FAST_SH4:
>>> +               reg_offset = sh_eth_offset_fast_sh4;
>>> +               break;
>>> +       case SH_ETH_REG_FAST_SH3_SH2:
>>> +               reg_offset = sh_eth_offset_fast_sh3_sh2;
>>> +               break;
>>> +       case SH_ETH_REG_DEFAULT:
>>> +#if defined(CONFIG_CPU_SUBTYPE_SH7763)
>>> +               reg_offset = sh_eth_offset_gigabit;
>>> +#elif defined(CONFIG_CPU_SH4)
>>> +               reg_offset = sh_eth_offset_fast_sh4;
>>> +#else
>>> +               reg_offset = sh_eth_offset_fast_sh3_sh2;
>>> +#endif
>>> +               break;
>>> +       default:
>>> +               printk(KERN_ERR "Unknown register type (%d)\n", register_type);
>>> +               break;
>>> +       }
>>> +
>>> +       return reg_offset;
>>> +}
>>> +
>>
>> Is the handling of SH_ETH_REG_DEFAULT necessary?
Thank you for your review!
No, it isn't necessary. I will remove it and I will add NULL checking of mdp->reg_offset.

Best regards,
Yoshihiro Shimoda

^ permalink raw reply

* Re: Fwd: IGMP and rwlock: Dead ocurred again on TILEPro
From: Cypher Wu @ 2011-02-17  6:42 UTC (permalink / raw)
  To: Américo Wang; +Cc: linux-kernel, Chris Metcalf, Eric Dumazet, netdev
In-Reply-To: <20110217054237.GB2653@cr0.nay.redhat.com>

On Thu, Feb 17, 2011 at 1:42 PM, Américo Wang <xiyou.wangcong@gmail.com> wrote:
> On Thu, Feb 17, 2011 at 01:04:14PM +0800, Cypher Wu wrote:
>>>
>>> Have you turned CONFIG_LOCKDEP on?
>>>
>>> I think Eric already converted that rwlock into RCU lock, thus
>>> this problem should disappear. Could you try a new kernel?
>>>
>>> Thanks.
>>>
>>
>>I haven't turned CONFIG_LOCKDEP on for test since I didn't get too
>>much information when we tried to figured out the former deadlock.
>>
>>IGMP used read_lock() instead of read_lock_bh() since usually
>>read_lock() can be called recursively, and today I've read the
>>implementation of MIPS, it's should also works fine in that situation.
>>The implementation of TILEPro cause problem since after it use TNS set
>>the lock-val to 1 and hold the original value and before it re-set
>>lock-val a new value, it a race condition window.
>>
>
> I see no reason why you can't call read_lock_bh() recursively,
> read_lock_bh() is roughly equalent to local_bh_disable() + read_lock(),
> both can be recursive.
>
> But I may miss something here. :-/
>

Of course read_lock_bh() can be called recursively, but read_lock() is
already enough for IGMP, the only reason for that deadlock is because
using TNS instruction set the value to 1 cause another race condition.

-- 
Cyberman Wu

^ permalink raw reply

* Re: IGMP and rwlock: Dead ocurred again on TILEPro
From: Eric Dumazet @ 2011-02-17  6:39 UTC (permalink / raw)
  To: David Miller; +Cc: xiyou.wangcong, cypher.w, linux-kernel, cmetcalf, netdev
In-Reply-To: <20110216.214625.189707123.davem@davemloft.net>

Le mercredi 16 février 2011 à 21:46 -0800, David Miller a écrit :
> From: Américo Wang <xiyou.wangcong@gmail.com>
> Date: Thu, 17 Feb 2011 13:42:37 +0800
> 
> > On Thu, Feb 17, 2011 at 01:04:14PM +0800, Cypher Wu wrote:
> >>>
> >>> Have you turned CONFIG_LOCKDEP on?
> >>>
> >>> I think Eric already converted that rwlock into RCU lock, thus
> >>> this problem should disappear. Could you try a new kernel?
> >>>
> >>> Thanks.
> >>>
> >>
> >>I haven't turned CONFIG_LOCKDEP on for test since I didn't get too
> >>much information when we tried to figured out the former deadlock.
> >>
> >>IGMP used read_lock() instead of read_lock_bh() since usually
> >>read_lock() can be called recursively, and today I've read the
> >>implementation of MIPS, it's should also works fine in that situation.
> >>The implementation of TILEPro cause problem since after it use TNS set
> >>the lock-val to 1 and hold the original value and before it re-set
> >>lock-val a new value, it a race condition window.
> >>
> > 
> > I see no reason why you can't call read_lock_bh() recursively,
> > read_lock_bh() is roughly equalent to local_bh_disable() + read_lock(),
> > both can be recursive.
> > 
> > But I may miss something here. :-/
> 
> IGMP is doing this so that taking the read lock does not stop packet
> processing.
> 
> TILEPro's rwlock implementation is simply buggy and needs to be fixed.

Yep. Finding all recursive readlocks in kernel and convert them to
another locking model is probably more expensive than fixing TILEPro
rwlock implementation.

^ permalink raw reply

* Re: state of rtcache removal...
From: David Miller @ 2011-02-17  6:39 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1297923915.2645.24.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 17 Feb 2011 07:25:15 +0100

> Thanks David for this work in progress.

It was my pleasure :-)

> If I remember my works in last October/November, I also know fib_hash
> was a bit faster than fib_trie (around 20%)...

Right, if table is small hash can be faster.

I just wrote a hack that puts fib_lookup() results into the the
flow cache, and hooked it only into the one fib_lookup() call
that happens in ip_route_output_slow().

This pointed out another costly thing we do when resolving output
routes.  If the flow key's source is not INADDR_ANY, we validate the
source address is our's by doing a fib table lookup in the local
table, with the address in the flow key's destination field.

So even on output we were doing 3 fib lookups :-/

Therefore, the flow cache hack gets rid of 2 of those 3.

^ permalink raw reply

* Re: state of rtcache removal...
From: Eric Dumazet @ 2011-02-17  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110216.160838.39164069.davem@davemloft.net>

Le mercredi 16 février 2011 à 16:08 -0800, David Miller a écrit :
> So I've been testing out the routing cache removal patch to see
> what the impact is on performance.
> 
> I'm using a UDP flood to a single IP address over a dummy interface
> with hard coded ARP entries, so that pretty much just the main IP
> output and routing paths are being exercised.
> 
> The UDP flood tool I cooked up based upon a description sent to me by
> Eric Dumazet of a similar utility he uses for testing.  I've included
> the code to this tool at the end of this email, as well as the dummy
> interface setup script.   Basically, you go:
> 
> bash# ./udpflood_setup.sh
> bash# time ./udpflood -l 10000 10.2.2.11
> 
> The IP output path is about twice as slow with the routing cache
> removed entirely.  Here are the numbers I have:
> 
> net-next-2.6, rt_cache on:
> 
> davem@maramba:~$ time udpflood -l 10000000 10.2.2.11
> real		 1m47.012s
> user		 0m8.670s
> sys		 1m38.370s
> 
> net-next-2.6, rt_cache turned off via sysctl:
> 
> davem@maramba:~$ time udpflood -l 10000000 10.2.2.11
> real		 3m12.662s
> user		 0m9.490s
> sys		 3m3.220s
> 
> net-next-2.6 + "BONUS" rt_cache deletion patch:
> 
> maramba:/home/davem# time ./bin/udpflood -l 10000000 10.2.2.11
> real		     3m9.921s
> user		     0m9.520s
> sys		     3m0.440s
> 
> I then worked on some simplifications of the code in net/ipv4/route.c
> that remains after the cache removal.  I'll post those patches after
> I've chewed on them some more, but they knock a couple seconds back off
> of the benchmark:
> 
> The profile output is what you'd expect, with fib_table_lookup() topping
> the charts taking ~%10 of the time.
> 
> What might not be initially apparent is that each output route lookup
> results in two calls to fib_table_lookup() and thus two trie lookups.
> Why?  Because we have two routing tables (3 with IP_MULTIPLE_TABLES
> enabled) that get searched, first the LOCAL then the MAIN table (then
> with mutliple-tables enabled, the DEFAULT).  And most external
> outgoing routes sit in the MAIN table.
> 
> We do this so we can store all the interface address network,
> broadcast, loopback network, et al. routes in the LOCAL table, then all
> globally visible routes in the MAIN table.
> 
> Anyways, the long and short of this is that route lookups take two
> trie lookups instead of just one.  On input there are even more, for
> source address validation done by fib_validate_source().  That can be
> up to 4 more fib_table_lookup() invocations.
> 
> Add in another level of complexity if you have a series of FIB rules
> installed.
> 
> So, to me, this means that spending time micro-optiming fib_trie is
> not going to help much.  Getting rid of that multiplier somehow, on
> the other hand, might.
> 
> I plan to play with some ideas, such as sticking fib_alias entries into
> the flow cache and consulting/populating the flow cache on fib_lookup()
> calls.
> 
> -------------------- udpflood.c --------------------
> /* An adaptation of Eric Dumazet's udpflood tool.  */
> 
> #include <stdio.h>
> #include <stddef.h>
> #include <malloc.h>
> #include <string.h>
> #include <errno.h>
> 
> #include <sys/types.h>
> #include <sys/socket.h>
> #include <netinet/in.h>
> #include <arpa/inet.h>
> 
> #define _GNU_SOURCE
> #include <getopt.h>
> 
> static int usage(void)
> {
> 	printf("usage: udpflood [ -l count ] [ -m message_size ] IP_ADDRESS\n");
> 	return -1;
> }
> 
> static int send_packets(in_addr_t addr, int port, int count, int msg_sz)
> {
> 	char *msg = malloc(msg_sz);
> 	struct sockaddr_in saddr;
> 	int fd, i, err;
> 
> 	if (!msg)
> 		return -ENOMEM;
> 
> 	memset(msg, 0, msg_sz);
> 
> 	memset(&saddr, 0, sizeof(saddr));
> 	saddr.sin_family = AF_INET;
> 	saddr.sin_port = port;
> 	saddr.sin_addr.s_addr = addr;
> 
> 	fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
> 	if (fd < 0) {
> 		perror("socket");
> 		err = fd;
> 		goto out_nofd;
> 	}
> 	err = connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
> 	if (err < 0) {
> 		perror("connect");
> 		close(fd);
> 		goto out;
> 	}
> 	for (i = 0; i < count; i++) {
> 		err = sendto(fd, msg, msg_sz, 0,
> 			     (struct sockaddr *) &saddr, sizeof(saddr));
> 		if (err < 0) {
> 			perror("sendto");
> 			goto out;
> 		}
> 	}
> 
> 	err = 0;
> out:
> 	close(fd);
> out_nofd:
> 	free(msg);
> 	return err;
> }
> 
> int main(int argc, char **argv, char **envp)
> {
> 	int port, msg_sz, count, ret;
> 	in_addr_t addr;
> 
> 	port = 6000;
> 	msg_sz = 32;
> 	count = 10000000;
> 
> 	while ((ret = getopt(argc, argv, "l:s:p:")) >= 0) {
> 		switch (ret) {
> 		case 'l':
> 			sscanf(optarg, "%d", &count);
> 			break;
> 		case 's':
> 			sscanf(optarg, "%d", &msg_sz);
> 			break;
> 		case 'p':
> 			sscanf(optarg, "%d", &port);
> 			break;
> 		case '?':
> 			return usage();
> 		}
> 	}
> 
> 	if (!argv[optind])
> 		return usage();
> 
> 	addr = inet_addr(argv[optind]);
> 	if (addr == INADDR_NONE)
> 		return usage();
> 
> 	return send_packets(addr, port, count, msg_sz);
> }
> 
> -------------------- udpflood_setup.sh --------------------
> #!/bin/sh
> modprobe dummy
> ifconfig dummy0 10.2.2.254 netmask 255.255.255.0 up
> 
> for f in $(seq 11 26)
> do
>  arp -H ether -i dummy0 -s 10.2.2.$f 00:00:0c:07:ac:$f
> done
> --

Thanks David for this work in progress.

If I remember my works in last October/November, I also know fib_hash
was a bit faster than fib_trie (around 20%)...





^ permalink raw reply

* Re: IGMP and rwlock: Dead ocurred again on TILEPro
From: David Miller @ 2011-02-17  5:46 UTC (permalink / raw)
  To: xiyou.wangcong; +Cc: cypher.w, linux-kernel, cmetcalf, eric.dumazet, netdev
In-Reply-To: <20110217054237.GB2653@cr0.nay.redhat.com>

From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Thu, 17 Feb 2011 13:42:37 +0800

> On Thu, Feb 17, 2011 at 01:04:14PM +0800, Cypher Wu wrote:
>>>
>>> Have you turned CONFIG_LOCKDEP on?
>>>
>>> I think Eric already converted that rwlock into RCU lock, thus
>>> this problem should disappear. Could you try a new kernel?
>>>
>>> Thanks.
>>>
>>
>>I haven't turned CONFIG_LOCKDEP on for test since I didn't get too
>>much information when we tried to figured out the former deadlock.
>>
>>IGMP used read_lock() instead of read_lock_bh() since usually
>>read_lock() can be called recursively, and today I've read the
>>implementation of MIPS, it's should also works fine in that situation.
>>The implementation of TILEPro cause problem since after it use TNS set
>>the lock-val to 1 and hold the original value and before it re-set
>>lock-val a new value, it a race condition window.
>>
> 
> I see no reason why you can't call read_lock_bh() recursively,
> read_lock_bh() is roughly equalent to local_bh_disable() + read_lock(),
> both can be recursive.
> 
> But I may miss something here. :-/

IGMP is doing this so that taking the read lock does not stop packet
processing.

TILEPro's rwlock implementation is simply buggy and needs to be fixed.

^ permalink raw reply

* Re: Fwd: IGMP and rwlock: Dead ocurred again on TILEPro
From: Américo Wang @ 2011-02-17  5:42 UTC (permalink / raw)
  To: Cypher Wu
  Cc: Américo Wang, linux-kernel, Chris Metcalf, Eric Dumazet,
	netdev
In-Reply-To: <AANLkTikkro-_9pv5uJ4+SqYh9Fr480_96Wh1Pv6Pvtez@mail.gmail.com>

On Thu, Feb 17, 2011 at 01:04:14PM +0800, Cypher Wu wrote:
>>
>> Have you turned CONFIG_LOCKDEP on?
>>
>> I think Eric already converted that rwlock into RCU lock, thus
>> this problem should disappear. Could you try a new kernel?
>>
>> Thanks.
>>
>
>I haven't turned CONFIG_LOCKDEP on for test since I didn't get too
>much information when we tried to figured out the former deadlock.
>
>IGMP used read_lock() instead of read_lock_bh() since usually
>read_lock() can be called recursively, and today I've read the
>implementation of MIPS, it's should also works fine in that situation.
>The implementation of TILEPro cause problem since after it use TNS set
>the lock-val to 1 and hold the original value and before it re-set
>lock-val a new value, it a race condition window.
>

I see no reason why you can't call read_lock_bh() recursively,
read_lock_bh() is roughly equalent to local_bh_disable() + read_lock(),
both can be recursive.

But I may miss something here. :-/

^ permalink raw reply

* Re: Fwd: IGMP and rwlock: Dead ocurred again on TILEPro
From: Cypher Wu @ 2011-02-17  5:04 UTC (permalink / raw)
  To: Américo Wang; +Cc: linux-kernel, Chris Metcalf, Eric Dumazet, netdev
In-Reply-To: <20110217044917.GA2653@cr0.nay.redhat.com>

On Thu, Feb 17, 2011 at 12:49 PM, Américo Wang <xiyou.wangcong@gmail.com> wrote:
> On Thu, Feb 17, 2011 at 11:39:22AM +0800, Cypher Wu wrote:
>>---------- Forwarded message ----------
>>From: Cypher Wu <cypher.w@gmail.com>
>>Date: Wed, Feb 16, 2011 at 5:58 PM
>>Subject: GMP and rwlock: Dead ocurred again on TILEPro
>>To: linux-kernel@vger.kernel.org
>>
>>
>>The rwlock and spinlock of TILEPro platform use TNS instruction to
>>test the value of lock, but if interrupt is not masked, read_lock()
>>have another chance to deadlock while read_lock() called in bh of
>>interrupt.
>>
>
>
> In this case, you should call read_lock_bh() instead of read_lock().
>
>
>> frame 0: 0xfd3bfbe0 dump_stack+0x0/0x20 (sp 0xe4b5f9d8)
>> frame 1: 0xfd3c0b50 __raw_read_lock_slow.cold+0x50/0x90 (sp 0xe4b5f9d8)
>> frame 2: 0xfd184a58 igmpv3_send_cr+0x60/0x440 (sp 0xe4b5f9f0)
>> frame 3: 0xfd3bd928 igmp_ifc_timer_expire+0x30/0x90 (sp 0xe4b5fa20)
>> frame 4: 0xfd047698 run_timer_softirq+0x258/0x3c8 (sp 0xe4b5fa30)
>> frame 5: 0xfd0563f8 __do_softirq+0x138/0x220 (sp 0xe4b5fa70)
>> frame 6: 0xfd097d48 do_softirq+0x88/0x110 (sp 0xe4b5fa98)
>> frame 7: 0xfd1871f8 irq_exit+0xf8/0x120 (sp 0xe4b5faa8)
>> frame 8: 0xfd1afda0 do_timer_interrupt+0xa0/0xf8 (sp 0xe4b5fab0)
>> frame 9: 0xfd187b98 handle_interrupt+0x2d8/0x2e0 (sp 0xe4b5fac0)
>> <interrupt 25 while in kernel mode>
>> frame 10: 0xfd0241c8 _read_lock+0x8/0x40 (sp 0xe4b5fc38)
>> frame 11: 0xfd1bb008 ip_mc_del_src+0xc8/0x378 (sp 0xe4b5fc40)
>> frame 12: 0xfd2681e8 ip_mc_leave_group+0xf8/0x1e0 (sp 0xe4b5fc70)
>> frame 13: 0xfd0a4d70 do_ip_setsockopt+0xe48/0x1560 (sp 0xe4b5fc90)
>> frame 14: 0xfd2b4168 sys_setsockopt+0x150/0x170 (sp 0xe4b5fe98)
>> frame 15: 0xfd14e550 handle_syscall+0x2d0/0x320 (sp 0xe4b5fec0)
>> <syscall while in user mode>
>> frame 16: 0x3342a0 (sp 0xbfddfb00)
>> frame 17: 0x16130 (sp 0xbfddfb08)
>> frame 18: 0x16640 (sp 0xbfddfb38)
>> frame 19: 0x16ee8 (sp 0xbfddfc58)
>> frame 20: 0x345a08 (sp 0xbfddfc90)
>> frame 21: 0x10218 (sp 0xbfddfe48)
>>Stack dump complete
>>
>>I don't know the clear definition of rwlock & spinlock in Linux, but
>>the implementation of other platforms
>>like x86, PowerPC, ARM don't have that issue. The use of TNS cause a
>>race condition between system
>>call and interrupt.
>>
>
> Have you turned CONFIG_LOCKDEP on?
>
> I think Eric already converted that rwlock into RCU lock, thus
> this problem should disappear. Could you try a new kernel?
>
> Thanks.
>

I haven't turned CONFIG_LOCKDEP on for test since I didn't get too
much information when we tried to figured out the former deadlock.

IGMP used read_lock() instead of read_lock_bh() since usually
read_lock() can be called recursively, and today I've read the
implementation of MIPS, it's should also works fine in that situation.
The implementation of TILEPro cause problem since after it use TNS set
the lock-val to 1 and hold the original value and before it re-set
lock-val a new value, it a race condition window.

It's not practical to upgrade the kernel.

Thanks.

-- 
Cyberman Wu

^ permalink raw reply

* BUG? racy code at net/atm/br2684.c
From: 홍신 shin hong @ 2011-02-17  4:49 UTC (permalink / raw)
  To: netdev

Hi. I am reporting a suspected race bug at br2684_push()
in /net/atm/br2684.c.

Please examine the report and let me know your opinion.
I reported the same issue several month ago and got feedback
however, I found that the code is still not changed.

In br2684_push() accesses &brdev->brvcss without devs_lock holding at line 334.
But it seems that all accesses to &brdev->brvcss are synchronized by devs_lock.

So that br2684_push() may result race condition in concurrent execution
with other functions that manipulate &brdev->brvcss.

It seems that it is better to guard list_empty(&brdev->brvccs) by devs_lock.
Or check list_empty(&brdev->brvccs) once again after devs_lock holding.

Thank you.

Sincerely
Shin Hong

^ permalink raw reply

* Re: Fwd: IGMP and rwlock: Dead ocurred again on TILEPro
From: Américo Wang @ 2011-02-17  4:49 UTC (permalink / raw)
  To: Cypher Wu
  Cc: linux-kernel, Chris Metcalf, Américo Wang, Eric Dumazet,
	netdev
In-Reply-To: <AANLkTimj+_8cBz0gcEyF1889xK2HdF4SQDSK8DiY-4Gs@mail.gmail.com>

On Thu, Feb 17, 2011 at 11:39:22AM +0800, Cypher Wu wrote:
>---------- Forwarded message ----------
>From: Cypher Wu <cypher.w@gmail.com>
>Date: Wed, Feb 16, 2011 at 5:58 PM
>Subject: GMP and rwlock: Dead ocurred again on TILEPro
>To: linux-kernel@vger.kernel.org
>
>
>The rwlock and spinlock of TILEPro platform use TNS instruction to
>test the value of lock, but if interrupt is not masked, read_lock()
>have another chance to deadlock while read_lock() called in bh of
>interrupt.
>


In this case, you should call read_lock_bh() instead of read_lock().


> frame 0: 0xfd3bfbe0 dump_stack+0x0/0x20 (sp 0xe4b5f9d8)
> frame 1: 0xfd3c0b50 __raw_read_lock_slow.cold+0x50/0x90 (sp 0xe4b5f9d8)
> frame 2: 0xfd184a58 igmpv3_send_cr+0x60/0x440 (sp 0xe4b5f9f0)
> frame 3: 0xfd3bd928 igmp_ifc_timer_expire+0x30/0x90 (sp 0xe4b5fa20)
> frame 4: 0xfd047698 run_timer_softirq+0x258/0x3c8 (sp 0xe4b5fa30)
> frame 5: 0xfd0563f8 __do_softirq+0x138/0x220 (sp 0xe4b5fa70)
> frame 6: 0xfd097d48 do_softirq+0x88/0x110 (sp 0xe4b5fa98)
> frame 7: 0xfd1871f8 irq_exit+0xf8/0x120 (sp 0xe4b5faa8)
> frame 8: 0xfd1afda0 do_timer_interrupt+0xa0/0xf8 (sp 0xe4b5fab0)
> frame 9: 0xfd187b98 handle_interrupt+0x2d8/0x2e0 (sp 0xe4b5fac0)
> <interrupt 25 while in kernel mode>
> frame 10: 0xfd0241c8 _read_lock+0x8/0x40 (sp 0xe4b5fc38)
> frame 11: 0xfd1bb008 ip_mc_del_src+0xc8/0x378 (sp 0xe4b5fc40)
> frame 12: 0xfd2681e8 ip_mc_leave_group+0xf8/0x1e0 (sp 0xe4b5fc70)
> frame 13: 0xfd0a4d70 do_ip_setsockopt+0xe48/0x1560 (sp 0xe4b5fc90)
> frame 14: 0xfd2b4168 sys_setsockopt+0x150/0x170 (sp 0xe4b5fe98)
> frame 15: 0xfd14e550 handle_syscall+0x2d0/0x320 (sp 0xe4b5fec0)
> <syscall while in user mode>
> frame 16: 0x3342a0 (sp 0xbfddfb00)
> frame 17: 0x16130 (sp 0xbfddfb08)
> frame 18: 0x16640 (sp 0xbfddfb38)
> frame 19: 0x16ee8 (sp 0xbfddfc58)
> frame 20: 0x345a08 (sp 0xbfddfc90)
> frame 21: 0x10218 (sp 0xbfddfe48)
>Stack dump complete
>
>I don't know the clear definition of rwlock & spinlock in Linux, but
>the implementation of other platforms
>like x86, PowerPC, ARM don't have that issue. The use of TNS cause a
>race condition between system
>call and interrupt.
>

Have you turned CONFIG_LOCKDEP on?

I think Eric already converted that rwlock into RCU lock, thus
this problem should disappear. Could you try a new kernel?

Thanks.

^ permalink raw reply

* Fwd: IGMP and rwlock: Dead ocurred again on TILEPro
From: Cypher Wu @ 2011-02-17  3:39 UTC (permalink / raw)
  To: linux-kernel; +Cc: Chris Metcalf, Américo Wang, Eric Dumazet, netdev

---------- Forwarded message ----------
From: Cypher Wu <cypher.w@gmail.com>
Date: Wed, Feb 16, 2011 at 5:58 PM
Subject: GMP and rwlock: Dead ocurred again on TILEPro
To: linux-kernel@vger.kernel.org


The rwlock and spinlock of TILEPro platform use TNS instruction to
test the value of lock, but if interrupt is not masked, read_lock()
have another chance to deadlock while read_lock() called in bh of
interrupt.

The original code:
void __raw_read_lock_slow(raw_
rwlock_t *rwlock, u32 val)
{
   u32 iterations = 0;
   do {
       if (!(val & 1))
           rwlock->lock = val;
       delay_backoff(iterations++);
       val = __insn_tns((int *)&rwlock->lock);
   } while ((val << RD_COUNT_WIDTH) != 0);
   rwlock->lock = val + (1 << RD_COUNT_SHIFT);
}

I've modified it to get some information:
void __raw_read_lock_slow(raw_rwlock_t *rwlock, u32 val)
{
   u32 iterations = 0;
   do {
       if (!(val & 1))
       {
           rwlock->lock = val;
           iterations = 0;
       }
       delay_backoff(iterations++);
       if (iterations > 0x1000000)
       {
           dump_stack();
           iterations = 0;
       }

       val = __insn_tns((int *)&rwlock->lock);
   } while ((val << RD_COUNT_WIDTH) != 0);
   rwlock->lock = val + (1 << RD_COUNT_SHIFT);
}

And this is the stack info:

Starting stack dump of tid 837, pid 837 (ff0) on cpu 55 at cycle 10180633928773
 frame 0: 0xfd3bfbe0 dump_stack+0x0/0x20 (sp 0xe4b5f9d8)
 frame 1: 0xfd3c0b50 __raw_read_lock_slow.cold+0x50/0x90 (sp 0xe4b5f9d8)
 frame 2: 0xfd184a58 igmpv3_send_cr+0x60/0x440 (sp 0xe4b5f9f0)
 frame 3: 0xfd3bd928 igmp_ifc_timer_expire+0x30/0x90 (sp 0xe4b5fa20)
 frame 4: 0xfd047698 run_timer_softirq+0x258/0x3c8 (sp 0xe4b5fa30)
 frame 5: 0xfd0563f8 __do_softirq+0x138/0x220 (sp 0xe4b5fa70)
 frame 6: 0xfd097d48 do_softirq+0x88/0x110 (sp 0xe4b5fa98)
 frame 7: 0xfd1871f8 irq_exit+0xf8/0x120 (sp 0xe4b5faa8)
 frame 8: 0xfd1afda0 do_timer_interrupt+0xa0/0xf8 (sp 0xe4b5fab0)
 frame 9: 0xfd187b98 handle_interrupt+0x2d8/0x2e0 (sp 0xe4b5fac0)
 <interrupt 25 while in kernel mode>
 frame 10: 0xfd0241c8 _read_lock+0x8/0x40 (sp 0xe4b5fc38)
 frame 11: 0xfd1bb008 ip_mc_del_src+0xc8/0x378 (sp 0xe4b5fc40)
 frame 12: 0xfd2681e8 ip_mc_leave_group+0xf8/0x1e0 (sp 0xe4b5fc70)
 frame 13: 0xfd0a4d70 do_ip_setsockopt+0xe48/0x1560 (sp 0xe4b5fc90)
 frame 14: 0xfd2b4168 sys_setsockopt+0x150/0x170 (sp 0xe4b5fe98)
 frame 15: 0xfd14e550 handle_syscall+0x2d0/0x320 (sp 0xe4b5fec0)
 <syscall while in user mode>
 frame 16: 0x3342a0 (sp 0xbfddfb00)
 frame 17: 0x16130 (sp 0xbfddfb08)
 frame 18: 0x16640 (sp 0xbfddfb38)
 frame 19: 0x16ee8 (sp 0xbfddfc58)
 frame 20: 0x345a08 (sp 0xbfddfc90)
 frame 21: 0x10218 (sp 0xbfddfe48)
Stack dump complete

I don't know the clear definition of rwlock & spinlock in Linux, but
the implementation of other platforms
like x86, PowerPC, ARM don't have that issue. The use of TNS cause a
race condition between system
call and interrupt.

Through the call tree of packet sending, there are also some other
rwlock will be tried, say
read_lock(&fib_hash_lock) in fn_hash_lookup() which is called in
ip_route_output_slow(). I've seen deadlock
on fib_hash_lock, but haven't reproduced with that debug information yet.

Maybe IGMP is not the only one, TCP timer will retransmit data and
will also call read_lock(&fib_hash_lock).

--
Cyberman Wu



-- 
Cyberman Wu

^ permalink raw reply

* Re: state of rtcache removal...
From: David Miller @ 2011-02-17  3:27 UTC (permalink / raw)
  To: therbert; +Cc: netdev
In-Reply-To: <AANLkTi=W=1ZiP4JOJwo=FccNA9aK=1bpRn=zkepXXMzi@mail.gmail.com>

From: Tom Herbert <therbert@google.com>
Date: Wed, 16 Feb 2011 18:59:35 -0800

> On Wed, Feb 16, 2011 at 4:08 PM, David Miller <davem@davemloft.net> wrote:
>>
>> So I've been testing out the routing cache removal patch to see
>> what the impact is on performance.
>>
> Interesting results.
> 
> I assume that this test is purposely using sento on a connected socket
> to force sendmsg to go through the route lookup :-), so this is
> showing what the benefits of rtcache are is when cache hit rate is
> 100%.  For comparison, it might interesting to see what the
> performance is when rate is < 100%.  For instance, we often see hit
> rates < 20% on front end servers.  This could be done flooding to
> random addresses in 10/8 or even 0/0...  I'm hoping that without the
> rtcache performance actually improves in that case!

We know that the performance will be higher in the "closer to %0"
situation, ie.  for DoS workloads.  Because all of the route cache
management overhead goes away.

Anyways I'm working on some ideas to make the high hit rate case
perform amicably again.

^ permalink raw reply

* Re: state of rtcache removal...
From: Tom Herbert @ 2011-02-17  2:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110216.160838.39164069.davem@davemloft.net>

On Wed, Feb 16, 2011 at 4:08 PM, David Miller <davem@davemloft.net> wrote:
>
> So I've been testing out the routing cache removal patch to see
> what the impact is on performance.
>
Interesting results.

I assume that this test is purposely using sento on a connected socket
to force sendmsg to go through the route lookup :-), so this is
showing what the benefits of rtcache are is when cache hit rate is
100%.  For comparison, it might interesting to see what the
performance is when rate is < 100%.  For instance, we often see hit
rates < 20% on front end servers.  This could be done flooding to
random addresses in 10/8 or even 0/0...  I'm hoping that without the
rtcache performance actually improves in that case!

Tom

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox