Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH v2] ipv6: fix handling of blackhole and prohibit routes
From: Nicolas Dichtel @ 2012-09-05 11:34 UTC (permalink / raw)
  To: davem; +Cc: netdev, Nicolas Dichtel
In-Reply-To: <20120904.155836.51164588359279575.davem@davemloft.net>

When adding a blackhole or a prohibit route, they were handling like classic
routes. Moreover, it was only possible to add this kind of routes by specifying
an interface.

Bug already reported here:
  http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=498498

Before the patch:
  $ ip route add blackhole 2001::1/128
  RTNETLINK answers: No such device
  $ ip route add blackhole 2001::1/128 dev eth0
  $ ip -6 route | grep 2001
  2001::1 dev eth0  metric 1024

After:
  $ ip route add blackhole 2001::1/128
  $ ip -6 route | grep 2001
  blackhole 2001::1 dev lo  metric 1024  error -22

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/route.c      | 32 ++++++++++++++++++++++++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 0fedbd8..cd64cf3 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -37,6 +37,7 @@ struct fib6_config {
 	int		fc_ifindex;
 	u32		fc_flags;
 	u32		fc_protocol;
+	u32		fc_type;	/* only 8 bits are used */
 
 	struct in6_addr	fc_dst;
 	struct in6_addr	fc_src;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8e80fd2..5642fb5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1463,8 +1463,18 @@ int ip6_route_add(struct fib6_config *cfg)
 		}
 		rt->dst.output = ip6_pkt_discard_out;
 		rt->dst.input = ip6_pkt_discard;
-		rt->dst.error = -ENETUNREACH;
 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
+		switch (cfg->fc_type) {
+		case RTM_BLACKHOLE:
+			rt->dst.error = -EINVAL;
+			break;
+		case RTM_PROHIBIT:
+			rt->dst.error = -EACCES;
+			break;
+		default:
+			rt->dst.error = -ENETUNREACH;
+			break;
+		}
 		goto install_route;
 	}
 
@@ -2261,8 +2271,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	cfg->fc_src_len = rtm->rtm_src_len;
 	cfg->fc_flags = RTF_UP;
 	cfg->fc_protocol = rtm->rtm_protocol;
+	cfg->type = rtm->rtm_type;
 
-	if (rtm->rtm_type == RTN_UNREACHABLE)
+	if (rtm->rtm_type == RTN_UNREACHABLE ||
+	    rtm->rtm_type == RTN_BLACKHOLE ||
+	    rtm->rtm_type == RTN_PROHIBIT)
 		cfg->fc_flags |= RTF_REJECT;
 
 	if (rtm->rtm_type == RTN_LOCAL)
@@ -2391,8 +2404,19 @@ static int rt6_fill_node(struct net *net,
 	rtm->rtm_table = table;
 	if (nla_put_u32(skb, RTA_TABLE, table))
 		goto nla_put_failure;
-	if (rt->rt6i_flags & RTF_REJECT)
-		rtm->rtm_type = RTN_UNREACHABLE;
+	if (rt->rt6i_flags & RTF_REJECT) {
+		switch (rt->dst.error) {
+		case -EINVAL:
+			rtm->rtm_type = RTN_BLACKHOLE;
+			break;
+		case -EACCES:
+			rtm->rtm_type = RTN_PROHIBIT;
+			break;
+		default:
+			rtm->rtm_type = RTN_UNREACHABLE;
+			break;
+		}
+	}
 	else if (rt->rt6i_flags & RTF_LOCAL)
 		rtm->rtm_type = RTN_LOCAL;
 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
-- 
1.7.12

^ permalink raw reply related

* [PATCH 10/10] net/macb: Offset first RX buffer by two bytes
From: Nicolas Ferre @ 2012-09-05  9:04 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

From: Havard Skinnemoen <havard@skinnemoen.net>

Make the ethernet frame payload word-aligned, possibly making the
memcpy into the skb a bit faster. This will be even more important
after we eliminate the copy altogether.

Also eliminate the redundant RX_OFFSET constant -- it has the same
definition and purpose as NET_IP_ALIGN.

Signed-off-by: Havard Skinnemoen <havard@skinnemoen.net>
[nicolas.ferre@atmel.com: adapt to newer kernel]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |   23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index f31c0a7..f7716b6 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -33,9 +33,6 @@
 #define RX_RING_SIZE		512
 #define RX_RING_BYTES		(sizeof(struct macb_dma_desc) * RX_RING_SIZE)
 
-/* Make the IP header word-aligned (the ethernet header is 14 bytes) */
-#define RX_OFFSET		2
-
 #define TX_RING_SIZE		128
 #define TX_RING_BYTES		(sizeof(struct macb_dma_desc) * TX_RING_SIZE)
 
@@ -466,7 +463,7 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 {
 	unsigned int len;
 	unsigned int frag;
-	unsigned int offset = 0;
+	unsigned int offset;
 	struct sk_buff *skb;
 	struct macb_dma_desc *desc;
 
@@ -477,7 +474,16 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 		macb_rx_ring_wrap(first_frag),
 		macb_rx_ring_wrap(last_frag), len);
 
-	skb = netdev_alloc_skb(bp->dev, len + RX_OFFSET);
+	/*
+	 * The ethernet header starts NET_IP_ALIGN bytes into the
+	 * first buffer. Since the header is 14 bytes, this makes the
+	 * payload word-aligned.
+	 *
+	 * Instead of calling skb_reserve(NET_IP_ALIGN), we just copy
+	 * the two padding bytes into the skb so that we avoid hitting
+	 * the slowpath in memcpy(), and pull them off afterwards.
+	 */
+	skb = netdev_alloc_skb(bp->dev, len + NET_IP_ALIGN);
 	if (!skb) {
 		bp->stats.rx_dropped++;
 		for (frag = first_frag; ; frag++) {
@@ -493,7 +499,8 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 		return 1;
 	}
 
-	skb_reserve(skb, RX_OFFSET);
+	offset = 0;
+	len += NET_IP_ALIGN;
 	skb_checksum_none_assert(skb);
 	skb_put(skb, len);
 
@@ -517,10 +524,11 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 	/* Make descriptor updates visible to hardware */
 	wmb();
 
+	__skb_pull(skb, NET_IP_ALIGN);
 	skb->protocol = eth_type_trans(skb, bp->dev);
 
 	bp->stats.rx_packets++;
-	bp->stats.rx_bytes += len;
+	bp->stats.rx_bytes += skb->len;
 	netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
 		   skb->len, skb->csum);
 	netif_receive_skb(skb);
@@ -985,6 +993,7 @@ static void macb_init_hw(struct macb *bp)
 	__macb_set_hwaddr(bp);
 
 	config = macb_mdc_clk_div(bp);
+	config |= MACB_BF(RBOF, NET_IP_ALIGN);	/* Make eth data aligned */
 	config |= MACB_BIT(PAE);		/* PAuse Enable */
 	config |= MACB_BIT(DRFCS);		/* Discard Rx FCS */
 	config |= MACB_BIT(BIG);		/* Receive oversized frames */
-- 
1.7.10

^ permalink raw reply related

* Aw: Re: Question regarding kernel panic in net/ipv4/tcp_output.c
From: "Sascha Mühlbach" @ 2012-09-05  9:01 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1346761388.13121.21.camel@edumazet-glaptop>

Hi,
 
this is the stack trace I could save by making a screenshot of the remote console:
 
[<ffffffff8101193b>] ? invalid_op+0x1b/0x20
[<ffffffff81289e6a>] ? tcp_retransmit_skb+0x66/0x5aa
[<ffffffff8128a546>] ? tcp_xmit_retransmit_queue+0x198/0x223
[<ffffffff81286113>] ? tcp_ack+0x1744/0x1952
[<ffffffff81286871>] ? tcp_validate_incoming+0x1ba/0x2be
[<ffffffff81286f5e>] ? tcp_rcv_established+0x5e9/0x6d9
[<ffffffff8128e00f>] ? tcp_v4_do_rcv+0x1bb/0x376
[<ffffffff8128e639>] ? tcp_v4_rcv+0x46f/0x6f8
[<ffffffff81273afa>] ? ip_local_deliver_finish+0x0/0x1e9
[<ffffffff81273afa>] ? ip_local_deliver_finish+0x0/0x1e9
[<ffffffff81273c40>] ? ip_local_deliver_finish+0x146/0x1e9
[<ffffffff8127378f>] ? ip_rcv_finish+0x373/0x38d
[<ffffffffa004d95c>] ? bnx2_poll_work+0x954/0xa7e [bnx2]
[<ffffffffa004d95c>] ? bnx2_poll_work+0x954/0xa7e [bnx2]
[<ffffffff8105aeb6>] ? __mod_timer+0x141/0x153
[<ffffffff810964a1>] ? handle_edge_irq+0xdd/0x101
[<ffffffffa004daae>] ? bnx2_poll_msix+0x28/0xa6 [bnx2]
[<ffffffff8125039f>] ? net_rx_action+0xae/0x1c9
[<ffffffff81053d6f>] ? __do_softirq+0xdd/0x1a6
[<ffffffff81011cac>] ? call_softirq+0x1c/0x30
[<ffffffff8101322b>] ? do_softirq+0x3f/0x7c
[<ffffffff81053bdf>] ? irq_exit+0x36/0x76
[<ffffffff81012922>] ? do_IRQ+0xa0/0xb6
[<ffffffff810114d3>] ? ret_from_intr+0x0/0x11
<E0I> [<ffffffffa0135509>] ? acpi_idle_enter_bm+0x27d/0x2af [processor]
[<ffffffffa0135509>] ? acpi_idle_enter_bm+0x27d/0x2af [processor]
[<ffffffffa0135502>] ? acpi_idle_enter_bm+0x276/0x2af [processor]
[<ffffffff8123a2c6>] ? cpuidle_idle_call+0x94/0xee
[<ffffffff8100fe97>] ? cpu_idle+0xa2/0xda
[<ffffffff8151c140>] ? early_idt_handler+0x0/0x71
[<ffffffff8151ccdd>] ? start_kernel+0x3dc/0x3e8
[<ffffffff8151c3b7>] ? x86_64_start_kernel+0xf9/0x106

Kind Regards,
Sascha

>Gesendet: Dienstag, 04. September 2012 um 14:23 Uhr
>Von: "Eric Dumazet" <eric.dumazet@gmail.com>
>An: smka2012@email.de
>Cc: netdev@vger.kernel.org
>Betreff: Re: Question regarding kernel panic in net/ipv4/tcp_output.c
>On Tue, 2012-09-04 at 13:55 +0200, smka2012@email.de wrote:
>> Hi,
>>
>> I recently had a severe issue with multiple servers that stopped
>> working at the same time. By using the stacktrace, I could locate the
>> following line in the tcp_retransmit_skb function of the tcp_output.c
>> kernel source that led to a kernel panic on all servers. I am using
>> Debian 6.0 with the kernel version 2.6.32-45.
>>
>> (gdb) list *(tcp_retransmit_skb+0x66)
>> 0x1a93 is in tcp_retransmit_skb (net/ipv4/tcp_output.c:1905).
>> 1900 if (atomic_read(&sk->sk_wmem_alloc) >
>> 1901 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
>> sk->sk_sndbuf))
>> 1902 return -EAGAIN;
>> 1903
>> 1904 if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
>>
>> --->
>> 1905 if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
>> 1906 BUG();
>> <---
>>
>> 1907 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
>> 1908 return -ENOMEM;
>> 1909 }
>>
>> Now I am interested in getting to know what can lead to a situation
>> that the condition in line 1905 evaluates true and why the kernel goes
>> into the BUG() function in that case and does not only return an
>> error. All servers reached this line of code. They all were connected
>> to a switch that broke the same time. However, I cannot say if the
>> switch broke before the servers and eventually affected the servers or
>> if the switch was also itself affected by some external event.
>>
>> Thank you very much for your help.
>>
>> Kind Regards,
>> Sascha
>> --
>
>You can see this BUG() as an early notification of a hard to
>debug/diagnose bug.
>
>We shouldnt take this path at all.
>
>If we do, we have an earlier bug that we should fix anyway, because
>machine is going to crash.
>
>It would be nice you sent the stack trace you had.

^ permalink raw reply

* [PATCH 09/10] net/macb: ethtool interface: add register dump feature
From: Nicolas Ferre @ 2012-09-05  9:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

Add macb_get_regs() ethtool function and its helper function:
macb_get_regs_len().

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |   40 +++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/cadence/macb.h |    3 +++
 2 files changed, 43 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index c7c39f1..f31c0a7 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -1321,10 +1321,50 @@ static void macb_get_drvinfo(struct net_device *dev,
 	strcpy(info->bus_info, dev_name(&bp->pdev->dev));
 }
 
+static int macb_get_regs_len(struct net_device *netdev)
+{
+	return MACB_GREGS_LEN * sizeof(u32);
+}
+
+static void macb_get_regs(struct net_device *dev, struct ethtool_regs *regs,
+			  void *p)
+{
+	struct macb *bp = netdev_priv(dev);
+	unsigned int tail, head;
+	u32 *regs_buff = p;
+
+        memset(p, 0, MACB_GREGS_LEN * sizeof(u32));
+	regs->version = MACB_BFEXT(IDNUM, macb_readl(bp, MID));
+
+	tail = macb_tx_ring_wrap(bp->tx_tail);
+	head = macb_tx_ring_wrap(bp->tx_head);
+
+	regs_buff[0]  = macb_readl(bp, NCR);
+	regs_buff[1]  = macb_or_gem_readl(bp, NCFGR);
+	regs_buff[2]  = macb_readl(bp, NSR);
+	regs_buff[3]  = macb_readl(bp, TSR);
+	regs_buff[4]  = macb_readl(bp, RBQP);
+	regs_buff[5]  = macb_readl(bp, TBQP);
+	regs_buff[6]  = macb_readl(bp, RSR);
+	regs_buff[7]  = macb_readl(bp, IMR);
+
+	regs_buff[8]  = tail;
+	regs_buff[9]  = head;
+	regs_buff[10] = macb_tx_dma(bp, tail);
+	regs_buff[11] = macb_tx_dma(bp, head);
+
+	if (macb_is_gem(bp)) {
+		regs_buff[12] = gem_readl(bp, USRIO);
+		regs_buff[13] = gem_readl(bp, DMACFG);
+	}
+}
+
 static const struct ethtool_ops macb_ethtool_ops = {
 	.get_settings		= macb_get_settings,
 	.set_settings		= macb_set_settings,
 	.get_drvinfo		= macb_get_drvinfo,
+	.get_regs_len		= macb_get_regs_len,
+	.get_regs		= macb_get_regs,
 	.get_link		= ethtool_op_get_link,
 	.get_ts_info		= ethtool_op_get_ts_info,
 };
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 8a4ee2f..d509e88 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -10,6 +10,9 @@
 #ifndef _MACB_H
 #define _MACB_H
 
+
+#define MACB_GREGS_LEN 32
+
 /* MACB register offsets */
 #define MACB_NCR				0x0000
 #define MACB_NCFGR				0x0004
-- 
1.7.10

^ permalink raw reply related

* [PATCH 08/10] net/macb: macb_get_drvinfo: add GEM/MACB suffix to differentiate revision
From: Nicolas Ferre @ 2012-09-05  9:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

Add an indication about which revision of the hardware we are running in
info->driver string.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |    4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index bd331fd..c7c39f1 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -1313,6 +1313,10 @@ static void macb_get_drvinfo(struct net_device *dev,
 	struct macb *bp = netdev_priv(dev);
 
 	strcpy(info->driver, bp->pdev->dev.driver->name);
+	if (macb_is_gem(bp))
+		strcat(info->driver, " GEM");
+	else
+		strcat(info->driver, " MACB");
 	strcpy(info->version, "$Revision: 1.14 $");
 	strcpy(info->bus_info, dev_name(&bp->pdev->dev));
 }
-- 
1.7.10

^ permalink raw reply related

* [PATCH 07/10] net/macb: tx status is more than 8 bits now
From: Nicolas Ferre @ 2012-09-05  9:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

On some revision of GEM, TSR status register is has more information.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index af71151..bd331fd 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -390,7 +390,7 @@ static void macb_tx_interrupt(struct macb *bp)
 	status = macb_readl(bp, TSR);
 	macb_writel(bp, TSR, status);
 
-	netdev_vdbg(bp->dev, "macb_tx_interrupt status = %02lx\n",
+	netdev_vdbg(bp->dev, "macb_tx_interrupt status = 0x%03lx\n",
 		(unsigned long)status);
 
 	head = bp->tx_head;
-- 
1.7.10

^ permalink raw reply related

* [PATCH 06/10] net/macb: better manage tx errors
From: Nicolas Ferre @ 2012-09-05  9:00 UTC (permalink / raw)
  To: netdev
  Cc: patrice.vilchez, nicolas.ferre, linux-kernel, havard, jamie,
	plagnioj, davem, linux-arm-kernel
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

Handle all TX errors, not only underruns.
Reinitialize the TX ring after skipping all remaining frames, and
restart the controller when everything has been cleaned up properly.

Original idea from a patch by Havard Skinnemoen.

Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |  124 ++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 53 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 3d3a077..af71151 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -44,6 +44,10 @@
 
 #define MACB_RX_INT_FLAGS	(MACB_BIT(RCOMP) | MACB_BIT(RXUBR)	\
 				 | MACB_BIT(ISR_ROVR))
+#define MACB_TX_INT_FLAGS	(MACB_BIT(ISR_TUND)			\
+					| MACB_BIT(ISR_RLE)		\
+					| MACB_BIT(TXERR)		\
+					| MACB_BIT(TCOMP))
 
 /* Ring buffer accessors */
 static unsigned int macb_tx_ring_wrap(unsigned int index)
@@ -338,66 +342,56 @@ static void macb_update_stats(struct macb *bp)
 		*p += __raw_readl(reg);
 }
 
-static void macb_tx(struct macb *bp)
+static void macb_handle_tx_error(struct macb *bp, unsigned int err_tail, u32 ctrl)
 {
-	unsigned int tail;
-	unsigned int head;
-	u32 status;
-
-	status = macb_readl(bp, TSR);
-	macb_writel(bp, TSR, status);
+	struct macb_tx_skb	*tx_skb;
+	struct sk_buff		*skb;
+	unsigned int		head = bp->tx_head;
 
-	netdev_vdbg(bp->dev, "macb_tx status = %02lx\n", (unsigned long)status);
+	netdev_dbg(bp->dev, "TX error: ctrl 0x%08x, head %u, error tail %u\n",
+		   ctrl, head, err_tail);
 
-	if (status & (MACB_BIT(UND) | MACB_BIT(TSR_RLE))) {
-		int i;
-		netdev_err(bp->dev, "TX %s, resetting buffers\n",
-			   status & MACB_BIT(UND) ?
-			   "underrun" : "retry limit exceeded");
-
-		/* Transfer ongoing, disable transmitter, to avoid confusion */
-		if (status & MACB_BIT(TGO))
-			macb_writel(bp, NCR, macb_readl(bp, NCR) & ~MACB_BIT(TE));
-
-		head = bp->tx_head;
-
-		/*Mark all the buffer as used to avoid sending a lost buffer*/
-		for (i = 0; i < TX_RING_SIZE; i++)
-			bp->tx_ring[i].ctrl = MACB_BIT(TX_USED);
-
-		/* Add wrap bit */
-		bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
+	/*
+	 * "Buffers exhausted mid-frame" errors may only happen if the
+	 * driver is buggy, so complain loudly about those. Statistics
+	 * are updated by hardware.
+	 */
+	if (ctrl & MACB_BIT(TX_BUF_EXHAUSTED))
+		netdev_err(bp->dev, "BUG: TX buffers exhausted mid-frame\n");
 
-		/* free transmit buffer in upper layer*/
-		for (tail = bp->tx_tail; tail != head; tail++) {
-			struct macb_tx_skb	*tx_skb;
-			struct sk_buff		*skb;
+	/*
+	 * Drop the frames that caused the error plus all remaining in queue.
+	 * Free transmit buffers in upper layer.
+	 */
+	for (; err_tail != head; err_tail++) {
+		struct macb_dma_desc	*desc;
 
-			rmb();
+		tx_skb = macb_tx_skb(bp, err_tail);
+		skb = tx_skb->skb;
+		dma_unmap_single(&bp->pdev->dev, tx_skb->mapping, skb->len,
+				 DMA_TO_DEVICE);
+		dev_kfree_skb_irq(skb);
+		tx_skb->skb = NULL;
 
-			tx_skb = macb_tx_skb(bp, tail);
-			skb = tx_skb->skb;
+		desc = macb_tx_desc(bp, err_tail);
+		desc->ctrl |= MACB_BIT(TX_USED);
+	}
 
-			dma_unmap_single(&bp->pdev->dev, tx_skb->mapping,
-						skb->len, DMA_TO_DEVICE);
-			tx_skb->skb = NULL;
-			dev_kfree_skb_irq(skb);
-		}
+	/* Make descriptor updates visible to hardware */
+	wmb();
+}
 
-		bp->tx_head = bp->tx_tail = 0;
+static void macb_tx_interrupt(struct macb *bp)
+{
+	unsigned int tail;
+	unsigned int head;
+	u32 status;
 
-		/* Enable the transmitter again */
-		if (status & MACB_BIT(TGO))
-			macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TE));
-	}
+	status = macb_readl(bp, TSR);
+	macb_writel(bp, TSR, status);
 
-	if (!(status & MACB_BIT(COMP)))
-		/*
-		 * This may happen when a buffer becomes complete
-		 * between reading the ISR and scanning the
-		 * descriptors.  Nothing to worry about.
-		 */
-		return;
+	netdev_vdbg(bp->dev, "macb_tx_interrupt status = %02lx\n",
+		(unsigned long)status);
 
 	head = bp->tx_head;
 	for (tail = bp->tx_tail; tail != head; tail++) {
@@ -413,6 +407,31 @@ static void macb_tx(struct macb *bp)
 
 		ctrl = desc->ctrl;
 
+		if (unlikely(ctrl & (MACB_BIT(TX_ERROR)
+					| MACB_BIT(TX_UNDERRUN)
+					| MACB_BIT(TX_BUF_EXHAUSTED)))) {
+			/*
+			 * In case of transfer ongoing, disable transmitter.
+			 * Should already be the case due to hardware,
+			 * but make sure to avoid confusion.
+			 */
+			if (status & MACB_BIT(TGO))
+				macb_writel(bp, NCR, macb_readl(bp, NCR) & ~MACB_BIT(TE));
+
+			/*
+			 * An error should always stop the queue from advancing.
+			 * reset entries in the ring and exit from the loop.
+			 */
+			macb_handle_tx_error(bp, tail, ctrl);
+			bp->tx_head = bp->tx_tail = head = tail = 0;
+
+			/* Enable the transmitter again, start TX will be done elsewhere */
+			if (status & MACB_BIT(TGO))
+				macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TE));
+
+			break;
+		}
+
 		if (!(ctrl & MACB_BIT(TX_USED)))
 			break;
 
@@ -644,9 +663,8 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			}
 		}
 
-		if (status & (MACB_BIT(TCOMP) | MACB_BIT(ISR_TUND) |
-			    MACB_BIT(ISR_RLE)))
-			macb_tx(bp);
+		if (status & MACB_TX_INT_FLAGS)
+			macb_tx_interrupt(bp);
 
 		/*
 		 * Link change detection isn't possible with RMII, so we'll
-- 
1.7.10

^ permalink raw reply related

* [PATCH 05/10] net/macb: clean up ring buffer logic
From: Nicolas Ferre @ 2012-09-05  9:00 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

From: Havard Skinnemoen <havard@skinnemoen.net>

Instead of masking head and tail every time we increment them, just let them
wrap through UINT_MAX and mask them when subscripting. Add simple accessor
functions to do the subscripting properly to minimize the chances of messing
this up.

This makes the code slightly smaller, and hopefully faster as well.  Also,
doing the ring buffer management this way will simplify things a lot when
making the ring sizes configurable in the future.

Signed-off-by: Havard Skinnemoen <havard@skinnemoen.net>
[nicolas.ferre@atmel.com: split patch in topics, adapt to newer kernel]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |  170 ++++++++++++++++++++++-------------
 drivers/net/ethernet/cadence/macb.h |   22 +++--
 2 files changed, 123 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index f4b8adf..3d3a077 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -31,24 +31,13 @@
 
 #define RX_BUFFER_SIZE		128
 #define RX_RING_SIZE		512
-#define RX_RING_BYTES		(sizeof(struct dma_desc) * RX_RING_SIZE)
+#define RX_RING_BYTES		(sizeof(struct macb_dma_desc) * RX_RING_SIZE)
 
 /* Make the IP header word-aligned (the ethernet header is 14 bytes) */
 #define RX_OFFSET		2
 
 #define TX_RING_SIZE		128
-#define DEF_TX_RING_PENDING	(TX_RING_SIZE - 1)
-#define TX_RING_BYTES		(sizeof(struct dma_desc) * TX_RING_SIZE)
-
-#define TX_RING_GAP(bp)						\
-	(TX_RING_SIZE - (bp)->tx_pending)
-#define TX_BUFFS_AVAIL(bp)					\
-	(((bp)->tx_tail <= (bp)->tx_head) ?			\
-	 (bp)->tx_tail + (bp)->tx_pending - (bp)->tx_head :	\
-	 (bp)->tx_tail - (bp)->tx_head - TX_RING_GAP(bp))
-#define NEXT_TX(n)		(((n) + 1) & (TX_RING_SIZE - 1))
-
-#define NEXT_RX(n)		(((n) + 1) & (RX_RING_SIZE - 1))
+#define TX_RING_BYTES		(sizeof(struct macb_dma_desc) * TX_RING_SIZE)
 
 /* minimum number of free TX descriptors before waking up TX process */
 #define MACB_TX_WAKEUP_THRESH	(TX_RING_SIZE / 4)
@@ -56,6 +45,51 @@
 #define MACB_RX_INT_FLAGS	(MACB_BIT(RCOMP) | MACB_BIT(RXUBR)	\
 				 | MACB_BIT(ISR_ROVR))
 
+/* Ring buffer accessors */
+static unsigned int macb_tx_ring_wrap(unsigned int index)
+{
+	return index & (TX_RING_SIZE - 1);
+}
+
+static unsigned int macb_tx_ring_avail(struct macb *bp)
+{
+	return TX_RING_SIZE - (bp->tx_head - bp->tx_tail);
+}
+
+static struct macb_dma_desc *macb_tx_desc(struct macb *bp, unsigned int index)
+{
+	return &bp->tx_ring[macb_tx_ring_wrap(index)];
+}
+
+static struct macb_tx_skb *macb_tx_skb(struct macb *bp, unsigned int index)
+{
+	return &bp->tx_skb[macb_tx_ring_wrap(index)];
+}
+
+static dma_addr_t macb_tx_dma(struct macb *bp, unsigned int index)
+{
+	dma_addr_t offset;
+
+	offset = macb_tx_ring_wrap(index) * sizeof(struct macb_dma_desc);
+
+	return bp->tx_ring_dma + offset;
+}
+
+static unsigned int macb_rx_ring_wrap(unsigned int index)
+{
+	return index & (RX_RING_SIZE - 1);
+}
+
+static struct macb_dma_desc *macb_rx_desc(struct macb *bp, unsigned int index)
+{
+	return &bp->rx_ring[macb_rx_ring_wrap(index)];
+}
+
+static void *macb_rx_buffer(struct macb *bp, unsigned int index)
+{
+	return bp->rx_buffers + RX_BUFFER_SIZE * macb_rx_ring_wrap(index);
+}
+
 static void __macb_set_hwaddr(struct macb *bp)
 {
 	u32 bottom;
@@ -335,17 +369,18 @@ static void macb_tx(struct macb *bp)
 		bp->tx_ring[TX_RING_SIZE - 1].ctrl |= MACB_BIT(TX_WRAP);
 
 		/* free transmit buffer in upper layer*/
-		for (tail = bp->tx_tail; tail != head; tail = NEXT_TX(tail)) {
-			struct ring_info *rp = &bp->tx_skb[tail];
-			struct sk_buff *skb = rp->skb;
-
-			BUG_ON(skb == NULL);
+		for (tail = bp->tx_tail; tail != head; tail++) {
+			struct macb_tx_skb	*tx_skb;
+			struct sk_buff		*skb;
 
 			rmb();
 
-			dma_unmap_single(&bp->pdev->dev, rp->mapping, skb->len,
-							 DMA_TO_DEVICE);
-			rp->skb = NULL;
+			tx_skb = macb_tx_skb(bp, tail);
+			skb = tx_skb->skb;
+
+			dma_unmap_single(&bp->pdev->dev, tx_skb->mapping,
+						skb->len, DMA_TO_DEVICE);
+			tx_skb->skb = NULL;
 			dev_kfree_skb_irq(skb);
 		}
 
@@ -365,28 +400,32 @@ static void macb_tx(struct macb *bp)
 		return;
 
 	head = bp->tx_head;
-	for (tail = bp->tx_tail; tail != head; tail = NEXT_TX(tail)) {
-		struct ring_info *rp = &bp->tx_skb[tail];
-		struct sk_buff *skb = rp->skb;
-		u32 bufstat;
+	for (tail = bp->tx_tail; tail != head; tail++) {
+		struct macb_tx_skb	*tx_skb;
+		struct sk_buff		*skb;
+		struct macb_dma_desc	*desc;
+		u32			ctrl;
 
-		BUG_ON(skb == NULL);
+		desc = macb_tx_desc(bp, tail);
 
 		/* Make hw descriptor updates visible to CPU */
 		rmb();
 
-		bufstat = bp->tx_ring[tail].ctrl;
+		ctrl = desc->ctrl;
 
-		if (!(bufstat & MACB_BIT(TX_USED)))
+		if (!(ctrl & MACB_BIT(TX_USED)))
 			break;
 
+		tx_skb = macb_tx_skb(bp, tail);
+		skb = tx_skb->skb;
+
 		netdev_vdbg(bp->dev, "skb %u (data %p) TX complete\n",
-			   tail, skb->data);
-		dma_unmap_single(&bp->pdev->dev, rp->mapping, skb->len,
+			macb_tx_ring_wrap(tail), skb->data);
+		dma_unmap_single(&bp->pdev->dev, tx_skb->mapping, skb->len,
 				 DMA_TO_DEVICE);
 		bp->stats.tx_packets++;
 		bp->stats.tx_bytes += skb->len;
-		rp->skb = NULL;
+		tx_skb->skb = NULL;
 		dev_kfree_skb_irq(skb);
 	}
 
@@ -398,8 +437,8 @@ static void macb_tx(struct macb *bp)
 		macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 
 	bp->tx_tail = tail;
-	if (netif_queue_stopped(bp->dev) &&
-	    TX_BUFFS_AVAIL(bp) > MACB_TX_WAKEUP_THRESH)
+	if (netif_queue_stopped(bp->dev)
+			&& macb_tx_ring_avail(bp) > MACB_TX_WAKEUP_THRESH)
 		netif_wake_queue(bp->dev);
 }
 
@@ -410,17 +449,21 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 	unsigned int frag;
 	unsigned int offset = 0;
 	struct sk_buff *skb;
+	struct macb_dma_desc *desc;
 
-	len = MACB_BFEXT(RX_FRMLEN, bp->rx_ring[last_frag].ctrl);
+	desc = macb_rx_desc(bp, last_frag);
+	len = MACB_BFEXT(RX_FRMLEN, desc->ctrl);
 
 	netdev_vdbg(bp->dev, "macb_rx_frame frags %u - %u (len %u)\n",
-		   first_frag, last_frag, len);
+		macb_rx_ring_wrap(first_frag),
+		macb_rx_ring_wrap(last_frag), len);
 
 	skb = netdev_alloc_skb(bp->dev, len + RX_OFFSET);
 	if (!skb) {
 		bp->stats.rx_dropped++;
-		for (frag = first_frag; ; frag = NEXT_RX(frag)) {
-			bp->rx_ring[frag].addr &= ~MACB_BIT(RX_USED);
+		for (frag = first_frag; ; frag++) {
+			desc = macb_rx_desc(bp, frag);
+			desc->addr &= ~MACB_BIT(RX_USED);
 			if (frag == last_frag)
 				break;
 		}
@@ -435,7 +478,7 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 	skb_checksum_none_assert(skb);
 	skb_put(skb, len);
 
-	for (frag = first_frag; ; frag = NEXT_RX(frag)) {
+	for (frag = first_frag; ; frag++) {
 		unsigned int frag_len = RX_BUFFER_SIZE;
 
 		if (offset + frag_len > len) {
@@ -443,11 +486,10 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 			frag_len = len - offset;
 		}
 		skb_copy_to_linear_data_offset(skb, offset,
-					       (bp->rx_buffers +
-					        (RX_BUFFER_SIZE * frag)),
-					       frag_len);
+				macb_rx_buffer(bp, frag), frag_len);
 		offset += RX_BUFFER_SIZE;
-		bp->rx_ring[frag].addr &= ~MACB_BIT(RX_USED);
+		desc = macb_rx_desc(bp, frag);
+		desc->addr &= ~MACB_BIT(RX_USED);
 
 		if (frag == last_frag)
 			break;
@@ -473,8 +515,10 @@ static void discard_partial_frame(struct macb *bp, unsigned int begin,
 {
 	unsigned int frag;
 
-	for (frag = begin; frag != end; frag = NEXT_RX(frag))
-		bp->rx_ring[frag].addr &= ~MACB_BIT(RX_USED);
+	for (frag = begin; frag != end; frag++) {
+		struct macb_dma_desc *desc = macb_rx_desc(bp, frag);
+		desc->addr &= ~MACB_BIT(RX_USED);
+	}
 
 	/* Make descriptor updates visible to hardware */
 	wmb();
@@ -489,17 +533,18 @@ static void discard_partial_frame(struct macb *bp, unsigned int begin,
 static int macb_rx(struct macb *bp, int budget)
 {
 	int received = 0;
-	unsigned int tail = bp->rx_tail;
+	unsigned int tail;
 	int first_frag = -1;
 
-	for (; budget > 0; tail = NEXT_RX(tail)) {
+	for (tail = bp->rx_tail; budget > 0; tail++) {
+		struct macb_dma_desc *desc = macb_rx_desc(bp, tail);
 		u32 addr, ctrl;
 
 		/* Make hw descriptor updates visible to CPU */
 		rmb();
 
-		addr = bp->rx_ring[tail].addr;
-		ctrl = bp->rx_ring[tail].ctrl;
+		addr = desc->addr;
+		ctrl = desc->ctrl;
 
 		if (!(addr & MACB_BIT(RX_USED)))
 			break;
@@ -653,6 +698,8 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct macb *bp = netdev_priv(dev);
 	dma_addr_t mapping;
 	unsigned int len, entry;
+	struct macb_dma_desc *desc;
+	struct macb_tx_skb *tx_skb;
 	u32 ctrl;
 	unsigned long flags;
 
@@ -669,7 +716,7 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_lock_irqsave(&bp->lock, flags);
 
 	/* This is a hard error, log it. */
-	if (TX_BUFFS_AVAIL(bp) < 1) {
+	if (macb_tx_ring_avail(bp) < 1) {
 		netif_stop_queue(dev);
 		spin_unlock_irqrestore(&bp->lock, flags);
 		netdev_err(bp->dev, "BUG! Tx Ring full when queue awake!\n");
@@ -678,12 +725,15 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_BUSY;
 	}
 
-	entry = bp->tx_head;
+	entry = macb_tx_ring_wrap(bp->tx_head);
+	bp->tx_head++;
 	netdev_vdbg(bp->dev, "Allocated ring entry %u\n", entry);
 	mapping = dma_map_single(&bp->pdev->dev, skb->data,
 				 len, DMA_TO_DEVICE);
-	bp->tx_skb[entry].skb = skb;
-	bp->tx_skb[entry].mapping = mapping;
+
+	tx_skb = &bp->tx_skb[entry];
+	tx_skb->skb = skb;
+	tx_skb->mapping = mapping;
 	netdev_vdbg(bp->dev, "Mapped skb data %p to DMA addr %08lx\n",
 		   skb->data, (unsigned long)mapping);
 
@@ -692,15 +742,13 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (entry == (TX_RING_SIZE - 1))
 		ctrl |= MACB_BIT(TX_WRAP);
 
-	bp->tx_ring[entry].addr = mapping;
-	bp->tx_ring[entry].ctrl = ctrl;
+	desc = &bp->tx_ring[entry];
+	desc->addr = mapping;
+	desc->ctrl = ctrl;
 
 	/* Make newly initialized descriptor visible to hardware */
 	wmb();
 
-	entry = NEXT_TX(entry);
-	bp->tx_head = entry;
-
 	skb_tx_timestamp(skb);
 
 	/*
@@ -713,10 +761,10 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * re-enable interrupts, and the interrupt handler will make
 	 * sure the controler is started.
 	 */
-	if (NEXT_TX(bp->tx_tail) == bp->tx_head)
+	if (bp->tx_tail == bp->tx_head - 1)
 		macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 
-	if (TX_BUFFS_AVAIL(bp) < 1)
+	if (macb_tx_ring_avail(bp) < 1)
 		netif_stop_queue(dev);
 
 	spin_unlock_irqrestore(&bp->lock, flags);
@@ -752,7 +800,7 @@ static int macb_alloc_consistent(struct macb *bp)
 {
 	int size;
 
-	size = TX_RING_SIZE * sizeof(struct ring_info);
+	size = TX_RING_SIZE * sizeof(struct macb_tx_skb);
 	bp->tx_skb = kmalloc(size, GFP_KERNEL);
 	if (!bp->tx_skb)
 		goto out_err;
@@ -1437,8 +1485,6 @@ static int __init macb_probe(struct platform_device *pdev)
 		macb_or_gem_writel(bp, USRIO, MACB_BIT(MII));
 #endif
 
-	bp->tx_pending = DEF_TX_RING_PENDING;
-
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Cannot register net device, aborting.\n");
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index f69ceef..8a4ee2f 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -356,7 +356,12 @@
 		__v; \
 	})
 
-struct dma_desc {
+/**
+ * struct macb_dma_desc - Hardware DMA descriptor
+ * @addr: DMA address of data buffer
+ * @ctrl: Control and status bits
+ */
+struct macb_dma_desc {
 	u32	addr;
 	u32	ctrl;
 };
@@ -421,7 +426,12 @@ struct dma_desc {
 #define MACB_TX_USED_OFFSET			31
 #define MACB_TX_USED_SIZE			1
 
-struct ring_info {
+/**
+ * struct macb_tx_skb - data about an skb which is being transmitted
+ * @skb: skb currently being transmitted
+ * @mapping: DMA address of the skb's data buffer
+ */
+struct macb_tx_skb {
 	struct sk_buff		*skb;
 	dma_addr_t		mapping;
 };
@@ -506,12 +516,12 @@ struct macb {
 	void __iomem		*regs;
 
 	unsigned int		rx_tail;
-	struct dma_desc		*rx_ring;
+	struct macb_dma_desc	*rx_ring;
 	void			*rx_buffers;
 
 	unsigned int		tx_head, tx_tail;
-	struct dma_desc		*tx_ring;
-	struct ring_info	*tx_skb;
+	struct macb_dma_desc	*tx_ring;
+	struct macb_tx_skb	*tx_skb;
 
 	spinlock_t		lock;
 	struct platform_device	*pdev;
@@ -529,8 +539,6 @@ struct macb {
 	dma_addr_t		tx_ring_dma;
 	dma_addr_t		rx_buffers_dma;
 
-	unsigned int		rx_pending, tx_pending;
-
 	struct mii_bus		*mii_bus;
 	struct phy_device	*phy_dev;
 	unsigned int 		link;
-- 
1.7.10

^ permalink raw reply related

* Re: Commit "ipconfig wait for carrier" makes boot hang for 2 mins if no carrier
From: Joakim Tjernlund @ 2012-09-05  8:47 UTC (permalink / raw)
  To: Micha Nelissen; +Cc: netdev
In-Reply-To: <50470DAC.8030009@neli.hopto.org>



Micha Nelissen <micha@neli.hopto.org> wrote on 2012/09/05 10:30:36:

> From: Micha Nelissen <micha@neli.hopto.org>
> To: Joakim Tjernlund <joakim.tjernlund@transmode.se>,
> Cc: netdev@vger.kernel.org
> Date: 2012/09/05 10:30
> Subject: Re: Commit "ipconfig wait for carrier" makes boot hang for 2 mins if no   carrier
>
> Op 2012-09-05 9:04, Joakim Tjernlund schreef:
> >> Because that's where my root filesystem is? The IP autoconfiguration
> >> code exists for this purpose.
> >
> > This is not the only purpose.
>
> Documentation/filesystems/nfs/nfsroot.txt seems to suggest so (this is
> where the ip= parameter is documented), but it works independently indeed.
>
> So explain your reasons?
If you read that doc you find:

ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>

  This parameter tells the kernel how to configure IP addresses of devices
  and also how to set up the IP routing table. It was originally called
  `nfsaddrs', but now the boot-time IP configuration works independently of
  NFS, so it was renamed to `ip' and the old name remained as an alias for
  compatibility reasons.

>
> >>> The answer is probably the same, it is much easier to
> >>> manage our IP config in one place for our embedded system.
> >>
> >> You retrieve the kernel via TFTP or so when booting?
> >
> > Yes, but mostly not. This really doesn't matter
>
> Seems to me that if you boot standalone there is no reason to let the IP
> address be configured by the kernel? Retrieve the IP address in user
> space from your bootloader environment or whatever. And if you boot from
> ethernet (or some other networking interface), then you have a carrier,
> and there is no 2 minute delay (maybe less even than before with this
> patch!).

Everything is possible but we choosed to use already built-in functionality, as
did you.
You could have added an initram FS and done your NFS mount there so this
argument goes nowhere.

>
> >>> The wait should be conditional on NFS root or not so that non NFS roots
> >>> can skip this stage altogether.
> >
> > Your patch broke other use cases so my patch would just revert or change the tmo
> > to 2 secs or so.
> > Or you could clean up your stuff so it works for all and not just for you.
>
> It didn't break anything, it does work for you also, you just need to
> wait somewhat longer. Or make sure there is a carrier. The intent of the
> original 1 second delay was to let the link come up!

Sure it did, our system(and not only ours I bet) can not accept a 2 minute delay in
booting up the system for no reason.

Please adjust the 2 min wait to nfsroot= only and keep the old way for ip=

 Jocke

^ permalink raw reply

* Re: [PATCH] decnet: fix shutdown parameter checking
From: Steven Whitehouse @ 2012-09-05  8:37 UTC (permalink / raw)
  To: David Miller; +Cc: xi.wang, netdev, linux-kernel
In-Reply-To: <20120831.155719.2228399422573401952.davem@davemloft.net>

Hi,

On Fri, 2012-08-31 at 15:57 -0400, David Miller wrote:
> From: Steven Whitehouse <swhiteho@redhat.com>
> Date: Mon, 27 Aug 2012 10:16:41 +0100
> 
> > On Sun, 2012-08-26 at 22:37 -0400, Xi Wang wrote:
> >> The allowed value of "how" is SHUT_RD/SHUT_WR/SHUT_RDWR (0/1/2),
> >> rather than SHUTDOWN_MASK (3).
> >> 
> >> Signed-off-by: Xi Wang <xi.wang@gmail.com>
> > Acked-by: Steven Whitehouse <swhiteho@redhat.com>
> 
> Applied to net-next.
> 
> > Although it could be argued that we should also continue to accept the
> > value 3 just in case there is any userland software out there which
> > sends that value,
> 
> True, but this is a rather standard BSD socket interface with a very
> specific small set of legitimate input parameters.  Allowing
> deviation, even for compatability for specific protocols, is largely
> unwise.

Yes, I'd agree on the whole, and certainly if this was a recent
addition. However since this code has been around for somewhere close to
16 years now, I'd say that means that either (a) nobody calls shutdown
for DECnet or (b) existing users are buggy too.

We do have a precedent for this kind of compatibility, such as the AX.25
use of SOCK_SEQPACKET.

However, I'm not overly worried and we'll soon know if it will cause any
problems or not,

Steve.

^ permalink raw reply

* Re: [Bug 47021] New: kernel panic with l2tpv3 & mtu > 1500
From: Eric Dumazet @ 2012-09-05  8:34 UTC (permalink / raw)
  To: a1; +Cc: Stephen Hemminger, netdev
In-Reply-To: <50470324.5040805@atlas.cz>

On Wed, 2012-09-05 at 09:45 +0200, a1 wrote:
> Thanks, it definitely helped, no more panics now...

It seems MTU of device is wrong, and lot of packets are fragmented...

Its currently 1488, but it really should be less than that (accounting
for the IP+UDP header)

I wonder if anybody ever used this code ?

^ permalink raw reply

* Re: Commit "ipconfig wait for carrier" makes boot hang for 2 mins if no   carrier
From: Micha Nelissen @ 2012-09-05  8:30 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: netdev
In-Reply-To: <OFFA3AC1B1.08FC3AAD-ONC1257A70.00266EDC-C1257A70.0026E8BC@transmode.se>

Op 2012-09-05 9:04, Joakim Tjernlund schreef:
>> Because that's where my root filesystem is? The IP autoconfiguration
>> code exists for this purpose.
>
> This is not the only purpose.

Documentation/filesystems/nfs/nfsroot.txt seems to suggest so (this is 
where the ip= parameter is documented), but it works independently indeed.

So explain your reasons?

>>> The answer is probably the same, it is much easier to
>>> manage our IP config in one place for our embedded system.
>>
>> You retrieve the kernel via TFTP or so when booting?
>
> Yes, but mostly not. This really doesn't matter

Seems to me that if you boot standalone there is no reason to let the IP 
address be configured by the kernel? Retrieve the IP address in user 
space from your bootloader environment or whatever. And if you boot from 
ethernet (or some other networking interface), then you have a carrier, 
and there is no 2 minute delay (maybe less even than before with this 
patch!).

>>> The wait should be conditional on NFS root or not so that non NFS roots
>>> can skip this stage altogether.
>
> Your patch broke other use cases so my patch would just revert or change the tmo
> to 2 secs or so.
> Or you could clean up your stuff so it works for all and not just for you.

It didn't break anything, it does work for you also, you just need to 
wait somewhat longer. Or make sure there is a carrier. The intent of the 
original 1 second delay was to let the link come up!

Micha

^ permalink raw reply

* [PATCH 04/10] net/macb: Fix a race in macb_start_xmit()
From: Nicolas Ferre @ 2012-09-05  8:19 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

From: Havard Skinnemoen <havard@skinnemoen.net>

Fix a race in macb_start_xmit() where we unconditionally set the TSTART bit.
If an underrun just happened (we do this with interrupts disabled, so it might
not have been handled yet), the controller starts transmitting from the first
entry in the ring, which is usually wrong.
Restart the controller after error handling.

Signed-off-by: Havard Skinnemoen <havard@skinnemoen.net>
[nicolas.ferre@atmel.com: split patch in topics]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |   20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 2228dfc..f4b8adf 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -390,6 +390,13 @@ static void macb_tx(struct macb *bp)
 		dev_kfree_skb_irq(skb);
 	}
 
+	/*
+	 * Someone may have submitted a new frame while this interrupt
+	 * was pending, or we may just have handled an error.
+	 */
+	if (head != tail && !(status & MACB_BIT(TGO)))
+		macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+
 	bp->tx_tail = tail;
 	if (netif_queue_stopped(bp->dev) &&
 	    TX_BUFFS_AVAIL(bp) > MACB_TX_WAKEUP_THRESH)
@@ -696,7 +703,18 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb_tx_timestamp(skb);
 
-	macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
+	/*
+	 * Only start the controller if the queue was empty; otherwise
+	 * we may race against the hardware resetting the ring pointer
+	 * due to a transmit error.
+	 *
+	 * If the controller is idle but the queue isn't empty, there
+	 * must be a pending interrupt that will trigger as soon as we
+	 * re-enable interrupts, and the interrupt handler will make
+	 * sure the controler is started.
+	 */
+	if (NEXT_TX(bp->tx_tail) == bp->tx_head)
+		macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(TSTART));
 
 	if (TX_BUFFS_AVAIL(bp) < 1)
 		netif_stop_queue(dev);
-- 
1.7.10

^ permalink raw reply related

* [PATCH 03/10] net/macb: change debugging messages
From: Nicolas Ferre @ 2012-09-05  8:19 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

From: Havard Skinnemoen <havard@skinnemoen.net>

Convert some noisy netdev_dbg() statements to netdev_vdbg(). Defining
DEBUG will no longer fill up the logs; VERBOSE_DEBUG still does.
Add one more verbose debug for ISR status.

Signed-off-by: Havard Skinnemoen <havard@skinnemoen.net>
[nicolas.ferre@atmel.com: split patch in topics, add ISR status]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |   22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 26ca01e..2228dfc 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -313,7 +313,7 @@ static void macb_tx(struct macb *bp)
 	status = macb_readl(bp, TSR);
 	macb_writel(bp, TSR, status);
 
-	netdev_dbg(bp->dev, "macb_tx status = %02lx\n", (unsigned long)status);
+	netdev_vdbg(bp->dev, "macb_tx status = %02lx\n", (unsigned long)status);
 
 	if (status & (MACB_BIT(UND) | MACB_BIT(TSR_RLE))) {
 		int i;
@@ -380,7 +380,7 @@ static void macb_tx(struct macb *bp)
 		if (!(bufstat & MACB_BIT(TX_USED)))
 			break;
 
-		netdev_dbg(bp->dev, "skb %u (data %p) TX complete\n",
+		netdev_vdbg(bp->dev, "skb %u (data %p) TX complete\n",
 			   tail, skb->data);
 		dma_unmap_single(&bp->pdev->dev, rp->mapping, skb->len,
 				 DMA_TO_DEVICE);
@@ -406,7 +406,7 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 
 	len = MACB_BFEXT(RX_FRMLEN, bp->rx_ring[last_frag].ctrl);
 
-	netdev_dbg(bp->dev, "macb_rx_frame frags %u - %u (len %u)\n",
+	netdev_vdbg(bp->dev, "macb_rx_frame frags %u - %u (len %u)\n",
 		   first_frag, last_frag, len);
 
 	skb = netdev_alloc_skb(bp->dev, len + RX_OFFSET);
@@ -453,7 +453,7 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 
 	bp->stats.rx_packets++;
 	bp->stats.rx_bytes += len;
-	netdev_dbg(bp->dev, "received skb of length %u, csum: %08x\n",
+	netdev_vdbg(bp->dev, "received skb of length %u, csum: %08x\n",
 		   skb->len, skb->csum);
 	netif_receive_skb(skb);
 
@@ -535,7 +535,7 @@ static int macb_poll(struct napi_struct *napi, int budget)
 
 	work_done = 0;
 
-	netdev_dbg(bp->dev, "poll: status = %08lx, budget = %d\n",
+	netdev_vdbg(bp->dev, "poll: status = %08lx, budget = %d\n",
 		   (unsigned long)status, budget);
 
 	work_done = macb_rx(bp, budget);
@@ -574,6 +574,8 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			break;
 		}
 
+		netdev_vdbg(bp->dev, "isr = 0x%08lx\n", (unsigned long)status);
+
 		if (status & MACB_RX_INT_FLAGS) {
 			/*
 			 * There's no point taking any more interrupts
@@ -585,7 +587,7 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 			macb_writel(bp, IDR, MACB_RX_INT_FLAGS);
 
 			if (napi_schedule_prep(&bp->napi)) {
-				netdev_dbg(bp->dev, "scheduling RX softirq\n");
+				netdev_vdbg(bp->dev, "scheduling RX softirq\n");
 				__napi_schedule(&bp->napi);
 			}
 		}
@@ -647,8 +649,8 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	u32 ctrl;
 	unsigned long flags;
 
-#ifdef DEBUG
-	netdev_dbg(bp->dev,
+#if defined(DEBUG) && defined(VERBOSE_DEBUG)
+	netdev_vdbg(bp->dev,
 		   "start_xmit: len %u head %p data %p tail %p end %p\n",
 		   skb->len, skb->head, skb->data,
 		   skb_tail_pointer(skb), skb_end_pointer(skb));
@@ -670,12 +672,12 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	entry = bp->tx_head;
-	netdev_dbg(bp->dev, "Allocated ring entry %u\n", entry);
+	netdev_vdbg(bp->dev, "Allocated ring entry %u\n", entry);
 	mapping = dma_map_single(&bp->pdev->dev, skb->data,
 				 len, DMA_TO_DEVICE);
 	bp->tx_skb[entry].skb = skb;
 	bp->tx_skb[entry].mapping = mapping;
-	netdev_dbg(bp->dev, "Mapped skb data %p to DMA addr %08lx\n",
+	netdev_vdbg(bp->dev, "Mapped skb data %p to DMA addr %08lx\n",
 		   skb->data, (unsigned long)mapping);
 
 	ctrl = MACB_BF(TX_FRMLEN, len);
-- 
1.7.10

^ permalink raw reply related

* [PATCH 02/10] net/macb: memory barriers cleanup
From: Nicolas Ferre @ 2012-09-05  8:19 UTC (permalink / raw)
  To: netdev
  Cc: patrice.vilchez, nicolas.ferre, linux-kernel, havard, jamie,
	plagnioj, davem, linux-arm-kernel
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

From: Havard Skinnemoen <havard@skinnemoen.net>

Remove a couple of unneeded barriers and document the remaining ones.

Signed-off-by: Havard Skinnemoen <havard@skinnemoen.net>
[nicolas.ferre@atmel.com: split patch in topics]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |   18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 9a10f69..26ca01e 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -372,7 +372,9 @@ static void macb_tx(struct macb *bp)
 
 		BUG_ON(skb == NULL);
 
+		/* Make hw descriptor updates visible to CPU */
 		rmb();
+
 		bufstat = bp->tx_ring[tail].ctrl;
 
 		if (!(bufstat & MACB_BIT(TX_USED)))
@@ -415,7 +417,10 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 			if (frag == last_frag)
 				break;
 		}
+
+		/* Make descriptor updates visible to hardware */
 		wmb();
+
 		return 1;
 	}
 
@@ -436,12 +441,14 @@ static int macb_rx_frame(struct macb *bp, unsigned int first_frag,
 					       frag_len);
 		offset += RX_BUFFER_SIZE;
 		bp->rx_ring[frag].addr &= ~MACB_BIT(RX_USED);
-		wmb();
 
 		if (frag == last_frag)
 			break;
 	}
 
+	/* Make descriptor updates visible to hardware */
+	wmb();
+
 	skb->protocol = eth_type_trans(skb, bp->dev);
 
 	bp->stats.rx_packets++;
@@ -461,6 +468,8 @@ static void discard_partial_frame(struct macb *bp, unsigned int begin,
 
 	for (frag = begin; frag != end; frag = NEXT_RX(frag))
 		bp->rx_ring[frag].addr &= ~MACB_BIT(RX_USED);
+
+	/* Make descriptor updates visible to hardware */
 	wmb();
 
 	/*
@@ -479,7 +488,9 @@ static int macb_rx(struct macb *bp, int budget)
 	for (; budget > 0; tail = NEXT_RX(tail)) {
 		u32 addr, ctrl;
 
+		/* Make hw descriptor updates visible to CPU */
 		rmb();
+
 		addr = bp->rx_ring[tail].addr;
 		ctrl = bp->rx_ring[tail].ctrl;
 
@@ -674,6 +685,8 @@ static int macb_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	bp->tx_ring[entry].addr = mapping;
 	bp->tx_ring[entry].ctrl = ctrl;
+
+	/* Make newly initialized descriptor visible to hardware */
 	wmb();
 
 	entry = NEXT_TX(entry);
@@ -782,9 +795,6 @@ static void macb_init_rings(struct macb *bp)
 
 static void macb_reset_hw(struct macb *bp)
 {
-	/* Make sure we have the write buffer for ourselves */
-	wmb();
-
 	/*
 	 * Disable RX and TX (XXX: Should we halt the transmission
 	 * more gracefully?)
-- 
1.7.10

^ permalink raw reply related

* [PATCH 01/10] net/macb: Add support for Gigabit Ethernet mode
From: Nicolas Ferre @ 2012-09-05  8:19 UTC (permalink / raw)
  To: netdev
  Cc: linux-arm-kernel, davem, havard, nicolas.ferre, plagnioj, jamie,
	linux-kernel, patrice.vilchez
In-Reply-To: <cover.1346775479.git.nicolas.ferre@atmel.com>

From: Patrice Vilchez <patrice.vilchez@atmel.com>

Add Gigabit Ethernet mode to GEM cadence IP and enable RGMII connection.

Signed-off-by: Patrice Vilchez <patrice.vilchez@atmel.com>
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
---
 drivers/net/ethernet/cadence/macb.c |   15 ++++++++++++---
 drivers/net/ethernet/cadence/macb.h |    4 ++++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 033064b..9a10f69 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -152,13 +152,17 @@ static void macb_handle_link_change(struct net_device *dev)
 
 			reg = macb_readl(bp, NCFGR);
 			reg &= ~(MACB_BIT(SPD) | MACB_BIT(FD));
+			if (macb_is_gem(bp))
+				reg &= ~GEM_BIT(GBE);
 
 			if (phydev->duplex)
 				reg |= MACB_BIT(FD);
 			if (phydev->speed == SPEED_100)
 				reg |= MACB_BIT(SPD);
+			if (phydev->speed == SPEED_1000)
+				reg |= GEM_BIT(GBE);
 
-			macb_writel(bp, NCFGR, reg);
+			macb_or_gem_writel(bp, NCFGR, reg);
 
 			bp->speed = phydev->speed;
 			bp->duplex = phydev->duplex;
@@ -216,7 +220,10 @@ static int macb_mii_probe(struct net_device *dev)
 	}
 
 	/* mask with MAC supported features */
-	phydev->supported &= PHY_BASIC_FEATURES;
+	if (macb_is_gem(bp))
+		phydev->supported &= PHY_GBIT_FEATURES;
+	else
+		phydev->supported &= PHY_BASIC_FEATURES;
 
 	phydev->advertising = phydev->supported;
 
@@ -1384,7 +1391,9 @@ static int __init macb_probe(struct platform_device *pdev)
 		bp->phy_interface = err;
 	}
 
-	if (bp->phy_interface == PHY_INTERFACE_MODE_RMII)
+	if (bp->phy_interface == PHY_INTERFACE_MODE_RGMII)
+		macb_or_gem_writel(bp, USRIO, GEM_BIT(RGMII));
+	else if (bp->phy_interface == PHY_INTERFACE_MODE_RMII)
 #if defined(CONFIG_ARCH_AT91)
 		macb_or_gem_writel(bp, USRIO, (MACB_BIT(RMII) |
 					       MACB_BIT(CLKEN)));
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index 335e288..f69ceef 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -145,6 +145,8 @@
 #define MACB_IRXFCS_SIZE			1
 
 /* GEM specific NCFGR bitfields. */
+#define GEM_GBE_OFFSET				10
+#define GEM_GBE_SIZE				1
 #define GEM_CLK_OFFSET				18
 #define GEM_CLK_SIZE				3
 #define GEM_DBW_OFFSET				21
@@ -246,6 +248,8 @@
 /* Bitfields in USRIO (AT91) */
 #define MACB_RMII_OFFSET			0
 #define MACB_RMII_SIZE				1
+#define GEM_RGMII_OFFSET			0	/* GEM gigabit mode */
+#define GEM_RGMII_SIZE				1
 #define MACB_CLKEN_OFFSET			1
 #define MACB_CLKEN_SIZE				1
 
-- 
1.7.10

^ permalink raw reply related

* [PATCH 00/10] net/macb: driver enhancement concerning GEM support, ring logic and cleanup
From: Nicolas Ferre @ 2012-09-05  8:19 UTC (permalink / raw)
  To: netdev
  Cc: patrice.vilchez, nicolas.ferre, linux-kernel, havard, jamie,
	plagnioj, davem, linux-arm-kernel

This is an enhancement work that began several years ago. I try to catchup with
some performance improvement that has been implemented then by Havard.
The ring index logic and the TX error path modification are the biggest changes
but some cleanup/debugging have been added along the way.
The GEM revision will benefit from the Gigabit support.

The series has been tested on several Atmel AT91 SoC with the two MACB/GEM
flavors.

Havard Skinnemoen (5):
  net/macb: memory barriers cleanup
  net/macb: change debugging messages
  net/macb: Fix a race in macb_start_xmit()
  net/macb: clean up ring buffer logic
  net/macb: Offset first RX buffer by two bytes

Nicolas Ferre (4):
  net/macb: better manage tx errors
  net/macb: tx status is more than 8 bits now
  net/macb: macb_get_drvinfo: add GEM/MACB suffix to differentiate
    revision
  net/macb: ethtool interface: add register dump feature

Patrice Vilchez (1):
  net/macb: Add support for Gigabit Ethernet mode

 drivers/net/ethernet/cadence/macb.c |  408 ++++++++++++++++++++++++-----------
 drivers/net/ethernet/cadence/macb.h |   29 ++-
 2 files changed, 304 insertions(+), 133 deletions(-)

-- 
1.7.10

^ permalink raw reply

* Re: [PATCH v7 1/1] ieee802154: MRF24J40 driver
From: Alexander Smirnov @ 2012-09-05  8:05 UTC (permalink / raw)
  To: David Miller
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	alan-yzvJWuRpmD1zbRFIqnYvSA
In-Reply-To: <20120904.144630.82568755842479462.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

Dear colleagues,

2012/9/4 David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>:
> From: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
> Date: Sun,  2 Sep 2012 21:44:13 -0400
>
>> Driver for the Microchip MRF24J40 802.15.4 WPAN module.

I wan on vacation so had no possibility to review the code.
Alan, thank you for the contribution!

>>
>> Signed-off-by: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
>
> Applied to net-next, thanks.

Thanks David.

Alex

------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/

^ permalink raw reply

* Re: [Bug 47021] New: kernel panic with l2tpv3 & mtu > 1500
From: a1 @ 2012-09-05  7:45 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, netdev
In-Reply-To: <20120904125458.7d97ec38@nehalam.linuxnetplumber.net>

Thanks, it definitely helped, no more panics now...

jn

>>>
>>
>> Seems following patch is needed, not sure if it helps
>>
>> [PATCH] l2tp: fix a typo in l2tp_eth_dev_recv()
>>
>> While investigating l2tp bug, I hit a bug in eth_type_trans(),
>> because not enough bytes were pulled in skb head.
>>
>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> ---
>>  net/l2tp/l2tp_eth.c |    2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
>> index f9ee74d..3bfb34a 100644
>> --- a/net/l2tp/l2tp_eth.c
>> +++ b/net/l2tp/l2tp_eth.c
>> @@ -153,7 +153,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
>>  		print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
>>  	}
>>  
>> -	if (!pskb_may_pull(skb, sizeof(ETH_HLEN)))
>> +	if (!pskb_may_pull(skb, ETH_HLEN))
>>  		goto error;
> 
> I guess nobody ever looked inside this code. That seems like an obvious bug.
> 

^ permalink raw reply

* Re: [PATCH 1/4] net: mvneta: driver for Marvell Armada 370/XP network unit
From: Thomas Petazzoni @ 2012-09-05  7:32 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Lior Amsalem, Ike Pan, Nadav Haklai, Ian Molton,
	Lennert Buytenhek, David Marlin, Rami Rosen, Yehuda Yitschak,
	Jani Monoses, Tawfik Bayouk, Dan Frazier, Eran Ben-Avi, Li Li,
	Leif Lindholm, Sebastian Hesselbarth, Jason Cooper, Arnd Bergmann,
	Jon Masters, Ben Dooks, Gregory Clement, linux-arm-kernel,
	Chris Van Hoof, Nicolas Pitre, netdev
In-Reply-To: <20120904183125.GB14683@lunn.ch>

Hello Andrew,

Le Tue, 4 Sep 2012 20:31:25 +0200,
Andrew Lunn <andrew@lunn.ch> a écrit :

> I've used Marvell switch chipsets, which have a phy polling unit,
> PPU. This sounds very similar. You can do a lot with the PPU, but when
> you want to configure subsets of auto-negotiation rates/duplex modes,
> or fixed speeds/duplex modes, the PPU could not do it. You had to
> disable the PPU and configure the PHY directly.
> 
> I see you have some of the ethtools API calls implemented, but not the
> ones needed for auto-neg and rates/duplex mode configurations. Does
> the neta PPU support this, or will you need to export the MDIO bus for
> these sorts of configuration options?

There's nothing named 'PPU' in the datasheet for Armada XP/370, but the
Ethernet controller registers indeed allow to change the
auto-negotiation, duplex and speed without talking to the PHY.

In addition to the GMAC_STATUS register (offset 0x2C10) that the driver
currently uses to find out the state of the link (up/down, speed,
duplex), there is a "Port Auto-Negotiation Configuration
Register" (offset 0x2C0C), which allows to set:

 * Manual duplex or auto duplex detection
 * If manual duplex, choose full/half
 * Manual or automatic detection of flow control
 * If manual flow control, decide the flow control value
 * Manual or automatic detection of speed
 * If manual speed, set the speed value

And a few other things. So indeed, we can configure the PHY parameters
without having to talk to the PHY directly.

Best regards,

Thomas
-- 
Thomas Petazzoni, Free Electrons
Kernel, drivers, real-time and embedded Linux
development, consulting, training and support.
http://free-electrons.com

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply

* Re: [PATCH] usbnet: drop unneeded check for NULL
From: Oliver Neukum @ 2012-09-05  6:24 UTC (permalink / raw)
  To: Richard Cochran; +Cc: David Miller, netdev
In-Reply-To: <20120905044712.GA2284@netboy.at.omicron.at>

On Wednesday 05 September 2012 06:47:12 Richard Cochran wrote:
> and so I think the problem that the test addresses is still present,
> or am I missing something?

No,

you are right. Thank you.

Dave, for now, please don't apply this patch. In the long run, this crap
in cdc-ncm needs to go. I am starting rewriting this driver right now.

	Regards
		Oliver

^ permalink raw reply

* [PATCH] ath6kl: use list_move_tail instead of list_del/list_add_tail
From: Wei Yongjun @ 2012-09-05  7:07 UTC (permalink / raw)
  To: kvalo, linville; +Cc: yongjun_wei, linux-wireless, netdev

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Using list_move_tail() instead of list_del() + list_add_tail().

spatch with a semantic match is used to found this problem.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 drivers/net/wireless/ath/ath6kl/htc_pipe.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/htc_pipe.c b/drivers/net/wireless/ath/ath6kl/htc_pipe.c
index f9626c7..ba6bd49 100644
--- a/drivers/net/wireless/ath/ath6kl/htc_pipe.c
+++ b/drivers/net/wireless/ath/ath6kl/htc_pipe.c
@@ -374,9 +374,8 @@ static enum htc_send_queue_result htc_try_send(struct htc_target *target,
 				packet = list_first_entry(txq,
 							  struct htc_packet,
 							  list);
-				list_del(&packet->list);
-				/* insert into local queue */
-				list_add_tail(&packet->list, &send_queue);
+				/* move to local queue */
+				list_move_tail(&packet->list, &send_queue);
 			}
 
 			/*
@@ -399,11 +398,10 @@ static enum htc_send_queue_result htc_try_send(struct htc_target *target,
 					 * for cleanup */
 				} else {
 					/* callback wants to keep this packet,
-					 * remove from caller's queue */
-					list_del(&packet->list);
-					/* put it in the send queue */
-					list_add_tail(&packet->list,
-						      &send_queue);
+					 * move from caller's queue to the send
+					 * queue */
+					list_move_tail(&packet->list,
+						       &send_queue);
 				}
 
 			}

^ permalink raw reply related

* [PATCH] bnx2x: use list_move_tail instead of list_del/list_add_tail
From: Wei Yongjun @ 2012-09-05  7:06 UTC (permalink / raw)
  To: eilong; +Cc: yongjun_wei, netdev

From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>

Using list_move_tail() instead of list_del() + list_add_tail().

spatch with a semantic match is used to found this problem.
(http://coccinelle.lip6.fr/)

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index 62f754b..5a5fbf5 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -229,8 +229,7 @@ static inline int bnx2x_exe_queue_step(struct bnx2x *bp,
 			 */
 			list_add_tail(&spacer.link, &o->pending_comp);
 			mb();
-			list_del(&elem->link);
-			list_add_tail(&elem->link, &o->pending_comp);
+			list_move_tail(&elem->link, &o->pending_comp);
 			list_del(&spacer.link);
 		} else
 			break;

^ permalink raw reply related

* Re: Commit "ipconfig wait for carrier" makes boot hang for 2 mins if no carrier
From: Joakim Tjernlund @ 2012-09-05  7:04 UTC (permalink / raw)
  To: Micha Nelissen; +Cc: netdev
In-Reply-To: <50465791.40805@neli.hopto.org>

Micha Nelissen <micha@neli.hopto.org> wrote on 2012/09/04 21:33:37:
>
> Joakim Tjernlund wrote:
> >>  Why not set the IP address then in your rootfs yourself?  Micha
> >
> > I could ask you the same question, why do you need to have nfs in kernel?
>
> Because that's where my root filesystem is? The IP autoconfiguration
> code exists for this purpose.

This is not the only purpose.

>
> > The answer is probably the same, it is much easier to
> > manage our IP config in one place for our embedded system.
>
> You retrieve the kernel via TFTP or so when booting?

Yes, but mostly not. This really doesn't matter

>
> > I don't understand why you need 2 minutes timeout for carrier either?
>
> Just a safe value.
>
> > The wait should be conditional on NFS root or not so that non NFS roots
> > can skip this stage altogether.
>
> Feel free to submit a patch :-)

Your patch broke other use cases so my patch would just revert or change the tmo
to 2 secs or so.
Or you could clean up your stuff so it works for all and not just for you.

 Jocke

^ permalink raw reply

* [V2 PATCH 2/9] csiostor: Chelsio FCoE offload driver submission (sources part 2).
From: Naresh Kumar Inna @ 2012-09-05 12:33 UTC (permalink / raw)
  To: JBottomley, linux-scsi, dm, leedom; +Cc: netdev, naresh, chethan
In-Reply-To: <1346848442-4573-1-git-send-email-naresh@chelsio.com>

This patch contains code for driver initialization, driver resource
allocation and the Work Request module functionality. Driver initialization
includes module entry/exit points, registration with PCI, FC transport and
SCSI mid layer subsystems. The Work Request module provides services for
allocation of DMA queues, posting Work Requests on them and processing
completions.

Signed-off-by: Naresh Kumar Inna <naresh@chelsio.com>
---
V2: Removed module parameters.

 drivers/scsi/csiostor/csio_init.c | 1276 +++++++++++++++++++++++++++++
 drivers/scsi/csiostor/csio_wr.c   | 1632 +++++++++++++++++++++++++++++++++++++
 2 files changed, 2908 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/csiostor/csio_init.c
 create mode 100644 drivers/scsi/csiostor/csio_wr.c

diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c
new file mode 100644
index 0000000..383f0dc
--- /dev/null
+++ b/drivers/scsi/csiostor/csio_init.c
@@ -0,0 +1,1276 @@
+/*
+ * This file is part of the Chelsio FCoE driver for Linux.
+ *
+ * Copyright (c) 2008-2012 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/kdebug.h>
+#include <linux/version.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/string.h>
+#include <linux/export.h>
+
+#include "csio_init.h"
+#include "csio_defs.h"
+
+#define CSIO_MIN_MEMPOOL_SZ	64
+
+static struct dentry *csio_debugfs_root;
+
+static struct scsi_transport_template *csio_fcoe_transport;
+static struct scsi_transport_template *csio_fcoe_transport_vport;
+
+/*
+ * debugfs support
+ */
+static int
+csio_mem_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t
+csio_mem_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	loff_t pos = *ppos;
+	loff_t avail = file->f_path.dentry->d_inode->i_size;
+	unsigned int mem = (uintptr_t)file->private_data & 3;
+	struct csio_hw *hw = file->private_data - mem;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= avail)
+		return 0;
+	if (count > avail - pos)
+		count = avail - pos;
+
+	while (count) {
+		size_t len;
+		int ret, ofst;
+		__be32 data[16];
+
+		if (mem == MEM_MC)
+			ret = csio_hw_mc_read(hw, pos, data, NULL);
+		else
+			ret = csio_hw_edc_read(hw, mem, pos, data, NULL);
+		if (ret)
+			return ret;
+
+		ofst = pos % sizeof(data);
+		len = min(count, sizeof(data) - ofst);
+		if (copy_to_user(buf, (u8 *)data + ofst, len))
+			return -EFAULT;
+
+		buf += len;
+		pos += len;
+		count -= len;
+	}
+	count = pos - *ppos;
+	*ppos = pos;
+	return count;
+}
+
+static const struct file_operations csio_mem_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = csio_mem_open,
+	.read    = csio_mem_read,
+	.llseek  = default_llseek,
+};
+
+static void __devinit
+csio_add_debugfs_mem(struct csio_hw *hw, const char *name,
+		     unsigned int idx, unsigned int size_mb)
+{
+	struct dentry *de;
+
+	de = debugfs_create_file(name, S_IRUSR, hw->debugfs_root,
+				 (void *)hw + idx, &csio_mem_debugfs_fops);
+	if (de && de->d_inode)
+		de->d_inode->i_size = size_mb << 20;
+}
+
+static int __devinit
+csio_setup_debugfs(struct csio_hw *hw)
+{
+	int i;
+
+	if (IS_ERR_OR_NULL(hw->debugfs_root))
+		return -1;
+
+	i = csio_rd_reg32(hw, MA_TARGET_MEM_ENABLE);
+	if (i & EDRAM0_ENABLE)
+		csio_add_debugfs_mem(hw, "edc0", MEM_EDC0, 5);
+	if (i & EDRAM1_ENABLE)
+		csio_add_debugfs_mem(hw, "edc1", MEM_EDC1, 5);
+	if (i & EXT_MEM_ENABLE)
+		csio_add_debugfs_mem(hw, "mc", MEM_MC,
+		      EXT_MEM_SIZE_GET(csio_rd_reg32(hw, MA_EXT_MEMORY_BAR)));
+	return 0;
+}
+
+/*
+ * csio_dfs_create - Creates debug filesystem and proc fs for the given HW.
+ *
+ */
+static int
+csio_dfs_create(struct csio_hw *hw)
+{
+	if (csio_debugfs_root) {
+		hw->debugfs_root = debugfs_create_dir(pci_name(hw->pdev),
+							csio_debugfs_root);
+		csio_setup_debugfs(hw);
+	}
+
+	return 0;
+}
+
+/*
+ * csio_dfs_destroy - Deletes debugfs and procfs entries for the given HW.
+ *
+ */
+static int
+csio_dfs_destroy(struct csio_hw *hw)
+{
+	if (hw->debugfs_root)
+		debugfs_remove_recursive(hw->debugfs_root);
+
+	return 0;
+}
+
+/*
+ * csio_dfs_init - Debug filesystem initialization.
+ *
+ * This is function is called during driver load to initialize debugfs, procfs
+ * used for debugging.
+ */
+static int
+csio_dfs_init(void)
+{
+	csio_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	if (!csio_debugfs_root)
+		pr_warn("Could not create debugfs entry, continuing\n");
+
+	return 0;
+}
+
+/*
+ * csio_dfs_exit - Cleans up debugfs and  procfs created during driver load.
+ * Function that gets called in the unload path.
+ */
+static void
+csio_dfs_exit(void)
+{
+	debugfs_remove(csio_debugfs_root);
+}
+
+/*
+ * csio_pci_init - PCI initialization.
+ * @pdev: PCI device.
+ * @bars: Bitmask of bars to be requested.
+ *
+ * Initializes the PCI function by enabling MMIO, setting bus
+ * mastership and setting DMA mask.
+ */
+static int
+csio_pci_init(struct pci_dev *pdev, int *bars)
+{
+	int rv = -ENODEV;
+
+	*bars = pci_select_bars(pdev, IORESOURCE_MEM);
+
+	if (pci_enable_device_mem(pdev))
+		goto err;
+
+	if (pci_request_selected_regions(pdev, *bars, KBUILD_MODNAME))
+		goto err_disable_device;
+
+	pci_set_master(pdev);
+	pci_try_set_mwi(pdev);
+
+	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	} else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
+		pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	} else {
+		dev_err(&pdev->dev, "No suitable DMA available.\n");
+		goto err_release_regions;
+	}
+
+	return 0;
+
+err_release_regions:
+	pci_release_selected_regions(pdev, *bars);
+err_disable_device:
+	pci_disable_device(pdev);
+err:
+	return rv;
+
+}
+
+/*
+ * csio_pci_exit - PCI unitialization.
+ * @pdev: PCI device.
+ * @bars: Bars to be released.
+ *
+ */
+static void
+csio_pci_exit(struct pci_dev *pdev, int *bars)
+{
+	pci_release_selected_regions(pdev, *bars);
+	pci_disable_device(pdev);
+}
+
+/*
+ * csio_hw_init_workers - Initialize the HW module's worker threads.
+ * @hw: HW module.
+ *
+ */
+static void
+csio_hw_init_workers(struct csio_hw *hw)
+{
+	INIT_WORK(&hw->evtq_work, csio_evtq_worker);
+}
+
+static void
+csio_hw_exit_workers(struct csio_hw *hw)
+{
+	cancel_work_sync(&hw->evtq_work);
+	flush_scheduled_work();
+}
+
+static int
+csio_create_queues(struct csio_hw *hw)
+{
+	int i, j;
+	struct csio_mgmtm *mgmtm = csio_hw_to_mgmtm(hw);
+	int rv;
+	struct csio_scsi_cpu_info *info;
+
+	if (hw->flags & CSIO_HWF_Q_FW_ALLOCED)
+		return 0;
+
+	if (hw->intr_mode != CSIO_IM_MSIX) {
+		rv = csio_wr_iq_create(hw, NULL, hw->intr_iq_idx,
+					0, hw->pport[0].portid, false, NULL);
+		if (rv != 0) {
+			csio_err(hw, " Forward Interrupt IQ failed!: %d\n", rv);
+			return rv;
+		}
+	}
+
+	/* FW event queue */
+	rv = csio_wr_iq_create(hw, NULL, hw->fwevt_iq_idx,
+			       csio_get_fwevt_intr_idx(hw),
+			       hw->pport[0].portid, true, NULL);
+	if (rv != 0) {
+		csio_err(hw, "FW event IQ config failed!: %d\n", rv);
+		return rv;
+	}
+
+	/* Create mgmt queue */
+	rv = csio_wr_eq_create(hw, NULL, mgmtm->eq_idx,
+			mgmtm->iq_idx, hw->pport[0].portid, NULL);
+
+	if (rv != 0) {
+		csio_err(hw, "Mgmt EQ create failed!: %d\n", rv);
+		goto err;
+	}
+
+	/* Create SCSI queues */
+	for (i = 0; i < hw->num_pports; i++) {
+		info = &hw->scsi_cpu_info[i];
+
+		for (j = 0; j < info->max_cpus; j++) {
+			struct csio_scsi_qset *sqset = &hw->sqset[i][j];
+
+			rv = csio_wr_iq_create(hw, NULL, sqset->iq_idx,
+					       sqset->intr_idx, i, false, NULL);
+			if (rv != 0) {
+				csio_err(hw,
+				   "SCSI module IQ config failed [%d][%d]:%d\n",
+				   i, j, rv);
+				goto err;
+			}
+			rv = csio_wr_eq_create(hw, NULL, sqset->eq_idx,
+					       sqset->iq_idx, i, NULL);
+			if (rv != 0) {
+				csio_err(hw,
+				   "SCSI module EQ config failed [%d][%d]:%d\n",
+				   i, j, rv);
+				goto err;
+			}
+		} /* for all CPUs */
+	} /* For all ports */
+
+	hw->flags |= CSIO_HWF_Q_FW_ALLOCED;
+	return 0;
+err:
+	csio_wr_destroy_queues(hw, true);
+	return -EINVAL;
+}
+
+/*
+ * csio_config_queues - Configure the DMA queues.
+ * @hw: HW module.
+ *
+ * Allocates memory for queues are registers them with FW.
+ */
+int
+csio_config_queues(struct csio_hw *hw)
+{
+	int i, j, idx, k = 0;
+	int rv;
+	struct csio_scsi_qset *sqset;
+	struct csio_mgmtm *mgmtm = csio_hw_to_mgmtm(hw);
+	struct csio_scsi_qset *orig;
+	struct csio_scsi_cpu_info *info;
+
+	if (hw->flags & CSIO_HWF_Q_MEM_ALLOCED)
+		return csio_create_queues(hw);
+
+	/* Calculate number of SCSI queues for MSIX we would like */
+	hw->num_scsi_msix_cpus = num_online_cpus();
+	hw->num_sqsets = num_online_cpus() * hw->num_pports;
+
+	if (hw->num_sqsets > CSIO_MAX_SCSI_QSETS) {
+		hw->num_sqsets = CSIO_MAX_SCSI_QSETS;
+		hw->num_scsi_msix_cpus = CSIO_MAX_SCSI_CPU;
+	}
+
+	/* Initialize max_cpus, may get reduced during msix allocations */
+	for (i = 0; i < hw->num_pports; i++)
+		hw->scsi_cpu_info[i].max_cpus = hw->num_scsi_msix_cpus;
+
+	csio_dbg(hw, "nsqsets:%d scpus:%d\n",
+		    hw->num_sqsets, hw->num_scsi_msix_cpus);
+
+	csio_intr_enable(hw);
+
+	if (hw->intr_mode != CSIO_IM_MSIX) {
+
+		/* Allocate Forward interrupt iq. */
+		hw->intr_iq_idx = csio_wr_alloc_q(hw, CSIO_INTR_IQSIZE,
+						CSIO_INTR_WRSIZE, CSIO_INGRESS,
+						(void *)hw, 0, 0, NULL);
+		if (hw->intr_iq_idx == -1) {
+			csio_err(hw,
+				 "Forward interrupt queue creation failed\n");
+			goto intr_disable;
+		}
+	}
+
+	/* Allocate the FW evt queue */
+	hw->fwevt_iq_idx = csio_wr_alloc_q(hw, CSIO_FWEVT_IQSIZE,
+					   CSIO_FWEVT_WRSIZE,
+					   CSIO_INGRESS, (void *)hw,
+					   CSIO_FWEVT_FLBUFS, 0,
+					   csio_fwevt_intx_handler);
+	if (hw->fwevt_iq_idx == -1) {
+		csio_err(hw, "FW evt queue creation failed\n");
+		goto intr_disable;
+	}
+
+	/* Allocate the mgmt queue */
+	mgmtm->eq_idx = csio_wr_alloc_q(hw, CSIO_MGMT_EQSIZE,
+				      CSIO_MGMT_EQ_WRSIZE,
+				      CSIO_EGRESS, (void *)hw, 0, 0, NULL);
+	if (mgmtm->eq_idx == -1) {
+		csio_err(hw, "Failed to alloc egress queue for mgmt module\n");
+		goto intr_disable;
+	}
+
+	/* Use FW IQ for MGMT req completion */
+	mgmtm->iq_idx = hw->fwevt_iq_idx;
+
+	/* Allocate SCSI queues */
+	for (i = 0; i < hw->num_pports; i++) {
+		info = &hw->scsi_cpu_info[i];
+
+		for (j = 0; j < hw->num_scsi_msix_cpus; j++) {
+			sqset = &hw->sqset[i][j];
+
+			if (j >= info->max_cpus) {
+				k = j % info->max_cpus;
+				orig = &hw->sqset[i][k];
+				sqset->eq_idx = orig->eq_idx;
+				sqset->iq_idx = orig->iq_idx;
+				continue;
+			}
+
+			idx = csio_wr_alloc_q(hw, csio_scsi_eqsize, 0,
+					      CSIO_EGRESS, (void *)hw, 0, 0,
+					      NULL);
+			if (idx == -1) {
+				csio_err(hw, "EQ creation failed for idx:%d\n",
+					    idx);
+				goto intr_disable;
+			}
+
+			sqset->eq_idx = idx;
+
+			idx = csio_wr_alloc_q(hw, CSIO_SCSI_IQSIZE,
+					     CSIO_SCSI_IQ_WRSZ, CSIO_INGRESS,
+					     (void *)hw, 0, 0,
+					     csio_scsi_intx_handler);
+			if (idx == -1) {
+				csio_err(hw, "IQ creation failed for idx:%d\n",
+					    idx);
+				goto intr_disable;
+			}
+			sqset->iq_idx = idx;
+		} /* for all CPUs */
+	} /* For all ports */
+
+	hw->flags |= CSIO_HWF_Q_MEM_ALLOCED;
+
+	rv = csio_create_queues(hw);
+	if (rv != 0)
+		goto intr_disable;
+
+	/*
+	 * Now request IRQs for the vectors. In the event of a failure,
+	 * cleanup is handled internally by this function.
+	 */
+	rv = csio_request_irqs(hw);
+	if (rv != 0)
+		return -EINVAL;
+
+	return 0;
+
+intr_disable:
+	csio_intr_disable(hw, false);
+
+	return -EINVAL;
+}
+
+static int
+csio_resource_alloc(struct csio_hw *hw)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	int rv = -ENOMEM;
+
+	wrm->num_q = ((CSIO_MAX_SCSI_QSETS * 2) + CSIO_HW_NIQ +
+		       CSIO_HW_NEQ + CSIO_HW_NFLQ + CSIO_HW_NINTXQ);
+
+	hw->mb_mempool = mempool_create_kmalloc_pool(CSIO_MIN_MEMPOOL_SZ,
+						  sizeof(struct csio_mb));
+	if (!hw->mb_mempool)
+		goto err;
+
+	hw->rnode_mempool = mempool_create_kmalloc_pool(CSIO_MIN_MEMPOOL_SZ,
+						     sizeof(struct csio_rnode));
+	if (!hw->rnode_mempool)
+		goto err_free_mb_mempool;
+
+	hw->scsi_pci_pool = pci_pool_create("csio_scsi_pci_pool", hw->pdev,
+					    CSIO_SCSI_RSP_LEN, 8, 0);
+	if (!hw->scsi_pci_pool)
+		goto err_free_rn_pool;
+
+	return 0;
+
+err_free_rn_pool:
+	mempool_destroy(hw->rnode_mempool);
+	hw->rnode_mempool = NULL;
+err_free_mb_mempool:
+	mempool_destroy(hw->mb_mempool);
+	hw->mb_mempool = NULL;
+err:
+	return rv;
+}
+
+static void
+csio_resource_free(struct csio_hw *hw)
+{
+	pci_pool_destroy(hw->scsi_pci_pool);
+	hw->scsi_pci_pool = NULL;
+	mempool_destroy(hw->rnode_mempool);
+	hw->rnode_mempool = NULL;
+	mempool_destroy(hw->mb_mempool);
+	hw->mb_mempool = NULL;
+}
+
+/*
+ * csio_hw_alloc - Allocate and initialize the HW module.
+ * @pdev: PCI device.
+ *
+ * Allocates HW structure, DMA, memory resources, maps BARS to
+ * host memory and initializes HW module.
+ */
+static struct csio_hw * __devinit
+csio_hw_alloc(struct pci_dev *pdev)
+{
+	struct csio_hw *hw;
+
+	hw = kzalloc(sizeof(struct csio_hw), GFP_KERNEL);
+	if (!hw)
+		goto err;
+
+	hw->pdev = pdev;
+	strncpy(hw->drv_version, CSIO_DRV_VERSION, 32);
+
+	/* memory pool/DMA pool allocation */
+	if (csio_resource_alloc(hw))
+		goto err_free_hw;
+
+	/* Get the start address of registers from BAR 0 */
+	hw->regstart = ioremap_nocache(pci_resource_start(pdev, 0),
+				       pci_resource_len(pdev, 0));
+	if (!hw->regstart) {
+		csio_err(hw, "Could not map BAR 0, regstart = %p\n",
+			 hw->regstart);
+		goto err_resource_free;
+	}
+
+	csio_hw_init_workers(hw);
+
+	if (csio_hw_init(hw))
+		goto err_unmap_bar;
+
+	csio_dfs_create(hw);
+
+	csio_dbg(hw, "hw:%p\n", hw);
+
+	return hw;
+
+err_unmap_bar:
+	csio_hw_exit_workers(hw);
+	iounmap(hw->regstart);
+err_resource_free:
+	csio_resource_free(hw);
+err_free_hw:
+	kfree(hw);
+err:
+	return NULL;
+}
+
+/*
+ * csio_hw_free - Uninitialize and free the HW module.
+ * @hw: The HW module
+ *
+ * Disable interrupts, uninit the HW module, free resources, free hw.
+ */
+static void
+csio_hw_free(struct csio_hw *hw)
+{
+	csio_intr_disable(hw, true);
+	csio_hw_exit_workers(hw);
+	csio_hw_exit(hw);
+	iounmap(hw->regstart);
+	csio_dfs_destroy(hw);
+	csio_resource_free(hw);
+	kfree(hw);
+}
+
+/**
+ * csio_shost_init - Create and initialize the lnode module.
+ * @hw:		The HW module.
+ * @dev:	The device associated with this invocation.
+ * @probe:	Called from probe context or not?
+ * @os_pln:	Parent lnode if any.
+ *
+ * Allocates lnode structure via scsi_host_alloc, initializes
+ * shost, initializes lnode module and registers with SCSI ML
+ * via scsi_host_add. This function is shared between physical and
+ * virtual node ports.
+ */
+struct csio_lnode *
+csio_shost_init(struct csio_hw *hw, struct device *dev,
+		  bool probe, struct csio_lnode *pln)
+{
+	struct Scsi_Host  *shost = NULL;
+	struct csio_lnode *ln;
+
+	csio_fcoe_shost_template.cmd_per_lun = csio_lun_qdepth;
+	csio_fcoe_shost_vport_template.cmd_per_lun = csio_lun_qdepth;
+
+	/*
+	 * hw->pdev is the physical port's PCI dev structure,
+	 * which will be different from the NPIV dev structure.
+	 */
+	if (dev == &hw->pdev->dev)
+		shost = scsi_host_alloc(
+				&csio_fcoe_shost_template,
+				sizeof(struct csio_lnode));
+	else
+		shost = scsi_host_alloc(
+				&csio_fcoe_shost_vport_template,
+				sizeof(struct csio_lnode));
+
+	if (!shost)
+		goto err;
+
+	ln = shost_priv(shost);
+	memset(ln, 0, sizeof(struct csio_lnode));
+
+	/* Link common lnode to this lnode */
+	ln->dev_num = (shost->host_no << 16);
+
+	shost->can_queue = CSIO_MAX_QUEUE;
+	shost->this_id = -1;
+	shost->unique_id = shost->host_no;
+	shost->max_cmd_len = 16; /* Max CDB length supported */
+	shost->max_id = min_t(uint32_t, csio_fcoe_rnodes,
+			      hw->fres_info.max_ssns);
+	shost->max_lun = CSIO_MAX_LUN;
+	if (dev == &hw->pdev->dev)
+		shost->transportt = csio_fcoe_transport;
+	else
+		shost->transportt = csio_fcoe_transport_vport;
+
+	/* root lnode */
+	if (!hw->rln)
+		hw->rln = ln;
+
+	/* Other initialization here: Common, Transport specific */
+	if (csio_lnode_init(ln, hw, pln))
+		goto err_shost_put;
+
+	if (scsi_add_host(shost, dev))
+		goto err_lnode_exit;
+
+	return ln;
+
+err_lnode_exit:
+	csio_lnode_exit(ln);
+err_shost_put:
+	scsi_host_put(shost);
+err:
+	return NULL;
+}
+
+/**
+ * csio_shost_exit - De-instantiate the shost.
+ * @ln:		The lnode module corresponding to the shost.
+ *
+ */
+void
+csio_shost_exit(struct csio_lnode *ln)
+{
+	struct Scsi_Host *shost = csio_ln_to_shost(ln);
+	struct csio_hw *hw = csio_lnode_to_hw(ln);
+
+	/* Inform transport */
+	fc_remove_host(shost);
+
+	/* Inform SCSI ML */
+	scsi_remove_host(shost);
+
+	/* Flush all the events, so that any rnode removal events
+	 * already queued are all handled, before we remove the lnode.
+	 */
+	spin_lock_irq(&hw->lock);
+	csio_evtq_flush(hw);
+	spin_unlock_irq(&hw->lock);
+
+	csio_lnode_exit(ln);
+	scsi_host_put(shost);
+}
+
+struct csio_lnode *
+csio_lnode_alloc(struct csio_hw *hw)
+{
+	return csio_shost_init(hw, &hw->pdev->dev, false, NULL);
+}
+
+void
+csio_lnodes_block_request(struct csio_hw *hw)
+{
+	struct Scsi_Host  *shost;
+	struct csio_lnode *sln;
+	struct csio_lnode *ln;
+	struct list_head *cur_ln, *cur_cln;
+	struct csio_lnode **lnode_list;
+	int cur_cnt = 0, ii;
+
+	lnode_list = kzalloc((sizeof(struct csio_lnode *) * hw->num_lns),
+			GFP_KERNEL);
+	if (!lnode_list) {
+		csio_err(hw, "Failed to allocate lnodes_list");
+		return;
+	}
+
+	spin_lock_irq(&hw->lock);
+	/* Traverse sibling lnodes */
+	list_for_each(cur_ln, &hw->sln_head) {
+		sln = (struct csio_lnode *) cur_ln;
+		lnode_list[cur_cnt++] = sln;
+
+		/* Traverse children lnodes */
+		list_for_each(cur_cln, &sln->cln_head)
+			lnode_list[cur_cnt++] = (struct csio_lnode *) cur_cln;
+	}
+	spin_unlock_irq(&hw->lock);
+
+	for (ii = 0; ii < cur_cnt; ii++) {
+		csio_dbg(hw, "Blocking IOs on lnode: %p\n", lnode_list[ii]);
+		ln = lnode_list[ii];
+		shost = csio_ln_to_shost(ln);
+		scsi_block_requests(shost);
+
+	}
+	kfree(lnode_list);
+}
+
+void
+csio_lnodes_unblock_request(struct csio_hw *hw)
+{
+	struct csio_lnode *ln;
+	struct Scsi_Host  *shost;
+	struct csio_lnode *sln;
+	struct list_head *cur_ln, *cur_cln;
+	struct csio_lnode **lnode_list;
+	int cur_cnt = 0, ii;
+
+	lnode_list = kzalloc((sizeof(struct csio_lnode *) * hw->num_lns),
+			GFP_KERNEL);
+	if (!lnode_list) {
+		csio_err(hw, "Failed to allocate lnodes_list");
+		return;
+	}
+
+	spin_lock_irq(&hw->lock);
+	/* Traverse sibling lnodes */
+	list_for_each(cur_ln, &hw->sln_head) {
+		sln = (struct csio_lnode *) cur_ln;
+		lnode_list[cur_cnt++] = sln;
+
+		/* Traverse children lnodes */
+		list_for_each(cur_cln, &sln->cln_head)
+			lnode_list[cur_cnt++] = (struct csio_lnode *) cur_cln;
+	}
+	spin_unlock_irq(&hw->lock);
+
+	for (ii = 0; ii < cur_cnt; ii++) {
+		csio_dbg(hw, "unblocking IOs on lnode: %p\n", lnode_list[ii]);
+		ln = lnode_list[ii];
+		shost = csio_ln_to_shost(ln);
+		scsi_unblock_requests(shost);
+	}
+	kfree(lnode_list);
+}
+
+void
+csio_lnodes_block_by_port(struct csio_hw *hw, uint8_t portid)
+{
+	struct csio_lnode *ln;
+	struct Scsi_Host  *shost;
+	struct csio_lnode *sln;
+	struct list_head *cur_ln, *cur_cln;
+	struct csio_lnode **lnode_list;
+	int cur_cnt = 0, ii;
+
+	lnode_list = kzalloc((sizeof(struct csio_lnode *) * hw->num_lns),
+			GFP_KERNEL);
+	if (!lnode_list) {
+		csio_err(hw, "Failed to allocate lnodes_list");
+		return;
+	}
+
+	spin_lock_irq(&hw->lock);
+	/* Traverse sibling lnodes */
+	list_for_each(cur_ln, &hw->sln_head) {
+		sln = (struct csio_lnode *) cur_ln;
+		if (sln->portid != portid)
+			continue;
+
+		lnode_list[cur_cnt++] = sln;
+
+		/* Traverse children lnodes */
+		list_for_each(cur_cln, &sln->cln_head)
+			lnode_list[cur_cnt++] = (struct csio_lnode *) cur_cln;
+	}
+	spin_unlock_irq(&hw->lock);
+
+	for (ii = 0; ii < cur_cnt; ii++) {
+		csio_dbg(hw, "Blocking IOs on lnode: %p\n", lnode_list[ii]);
+		ln = lnode_list[ii];
+		shost = csio_ln_to_shost(ln);
+		scsi_block_requests(shost);
+	}
+	kfree(lnode_list);
+}
+
+void
+csio_lnodes_unblock_by_port(struct csio_hw *hw, uint8_t portid)
+{
+	struct csio_lnode *ln;
+	struct Scsi_Host  *shost;
+	struct csio_lnode *sln;
+	struct list_head *cur_ln, *cur_cln;
+	struct csio_lnode **lnode_list;
+	int cur_cnt = 0, ii;
+
+	lnode_list = kzalloc((sizeof(struct csio_lnode *) * hw->num_lns),
+			GFP_KERNEL);
+	if (!lnode_list) {
+		csio_err(hw, "Failed to allocate lnodes_list");
+		return;
+	}
+
+	spin_lock_irq(&hw->lock);
+	/* Traverse sibling lnodes */
+	list_for_each(cur_ln, &hw->sln_head) {
+		sln = (struct csio_lnode *) cur_ln;
+		if (sln->portid != portid)
+			continue;
+		lnode_list[cur_cnt++] = sln;
+
+		/* Traverse children lnodes */
+		list_for_each(cur_cln, &sln->cln_head)
+			lnode_list[cur_cnt++] = (struct csio_lnode *) cur_cln;
+	}
+	spin_unlock_irq(&hw->lock);
+
+	for (ii = 0; ii < cur_cnt; ii++) {
+		csio_dbg(hw, "unblocking IOs on lnode: %p\n", lnode_list[ii]);
+		ln = lnode_list[ii];
+		shost = csio_ln_to_shost(ln);
+		scsi_unblock_requests(shost);
+	}
+	kfree(lnode_list);
+}
+
+void
+csio_lnodes_exit(struct csio_hw *hw, bool npiv)
+{
+	struct csio_lnode *sln;
+	struct csio_lnode *ln;
+	struct list_head *cur_ln, *cur_cln;
+	struct csio_lnode **lnode_list;
+	int cur_cnt = 0, ii;
+
+	lnode_list = kzalloc((sizeof(struct csio_lnode *) * hw->num_lns),
+			GFP_KERNEL);
+	if (!lnode_list) {
+		csio_err(hw, "lnodes_exit: Failed to allocate lnodes_list.\n");
+		return;
+	}
+
+	/* Get all child lnodes(NPIV ports) */
+	spin_lock_irq(&hw->lock);
+	list_for_each(cur_ln, &hw->sln_head) {
+		sln = (struct csio_lnode *) cur_ln;
+
+		/* Traverse children lnodes */
+		list_for_each(cur_cln, &sln->cln_head)
+			lnode_list[cur_cnt++] = (struct csio_lnode *) cur_cln;
+	}
+	spin_unlock_irq(&hw->lock);
+
+	/* Delete NPIV lnodes */
+	for (ii = 0; ii < cur_cnt; ii++) {
+		csio_dbg(hw, "Deleting child lnode: %p\n", lnode_list[ii]);
+		ln = lnode_list[ii];
+		fc_vport_terminate(ln->fc_vport);
+	}
+
+	/* Delete only npiv lnodes */
+	if (npiv)
+		goto free_lnodes;
+
+	cur_cnt = 0;
+	/* Get all physical lnodes */
+	spin_lock_irq(&hw->lock);
+	/* Traverse sibling lnodes */
+	list_for_each(cur_ln, &hw->sln_head) {
+		sln = (struct csio_lnode *) cur_ln;
+		lnode_list[cur_cnt++] = sln;
+	}
+	spin_unlock_irq(&hw->lock);
+
+	/* Delete physical lnodes */
+	for (ii = 0; ii < cur_cnt; ii++) {
+		csio_dbg(hw, "Deleting parent lnode: %p\n", lnode_list[ii]);
+		csio_shost_exit(lnode_list[ii]);
+	}
+
+free_lnodes:
+	kfree(lnode_list);
+}
+
+/*
+ * csio_lnode_init_post: Set lnode attributes after starting HW.
+ * @ln: lnode.
+ *
+ */
+static void
+csio_lnode_init_post(struct csio_lnode *ln)
+{
+	struct Scsi_Host  *shost = csio_ln_to_shost(ln);
+
+	csio_fchost_attr_init(ln);
+
+	scsi_scan_host(shost);
+}
+
+/*
+ * csio_probe_one - Instantiate this function.
+ * @pdev: PCI device
+ * @id: Device ID
+ *
+ * This is the .probe() callback of the driver. This function:
+ * - Initializes the PCI function by enabling MMIO, setting bus
+ *   mastership and setting DMA mask.
+ * - Allocates HW structure, DMA, memory resources, maps BARS to
+ *   host memory and initializes HW module.
+ * - Allocates lnode structure via scsi_host_alloc, initializes
+ *   shost, initialized lnode module and registers with SCSI ML
+ *   via scsi_host_add.
+ * - Enables interrupts, and starts the chip by kicking off the
+ *   HW state machine.
+ * - Once hardware is ready, initiated scan of the host via
+ *   scsi_scan_host.
+ */
+static int __devinit
+csio_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int rv;
+	int bars;
+	int i;
+	struct csio_hw *hw;
+	struct csio_lnode *ln;
+
+	rv = csio_pci_init(pdev, &bars);
+	if (rv)
+		goto err;
+
+	hw = csio_hw_alloc(pdev);
+	if (!hw) {
+		rv = -ENODEV;
+		goto err_pci_exit;
+	}
+
+	pci_set_drvdata(pdev, hw);
+
+	if (csio_hw_start(hw) != 0) {
+		dev_err(&pdev->dev,
+			"Failed to start FW, continuing in debug mode.\n");
+		return 0;
+	}
+
+	sprintf(hw->fwrev_str, "%u.%u.%u.%u\n",
+		    FW_HDR_FW_VER_MAJOR_GET(hw->fwrev),
+		    FW_HDR_FW_VER_MINOR_GET(hw->fwrev),
+		    FW_HDR_FW_VER_MICRO_GET(hw->fwrev),
+		    FW_HDR_FW_VER_BUILD_GET(hw->fwrev));
+
+	for (i = 0; i < hw->num_pports; i++) {
+		ln = csio_shost_init(hw, &pdev->dev, true, NULL);
+		if (!ln) {
+			rv = -ENODEV;
+			break;
+		}
+		/* Initialize portid */
+		ln->portid = hw->pport[i].portid;
+
+		spin_lock_irq(&hw->lock);
+		if (csio_lnode_start(ln) != 0)
+			rv = -ENODEV;
+		spin_unlock_irq(&hw->lock);
+
+		if (rv)
+			break;
+
+		csio_lnode_init_post(ln);
+	}
+
+	if (rv)
+		goto err_lnode_exit;
+
+	return 0;
+
+err_lnode_exit:
+	csio_lnodes_block_request(hw);
+	spin_lock_irq(&hw->lock);
+	csio_hw_stop(hw);
+	spin_unlock_irq(&hw->lock);
+	csio_lnodes_unblock_request(hw);
+	pci_set_drvdata(hw->pdev, NULL);
+	csio_lnodes_exit(hw, 0);
+	csio_hw_free(hw);
+err_pci_exit:
+	csio_pci_exit(pdev, &bars);
+err:
+	dev_err(&pdev->dev, "probe of device failed: %d\n", rv);
+	return rv;
+}
+
+/*
+ * csio_remove_one - Remove one instance of the driver at this PCI function.
+ * @pdev: PCI device
+ *
+ * Used during hotplug operation.
+ */
+static void __devexit
+csio_remove_one(struct pci_dev *pdev)
+{
+	struct csio_hw *hw = pci_get_drvdata(pdev);
+	int bars = pci_select_bars(pdev, IORESOURCE_MEM);
+
+	csio_lnodes_block_request(hw);
+	spin_lock_irq(&hw->lock);
+
+	/* Stops lnode, Rnode s/m
+	 * Quiesce IOs.
+	 * All sessions with remote ports are unregistered.
+	 */
+	csio_hw_stop(hw);
+	spin_unlock_irq(&hw->lock);
+	csio_lnodes_unblock_request(hw);
+
+	csio_lnodes_exit(hw, 0);
+	csio_hw_free(hw);
+	pci_set_drvdata(pdev, NULL);
+	csio_pci_exit(pdev, &bars);
+}
+
+/*
+ * csio_pci_error_detected - PCI error was detected
+ * @pdev: PCI device
+ *
+ */
+static pci_ers_result_t
+csio_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct csio_hw *hw = pci_get_drvdata(pdev);
+
+	csio_lnodes_block_request(hw);
+	spin_lock_irq(&hw->lock);
+
+	/* Post PCI error detected evt to HW s/m
+	 * HW s/m handles this evt by quiescing IOs, unregisters rports
+	 * and finally takes the device to offline.
+	 */
+	csio_post_event(&hw->sm, CSIO_HWE_PCIERR_DETECTED);
+	spin_unlock_irq(&hw->lock);
+	csio_lnodes_unblock_request(hw);
+	csio_lnodes_exit(hw, 0);
+	csio_intr_disable(hw, true);
+	pci_disable_device(pdev);
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+/*
+ * csio_pci_slot_reset - PCI slot has been reset.
+ * @pdev: PCI device
+ *
+ */
+static pci_ers_result_t
+csio_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct csio_hw *hw = pci_get_drvdata(pdev);
+
+	if (pci_enable_device(pdev)) {
+		dev_err(&pdev->dev, "cannot re-enable device in slot reset\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	pci_set_master(pdev);
+	pci_restore_state(pdev);
+	pci_save_state(pdev);
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+
+	/* Bring HW s/m to ready state.
+	 * but don't resume IOs.
+	 */
+	spin_lock_irq(&hw->lock);
+	csio_post_event(&hw->sm, CSIO_HWE_PCIERR_SLOT_RESET);
+	if (!csio_is_hw_ready(hw)) {
+		spin_unlock_irq(&hw->lock);
+		dev_err(&pdev->dev, "Can't initialize HW when in slot reset\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	spin_unlock_irq(&hw->lock);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+/*
+ * csio_pci_resume - Resume normal operations
+ * @pdev: PCI device
+ *
+ */
+static void
+csio_pci_resume(struct pci_dev *pdev)
+{
+	struct csio_hw *hw = pci_get_drvdata(pdev);
+	struct csio_lnode *ln;
+	int rv = 0;
+	int i;
+
+	/* Bring the LINK UP and Resume IO */
+
+	for (i = 0; i < hw->num_pports; i++) {
+		ln = csio_shost_init(hw, &pdev->dev, true, NULL);
+		if (!ln) {
+			rv = -ENODEV;
+			break;
+		}
+		/* Initialize portid */
+		ln->portid = hw->pport[i].portid;
+
+		spin_lock_irq(&hw->lock);
+		if (csio_lnode_start(ln) != 0)
+			rv = -ENODEV;
+		spin_unlock_irq(&hw->lock);
+
+		if (rv)
+			break;
+
+		csio_lnode_init_post(ln);
+	}
+
+	if (rv)
+		goto err_resume_exit;
+
+	return;
+
+err_resume_exit:
+	csio_lnodes_block_request(hw);
+	spin_lock_irq(&hw->lock);
+	csio_hw_stop(hw);
+	spin_unlock_irq(&hw->lock);
+	csio_lnodes_unblock_request(hw);
+	csio_lnodes_exit(hw, 0);
+	csio_hw_free(hw);
+	dev_err(&pdev->dev, "resume of device failed: %d\n", rv);
+}
+
+static struct pci_error_handlers csio_err_handler = {
+	.error_detected = csio_pci_error_detected,
+	.slot_reset	= csio_pci_slot_reset,
+	.resume		= csio_pci_resume,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(csio_pci_tbl) = {
+	CSIO_DEVICE(CSIO_DEVID_T440DBG_FCOE, 0),	/* T440DBG FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T420CR_FCOE, 0),		/* T420CR FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T422CR_FCOE, 0),		/* T422CR FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T440CR_FCOE, 0),		/* T440CR FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T420BCH_FCOE, 0),	/* T420BCH FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T440BCH_FCOE, 0),	/* T440BCH FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T440CH_FCOE, 0),		/* T440CH FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T420SO_FCOE, 0),		/* T420SO FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T420CX_FCOE, 0),		/* T420CX FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T420BT_FCOE, 0),		/* T420BT FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T404BT_FCOE, 0),		/* T404BT FCOE */
+	CSIO_DEVICE(CSIO_DEVID_B420_FCOE, 0),		/* B420 FCOE */
+	CSIO_DEVICE(CSIO_DEVID_B404_FCOE, 0),		/* B404 FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T480CR_FCOE, 0),		/* T480 CR FCOE */
+	CSIO_DEVICE(CSIO_DEVID_T440LPCR_FCOE, 0),	/* T440 LP-CR FCOE */
+	CSIO_DEVICE(CSIO_DEVID_PE10K, 0),		/* PE10K FCOE */
+	CSIO_DEVICE(CSIO_DEVID_PE10K_PF1, 0),	/* PE10K FCOE on PF1 */
+	{ 0, 0, 0, 0, 0, 0, 0 }
+};
+
+
+static struct pci_driver csio_pci_driver = {
+	.name		= KBUILD_MODNAME,
+	.driver		= {
+		.owner	= THIS_MODULE,
+	},
+	.id_table	= csio_pci_tbl,
+	.probe		= csio_probe_one,
+	.remove		= csio_remove_one,
+	.err_handler	= &csio_err_handler,
+};
+
+/*
+ * csio_init - Chelsio storage driver initialization function.
+ *
+ */
+static int __init
+csio_init(void)
+{
+	int rv = -ENOMEM;
+
+	pr_info("%s %s\n", CSIO_DRV_DESC, CSIO_DRV_VERSION);
+
+	csio_dfs_init();
+
+	csio_fcoe_transport = fc_attach_transport(&csio_fc_transport_funcs);
+	if (!csio_fcoe_transport)
+		goto err;
+
+	csio_fcoe_transport_vport =
+			fc_attach_transport(&csio_fc_transport_vport_funcs);
+	if (!csio_fcoe_transport_vport)
+		goto err_vport;
+
+	rv = pci_register_driver(&csio_pci_driver);
+	if (rv)
+		goto err_pci;
+
+	return 0;
+
+err_pci:
+	fc_release_transport(csio_fcoe_transport_vport);
+err_vport:
+	fc_release_transport(csio_fcoe_transport);
+err:
+	csio_dfs_exit();
+	return rv;
+}
+
+/*
+ * csio_exit - Chelsio storage driver uninitialization .
+ *
+ * Function that gets called in the unload path.
+ */
+static void __exit
+csio_exit(void)
+{
+	pci_unregister_driver(&csio_pci_driver);
+	csio_dfs_exit();
+	fc_release_transport(csio_fcoe_transport_vport);
+	fc_release_transport(csio_fcoe_transport);
+}
+
+module_init(csio_init);
+module_exit(csio_exit);
+MODULE_AUTHOR(CSIO_DRV_AUTHOR);
+MODULE_DESCRIPTION(CSIO_DRV_DESC);
+MODULE_LICENSE(CSIO_DRV_LICENSE);
+MODULE_DEVICE_TABLE(pci, csio_pci_tbl);
+MODULE_VERSION(CSIO_DRV_VERSION);
+MODULE_FIRMWARE(CSIO_FW_FNAME);
diff --git a/drivers/scsi/csiostor/csio_wr.c b/drivers/scsi/csiostor/csio_wr.c
new file mode 100644
index 0000000..329c6df
--- /dev/null
+++ b/drivers/scsi/csiostor/csio_wr.c
@@ -0,0 +1,1632 @@
+/*
+ * This file is part of the Chelsio FCoE driver for Linux.
+ *
+ * Copyright (c) 2008-2012 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+#include <asm/page.h>
+#include <linux/cache.h>
+
+#include "csio_hw.h"
+#include "csio_wr.h"
+#include "csio_mb.h"
+#include "csio_defs.h"
+
+int csio_intr_coalesce_cnt;		/* value:SGE_INGRESS_RX_THRESHOLD[0] */
+static int csio_sge_thresh_reg;		/* SGE_INGRESS_RX_THRESHOLD[0] */
+
+int csio_intr_coalesce_time = 10;	/* value:SGE_TIMER_VALUE_1 */
+static int csio_sge_timer_reg = 1;
+
+#define CSIO_SET_FLBUF_SIZE(_hw, _reg, _val)				\
+	csio_wr_reg32((_hw), (_val), SGE_FL_BUFFER_SIZE##_reg)
+
+static void
+csio_get_flbuf_size(struct csio_hw *hw, struct csio_sge *sge, uint32_t reg)
+{
+	sge->sge_fl_buf_size[reg] = csio_rd_reg32(hw, SGE_FL_BUFFER_SIZE0 +
+							reg * sizeof(uint32_t));
+}
+
+/* Free list buffer size */
+static inline uint32_t
+csio_wr_fl_bufsz(struct csio_sge *sge, struct csio_dma_buf *buf)
+{
+	return sge->sge_fl_buf_size[buf->paddr & 0xF];
+}
+
+/* Size of the egress queue status page */
+static inline uint32_t
+csio_wr_qstat_pgsz(struct csio_hw *hw)
+{
+	return (hw->wrm.sge.sge_control & EGRSTATUSPAGESIZE(1)) ?  128 : 64;
+}
+
+/* Ring freelist doorbell */
+static inline void
+csio_wr_ring_fldb(struct csio_hw *hw, struct csio_q *flq)
+{
+	/*
+	 * Ring the doorbell only when we have atleast CSIO_QCREDIT_SZ
+	 * number of bytes in the freelist queue. This translates to atleast
+	 * 8 freelist buffer pointers (since each pointer is 8 bytes).
+	 */
+	if (flq->inc_idx >= 8) {
+		csio_wr_reg32(hw, DBPRIO(1) | QID(flq->un.fl.flid) |
+			      PIDX(flq->inc_idx / 8),
+			      MYPF_REG(SGE_PF_KDOORBELL));
+		flq->inc_idx &= 7;
+	}
+}
+
+/* Write a 0 cidx increment value to enable SGE interrupts for this queue */
+static void
+csio_wr_sge_intr_enable(struct csio_hw *hw, uint16_t iqid)
+{
+	csio_wr_reg32(hw, CIDXINC(0)		|
+			  INGRESSQID(iqid)	|
+			  TIMERREG(X_TIMERREG_RESTART_COUNTER),
+			  MYPF_REG(SGE_PF_GTS));
+}
+
+/*
+ * csio_wr_fill_fl - Populate the FL buffers of a FL queue.
+ * @hw: HW module.
+ * @flq: Freelist queue.
+ *
+ * Fill up freelist buffer entries with buffers of size specified
+ * in the size register.
+ *
+ */
+static int
+csio_wr_fill_fl(struct csio_hw *hw, struct csio_q *flq)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_sge *sge = &wrm->sge;
+	__be64 *d = (__be64 *)(flq->vstart);
+	struct csio_dma_buf *buf = &flq->un.fl.bufs[0];
+	uint64_t paddr;
+	int sreg = flq->un.fl.sreg;
+	int n = flq->credits;
+
+	while (n--) {
+		buf->len = sge->sge_fl_buf_size[sreg];
+		buf->vaddr = pci_alloc_consistent(hw->pdev, buf->len,
+						  &buf->paddr);
+		if (!buf->vaddr) {
+			csio_err(hw, "Could only fill %d buffers!\n", n + 1);
+			return -ENOMEM;
+		}
+
+		paddr = buf->paddr | (sreg & 0xF);
+
+		*d++ = cpu_to_be64(paddr);
+		buf++;
+	}
+
+	return 0;
+}
+
+/*
+ * csio_wr_update_fl -
+ * @hw: HW module.
+ * @flq: Freelist queue.
+ *
+ *
+ */
+static inline void
+csio_wr_update_fl(struct csio_hw *hw, struct csio_q *flq, uint16_t n)
+{
+
+	flq->inc_idx += n;
+	flq->pidx += n;
+	if (unlikely(flq->pidx >= flq->credits))
+		flq->pidx -= (uint16_t)flq->credits;
+
+	CSIO_INC_STATS(flq, n_flq_refill);
+}
+
+/*
+ * csio_wr_alloc_q - Allocate a WR queue and initialize it.
+ * @hw: HW module
+ * @qsize: Size of the queue in bytes
+ * @wrsize: Since of WR in this queue, if fixed.
+ * @type: Type of queue (Ingress/Egress/Freelist)
+ * @owner: Module that owns this queue.
+ * @nflb: Number of freelist buffers for FL.
+ * @sreg: What is the FL buffer size register?
+ * @iq_int_handler: Ingress queue handler in INTx mode.
+ *
+ * This function allocates and sets up a queue for the caller
+ * of size qsize, aligned at the required boundary. This is subject to
+ * be free entries being available in the queue array. If one is found,
+ * it is initialized with the allocated queue, marked as being used (owner),
+ * and a handle returned to the caller in form of the queue's index
+ * into the q_arr array.
+ * If user has indicated a freelist (by specifying nflb > 0), create
+ * another queue (with its own index into q_arr) for the freelist. Allocate
+ * memory for DMA buffer metadata (vaddr, len etc). Save off the freelist
+ * idx in the ingress queue's flq.idx. This is how a Freelist is associated
+ * with its owning ingress queue.
+ */
+int
+csio_wr_alloc_q(struct csio_hw *hw, uint32_t qsize, uint32_t wrsize,
+		uint16_t type, void *owner, uint32_t nflb, int sreg,
+		iq_handler_t iq_intx_handler)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_q	*q, *flq;
+	int		free_idx = wrm->free_qidx;
+	int		ret_idx = free_idx;
+	uint32_t	qsz;
+	int flq_idx;
+
+	if (free_idx >= wrm->num_q) {
+		csio_err(hw, "No more free queues.\n");
+		return -1;
+	}
+
+	switch (type) {
+	case CSIO_EGRESS:
+		qsz = ALIGN(qsize, CSIO_QCREDIT_SZ) + csio_wr_qstat_pgsz(hw);
+		break;
+	case CSIO_INGRESS:
+		switch (wrsize) {
+		case 16:
+		case 32:
+		case 64:
+		case 128:
+			break;
+		default:
+			csio_err(hw, "Invalid Ingress queue WR size:%d\n",
+				    wrsize);
+			return -1;
+		}
+
+		/*
+		 * Number of elements must be a multiple of 16
+		 * So this includes status page size
+		 */
+		qsz = ALIGN(qsize/wrsize, 16) * wrsize;
+
+		break;
+	case CSIO_FREELIST:
+		qsz = ALIGN(qsize/wrsize, 8) * wrsize + csio_wr_qstat_pgsz(hw);
+		break;
+	default:
+		csio_err(hw, "Invalid queue type: 0x%x\n", type);
+		return -1;
+	}
+
+	q = wrm->q_arr[free_idx];
+
+	q->vstart = pci_alloc_consistent(hw->pdev, qsz, &q->pstart);
+	if (!q->vstart) {
+		csio_err(hw,
+			 "Failed to allocate DMA memory for "
+			 "queue at id: %d size: %d\n", free_idx, qsize);
+		return -1;
+	}
+
+	/*
+	 * We need to zero out the contents, importantly for ingress,
+	 * since we start with a generatiom bit of 1 for ingress.
+	 */
+	memset(q->vstart, 0, qsz);
+
+	q->type		= type;
+	q->owner	= owner;
+	q->pidx		= q->cidx = q->inc_idx = 0;
+	q->size		= qsz;
+	q->wr_sz	= wrsize;	/* If using fixed size WRs */
+
+	wrm->free_qidx++;
+
+	if (type == CSIO_INGRESS) {
+		/* Since queue area is set to zero */
+		q->un.iq.genbit	= 1;
+
+		/*
+		 * Ingress queue status page size is always the size of
+		 * the ingress queue entry.
+		 */
+		q->credits	= (qsz - q->wr_sz) / q->wr_sz;
+		q->vwrap	= (void *)((uintptr_t)(q->vstart) + qsz
+							- q->wr_sz);
+
+		/* Allocate memory for FL if requested */
+		if (nflb > 0) {
+			flq_idx = csio_wr_alloc_q(hw, nflb * sizeof(__be64),
+						  sizeof(__be64), CSIO_FREELIST,
+						  owner, 0, sreg, NULL);
+			if (flq_idx == -1) {
+				csio_err(hw,
+					 "Failed to allocate FL queue"
+					 " for IQ idx:%d\n", free_idx);
+				return -1;
+			}
+
+			/* Associate the new FL with the Ingress quue */
+			q->un.iq.flq_idx = flq_idx;
+
+			flq = wrm->q_arr[q->un.iq.flq_idx];
+			flq->un.fl.bufs = kzalloc(flq->credits *
+						  sizeof(struct csio_dma_buf),
+						  GFP_KERNEL);
+			if (!flq->un.fl.bufs) {
+				csio_err(hw,
+					 "Failed to allocate FL queue bufs"
+					 " for IQ idx:%d\n", free_idx);
+				return -1;
+			}
+
+			flq->un.fl.packen = 0;
+			flq->un.fl.offset = 0;
+			flq->un.fl.sreg = sreg;
+
+			/* Fill up the free list buffers */
+			if (csio_wr_fill_fl(hw, flq))
+				return -1;
+
+			/*
+			 * Make sure in a FLQ, atleast 1 credit (8 FL buffers)
+			 * remains unpopulated,otherwise HW thinks
+			 * FLQ is empty.
+			 */
+			flq->pidx = flq->inc_idx = flq->credits - 8;
+		} else {
+			q->un.iq.flq_idx = -1;
+		}
+
+		/* Associate the IQ INTx handler. */
+		q->un.iq.iq_intx_handler = iq_intx_handler;
+
+		csio_q_iqid(hw, ret_idx) = CSIO_MAX_QID;
+
+	} else if (type == CSIO_EGRESS) {
+		q->credits = (qsz - csio_wr_qstat_pgsz(hw)) / CSIO_QCREDIT_SZ;
+		q->vwrap   = (void *)((uintptr_t)(q->vstart) + qsz
+						- csio_wr_qstat_pgsz(hw));
+		csio_q_eqid(hw, ret_idx) = CSIO_MAX_QID;
+	} else { /* Freelist */
+		q->credits = (qsz - csio_wr_qstat_pgsz(hw)) / sizeof(__be64);
+		q->vwrap   = (void *)((uintptr_t)(q->vstart) + qsz
+						- csio_wr_qstat_pgsz(hw));
+		csio_q_flid(hw, ret_idx) = CSIO_MAX_QID;
+	}
+
+	return ret_idx;
+}
+
+/*
+ * csio_wr_iq_create_rsp - Response handler for IQ creation.
+ * @hw: The HW module.
+ * @mbp: Mailbox.
+ * @iq_idx: Ingress queue that got created.
+ *
+ * Handle FW_IQ_CMD mailbox completion. Save off the assigned IQ/FL ids.
+ */
+static int
+csio_wr_iq_create_rsp(struct csio_hw *hw, struct csio_mb *mbp, int iq_idx)
+{
+	struct csio_iq_params iqp;
+	enum fw_retval retval;
+	uint32_t iq_id;
+	int flq_idx;
+
+	memset(&iqp, 0, sizeof(struct csio_iq_params));
+
+	csio_mb_iq_alloc_write_rsp(hw, mbp, &retval, &iqp);
+
+	if (retval != FW_SUCCESS) {
+		csio_err(hw, "IQ cmd returned 0x%x!\n", retval);
+		mempool_free(mbp, hw->mb_mempool);
+		return -EINVAL;
+	}
+
+	csio_q_iqid(hw, iq_idx)		= iqp.iqid;
+	csio_q_physiqid(hw, iq_idx)	= iqp.physiqid;
+	csio_q_pidx(hw, iq_idx)		= csio_q_cidx(hw, iq_idx) = 0;
+	csio_q_inc_idx(hw, iq_idx)	= 0;
+
+	/* Actual iq-id. */
+	iq_id = iqp.iqid - hw->wrm.fw_iq_start;
+
+	/* Set the iq-id to iq map table. */
+	if (iq_id >= CSIO_MAX_IQ) {
+		csio_err(hw,
+			 "Exceeding MAX_IQ(%d) supported!"
+			 " iqid:%d rel_iqid:%d FW iq_start:%d\n",
+			 CSIO_MAX_IQ, iq_id, iqp.iqid, hw->wrm.fw_iq_start);
+		mempool_free(mbp, hw->mb_mempool);
+		return -EINVAL;
+	}
+	csio_q_set_intr_map(hw, iq_idx, iq_id);
+
+	/*
+	 * During FW_IQ_CMD, FW sets interrupt_sent bit to 1 in the SGE
+	 * ingress context of this queue. This will block interrupts to
+	 * this queue until the next GTS write. Therefore, we do a
+	 * 0-cidx increment GTS write for this queue just to clear the
+	 * interrupt_sent bit. This will re-enable interrupts to this
+	 * queue.
+	 */
+	csio_wr_sge_intr_enable(hw, iqp.physiqid);
+
+	flq_idx = csio_q_iq_flq_idx(hw, iq_idx);
+	if (flq_idx != -1) {
+		struct csio_q *flq = hw->wrm.q_arr[flq_idx];
+
+		csio_q_flid(hw, flq_idx) = iqp.fl0id;
+		csio_q_cidx(hw, flq_idx) = 0;
+		csio_q_pidx(hw, flq_idx)    = csio_q_credits(hw, flq_idx) - 8;
+		csio_q_inc_idx(hw, flq_idx) = csio_q_credits(hw, flq_idx) - 8;
+
+		/* Now update SGE about the buffers allocated during init */
+		csio_wr_ring_fldb(hw, flq);
+	}
+
+	mempool_free(mbp, hw->mb_mempool);
+
+	return 0;
+}
+
+/*
+ * csio_wr_iq_create - Configure an Ingress queue with FW.
+ * @hw: The HW module.
+ * @priv: Private data object.
+ * @iq_idx: Ingress queue index in the WR module.
+ * @vec: MSIX vector.
+ * @portid: PCIE Channel to be associated with this queue.
+ * @async: Is this a FW asynchronous message handling queue?
+ * @cbfn: Completion callback.
+ *
+ * This API configures an ingress queue with FW by issuing a FW_IQ_CMD mailbox
+ * with alloc/write bits set.
+ */
+int
+csio_wr_iq_create(struct csio_hw *hw, void *priv, int iq_idx,
+		  uint32_t vec, uint8_t portid, bool async,
+		  void (*cbfn) (struct csio_hw *, struct csio_mb *))
+{
+	struct csio_mb  *mbp;
+	struct csio_iq_params iqp;
+	int flq_idx;
+
+	memset(&iqp, 0, sizeof(struct csio_iq_params));
+	csio_q_portid(hw, iq_idx) = portid;
+
+	mbp = mempool_alloc(hw->mb_mempool, GFP_ATOMIC);
+	if (!mbp) {
+		csio_err(hw, "IQ command out of memory!\n");
+		return -ENOMEM;
+	}
+
+	switch (hw->intr_mode) {
+	case CSIO_IM_INTX:
+	case CSIO_IM_MSI:
+		/* For interrupt forwarding queue only */
+		if (hw->intr_iq_idx == iq_idx)
+			iqp.iqandst	= X_INTERRUPTDESTINATION_PCIE;
+		else
+			iqp.iqandst	= X_INTERRUPTDESTINATION_IQ;
+		iqp.iqandstindex	=
+			csio_q_physiqid(hw, hw->intr_iq_idx);
+		break;
+	case CSIO_IM_MSIX:
+		iqp.iqandst		= X_INTERRUPTDESTINATION_PCIE;
+		iqp.iqandstindex	= (uint16_t)vec;
+		break;
+	case CSIO_IM_NONE:
+		mempool_free(mbp, hw->mb_mempool);
+		return -EINVAL;
+	}
+
+	/* Pass in the ingress queue cmd parameters */
+	iqp.pfn			= hw->pfn;
+	iqp.vfn			= 0;
+	iqp.iq_start		= 1;
+	iqp.viid		= 0;
+	iqp.type		= FW_IQ_TYPE_FL_INT_CAP;
+	iqp.iqasynch		= async;
+	if (csio_intr_coalesce_cnt)
+		iqp.iqanus	= X_UPDATESCHEDULING_COUNTER_OPTTIMER;
+	else
+		iqp.iqanus	= X_UPDATESCHEDULING_TIMER;
+	iqp.iqanud		= X_UPDATEDELIVERY_INTERRUPT;
+	iqp.iqpciech		= portid;
+	iqp.iqintcntthresh	= (uint8_t)csio_sge_thresh_reg;
+
+	switch (csio_q_wr_sz(hw, iq_idx)) {
+	case 16:
+		iqp.iqesize = 0; break;
+	case 32:
+		iqp.iqesize = 1; break;
+	case 64:
+		iqp.iqesize = 2; break;
+	case 128:
+		iqp.iqesize = 3; break;
+	}
+
+	iqp.iqsize		= csio_q_size(hw, iq_idx) /
+						csio_q_wr_sz(hw, iq_idx);
+	iqp.iqaddr		= csio_q_pstart(hw, iq_idx);
+
+	flq_idx = csio_q_iq_flq_idx(hw, iq_idx);
+	if (flq_idx != -1) {
+		struct csio_q *flq = hw->wrm.q_arr[flq_idx];
+
+		iqp.fl0paden	= 1;
+		iqp.fl0packen	= flq->un.fl.packen ? 1 : 0;
+		iqp.fl0fbmin	= X_FETCHBURSTMIN_64B;
+		iqp.fl0fbmax	= X_FETCHBURSTMAX_512B;
+		iqp.fl0size	= csio_q_size(hw, flq_idx) / CSIO_QCREDIT_SZ;
+		iqp.fl0addr	= csio_q_pstart(hw, flq_idx);
+	}
+
+	csio_mb_iq_alloc_write(hw, mbp, priv, CSIO_MB_DEFAULT_TMO, &iqp, cbfn);
+
+	if (csio_mb_issue(hw, mbp)) {
+		csio_err(hw, "Issue of IQ cmd failed!\n");
+		mempool_free(mbp, hw->mb_mempool);
+		return -EINVAL;
+	}
+
+	if (cbfn != NULL)
+		return 0;
+
+	return csio_wr_iq_create_rsp(hw, mbp, iq_idx);
+}
+
+/*
+ * csio_wr_eq_create_rsp - Response handler for EQ creation.
+ * @hw: The HW module.
+ * @mbp: Mailbox.
+ * @eq_idx: Egress queue that got created.
+ *
+ * Handle FW_EQ_OFLD_CMD mailbox completion. Save off the assigned EQ ids.
+ */
+static int
+csio_wr_eq_cfg_rsp(struct csio_hw *hw, struct csio_mb *mbp, int eq_idx)
+{
+	struct csio_eq_params eqp;
+	enum fw_retval retval;
+
+	memset(&eqp, 0, sizeof(struct csio_eq_params));
+
+	csio_mb_eq_ofld_alloc_write_rsp(hw, mbp, &retval, &eqp);
+
+	if (retval != FW_SUCCESS) {
+		csio_err(hw, "EQ OFLD cmd returned 0x%x!\n", retval);
+		mempool_free(mbp, hw->mb_mempool);
+		return -EINVAL;
+	}
+
+	csio_q_eqid(hw, eq_idx)	= (uint16_t)eqp.eqid;
+	csio_q_physeqid(hw, eq_idx) = (uint16_t)eqp.physeqid;
+	csio_q_pidx(hw, eq_idx)	= csio_q_cidx(hw, eq_idx) = 0;
+	csio_q_inc_idx(hw, eq_idx) = 0;
+
+	mempool_free(mbp, hw->mb_mempool);
+
+	return 0;
+}
+
+/*
+ * csio_wr_eq_create - Configure an Egress queue with FW.
+ * @hw: HW module.
+ * @priv: Private data.
+ * @eq_idx: Egress queue index in the WR module.
+ * @iq_idx: Associated ingress queue index.
+ * @cbfn: Completion callback.
+ *
+ * This API configures a offload egress queue with FW by issuing a
+ * FW_EQ_OFLD_CMD  (with alloc + write ) mailbox.
+ */
+int
+csio_wr_eq_create(struct csio_hw *hw, void *priv, int eq_idx,
+		  int iq_idx, uint8_t portid,
+		  void (*cbfn) (struct csio_hw *, struct csio_mb *))
+{
+	struct csio_mb  *mbp;
+	struct csio_eq_params eqp;
+
+	memset(&eqp, 0, sizeof(struct csio_eq_params));
+
+	mbp = mempool_alloc(hw->mb_mempool, GFP_ATOMIC);
+	if (!mbp) {
+		csio_err(hw, "EQ command out of memory!\n");
+		return -ENOMEM;
+	}
+
+	eqp.pfn			= hw->pfn;
+	eqp.vfn			= 0;
+	eqp.eqstart		= 1;
+	eqp.hostfcmode		= X_HOSTFCMODE_STATUS_PAGE;
+	eqp.iqid		= csio_q_iqid(hw, iq_idx);
+	eqp.fbmin		= X_FETCHBURSTMIN_64B;
+	eqp.fbmax		= X_FETCHBURSTMAX_512B;
+	eqp.cidxfthresh		= 0;
+	eqp.pciechn		= portid;
+	eqp.eqsize		= csio_q_size(hw, eq_idx) / CSIO_QCREDIT_SZ;
+	eqp.eqaddr		= csio_q_pstart(hw, eq_idx);
+
+	csio_mb_eq_ofld_alloc_write(hw, mbp, priv, CSIO_MB_DEFAULT_TMO,
+				    &eqp, cbfn);
+
+	if (csio_mb_issue(hw, mbp)) {
+		csio_err(hw, "Issue of EQ OFLD cmd failed!\n");
+		mempool_free(mbp, hw->mb_mempool);
+		return -EINVAL;
+	}
+
+	if (cbfn != NULL)
+		return 0;
+
+	return csio_wr_eq_cfg_rsp(hw, mbp, eq_idx);
+}
+
+/*
+ * csio_wr_iq_destroy_rsp - Response handler for IQ removal.
+ * @hw: The HW module.
+ * @mbp: Mailbox.
+ * @iq_idx: Ingress queue that was freed.
+ *
+ * Handle FW_IQ_CMD (free) mailbox completion.
+ */
+static int
+csio_wr_iq_destroy_rsp(struct csio_hw *hw, struct csio_mb *mbp, int iq_idx)
+{
+	enum fw_retval retval = csio_mb_fw_retval(mbp);
+	int rv = 0;
+
+	if (retval != FW_SUCCESS)
+		rv = -EINVAL;
+
+	mempool_free(mbp, hw->mb_mempool);
+
+	return rv;
+}
+
+/*
+ * csio_wr_iq_destroy - Free an ingress queue.
+ * @hw: The HW module.
+ * @priv: Private data object.
+ * @iq_idx: Ingress queue index to destroy
+ * @cbfn: Completion callback.
+ *
+ * This API frees an ingress queue by issuing the FW_IQ_CMD
+ * with the free bit set.
+ */
+static int
+csio_wr_iq_destroy(struct csio_hw *hw, void *priv, int iq_idx,
+		   void (*cbfn)(struct csio_hw *, struct csio_mb *))
+{
+	int rv = 0;
+	struct csio_mb  *mbp;
+	struct csio_iq_params iqp;
+	int flq_idx;
+
+	memset(&iqp, 0, sizeof(struct csio_iq_params));
+
+	mbp = mempool_alloc(hw->mb_mempool, GFP_ATOMIC);
+	if (!mbp)
+		return -ENOMEM;
+
+	iqp.pfn		= hw->pfn;
+	iqp.vfn		= 0;
+	iqp.iqid	= csio_q_iqid(hw, iq_idx);
+	iqp.type	= FW_IQ_TYPE_FL_INT_CAP;
+
+	flq_idx = csio_q_iq_flq_idx(hw, iq_idx);
+	if (flq_idx != -1)
+		iqp.fl0id = csio_q_flid(hw, flq_idx);
+	else
+		iqp.fl0id = 0xFFFF;
+
+	iqp.fl1id = 0xFFFF;
+
+	csio_mb_iq_free(hw, mbp, priv, CSIO_MB_DEFAULT_TMO, &iqp, cbfn);
+
+	rv = csio_mb_issue(hw, mbp);
+	if (rv != 0) {
+		mempool_free(mbp, hw->mb_mempool);
+		return rv;
+	}
+
+	if (cbfn != NULL)
+		return 0;
+
+	return csio_wr_iq_destroy_rsp(hw, mbp, iq_idx);
+}
+
+/*
+ * csio_wr_eq_destroy_rsp - Response handler for OFLD EQ creation.
+ * @hw: The HW module.
+ * @mbp: Mailbox.
+ * @eq_idx: Egress queue that was freed.
+ *
+ * Handle FW_OFLD_EQ_CMD (free) mailbox completion.
+ */
+static int
+csio_wr_eq_destroy_rsp(struct csio_hw *hw, struct csio_mb *mbp, int eq_idx)
+{
+	enum fw_retval retval = csio_mb_fw_retval(mbp);
+	int rv = 0;
+
+	if (retval != FW_SUCCESS)
+		rv = -EINVAL;
+
+	mempool_free(mbp, hw->mb_mempool);
+
+	return rv;
+}
+
+/*
+ * csio_wr_eq_destroy - Free an Egress queue.
+ * @hw: The HW module.
+ * @priv: Private data object.
+ * @eq_idx: Egress queue index to destroy
+ * @cbfn: Completion callback.
+ *
+ * This API frees an Egress queue by issuing the FW_EQ_OFLD_CMD
+ * with the free bit set.
+ */
+static int
+csio_wr_eq_destroy(struct csio_hw *hw, void *priv, int eq_idx,
+		   void (*cbfn) (struct csio_hw *, struct csio_mb *))
+{
+	int rv = 0;
+	struct csio_mb  *mbp;
+	struct csio_eq_params eqp;
+
+	memset(&eqp, 0, sizeof(struct csio_eq_params));
+
+	mbp = mempool_alloc(hw->mb_mempool, GFP_ATOMIC);
+	if (!mbp)
+		return -ENOMEM;
+
+	eqp.pfn		= hw->pfn;
+	eqp.vfn		= 0;
+	eqp.eqid	= csio_q_eqid(hw, eq_idx);
+
+	csio_mb_eq_ofld_free(hw, mbp, priv, CSIO_MB_DEFAULT_TMO, &eqp, cbfn);
+
+	rv = csio_mb_issue(hw, mbp);
+	if (rv != 0) {
+		mempool_free(mbp, hw->mb_mempool);
+		return rv;
+	}
+
+	if (cbfn != NULL)
+		return 0;
+
+	return csio_wr_eq_destroy_rsp(hw, mbp, eq_idx);
+}
+
+/*
+ * csio_wr_cleanup_eq_stpg - Cleanup Egress queue status page
+ * @hw: HW module
+ * @qidx: Egress queue index
+ *
+ * Cleanup the Egress queue status page.
+ */
+static void
+csio_wr_cleanup_eq_stpg(struct csio_hw *hw, int qidx)
+{
+	struct csio_q	*q = csio_hw_to_wrm(hw)->q_arr[qidx];
+	struct csio_qstatus_page *stp = (struct csio_qstatus_page *)q->vwrap;
+
+	memset(stp, 0, sizeof(*stp));
+}
+
+/*
+ * csio_wr_cleanup_iq_ftr - Cleanup Footer entries in IQ
+ * @hw: HW module
+ * @qidx: Ingress queue index
+ *
+ * Cleanup the footer entries in the given ingress queue,
+ * set to 1 the internal copy of genbit.
+ */
+static void
+csio_wr_cleanup_iq_ftr(struct csio_hw *hw, int qidx)
+{
+	struct csio_wrm *wrm	= csio_hw_to_wrm(hw);
+	struct csio_q	*q	= wrm->q_arr[qidx];
+	void *wr;
+	struct csio_iqwr_footer *ftr;
+	uint32_t i = 0;
+
+	/* set to 1 since we are just about zero out genbit */
+	q->un.iq.genbit = 1;
+
+	for (i = 0; i < q->credits; i++) {
+		/* Get the WR */
+		wr = (void *)((uintptr_t)q->vstart +
+					   (i * q->wr_sz));
+		/* Get the footer */
+		ftr = (struct csio_iqwr_footer *)((uintptr_t)wr +
+					  (q->wr_sz - sizeof(*ftr)));
+		/* Zero out footer */
+		memset(ftr, 0, sizeof(*ftr));
+	}
+}
+
+int
+csio_wr_destroy_queues(struct csio_hw *hw, bool cmd)
+{
+	int i, flq_idx;
+	struct csio_q *q;
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	int rv;
+
+	for (i = 0; i < wrm->free_qidx; i++) {
+		q = wrm->q_arr[i];
+
+		switch (q->type) {
+		case CSIO_EGRESS:
+			if (csio_q_eqid(hw, i) != CSIO_MAX_QID) {
+				csio_wr_cleanup_eq_stpg(hw, i);
+				if (!cmd) {
+					csio_q_eqid(hw, i) = CSIO_MAX_QID;
+					continue;
+				}
+
+				rv = csio_wr_eq_destroy(hw, NULL, i, NULL);
+				if ((rv == -EBUSY) || (rv == -ETIMEDOUT))
+					cmd = false;
+
+				csio_q_eqid(hw, i) = CSIO_MAX_QID;
+			}
+		case CSIO_INGRESS:
+			if (csio_q_iqid(hw, i) != CSIO_MAX_QID) {
+				csio_wr_cleanup_iq_ftr(hw, i);
+				if (!cmd) {
+					csio_q_iqid(hw, i) = CSIO_MAX_QID;
+					flq_idx = csio_q_iq_flq_idx(hw, i);
+					if (flq_idx != -1)
+						csio_q_flid(hw, flq_idx) =
+								CSIO_MAX_QID;
+					continue;
+				}
+
+				rv = csio_wr_iq_destroy(hw, NULL, i, NULL);
+				if ((rv == -EBUSY) || (rv == -ETIMEDOUT))
+					cmd = false;
+
+				csio_q_iqid(hw, i) = CSIO_MAX_QID;
+				flq_idx = csio_q_iq_flq_idx(hw, i);
+				if (flq_idx != -1)
+					csio_q_flid(hw, flq_idx) = CSIO_MAX_QID;
+			}
+		default:
+			break;
+		}
+	}
+
+	hw->flags &= ~CSIO_HWF_Q_FW_ALLOCED;
+
+	return 0;
+}
+
+/*
+ * csio_wr_get - Get requested size of WR entry/entries from queue.
+ * @hw: HW module.
+ * @qidx: Index of queue.
+ * @size: Cumulative size of Work request(s).
+ * @wrp: Work request pair.
+ *
+ * If requested credits are available, return the start address of the
+ * work request in the work request pair. Set pidx accordingly and
+ * return.
+ *
+ * NOTE about WR pair:
+ * ==================
+ * A WR can start towards the end of a queue, and then continue at the
+ * beginning, since the queue is considered to be circular. This will
+ * require a pair of address/size to be passed back to the caller -
+ * hence Work request pair format.
+ */
+int
+csio_wr_get(struct csio_hw *hw, int qidx, uint32_t size,
+	    struct csio_wr_pair *wrp)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_q *q = wrm->q_arr[qidx];
+	void *cwr = (void *)((uintptr_t)(q->vstart) +
+						(q->pidx * CSIO_QCREDIT_SZ));
+	struct csio_qstatus_page *stp = (struct csio_qstatus_page *)q->vwrap;
+	uint16_t cidx = q->cidx = ntohs(stp->cidx);
+	uint16_t pidx = q->pidx;
+	uint32_t req_sz	= ALIGN(size, CSIO_QCREDIT_SZ);
+	int req_credits	= req_sz / CSIO_QCREDIT_SZ;
+	int credits;
+
+	CSIO_DB_ASSERT(q->owner != NULL);
+	CSIO_DB_ASSERT((qidx >= 0) && (qidx < wrm->free_qidx));
+	CSIO_DB_ASSERT(cidx <= q->credits);
+
+	/* Calculate credits */
+	if (pidx > cidx) {
+		credits = q->credits - (pidx - cidx) - 1;
+	} else if (cidx > pidx) {
+		credits = cidx - pidx - 1;
+	} else {
+		/* cidx == pidx, empty queue */
+		credits = q->credits;
+		CSIO_INC_STATS(q, n_qempty);
+	}
+
+	/*
+	 * Check if we have enough credits.
+	 * credits = 1 implies queue is full.
+	 */
+	if (!credits || (req_credits > credits)) {
+		CSIO_INC_STATS(q, n_qfull);
+		return -EBUSY;
+	}
+
+	/*
+	 * If we are here, we have enough credits to satisfy the
+	 * request. Check if we are near the end of q, and if WR spills over.
+	 * If it does, use the first addr/size to cover the queue until
+	 * the end. Fit the remainder portion of the request at the top
+	 * of queue and return it in the second addr/len. Set pidx
+	 * accordingly.
+	 */
+	if (unlikely(((uintptr_t)cwr + req_sz) > (uintptr_t)(q->vwrap))) {
+		wrp->addr1 = cwr;
+		wrp->size1 = (uint32_t)((uintptr_t)q->vwrap - (uintptr_t)cwr);
+		wrp->addr2 = q->vstart;
+		wrp->size2 = req_sz - wrp->size1;
+		q->pidx	= (uint16_t)(ALIGN(wrp->size2, CSIO_QCREDIT_SZ) /
+							CSIO_QCREDIT_SZ);
+		CSIO_INC_STATS(q, n_qwrap);
+		CSIO_INC_STATS(q, n_eq_wr_split);
+	} else {
+		wrp->addr1 = cwr;
+		wrp->size1 = req_sz;
+		wrp->addr2 = NULL;
+		wrp->size2 = 0;
+		q->pidx	+= (uint16_t)req_credits;
+
+		/* We are the end of queue, roll back pidx to top of queue */
+		if (unlikely(q->pidx == q->credits)) {
+			q->pidx = 0;
+			CSIO_INC_STATS(q, n_qwrap);
+		}
+	}
+
+	q->inc_idx = (uint16_t)req_credits;
+
+	CSIO_INC_STATS(q, n_tot_reqs);
+
+	return 0;
+}
+
+/*
+ * csio_wr_copy_to_wrp - Copies given data into WR.
+ * @data_buf - Data buffer
+ * @wrp - Work request pair.
+ * @wr_off - Work request offset.
+ * @data_len - Data length.
+ *
+ * Copies the given data in Work Request. Work request pair(wrp) specifies
+ * address information of Work request.
+ * Returns: none
+ */
+void
+csio_wr_copy_to_wrp(void *data_buf, struct csio_wr_pair *wrp,
+		   uint32_t wr_off, uint32_t data_len)
+{
+	uint32_t nbytes;
+
+	/* Number of space available in buffer addr1 of WRP */
+	nbytes = ((wrp->size1 - wr_off) >= data_len) ?
+					data_len : (wrp->size1 - wr_off);
+
+	memcpy((uint8_t *) wrp->addr1 + wr_off, data_buf, nbytes);
+	data_len -= nbytes;
+
+	/* Write the remaining data from the begining of circular buffer */
+	if (data_len) {
+		CSIO_DB_ASSERT(data_len <= wrp->size2);
+		CSIO_DB_ASSERT(wrp->addr2 != NULL);
+		memcpy(wrp->addr2, (uint8_t *) data_buf + nbytes, data_len);
+	}
+}
+
+/*
+ * csio_wr_issue - Notify chip of Work request.
+ * @hw: HW module.
+ * @qidx: Index of queue.
+ * @prio: 0: Low priority, 1: High priority
+ *
+ * Rings the SGE Doorbell by writing the current producer index of the passed
+ * in queue into the register.
+ *
+ */
+int
+csio_wr_issue(struct csio_hw *hw, int qidx, bool prio)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_q *q = wrm->q_arr[qidx];
+
+	CSIO_DB_ASSERT((qidx >= 0) && (qidx < wrm->free_qidx));
+
+	wmb();
+	/* Ring SGE Doorbell writing q->pidx into it */
+	csio_wr_reg32(hw, DBPRIO(prio) | QID(q->un.eq.physeqid) |
+		      PIDX(q->inc_idx), MYPF_REG(SGE_PF_KDOORBELL));
+	q->inc_idx = 0;
+
+	return 0;
+}
+
+static inline uint32_t
+csio_wr_avail_qcredits(struct csio_q *q)
+{
+	if (q->pidx > q->cidx)
+		return q->pidx - q->cidx;
+	else if (q->cidx > q->pidx)
+		return q->credits - (q->cidx - q->pidx);
+	else
+		return 0;	/* cidx == pidx, empty queue */
+}
+
+/*
+ * csio_wr_inval_flq_buf - Invalidate a free list buffer entry.
+ * @hw: HW module.
+ * @flq: The freelist queue.
+ *
+ * Invalidate the driver's version of a freelist buffer entry,
+ * without freeing the associated the DMA memory. The entry
+ * to be invalidated is picked up from the current Free list
+ * queue cidx.
+ *
+ */
+static inline void
+csio_wr_inval_flq_buf(struct csio_hw *hw, struct csio_q *flq)
+{
+	flq->cidx++;
+	if (flq->cidx == flq->credits) {
+		flq->cidx = 0;
+		CSIO_INC_STATS(flq, n_qwrap);
+	}
+}
+
+/*
+ * csio_wr_process_fl - Process a freelist completion.
+ * @hw: HW module.
+ * @q: The ingress queue attached to the Freelist.
+ * @wr: The freelist completion WR in the ingress queue.
+ * @len_to_qid: The lower 32-bits of the first flit of the RSP footer
+ * @iq_handler: Caller's handler for this completion.
+ * @priv: Private pointer of caller
+ *
+ */
+static inline void
+csio_wr_process_fl(struct csio_hw *hw, struct csio_q *q,
+		   void *wr, uint32_t len_to_qid,
+		   void (*iq_handler)(struct csio_hw *, void *,
+				      uint32_t, struct csio_fl_dma_buf *,
+				      void *),
+		   void *priv)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_sge *sge = &wrm->sge;
+	struct csio_fl_dma_buf flb;
+	struct csio_dma_buf *buf, *fbuf;
+	uint32_t bufsz, len, lastlen = 0;
+	int n;
+	struct csio_q *flq = hw->wrm.q_arr[q->un.iq.flq_idx];
+
+	CSIO_DB_ASSERT(flq != NULL);
+
+	len = len_to_qid;
+
+	if (len & IQWRF_NEWBUF) {
+		if (flq->un.fl.offset > 0) {
+			csio_wr_inval_flq_buf(hw, flq);
+			flq->un.fl.offset = 0;
+		}
+		len = IQWRF_LEN_GET(len);
+	}
+
+	CSIO_DB_ASSERT(len != 0);
+
+	flb.totlen = len;
+
+	/* Consume all freelist buffers used for len bytes */
+	for (n = 0, fbuf = flb.flbufs; ; n++, fbuf++) {
+		buf = &flq->un.fl.bufs[flq->cidx];
+		bufsz = csio_wr_fl_bufsz(sge, buf);
+
+		fbuf->paddr	= buf->paddr;
+		fbuf->vaddr	= buf->vaddr;
+
+		flb.offset	= flq->un.fl.offset;
+		lastlen		= min(bufsz, len);
+		fbuf->len	= lastlen;
+
+		len -= lastlen;
+		if (!len)
+			break;
+		csio_wr_inval_flq_buf(hw, flq);
+	}
+
+	flb.defer_free = flq->un.fl.packen ? 0 : 1;
+
+	iq_handler(hw, wr, q->wr_sz - sizeof(struct csio_iqwr_footer),
+		   &flb, priv);
+
+	if (flq->un.fl.packen)
+		flq->un.fl.offset += ALIGN(lastlen, sge->csio_fl_align);
+	else
+		csio_wr_inval_flq_buf(hw, flq);
+
+}
+
+/*
+ * csio_is_new_iqwr - Is this a new Ingress queue entry ?
+ * @q: Ingress quueue.
+ * @ftr: Ingress queue WR SGE footer.
+ *
+ * The entry is new if our generation bit matches the corresponding
+ * bit in the footer of the current WR.
+ */
+static inline bool
+csio_is_new_iqwr(struct csio_q *q, struct csio_iqwr_footer *ftr)
+{
+	return (q->un.iq.genbit == (ftr->u.type_gen >> IQWRF_GEN_SHIFT));
+}
+
+/*
+ * csio_wr_process_iq - Process elements in Ingress queue.
+ * @hw:  HW pointer
+ * @qidx: Index of queue
+ * @iq_handler: Handler for this queue
+ * @priv: Caller's private pointer
+ *
+ * This routine walks through every entry of the ingress queue, calling
+ * the provided iq_handler with the entry, until the generation bit
+ * flips.
+ */
+int
+csio_wr_process_iq(struct csio_hw *hw, struct csio_q *q,
+		   void (*iq_handler)(struct csio_hw *, void *,
+				      uint32_t, struct csio_fl_dma_buf *,
+				      void *),
+		   void *priv)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	void *wr = (void *)((uintptr_t)q->vstart + (q->cidx * q->wr_sz));
+	struct csio_iqwr_footer *ftr;
+	uint32_t wr_type, fw_qid, qid;
+	struct csio_q *q_completed;
+	struct csio_q *flq = csio_iq_has_fl(q) ?
+					wrm->q_arr[q->un.iq.flq_idx] : NULL;
+	int rv = 0;
+
+	/* Get the footer */
+	ftr = (struct csio_iqwr_footer *)((uintptr_t)wr +
+					  (q->wr_sz - sizeof(*ftr)));
+
+	/*
+	 * When q wrapped around last time, driver should have inverted
+	 * ic.genbit as well.
+	 */
+	while (csio_is_new_iqwr(q, ftr)) {
+
+		CSIO_DB_ASSERT(((uintptr_t)wr + q->wr_sz) <=
+						(uintptr_t)q->vwrap);
+		rmb();
+		wr_type = IQWRF_TYPE_GET(ftr->u.type_gen);
+
+		switch (wr_type) {
+		case X_RSPD_TYPE_CPL:
+			/* Subtract footer from WR len */
+			iq_handler(hw, wr, q->wr_sz - sizeof(*ftr), NULL, priv);
+			break;
+		case X_RSPD_TYPE_FLBUF:
+			csio_wr_process_fl(hw, q, wr,
+					   ntohl(ftr->pldbuflen_qid),
+					   iq_handler, priv);
+			break;
+		case X_RSPD_TYPE_INTR:
+			fw_qid = ntohl(ftr->pldbuflen_qid);
+			qid = fw_qid - wrm->fw_iq_start;
+			q_completed = hw->wrm.intr_map[qid];
+
+			if (unlikely(qid ==
+					csio_q_physiqid(hw, hw->intr_iq_idx))) {
+				/*
+				 * We are already in the Forward Interrupt
+				 * Interrupt Queue Service! Do-not service
+				 * again!
+				 *
+				 */
+			} else {
+				CSIO_DB_ASSERT(q_completed);
+				CSIO_DB_ASSERT(
+					q_completed->un.iq.iq_intx_handler);
+
+				/* Call the queue handler. */
+				q_completed->un.iq.iq_intx_handler(hw, NULL,
+						0, NULL, (void *)q_completed);
+			}
+			break;
+		default:
+			csio_warn(hw, "Unknown resp type 0x%x received\n",
+				 wr_type);
+			CSIO_INC_STATS(q, n_rsp_unknown);
+			break;
+		}
+
+		/*
+		 * Ingress *always* has fixed size WR entries. Therefore,
+		 * there should always be complete WRs towards the end of
+		 * queue.
+		 */
+		if (((uintptr_t)wr + q->wr_sz) == (uintptr_t)q->vwrap) {
+
+			/* Roll over to start of queue */
+			q->cidx = 0;
+			wr	= q->vstart;
+
+			/* Toggle genbit */
+			q->un.iq.genbit ^= 0x1;
+
+			CSIO_INC_STATS(q, n_qwrap);
+		} else {
+			q->cidx++;
+			wr	= (void *)((uintptr_t)(q->vstart) +
+					   (q->cidx * q->wr_sz));
+		}
+
+		ftr = (struct csio_iqwr_footer *)((uintptr_t)wr +
+						  (q->wr_sz - sizeof(*ftr)));
+		q->inc_idx++;
+
+	} /* while (q->un.iq.genbit == hdr->genbit) */
+
+	/*
+	 * We need to re-arm SGE interrupts in case we got a stray interrupt,
+	 * especially in msix mode. With INTx, this may be a common occurence.
+	 */
+	if (unlikely(!q->inc_idx)) {
+		CSIO_INC_STATS(q, n_stray_comp);
+		rv = -EINVAL;
+		goto restart;
+	}
+
+	/* Replenish free list buffers if pending falls below low water mark */
+	if (flq) {
+		uint32_t avail  = csio_wr_avail_qcredits(flq);
+		if (avail <= 16) {
+			/* Make sure in FLQ, atleast 1 credit (8 FL buffers)
+			 * remains unpopulated otherwise HW thinks
+			 * FLQ is empty.
+			 */
+			csio_wr_update_fl(hw, flq, (flq->credits - 8) - avail);
+			csio_wr_ring_fldb(hw, flq);
+		}
+	}
+
+restart:
+	/* Now inform SGE about our incremental index value */
+	csio_wr_reg32(hw, CIDXINC(q->inc_idx)		|
+			  INGRESSQID(q->un.iq.physiqid)	|
+			  TIMERREG(csio_sge_timer_reg),
+			  MYPF_REG(SGE_PF_GTS));
+	q->stats.n_tot_rsps += q->inc_idx;
+
+	q->inc_idx = 0;
+
+	return rv;
+}
+
+int
+csio_wr_process_iq_idx(struct csio_hw *hw, int qidx,
+		   void (*iq_handler)(struct csio_hw *, void *,
+				      uint32_t, struct csio_fl_dma_buf *,
+				      void *),
+		   void *priv)
+{
+	struct csio_wrm *wrm	= csio_hw_to_wrm(hw);
+	struct csio_q	*iq	= wrm->q_arr[qidx];
+
+	return csio_wr_process_iq(hw, iq, iq_handler, priv);
+}
+
+static int
+csio_closest_timer(struct csio_sge *s, int time)
+{
+	int i, delta, match = 0, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->timer_val); i++) {
+		delta = time - s->timer_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+static int
+csio_closest_thresh(struct csio_sge *s, int cnt)
+{
+	int i, delta, match = 0, min_delta = INT_MAX;
+
+	for (i = 0; i < ARRAY_SIZE(s->counter_val); i++) {
+		delta = cnt - s->counter_val[i];
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			match = i;
+		}
+	}
+	return match;
+}
+
+static void
+csio_wr_fixup_host_params(struct csio_hw *hw)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_sge *sge = &wrm->sge;
+	uint32_t clsz = L1_CACHE_BYTES;
+	uint32_t s_hps = PAGE_SHIFT - 10;
+	uint32_t ingpad = 0;
+	uint32_t stat_len = clsz > 64 ? 128 : 64;
+
+	csio_wr_reg32(hw, HOSTPAGESIZEPF0(s_hps) | HOSTPAGESIZEPF1(s_hps) |
+		      HOSTPAGESIZEPF2(s_hps) | HOSTPAGESIZEPF3(s_hps) |
+		      HOSTPAGESIZEPF4(s_hps) | HOSTPAGESIZEPF5(s_hps) |
+		      HOSTPAGESIZEPF6(s_hps) | HOSTPAGESIZEPF7(s_hps),
+		      SGE_HOST_PAGE_SIZE);
+
+	sge->csio_fl_align = clsz < 32 ? 32 : clsz;
+	ingpad = ilog2(sge->csio_fl_align) - 5;
+
+	csio_set_reg_field(hw, SGE_CONTROL, INGPADBOUNDARY_MASK |
+					    EGRSTATUSPAGESIZE(1),
+			   INGPADBOUNDARY(ingpad) |
+			   EGRSTATUSPAGESIZE(stat_len != 64));
+
+	/* FL BUFFER SIZE#0 is Page size i,e already aligned to cache line */
+	csio_wr_reg32(hw, PAGE_SIZE, SGE_FL_BUFFER_SIZE0);
+	csio_wr_reg32(hw,
+		      (csio_rd_reg32(hw, SGE_FL_BUFFER_SIZE2) +
+		      sge->csio_fl_align - 1) & ~(sge->csio_fl_align - 1),
+		      SGE_FL_BUFFER_SIZE2);
+	csio_wr_reg32(hw,
+		      (csio_rd_reg32(hw, SGE_FL_BUFFER_SIZE3) +
+		      sge->csio_fl_align - 1) & ~(sge->csio_fl_align - 1),
+		      SGE_FL_BUFFER_SIZE3);
+
+	csio_wr_reg32(hw, HPZ0(PAGE_SHIFT - 12), ULP_RX_TDDP_PSZ);
+
+	/* default value of rx_dma_offset of the NIC driver */
+	csio_set_reg_field(hw, SGE_CONTROL, PKTSHIFT_MASK,
+			   PKTSHIFT(CSIO_SGE_RX_DMA_OFFSET));
+}
+
+static void
+csio_init_intr_coalesce_parms(struct csio_hw *hw)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_sge *sge = &wrm->sge;
+
+	csio_sge_thresh_reg = csio_closest_thresh(sge, csio_intr_coalesce_cnt);
+	if (csio_intr_coalesce_cnt) {
+		csio_sge_thresh_reg = 0;
+		csio_sge_timer_reg = X_TIMERREG_RESTART_COUNTER;
+		return;
+	}
+
+	csio_sge_timer_reg = csio_closest_timer(sge, csio_intr_coalesce_time);
+}
+
+/*
+ * csio_wr_get_sge - Get SGE register values.
+ * @hw: HW module.
+ *
+ * Used by non-master functions and by master-functions relying on config file.
+ */
+static void
+csio_wr_get_sge(struct csio_hw *hw)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_sge *sge = &wrm->sge;
+	uint32_t ingpad;
+	int i;
+	u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5;
+	u32 ingress_rx_threshold;
+
+	sge->sge_control = csio_rd_reg32(hw, SGE_CONTROL);
+
+	ingpad = INGPADBOUNDARY_GET(sge->sge_control);
+
+	switch (ingpad) {
+	case X_INGPCIEBOUNDARY_32B:
+		sge->csio_fl_align = 32; break;
+	case X_INGPCIEBOUNDARY_64B:
+		sge->csio_fl_align = 64; break;
+	case X_INGPCIEBOUNDARY_128B:
+		sge->csio_fl_align = 128; break;
+	case X_INGPCIEBOUNDARY_256B:
+		sge->csio_fl_align = 256; break;
+	case X_INGPCIEBOUNDARY_512B:
+		sge->csio_fl_align = 512; break;
+	case X_INGPCIEBOUNDARY_1024B:
+		sge->csio_fl_align = 1024; break;
+	case X_INGPCIEBOUNDARY_2048B:
+		sge->csio_fl_align = 2048; break;
+	case X_INGPCIEBOUNDARY_4096B:
+		sge->csio_fl_align = 4096; break;
+	}
+
+	for (i = 0; i < CSIO_SGE_FL_SIZE_REGS; i++)
+		csio_get_flbuf_size(hw, sge, i);
+
+	timer_value_0_and_1 = csio_rd_reg32(hw, SGE_TIMER_VALUE_0_AND_1);
+	timer_value_2_and_3 = csio_rd_reg32(hw, SGE_TIMER_VALUE_2_AND_3);
+	timer_value_4_and_5 = csio_rd_reg32(hw, SGE_TIMER_VALUE_4_AND_5);
+
+	sge->timer_val[0] = (uint16_t)csio_core_ticks_to_us(hw,
+					TIMERVALUE0_GET(timer_value_0_and_1));
+	sge->timer_val[1] = (uint16_t)csio_core_ticks_to_us(hw,
+					TIMERVALUE1_GET(timer_value_0_and_1));
+	sge->timer_val[2] = (uint16_t)csio_core_ticks_to_us(hw,
+					TIMERVALUE2_GET(timer_value_2_and_3));
+	sge->timer_val[3] = (uint16_t)csio_core_ticks_to_us(hw,
+					TIMERVALUE3_GET(timer_value_2_and_3));
+	sge->timer_val[4] = (uint16_t)csio_core_ticks_to_us(hw,
+					TIMERVALUE4_GET(timer_value_4_and_5));
+	sge->timer_val[5] = (uint16_t)csio_core_ticks_to_us(hw,
+					TIMERVALUE5_GET(timer_value_4_and_5));
+
+	ingress_rx_threshold = csio_rd_reg32(hw, SGE_INGRESS_RX_THRESHOLD);
+	sge->counter_val[0] = THRESHOLD_0_GET(ingress_rx_threshold);
+	sge->counter_val[1] = THRESHOLD_1_GET(ingress_rx_threshold);
+	sge->counter_val[2] = THRESHOLD_2_GET(ingress_rx_threshold);
+	sge->counter_val[3] = THRESHOLD_3_GET(ingress_rx_threshold);
+
+	csio_init_intr_coalesce_parms(hw);
+}
+
+/*
+ * csio_wr_set_sge - Initialize SGE registers
+ * @hw: HW module.
+ *
+ * Used by Master function to initialize SGE registers in the absence
+ * of a config file.
+ */
+static void
+csio_wr_set_sge(struct csio_hw *hw)
+{
+	struct csio_wrm *wrm = csio_hw_to_wrm(hw);
+	struct csio_sge *sge = &wrm->sge;
+	int i;
+
+	/*
+	 * Set up our basic SGE mode to deliver CPL messages to our Ingress
+	 * Queue and Packet Date to the Free List.
+	 */
+	csio_set_reg_field(hw, SGE_CONTROL, RXPKTCPLMODE, RXPKTCPLMODE);
+
+	sge->sge_control = csio_rd_reg32(hw, SGE_CONTROL);
+
+	/* sge->csio_fl_align is set up by csio_wr_fixup_host_params(). */
+
+	/*
+	 * Set up to drop DOORBELL writes when the DOORBELL FIFO overflows
+	 * and generate an interrupt when this occurs so we can recover.
+	 */
+	csio_set_reg_field(hw, SGE_DBFIFO_STATUS,
+			   HP_INT_THRESH(HP_INT_THRESH_MASK) |
+			   LP_INT_THRESH(LP_INT_THRESH_MASK),
+			   HP_INT_THRESH(CSIO_SGE_DBFIFO_INT_THRESH) |
+			   LP_INT_THRESH(CSIO_SGE_DBFIFO_INT_THRESH));
+	csio_set_reg_field(hw, SGE_DOORBELL_CONTROL, ENABLE_DROP,
+			   ENABLE_DROP);
+
+	/* SGE_FL_BUFFER_SIZE0 is set up by csio_wr_fixup_host_params(). */
+
+	CSIO_SET_FLBUF_SIZE(hw, 1, CSIO_SGE_FLBUF_SIZE1);
+	CSIO_SET_FLBUF_SIZE(hw, 2, CSIO_SGE_FLBUF_SIZE2);
+	CSIO_SET_FLBUF_SIZE(hw, 3, CSIO_SGE_FLBUF_SIZE3);
+	CSIO_SET_FLBUF_SIZE(hw, 4, CSIO_SGE_FLBUF_SIZE4);
+	CSIO_SET_FLBUF_SIZE(hw, 5, CSIO_SGE_FLBUF_SIZE5);
+	CSIO_SET_FLBUF_SIZE(hw, 6, CSIO_SGE_FLBUF_SIZE6);
+	CSIO_SET_FLBUF_SIZE(hw, 7, CSIO_SGE_FLBUF_SIZE7);
+	CSIO_SET_FLBUF_SIZE(hw, 8, CSIO_SGE_FLBUF_SIZE8);
+
+	for (i = 0; i < CSIO_SGE_FL_SIZE_REGS; i++)
+		csio_get_flbuf_size(hw, sge, i);
+
+	/* Initialize interrupt coalescing attributes */
+	sge->timer_val[0] = CSIO_SGE_TIMER_VAL_0;
+	sge->timer_val[1] = CSIO_SGE_TIMER_VAL_1;
+	sge->timer_val[2] = CSIO_SGE_TIMER_VAL_2;
+	sge->timer_val[3] = CSIO_SGE_TIMER_VAL_3;
+	sge->timer_val[4] = CSIO_SGE_TIMER_VAL_4;
+	sge->timer_val[5] = CSIO_SGE_TIMER_VAL_5;
+
+	sge->counter_val[0] = CSIO_SGE_INT_CNT_VAL_0;
+	sge->counter_val[1] = CSIO_SGE_INT_CNT_VAL_1;
+	sge->counter_val[2] = CSIO_SGE_INT_CNT_VAL_2;
+	sge->counter_val[3] = CSIO_SGE_INT_CNT_VAL_3;
+
+	csio_wr_reg32(hw, THRESHOLD_0(sge->counter_val[0]) |
+		      THRESHOLD_1(sge->counter_val[1]) |
+		      THRESHOLD_2(sge->counter_val[2]) |
+		      THRESHOLD_3(sge->counter_val[3]),
+		      SGE_INGRESS_RX_THRESHOLD);
+
+	csio_wr_reg32(hw,
+		   TIMERVALUE0(csio_us_to_core_ticks(hw, sge->timer_val[0])) |
+		   TIMERVALUE1(csio_us_to_core_ticks(hw, sge->timer_val[1])),
+		   SGE_TIMER_VALUE_0_AND_1);
+
+	csio_wr_reg32(hw,
+		   TIMERVALUE2(csio_us_to_core_ticks(hw, sge->timer_val[2])) |
+		   TIMERVALUE3(csio_us_to_core_ticks(hw, sge->timer_val[3])),
+		   SGE_TIMER_VALUE_2_AND_3);
+
+	csio_wr_reg32(hw,
+		   TIMERVALUE4(csio_us_to_core_ticks(hw, sge->timer_val[4])) |
+		   TIMERVALUE5(csio_us_to_core_ticks(hw, sge->timer_val[5])),
+		   SGE_TIMER_VALUE_4_AND_5);
+
+	csio_init_intr_coalesce_parms(hw);
+}
+
+void
+csio_wr_sge_init(struct csio_hw *hw)
+{
+	/*
+	 * If we are master:
+	 *    - If we plan to use the config file, we need to fixup some
+	 *      host specific registers, and read the rest of the SGE
+	 *      configuration.
+	 *    - If we dont plan to use the config file, we need to initialize
+	 *      SGE entirely, including fixing the host specific registers.
+	 * If we arent the master, we are only allowed to read and work off of
+	 *      the already initialized SGE values.
+	 *
+	 * Therefore, before calling this function, we assume that the master-
+	 * ship of the card, and whether to use config file or not, have
+	 * already been decided. In other words, CSIO_HWF_USING_SOFT_PARAMS and
+	 * CSIO_HWF_MASTER should be set/unset.
+	 */
+	if (csio_is_hw_master(hw)) {
+		csio_wr_fixup_host_params(hw);
+
+		if (hw->flags & CSIO_HWF_USING_SOFT_PARAMS)
+			csio_wr_get_sge(hw);
+		else
+			csio_wr_set_sge(hw);
+	} else
+		csio_wr_get_sge(hw);
+}
+
+/*
+ * csio_wrm_init - Initialize Work request module.
+ * @wrm: WR module
+ * @hw: HW pointer
+ *
+ * Allocates memory for an array of queue pointers starting at q_arr.
+ */
+int
+csio_wrm_init(struct csio_wrm *wrm, struct csio_hw *hw)
+{
+	int i;
+
+	if (!wrm->num_q) {
+		csio_err(hw, "Num queues is not set\n");
+		return -EINVAL;
+	}
+
+	wrm->q_arr = kzalloc(sizeof(struct csio_q *) * wrm->num_q, GFP_KERNEL);
+	if (!wrm->q_arr)
+		goto err;
+
+	for (i = 0; i < wrm->num_q; i++) {
+		wrm->q_arr[i] = kzalloc(sizeof(struct csio_q), GFP_KERNEL);
+		if (!wrm->q_arr[i]) {
+			while (--i >= 0)
+				kfree(wrm->q_arr[i]);
+			goto err_free_arr;
+		}
+	}
+	wrm->free_qidx	= 0;
+
+	return 0;
+
+err_free_arr:
+	kfree(wrm->q_arr);
+err:
+	return -ENOMEM;
+}
+
+/*
+ * csio_wrm_exit - Initialize Work request module.
+ * @wrm: WR module
+ * @hw: HW module
+ *
+ * Uninitialize WR module. Free q_arr and pointers in it.
+ * We have the additional job of freeing the DMA memory associated
+ * with the queues.
+ */
+void
+csio_wrm_exit(struct csio_wrm *wrm, struct csio_hw *hw)
+{
+	int i;
+	uint32_t j;
+	struct csio_q *q;
+	struct csio_dma_buf *buf;
+
+	for (i = 0; i < wrm->num_q; i++) {
+		q = wrm->q_arr[i];
+
+		if (wrm->free_qidx && (i < wrm->free_qidx)) {
+			if (q->type == CSIO_FREELIST) {
+				if (!q->un.fl.bufs)
+					continue;
+				for (j = 0; j < q->credits; j++) {
+					buf = &q->un.fl.bufs[j];
+					if (!buf->vaddr)
+						continue;
+					pci_free_consistent(hw->pdev, buf->len,
+							    buf->vaddr,
+							    buf->paddr);
+				}
+				kfree(q->un.fl.bufs);
+			}
+			pci_free_consistent(hw->pdev, q->size,
+					    q->vstart, q->pstart);
+		}
+		kfree(q);
+	}
+
+	hw->flags &= ~CSIO_HWF_Q_MEM_ALLOCED;
+
+	kfree(wrm->q_arr);
+}
-- 
1.7.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox