Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 2/3] cxgb4i: main driver files
From: Rakesh Ranjan @ 2010-04-08 12:14 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	open-iscsi-/JYPxA39Uh5TLH3MbocFFw
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, kxie-ut6Up61K2wZBDgjK7y7TUQ,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	James.Bottomley-JuX6DAaQMKPCXq6kfMZ53/egYHeGw8Jk,
	michaelc-hcNo3dDEHLuVc3sceRu5cw, Rakesh Ranjan
In-Reply-To: <1270728855-20951-2-git-send-email-rakesh-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

From: Rakesh Ranjan <rakesh-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>


Signed-off-by: Rakesh Ranjan <rakesh-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/scsi/cxgb4i/cxgb4i.h         |  201 ++++
 drivers/scsi/cxgb4i/cxgb4i_ddp.c     |  696 +++++++++++++
 drivers/scsi/cxgb4i/cxgb4i_ddp.h     |  238 +++++
 drivers/scsi/cxgb4i/cxgb4i_offload.c | 1853 ++++++++++++++++++++++++++++++++++
 drivers/scsi/cxgb4i/cxgb4i_offload.h |  171 ++++
 drivers/scsi/cxgb4i/cxgb4i_snic.c    |  253 +++++
 6 files changed, 3412 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/cxgb4i/cxgb4i.h
 create mode 100644 drivers/scsi/cxgb4i/cxgb4i_ddp.c
 create mode 100644 drivers/scsi/cxgb4i/cxgb4i_ddp.h
 create mode 100644 drivers/scsi/cxgb4i/cxgb4i_offload.c
 create mode 100644 drivers/scsi/cxgb4i/cxgb4i_offload.h
 create mode 100644 drivers/scsi/cxgb4i/cxgb4i_snic.c

diff --git a/drivers/scsi/cxgb4i/cxgb4i.h b/drivers/scsi/cxgb4i/cxgb4i.h
new file mode 100644
index 0000000..8007284
--- /dev/null
+++ b/drivers/scsi/cxgb4i/cxgb4i.h
@@ -0,0 +1,201 @@
+/*
+ * cxgb4i.h: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ * Written by: Rakesh Ranjan (rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ */
+
+#ifndef	__CXGB4I_H__
+#define	__CXGB4I_H__
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <scsi/libiscsi_tcp.h>
+
+#include "t4fw_api.h"
+#include "t4_msg.h"
+#include "l2t.h"
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+
+#include "cxgb4i_ddp.h"
+#include "cxgb4i_offload.h"
+
+#define	cxgb4i_log_error(fmt...)	printk(KERN_ERR "cxgb4i: ERR! " fmt)
+#define cxgb4i_log_warn(fmt...)	printk(KERN_WARNING "cxgb4i: WARN! " fmt)
+#define cxgb4i_log_info(fmt...)	printk(KERN_INFO "cxgb4i: " fmt)
+#define cxgb4i_debug_log(fmt, args...) \
+	printk(KERN_INFO "cxgb4i: %s - " fmt, __func__ , ## args)
+
+
+#ifdef	__DEBUG_CXGB4I__
+#define	cxgb4i_log_debug	cxgb4i_debug_log
+#else
+#define cxgb4i_log_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB4I_TAG__
+#define cxgb4i_tag_debug        cxgb4i_log_debug
+#else
+#define cxgb4i_tag_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB4I_API__
+#define cxgb4i_api_debug        cxgb4i_log_debug
+#else
+#define cxgb4i_api_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB4I_CONN__
+#define cxgb4i_conn_debug         cxgb4i_log_debug
+#else
+#define cxgb4i_conn_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB4I_TX__
+#define cxgb4i_tx_debug           cxgb4i_log_debug
+#else
+#define cxgb4i_tx_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB4I_RX__
+#define cxgb4i_rx_debug           cxgb4i_log_debug
+#else
+#define cxgb4i_rx_debug(fmt...)
+#endif
+
+#define	CXGB4I_SCSI_HOST_QDEPTH	1024
+#define	CXGB4I_MAX_TARGET	CXGB4I_MAX_CONN
+#define	CXGB4I_MAX_LUN		512
+#define	ISCSI_PDU_NONPAYLOAD_MAX \
+	(sizeof(struct iscsi_hdr) + ISCSI_MAX_AHS_SIZE + \
+	 (2 * ISCSI_DIGEST_SIZE))
+
+struct cxgb4i_snic;
+struct cxgb4i_host;
+struct cxgb4i_endpoint;
+typedef int (*cxgb4i_cplhandler_func)(struct cxgb4i_snic *, struct sk_buff *);
+
+
+struct cxgb4i_hba {
+	struct cxgb4i_snic *snic;
+	struct net_device *ndev;
+	struct Scsi_Host *shost;
+	struct port_info *pinfo;
+	__be32 ipv4addr;
+};
+
+struct cxgb4i_ports_map {
+	spinlock_t lock;
+	unsigned int next;
+	struct cxgb4i_sock *port_csk[0];
+};
+
+struct cxgb4i_snic {
+	struct list_head list_head;
+	spinlock_t lock;
+
+	struct cxgb4_lld_info lldi;
+
+	struct cxgb4i_hba *hba[MAX_NPORTS];
+	unsigned char hba_cnt;
+
+	unsigned int flags;
+	unsigned int tx_max_size;
+	unsigned int rx_max_size;
+
+	struct cxgb4i_tag_format tag_format;
+	struct cxgb4i_ddp_info *ddp;
+
+	struct cxgb4i_ports_map *pmap;
+	cxgb4i_cplhandler_func *funcs;
+};
+
+struct cxgb4i_conn {
+	struct list_head list_head;
+	struct cxgb4i_endpoint *cep;
+	struct iscsi_conn *iconn;
+	struct cxgb4i_hba *chba;
+	u32 task_idx_bits;
+};
+
+struct cxgb4i_endpoint {
+	struct cxgb4i_conn *cconn;
+	struct cxgb4i_hba *chba;
+	struct cxgb4i_sock *csk;
+};
+
+#define MAX_PDU_FRAGS	((ULP2_MAX_PDU_PAYLOAD + 512 - 1) / 512)
+struct cxgb4i_task_data {
+	unsigned short nr_frags;
+	skb_frag_t frags[MAX_PDU_FRAGS];
+	struct sk_buff *skb;
+	unsigned int offset;
+	unsigned int count;
+	unsigned int sgoffset;
+};
+
+int cxgb4i_ofld_init(struct cxgb4i_snic *);
+void cxgb4i_ofld_cleanup(struct cxgb4i_snic *);
+struct cxgb4i_snic *cxgb4i_snic_init(const struct cxgb4_lld_info *);
+void cxgb4i_snic_cleanup(void);
+struct cxgb4i_snic *cxgb4i_find_snic(struct net_device *, __be32);
+struct cxgb4i_hba *cxgb4i_hba_find_by_netdev(struct net_device *);
+struct cxgb4i_hba *cxgb4i_hba_add(struct cxgb4i_snic *, struct net_device *);
+void cxgb4i_hba_remove(struct cxgb4i_hba *);
+int cxgb4i_iscsi_init(void);
+void cxgb4i_iscsi_cleanup(void);
+
+int cxgb4i_pdu_init(void);
+void cxgb4i_pdu_cleanup(void);
+int cxgb4i_conn_alloc_pdu(struct iscsi_task *, u8);
+int cxgb4i_conn_init_pdu(struct iscsi_task *, unsigned int, unsigned int);
+int cxgb4i_conn_xmit_pdu(struct iscsi_task *);
+int cxgb4i_xmit_pdu_dummy(struct iscsi_task *task);
+
+int cxgb4i_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt);
+void cxgb4i_release_itt(struct iscsi_task *task, itt_t hdr_itt);
+
+int cxgb4i_sport_init(struct cxgb4i_snic *);
+
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+static inline void cxgb4i_set_iscsi_ipv4(struct cxgb4i_hba *chba, __be32 ipaddr)
+{
+	chba->ipv4addr = ipaddr;
+}
+
+static inline __be32 cxgb4i_get_iscsi_ipv4(struct cxgb4i_hba *chba)
+{
+	return chba->ipv4addr;
+}
+
+#define W_TCB_ULP_TYPE          0
+#define TCB_ULP_TYPE_SHIFT      0
+#define TCB_ULP_TYPE_MASK       0xfULL
+#define TCB_ULP_TYPE(x)         ((x) << TCB_ULP_TYPE_SHIFT)
+
+#define W_TCB_ULP_RAW           0
+#define TCB_ULP_RAW_SHIFT       4
+#define TCB_ULP_RAW_MASK        0xffULL
+#define TCB_ULP_RAW(x)          ((x) << TCB_ULP_RAW_SHIFT)
+
+
+#endif	/* __CXGB4I_H__ */
+
diff --git a/drivers/scsi/cxgb4i/cxgb4i_ddp.c b/drivers/scsi/cxgb4i/cxgb4i_ddp.c
new file mode 100644
index 0000000..d6143c0
--- /dev/null
+++ b/drivers/scsi/cxgb4i/cxgb4i_ddp.c
@@ -0,0 +1,696 @@
+/*
+ * cxgb4i_ddp.c: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ * Written by: Rakesh Ranjan (rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ */
+
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include "cxgb4i.h"
+#include "cxgb4i_ddp.h"
+
+#define DDP_PGIDX_MAX	4
+#define DDP_THRESHOLD	2048
+
+static unsigned char ddp_page_order[DDP_PGIDX_MAX] = {0, 1, 2, 4};
+static unsigned char ddp_page_shift[DDP_PGIDX_MAX] = {12, 13, 14, 16};
+static unsigned char page_idx = DDP_PGIDX_MAX;
+
+static unsigned char sw_tag_idx_bits;
+static unsigned char sw_tag_age_bits;
+
+
+static inline void cxgb4i_ddp_ppod_set(struct pagepod *ppod,
+					struct pagepod_hdr *hdr,
+					struct cxgb4i_gather_list *gl,
+					unsigned int pidx)
+{
+	int i;
+
+	memcpy(ppod, hdr, sizeof(*hdr));
+	for (i = 0; i < (PPOD_PAGES_MAX + 1); i++, pidx++) {
+		ppod->addr[i] = pidx < gl->nelem ?
+			cpu_to_be64(gl->phys_addr[pidx]) : 0ULL;
+	}
+}
+
+static inline void cxgb4i_ddp_ppod_clear(struct pagepod *ppod)
+{
+	memset(ppod, 0, sizeof(*ppod));
+}
+
+static inline void cxgb4i_ddp_ulp_mem_io_set_hdr(struct ulp_mem_io *req,
+					unsigned int wr_len, unsigned int dlen,
+					unsigned int pm_addr)
+{
+	struct ulptx_sgl *sgl;
+
+	INIT_ULPTX_WR(req, wr_len, 0, 0);
+	req->cmd = htonl(ULPTX_CMD(ULP_TX_MEM_WRITE));
+	req->dlen = htonl(ULP_MEMIO_DATA_LEN(dlen >> 5));
+	req->len16 = htonl(DIV_ROUND_UP(wr_len - sizeof(req->wr), 16));
+	req->lock_addr = htonl(ULP_MEMIO_ADDR(pm_addr >> 5));
+
+	sgl = (struct ulptx_sgl *)(req + 1);
+	sgl->cmd_nsge = htonl(ULPTX_CMD(ULP_TX_SC_DSGL) | ULPTX_NSGE(1));
+	sgl->len0 = htonl(dlen);
+}
+
+static int cxgb4i_ddp_ppod_write_sgl(struct cxgb4i_ddp_info *ddp,
+					struct pagepod_hdr *hdr,
+					unsigned int idx,
+					unsigned int npods,
+					struct cxgb4i_gather_list *gl,
+					unsigned int gl_pidx)
+{
+	unsigned int dlen = PPOD_SIZE * npods;
+	unsigned int pm_addr = idx * PPOD_SIZE + ddp->llimit;
+	unsigned int wr_len = roundup(sizeof(struct ulp_mem_io) +
+					sizeof(struct ulptx_sgl), 16);
+	struct sk_buff *skb = alloc_skb(wr_len + dlen, GFP_ATOMIC);
+	struct ulp_mem_io *req;
+	struct ulptx_sgl *sgl;
+	struct pagepod *ppod;
+	unsigned int i;
+
+	if (!skb) {
+		cxgb4i_log_error("snic 0x%p, idx %u, npods %u, OOM\n",
+				ddp->snic, idx, npods);
+		return -ENOMEM;
+	}
+
+	memset(skb->data, 0, wr_len + dlen);
+	skb->queue_mapping = CPL_PRIORITY_CONTROL;
+
+	req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+	cxgb4i_ddp_ulp_mem_io_set_hdr(req, wr_len, dlen, pm_addr);
+	sgl = (struct ulptx_sgl *)(req + 1);
+	ppod = (struct pagepod *)(sgl + 1);
+	sgl->addr0 = cpu_to_be64(virt_to_phys(ppod));
+
+	for (i = 0; i < npods; i++, ppod++, gl_pidx += PPOD_PAGES_MAX) {
+		if (!hdr && !gl)
+			cxgb4i_ddp_ppod_clear(ppod);
+		else
+			cxgb4i_ddp_ppod_set(ppod, hdr, gl, gl_pidx);
+
+	}
+
+	cxgb4_ofld_send(ddp->snic->lldi.ports[0], skb);
+
+	return 0;
+}
+
+static int cxgb4i_ddp_set_map(struct cxgb4i_ddp_info *ddp,
+					struct pagepod_hdr *hdr,
+					unsigned int idx,
+					unsigned int npods,
+					struct cxgb4i_gather_list *gl)
+{
+	unsigned int pidx = 0;
+	unsigned int w_npods = 0;
+	unsigned int cnt;
+	int err = 0;
+
+	for (; w_npods < npods; idx += cnt, w_npods += cnt,
+					pidx += PPOD_PAGES_MAX) {
+		cnt = npods - w_npods;
+		if (cnt > ULPMEM_DSGL_MAX_NPPODS)
+			cnt = ULPMEM_DSGL_MAX_NPPODS;
+		err = cxgb4i_ddp_ppod_write_sgl(ddp, hdr, idx, cnt, gl, pidx);
+
+		if (err < 0)
+			break;
+	}
+
+	return err;
+}
+
+static void cxgb4i_ddp_clear_map(struct cxgb4i_ddp_info *ddp,
+						unsigned int tag,
+						unsigned int idx,
+						unsigned int npods)
+{
+	int err;
+	unsigned int w_npods = 0;
+	unsigned int cnt;
+
+	for (; w_npods < npods; idx += cnt, w_npods += cnt) {
+		cnt = npods - w_npods;
+
+		if (cnt > ULPMEM_DSGL_MAX_NPPODS)
+			cnt = ULPMEM_DSGL_MAX_NPPODS;
+		err = cxgb4i_ddp_ppod_write_sgl(ddp, NULL, idx, cnt, NULL, 0);
+
+		if (err < 0)
+			break;
+	}
+}
+
+static inline int cxgb4i_ddp_find_unused_entries(struct cxgb4i_ddp_info *ddp,
+					unsigned int start, unsigned int max,
+					unsigned int count,
+					struct cxgb4i_gather_list *gl)
+{
+	unsigned int i, j, k;
+
+	/*  not enough entries */
+	if ((max - start) < count)
+		return -EBUSY;
+
+	max -= count;
+	spin_lock(&ddp->map_lock);
+	for (i = start; i < max;) {
+		for (j = 0, k = i; j < count; j++, k++) {
+			if (ddp->gl_map[k])
+				break;
+		}
+		if (j == count) {
+			for (j = 0, k = i; j < count; j++, k++)
+				ddp->gl_map[k] = gl;
+			spin_unlock(&ddp->map_lock);
+			return i;
+		}
+		i += j + 1;
+	}
+	spin_unlock(&ddp->map_lock);
+	return -EBUSY;
+}
+
+static inline void cxgb4i_ddp_unmark_entries(struct cxgb4i_ddp_info *ddp,
+							int start, int count)
+{
+	spin_lock(&ddp->map_lock);
+	memset(&ddp->gl_map[start], 0,
+			count * sizeof(struct cxgb4i_gather_list *));
+	spin_unlock(&ddp->map_lock);
+}
+
+static int cxgb4i_ddp_find_page_index(unsigned long pgsz)
+{
+	int i;
+
+	for (i = 0; i < DDP_PGIDX_MAX; i++) {
+		if (pgsz == (1UL << ddp_page_shift[i]))
+			return i;
+	}
+	cxgb4i_log_debug("ddp page size 0x%lx not supported\n", pgsz);
+
+	return DDP_PGIDX_MAX;
+}
+
+static int cxgb4i_ddp_adjust_page_table(void)
+{
+	int i;
+	unsigned int base_order, order;
+
+	if (PAGE_SIZE < (1UL << ddp_page_shift[0])) {
+		cxgb4i_log_info("PAGE_SIZE 0x%lx too small, min 0x%lx\n",
+				PAGE_SIZE, 1UL << ddp_page_shift[0]);
+		return -EINVAL;
+	}
+
+	base_order = get_order(1UL << ddp_page_shift[0]);
+	order = get_order(1UL << PAGE_SHIFT);
+
+	for (i = 0; i < DDP_PGIDX_MAX; i++) {
+		/* first is the kernel page size, then just doubling the size */
+		ddp_page_order[i] = order - base_order + i;
+		ddp_page_shift[i] = PAGE_SHIFT + i;
+	}
+
+	return 0;
+}
+
+static inline void cxgb4i_ddp_gl_unmap(struct pci_dev *pdev,
+					struct cxgb4i_gather_list *gl)
+{
+	int i;
+
+	for (i = 0; i < gl->nelem; i++)
+		pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE,
+				PCI_DMA_FROMDEVICE);
+}
+
+static inline int cxgb4i_ddp_gl_map(struct pci_dev *pdev,
+				struct cxgb4i_gather_list *gl)
+{
+	int i;
+
+	for (i = 0; i < gl->nelem; i++) {
+		gl->phys_addr[i] = pci_map_page(pdev, gl->pages[i], 0,
+						PAGE_SIZE,
+						PCI_DMA_FROMDEVICE);
+		if (unlikely(pci_dma_mapping_error(pdev, gl->phys_addr[i])))
+			goto unmap;
+	}
+
+	return i;
+
+unmap:
+	if (i) {
+		unsigned int nelem = gl->nelem;
+
+		gl->nelem = i;
+		cxgb4i_ddp_gl_unmap(pdev, gl);
+		gl->nelem = nelem;
+	}
+	return -ENOMEM;
+}
+
+struct cxgb4i_gather_list *cxgb4i_ddp_make_gl(unsigned int xferlen,
+						struct scatterlist *sgl,
+						unsigned int sgcnt,
+						struct pci_dev *pdev,
+						gfp_t gfp)
+{
+	struct cxgb4i_gather_list *gl;
+	struct scatterlist *sg = sgl;
+	struct page *sgpage = sg_page(sg);
+	unsigned int sglen = sg->length;
+	unsigned int sgoffset = sg->offset;
+	unsigned int npages = (xferlen + sgoffset + PAGE_SIZE - 1) >>
+				PAGE_SHIFT;
+	int i = 1, j = 0;
+
+	if (xferlen < DDP_THRESHOLD) {
+		cxgb4i_log_debug("xfer %u < threshold %u, no ddp.\n",
+				xferlen, DDP_THRESHOLD);
+		return NULL;
+	}
+
+	gl = kzalloc(sizeof(struct cxgb4i_gather_list) +
+			npages * (sizeof(dma_addr_t) + sizeof(struct page *)),
+			gfp);
+	if (!gl)
+		return NULL;
+
+	gl->pages = (struct page **)&gl->phys_addr[npages];
+	gl->length = xferlen;
+	gl->offset = sgoffset;
+	gl->pages[0] = sgpage;
+
+	sg = sg_next(sg);
+	while (sg) {
+		struct page *page = sg_page(sg);
+
+		if (sgpage == page && sg->offset == sgoffset + sglen)
+			sglen += sg->length;
+		else {
+			/*  make sure the sgl is fit for ddp:
+			 *  each has the same page size, and
+			 *  all of the middle pages are used completely
+			 */
+			if ((j && sgoffset) || ((i != sgcnt - 1) &&
+					 ((sglen + sgoffset) & ~PAGE_MASK)))
+				goto error_out;
+
+			j++;
+			if (j == gl->nelem || sg->offset)
+				goto error_out;
+			gl->pages[j] = page;
+			sglen = sg->length;
+			sgoffset = sg->offset;
+			sgpage = page;
+		}
+		i++;
+		sg = sg_next(sg);
+	}
+	gl->nelem = ++j;
+
+	if (cxgb4i_ddp_gl_map(pdev, gl) < 0)
+		goto error_out;
+
+	return gl;
+
+error_out:
+	kfree(gl);
+	return NULL;
+}
+
+void cxgb4i_ddp_release_gl(struct cxgb4i_gather_list *gl,
+				struct pci_dev *pdev)
+{
+	cxgb4i_ddp_gl_unmap(pdev, gl);
+	kfree(gl);
+}
+
+int cxgb4i_ddp_tag_reserve(struct cxgb4i_snic *snic, unsigned int tid,
+				struct cxgb4i_tag_format *tformat, u32 *tagp,
+				struct cxgb4i_gather_list *gl, gfp_t gfp)
+{
+	struct cxgb4i_ddp_info *ddp = snic->ddp;
+	struct pagepod_hdr hdr;
+	unsigned int npods;
+	int idx = -1;
+	int err = -ENOMEM;
+	u32 sw_tag = *tagp;
+	u32 tag;
+
+	if (page_idx >= DDP_PGIDX_MAX || !ddp || !gl || !gl->nelem ||
+			gl->length < DDP_THRESHOLD) {
+		cxgb4i_log_debug("pgidx %u, xfer %u/%u, NO ddp.\n",
+				page_idx, gl->length, DDP_THRESHOLD);
+		return -EINVAL;
+	}
+
+	npods = (gl->nelem + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
+
+	if (ddp->idx_last == ddp->nppods)
+		idx = cxgb4i_ddp_find_unused_entries(ddp, 0, ddp->nppods,
+							npods, gl);
+	else {
+		idx = cxgb4i_ddp_find_unused_entries(ddp, ddp->idx_last + 1,
+							ddp->nppods, npods,
+							gl);
+		if (idx < 0 && ddp->idx_last >= npods) {
+			idx = cxgb4i_ddp_find_unused_entries(ddp, 0,
+				min(ddp->idx_last + npods, ddp->nppods),
+							npods, gl);
+		}
+	}
+	if (idx < 0) {
+		cxgb4i_log_debug("xferlen %u, gl %u, npods %u NO DDP.\n",
+				gl->length, gl->nelem, npods);
+		return idx;
+	}
+
+	tag = cxgb4i_ddp_tag_base(tformat, sw_tag);
+	tag |= idx << PPOD_IDX_SHIFT;
+
+	hdr.rsvd = 0;
+	hdr.vld_tid = htonl(PPOD_VALID_FLAG | PPOD_TID(tid));
+	hdr.pgsz_tag_clr = htonl(tag & ddp->rsvd_tag_mask);
+	hdr.max_offset = htonl(gl->length);
+	hdr.page_offset = htonl(gl->offset);
+
+	err = cxgb4i_ddp_set_map(ddp, &hdr, idx, npods, gl);
+	if (err < 0)
+		goto unmark_entries;
+
+	ddp->idx_last = idx;
+	cxgb4i_log_debug("xfer %u, gl %u,%u, tid 0x%x, 0x%x -> 0x%x(%u,%u).\n",
+			gl->length, gl->nelem, gl->offset, tid, sw_tag, tag,
+			idx, npods);
+	*tagp = tag;
+	return 0;
+
+unmark_entries:
+	cxgb4i_ddp_unmark_entries(ddp, idx, npods);
+	return err;
+}
+
+void cxgb4i_ddp_tag_release(struct cxgb4i_snic *snic, u32 tag)
+{
+	struct cxgb4i_ddp_info *ddp = snic->ddp;
+	u32 idx;
+
+	if (!ddp) {
+		cxgb4i_log_error("release ddp tag 0x%x, ddp NULL.\n", tag);
+		return;
+	}
+
+	idx = (tag >> PPOD_IDX_SHIFT) & ddp->idx_mask;
+	if (idx < ddp->nppods) {
+		struct cxgb4i_gather_list *gl = ddp->gl_map[idx];
+		unsigned int npods;
+
+		if (!gl || !gl->nelem) {
+			cxgb4i_log_error("rel 0x%x, idx 0x%x, gl 0x%p, %u\n",
+					tag, idx, gl, gl ? gl->nelem : 0);
+			return;
+		}
+		npods = (gl->nelem + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
+		cxgb4i_log_debug("ddp tag 0x%x, release idx 0x%x, npods %u.\n",
+				tag, idx, npods);
+		cxgb4i_ddp_clear_map(ddp, tag, idx, npods);
+		cxgb4i_ddp_unmark_entries(ddp, idx, npods);
+		cxgb4i_ddp_release_gl(gl, ddp->pdev);
+	} else
+		cxgb4i_log_error("ddp tag 0x%x, idx 0x%x > max 0x%x.\n",
+				tag, idx, ddp->nppods);
+}
+
+static int cxgb4i_ddp_setup_conn_pgidx(struct cxgb4i_sock *csk,
+					unsigned int tid, int pg_idx,
+					bool reply)
+{
+	struct sk_buff *skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
+					GFP_KERNEL);
+	struct cpl_set_tcb_field *req;
+	u64 val = pg_idx < DDP_PGIDX_MAX ? pg_idx : 0;
+
+	if (!skb)
+		return -ENOMEM;
+
+	/*  set up ulp submode and page size */
+	val = (val & 0x03) << 2;
+	val |= TCB_ULP_TYPE(ULP_MODE_ISCSI);
+	req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, csk->hwtid));
+	req->reply_ctrl = htons(NO_REPLY(reply) | QUEUENO(csk->rss_qid));
+	req->word_cookie = htons(TCB_WORD(W_TCB_ULP_RAW));
+	req->mask = cpu_to_be64(TCB_ULP_TYPE(TCB_ULP_TYPE_MASK));
+	req->val = cpu_to_be64(val);
+
+	skb->queue_mapping = CPL_PRIORITY_CONTROL;
+
+	cxgb4_ofld_send(csk->snic->lldi.ports[0], skb);
+	return 0;
+}
+
+int cxgb4i_ddp_setup_conn_host_pagesize(struct cxgb4i_sock *csk,
+						unsigned int tid,
+						int reply)
+{
+	return cxgb4i_ddp_setup_conn_pgidx(csk, tid, page_idx, reply);
+}
+
+int cxgb4i_ddp_setup_conn_pagesize(struct cxgb4i_sock *csk, unsigned int tid,
+					int reply, unsigned long pgsz)
+{
+	int pgidx = cxgb4i_ddp_find_page_index(pgsz);
+
+	return cxgb4i_ddp_setup_conn_pgidx(csk, tid, pgidx, reply);
+}
+
+int cxgb4i_ddp_setup_conn_digest(struct cxgb4i_sock *csk, unsigned int tid,
+				int hcrc, int dcrc, int reply)
+{
+	struct sk_buff *skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
+					GFP_KERNEL);
+	struct cpl_set_tcb_field *req;
+	u64 val = (hcrc ? ULP_CRC_HEADER : 0) | (dcrc ? ULP_CRC_DATA : 0);
+	val = TCB_ULP_RAW(val);
+	val |= TCB_ULP_TYPE(ULP_MODE_ISCSI);
+
+	if (!skb)
+		return -ENOMEM;
+
+	/*  set up ulp submode and page size */
+	req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, tid);
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
+	req->reply_ctrl = htons(NO_REPLY(reply) | QUEUENO(csk->rss_qid));
+	req->word_cookie = htons(TCB_WORD(W_TCB_ULP_RAW));
+	req->mask = cpu_to_be64(TCB_ULP_RAW(TCB_ULP_RAW_MASK));
+	req->val = cpu_to_be64(val);
+
+	skb->queue_mapping = CPL_PRIORITY_CONTROL;
+
+	cxgb4_ofld_send(csk->snic->lldi.ports[0], skb);
+	return 0;
+}
+
+int cxgb4i_ddp_info(struct cxgb4i_snic *snic,
+			struct cxgb4i_tag_format *tformat,
+			unsigned int *txsz, unsigned int *rxsz)
+{
+	struct cxgb4i_ddp_info *ddp;
+	unsigned char idx_bits;
+
+	if (!tformat || !snic->ddp)
+		return -EINVAL;
+
+	ddp = (struct cxgb4i_ddp_info *)snic->ddp;
+
+	idx_bits = 32 - tformat->sw_bits;
+	tformat->rsvd_bits = ddp->idx_bits;
+	tformat->rsvd_shift = PPOD_IDX_SHIFT;
+	tformat->rsvd_mask = (1 << tformat->rsvd_bits) - 1;
+
+	cxgb4i_log_info("tag format: sw %u, rsvd %u,%u, mask 0x%x.\n",
+			tformat->sw_bits, tformat->rsvd_bits,
+			tformat->rsvd_shift, tformat->rsvd_mask);
+
+	*txsz = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+			ddp->max_txsz - ISCSI_PDU_NONPAYLOAD_LEN);
+	*rxsz = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+			ddp->max_rxsz - ISCSI_PDU_NONPAYLOAD_LEN);
+	cxgb4i_log_info("max payload size: %u/%u, %u/%u.\n",
+			*txsz, ddp->max_txsz, *rxsz, ddp->max_rxsz);
+	return 0;
+}
+
+static void __cxgb4i_ddp_cleanup(struct kref *kref)
+{
+	int i = 0;
+	struct cxgb4i_ddp_info *ddp = container_of(kref,
+						struct cxgb4i_ddp_info,
+						refcnt);
+
+	cxgb4i_log_info("kref release ddp 0x%p, snic 0x%p\n", ddp, ddp->snic);
+
+	ddp->snic->ddp = NULL;
+
+	while (i < ddp->nppods) {
+		struct cxgb4i_gather_list *gl = ddp->gl_map[i];
+
+		if (gl) {
+			int npods = (gl->nelem + PPOD_PAGES_MAX - 1) >>
+							PPOD_PAGES_SHIFT;
+			cxgb4i_log_info("snic 0x%p, ddp %d + %d\n",
+						ddp->snic, i, npods);
+			kfree(gl);
+			i += npods;
+		} else
+			i++;
+	}
+	cxgb4i_free_big_mem(ddp);
+}
+
+
+static void __cxgb4i_ddp_init(struct cxgb4i_snic *snic)
+{
+	struct cxgb4i_ddp_info *ddp = snic->ddp;
+	unsigned int ppmax, bits, tagmask, pgsz_factor[4];
+	int i;
+
+	if (ddp) {
+		kref_get(&ddp->refcnt);
+		cxgb4i_log_warn("snic 0x%p, ddp 0x%p already set up\n",
+				snic, snic->ddp);
+		return;
+	}
+
+	sw_tag_idx_bits = (__ilog2_u32(ISCSI_ITT_MASK)) + 1;
+	sw_tag_age_bits = (__ilog2_u32(ISCSI_AGE_MASK)) + 1;
+	snic->tag_format.sw_bits = sw_tag_idx_bits + sw_tag_age_bits;
+
+	cxgb4i_log_info("tag itt 0x%x, %u bits, age 0x%x, %u bits\n",
+			ISCSI_ITT_MASK, sw_tag_idx_bits,
+			ISCSI_AGE_MASK, sw_tag_age_bits);
+
+	ppmax = (snic->lldi.vr->iscsi.size >> PPOD_SIZE_SHIFT);
+	bits = __ilog2_u32(ppmax) + 1;
+	if (bits > PPOD_IDX_MAX_SIZE)
+		bits = PPOD_IDX_MAX_SIZE;
+	ppmax = (1 << (bits - 1)) - 1;
+
+	ddp = cxgb4i_alloc_big_mem(sizeof(struct cxgb4i_ddp_info) +
+			ppmax * (sizeof(struct cxgb4i_gather_list *) +
+				sizeof(struct sk_buff *)),
+				GFP_KERNEL);
+	if (!ddp) {
+		cxgb4i_log_warn("snic 0x%p unable to alloc ddp 0x%d, "
+			       "ddp disabled\n", snic, ppmax);
+		return;
+	}
+
+	ddp->gl_map = (struct cxgb4i_gather_list **)(ddp + 1);
+	spin_lock_init(&ddp->map_lock);
+	kref_init(&ddp->refcnt);
+
+	ddp->snic = snic;
+	ddp->pdev = snic->lldi.pdev;
+	ddp->max_txsz = min_t(unsigned int,
+				snic->lldi.iscsi_iolen,
+				ULP2_MAX_PKT_SIZE);
+	ddp->max_rxsz = min_t(unsigned int,
+				snic->lldi.iscsi_iolen,
+				ULP2_MAX_PKT_SIZE);
+	ddp->llimit = snic->lldi.vr->iscsi.start;
+	ddp->ulimit = ddp->llimit + snic->lldi.vr->iscsi.size;
+	ddp->nppods = ppmax;
+	ddp->idx_last = ppmax;
+	ddp->idx_bits = bits;
+	ddp->idx_mask = (1 << bits) - 1;
+	ddp->rsvd_tag_mask = (1 << (bits + PPOD_IDX_SHIFT)) - 1;
+
+	tagmask = ddp->idx_mask << PPOD_IDX_SHIFT;
+	for (i = 0; i < DDP_PGIDX_MAX; i++)
+		pgsz_factor[i] = ddp_page_order[i];
+
+	cxgb4_iscsi_init(snic->lldi.ports[0], tagmask, pgsz_factor);
+	snic->ddp = ddp;
+
+	snic->tag_format.rsvd_bits = ddp->idx_bits;
+	snic->tag_format.rsvd_shift = PPOD_IDX_SHIFT;
+	snic->tag_format.rsvd_mask = ((1 << snic->tag_format.rsvd_bits) - 1);
+
+	cxgb4i_log_info("tag format: sw %u, rsvd %u,%u, mask 0x%x.\n",
+						snic->tag_format.sw_bits,
+						snic->tag_format.rsvd_bits,
+						snic->tag_format.rsvd_shift,
+						snic->tag_format.rsvd_mask);
+
+	snic->tx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				ddp->max_txsz - ISCSI_PDU_NONPAYLOAD_LEN);
+	snic->rx_max_size = min_t(unsigned int, ULP2_MAX_PDU_PAYLOAD,
+				ddp->max_rxsz - ISCSI_PDU_NONPAYLOAD_LEN);
+
+	cxgb4i_log_info("max payload size: %u/%u, %u/%u.\n",
+			snic->tx_max_size, ddp->max_txsz,
+			snic->rx_max_size, ddp->max_rxsz);
+
+	cxgb4i_log_info("snic 0x%p, nppods %u, bits %u, mask 0x%x,0x%x "
+			"pkt %u/%u, %u/%u\n",
+			snic, ppmax, ddp->idx_bits, ddp->idx_mask,
+			ddp->rsvd_tag_mask, ddp->max_txsz,
+			snic->lldi.iscsi_iolen,
+			ddp->max_rxsz, snic->lldi.iscsi_iolen);
+
+	return;
+}
+
+void cxgb4i_ddp_init(struct cxgb4i_snic *snic)
+{
+	if (page_idx == DDP_PGIDX_MAX) {
+		page_idx = cxgb4i_ddp_find_page_index(PAGE_SIZE);
+
+		if (page_idx == DDP_PGIDX_MAX) {
+			cxgb4i_log_info("system PAGE_SIZE %lu, update hw\n",
+					PAGE_SIZE);
+
+			if (cxgb4i_ddp_adjust_page_table()) {
+				cxgb4i_log_info("PAGE_SIZE %lu, ddp disabled\n",
+						PAGE_SIZE);
+				return;
+			}
+			page_idx = cxgb4i_ddp_find_page_index(PAGE_SIZE);
+		}
+		cxgb4i_log_info("system PAGE_SIZE %lu, ddp idx %u\n",
+				PAGE_SIZE, page_idx);
+	}
+
+	__cxgb4i_ddp_init(snic);
+}
+
+void cxgb4i_ddp_cleanup(struct cxgb4i_snic *snic)
+{
+	struct cxgb4i_ddp_info *ddp = snic->ddp;
+
+	cxgb4i_log_info("snic 0x%p, release ddp 0x%p\n", snic, ddp);
+	if (ddp)
+		kref_put(&ddp->refcnt, __cxgb4i_ddp_cleanup);
+}
+
diff --git a/drivers/scsi/cxgb4i/cxgb4i_ddp.h b/drivers/scsi/cxgb4i/cxgb4i_ddp.h
new file mode 100644
index 0000000..e749377
--- /dev/null
+++ b/drivers/scsi/cxgb4i/cxgb4i_ddp.h
@@ -0,0 +1,238 @@
+/*
+ * cxgb4i_ddp.h: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ * Written by: Rakesh Ranjan (rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ */
+
+#ifndef	__CXGB4I_DDP_H__
+#define	__CXGB4I_DDP_H__
+
+#include <linux/vmalloc.h>
+
+struct cxgb4i_sock;
+
+struct cxgb4i_tag_format {
+	unsigned char sw_bits;
+	unsigned char rsvd_bits;
+	unsigned char rsvd_shift;
+	unsigned char filler[1];
+	unsigned int rsvd_mask;
+};
+
+struct cxgb4i_gather_list {
+	unsigned int tag;
+	unsigned int length;
+	unsigned int offset;
+	unsigned int nelem;
+	struct page **pages;
+	dma_addr_t phys_addr[0];
+};
+
+struct cxgb4i_ddp_info {
+	struct list_head list;
+	struct kref refcnt;
+	struct cxgb4i_snic *snic;
+	struct pci_dev *pdev;
+	unsigned int max_txsz;
+	unsigned int max_rxsz;
+	unsigned int llimit;
+	unsigned int ulimit;
+	unsigned int nppods;
+	unsigned int idx_last;
+	unsigned char idx_bits;
+	unsigned char filler[3];
+	unsigned int idx_mask;
+	unsigned int rsvd_tag_mask;
+	spinlock_t map_lock;
+	struct cxgb4i_gather_list **gl_map;
+};
+
+#define ISCSI_PDU_NONPAYLOAD_LEN	312 /* bhs(48) + ahs(256) + digest(8)*/
+#define ULP2_MAX_PKT_SIZE		16224
+#define ULP2_MAX_PDU_PAYLOAD	\
+	(ULP2_MAX_PKT_SIZE - ISCSI_PDU_NONPAYLOAD_LEN)
+/* # of pages a pagepod can hold without needing another pagepod */
+#define PPOD_PAGES			4
+#define PPOD_PAGES_MAX			4
+#define PPOD_PAGES_SHIFT		2       /*  4 pages per pod */
+
+struct pagepod_hdr {
+	unsigned int vld_tid;
+	unsigned int pgsz_tag_clr;
+	unsigned int max_offset;
+	unsigned int page_offset;
+	unsigned long long rsvd;
+};
+
+struct pagepod {
+	struct pagepod_hdr hdr;
+	unsigned long long addr[PPOD_PAGES_MAX + 1];
+};
+
+#define PPOD_SIZE               sizeof(struct pagepod)  /*  64 */
+#define PPOD_SIZE_SHIFT         6
+
+#define ULPMEM_DSGL_MAX_NPPODS	16	/*  1024/PPOD_SIZE */
+#define ULPMEM_IDATA_MAX_NPPODS	4	/*  256/PPOD_SIZE */
+#define PCIE_MEMWIN_MAX_NPPODS	16	/*  1024/PPOD_SIZE */
+
+#define PPOD_COLOR_SHIFT	0
+#define PPOD_COLOR_MASK		0x3F
+#define PPOD_COLOR_SIZE         6
+#define PPOD_COLOR(x)		((x) << PPOD_COLOR_SHIFT)
+
+#define PPOD_TAG_SHIFT	6
+#define PPOD_TAG_MASK	0xFFFFFF
+#define PPOD_TAG(x)	((x) << PPOD_TAG_SHIFT)
+
+#define PPOD_PGSZ_SHIFT	30
+#define PPOD_PGSZ_MASK	0x3
+#define PPOD_PGSZ(x)	((x) << PPOD_PGSZ_SHIFT)
+
+#define PPOD_TID_SHIFT	32
+#define PPOD_TID_MASK	0xFFFFFF
+#define PPOD_TID(x)	((__u64)(x) << PPOD_TID_SHIFT)
+
+#define PPOD_VALID_SHIFT	56
+#define PPOD_VALID(x)	((__u64)(x) << PPOD_VALID_SHIFT)
+#define PPOD_VALID_FLAG	PPOD_VALID(1ULL)
+
+#define PPOD_LEN_SHIFT	32
+#define PPOD_LEN_MASK	0xFFFFFFFF
+#define PPOD_LEN(x)	((__u64)(x) << PPOD_LEN_SHIFT)
+
+#define PPOD_OFST_SHIFT	0
+#define PPOD_OFST_MASK	0xFFFFFFFF
+#define PPOD_OFST(x)	((x) << PPOD_OFST_SHIFT)
+
+#define PPOD_IDX_SHIFT          PPOD_COLOR_SIZE
+#define PPOD_IDX_MAX_SIZE       24
+
+
+static inline void *cxgb4i_alloc_big_mem(unsigned int size,
+						gfp_t gfp)
+{
+	void *p = kmalloc(size, gfp);
+	if (!p)
+		p = vmalloc(size);
+	if (p)
+		memset(p, 0, size);
+	return p;
+}
+
+static inline void cxgb4i_free_big_mem(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+
+static inline int cxgb4i_is_ddp_tag(struct cxgb4i_tag_format *tformat, u32 tag)
+{
+	return !(tag & (1 << (tformat->rsvd_bits + tformat->rsvd_shift - 1)));
+}
+
+static inline int cxgb4i_sw_tag_usable(struct cxgb4i_tag_format *tformat,
+					u32 sw_tag)
+{
+	sw_tag >>= (32 - tformat->rsvd_bits);
+	return !sw_tag;
+}
+
+static inline u32 cxgb4i_set_non_ddp_tag(struct cxgb4i_tag_format *tformat,
+					u32 sw_tag)
+{
+	unsigned char shift = tformat->rsvd_bits + tformat->rsvd_shift - 1;
+
+	u32 mask = (1 << shift) - 1;
+
+	if (sw_tag && (sw_tag & ~mask)) {
+		u32 v1 = sw_tag & ((1 << shift) - 1);
+		u32 v2 = (sw_tag >> (shift - 1)) << shift;
+
+		return v2 | v1 | 1 << shift;
+	}
+
+	return sw_tag | 1 << shift;
+}
+
+static inline u32 cxgb4i_ddp_tag_base(struct cxgb4i_tag_format *tformat,
+					u32 sw_tag)
+{
+	u32 mask = (1 << tformat->rsvd_shift) - 1;
+
+	if (sw_tag && (sw_tag & ~mask)) {
+		u32 v1 = sw_tag & mask;
+		u32 v2 = sw_tag >> tformat->rsvd_shift;
+
+		v2 <<= tformat->rsvd_bits + tformat->rsvd_shift;
+
+		return v2 | v1;
+	}
+
+	return sw_tag;
+}
+
+static inline u32 cxgb4i_tag_rsvd_bits(struct cxgb4i_tag_format *tformat,
+					u32 tag)
+{
+	if (cxgb4i_is_ddp_tag(tformat, tag))
+		return (tag >> tformat->rsvd_shift) & tformat->rsvd_mask;
+
+	return 0;
+}
+
+static inline u32 cxgb4i_tag_nonrsvd_bits(struct cxgb4i_tag_format *tformat,
+					u32 tag)
+{
+	unsigned char shift = tformat->rsvd_bits + tformat->rsvd_shift - 1;
+	u32 v1, v2;
+
+	if (cxgb4i_is_ddp_tag(tformat, tag)) {
+		v1 = tag & ((1 << tformat->rsvd_shift) - 1);
+		v2 = (tag >> (shift + 1)) << tformat->rsvd_shift;
+	} else {
+		u32 mask = (1 << shift) - 1;
+		tag &= ~(1 << shift);
+		v1 = tag & mask;
+		v2 = (tag >> 1) & ~mask;
+	}
+	return v1 | v2;
+}
+
+int cxgb4i_ddp_tag_reserve(struct cxgb4i_snic *, unsigned int,
+				struct cxgb4i_tag_format *, u32 *,
+				struct cxgb4i_gather_list *, gfp_t);
+void cxgb4i_ddp_tag_release(struct cxgb4i_snic *, u32);
+struct cxgb4i_gather_list *cxgb4i_ddp_make_gl(unsigned int,
+					struct scatterlist *,
+					unsigned int,
+					struct pci_dev *,
+					gfp_t);
+void cxgb4i_ddp_release_gl(struct cxgb4i_gather_list *,
+			struct pci_dev *pdev);
+int cxgb4i_ddp_setup_conn_host_pagesize(struct cxgb4i_sock*, unsigned int,
+					int);
+int cxgb3i_ddp_setup_conn_pagesize(struct cxgb4i_sock *, unsigned int, int,
+					unsigned long);
+
+int cxgb4i_ddp_setup_conn_digest(struct cxgb4i_sock *, unsigned int,
+				int, int, int);
+int cxgb4i_ddp_find_page_idx(unsigned long);
+int cxgb4i_snic_ddp_info(struct cxgb4i_snic *, struct cxgb4i_tag_format *,
+			unsigned int *, unsigned int *);
+
+void cxgb4i_ddp_init(struct cxgb4i_snic *);
+void cxgb4i_ddp_cleanup(struct cxgb4i_snic *);
+
+#endif	/* __CXGB4I_DDP_H__ */
+
diff --git a/drivers/scsi/cxgb4i/cxgb4i_offload.c b/drivers/scsi/cxgb4i/cxgb4i_offload.c
new file mode 100644
index 0000000..f98b020
--- /dev/null
+++ b/drivers/scsi/cxgb4i/cxgb4i_offload.c
@@ -0,0 +1,1853 @@
+/*
+ * cxgb4i_offload.c: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ * Written by: Rakesh Ranjan (rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ */
+
+#include <linux/if_vlan.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/tcp.h>
+
+#include "cxgb4i.h"
+#include "cxgb4i_offload.h"
+#include "cxgb4i_pdu.h"
+
+static int cxgb4i_rcv_win = 256 * 1024;
+module_param(cxgb4i_rcv_win, int, 0644);
+MODULE_PARM_DESC(cxgb4i_rcv_win, "TCP reveive window in bytes");
+
+static int cxgb4i_snd_win = 128 * 1024;
+module_param(cxgb4i_snd_win, int, 0644);
+MODULE_PARM_DESC(cxgb4i_snd_win, "TCP send window in bytes");
+
+static int cxgb4i_rx_credit_thres = 10 * 1024;
+module_param(cxgb4i_rx_credit_thres, int, 0644);
+MODULE_PARM_DESC(cxgb4i_rx_credit_thres,
+		"RX credits return threshold in bytes (default=10KB)");
+
+static unsigned int cxgb4i_max_connect = (8 * 1024);
+module_param(cxgb4i_max_connect, uint, 0644);
+MODULE_PARM_DESC(cxgb4i_max_connect, "Maximum number of connections");
+
+static unsigned short cxgb4i_sport_base = 20000;
+module_param(cxgb4i_sport_base, ushort, 0644);
+MODULE_PARM_DESC(cxgb4i_sport_base, "Starting port number (default 20000)");
+
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#define RCV_BUFSIZ_MASK	0x3FFU
+
+static void cxgb4i_sock_release_offload_resources(struct cxgb4i_sock *);
+static void cxgb4i_sock_conn_closing(struct cxgb4i_sock *);
+static int cxgb4i_sock_push_tx_frames(struct cxgb4i_sock *, int);
+
+
+#define MAX_IMM_TX_PKT_LEN 128
+
+/*
+ * is_ofld_imm - check whether a packet can be sent as immediate data
+ * @skb: the packet
+ *
+ * Returns true if a packet can be sent as an offload WR with immediate
+ * data.  We currently use the same limit as for Ethernet packets.
+ */
+static inline int is_ofld_imm(const struct sk_buff *skb)
+{
+	return skb->len <= (MAX_IMM_TX_PKT_LEN -
+			sizeof(struct fw_ofld_tx_data_wr));
+}
+
+static int cxgb4i_sock_get_port(struct cxgb4i_sock *csk)
+{
+	unsigned int start;
+	int idx;
+
+	if (!csk->snic->pmap)
+		goto error_out;
+
+	if (csk->saddr.sin_port) {
+		cxgb4i_log_error("connect, sin_port none ZERO %u\n",
+				ntohs(csk->saddr.sin_port));
+		return -EADDRINUSE;
+	}
+
+	spin_lock_bh(&csk->snic->pmap->lock);
+	start = idx = csk->snic->pmap->next;
+
+	do {
+		if (++idx >= cxgb4i_max_connect)
+			idx = 0;
+		if (!csk->snic->pmap->port_csk[idx]) {
+			csk->saddr.sin_port = htons(cxgb4i_sport_base + idx);
+			csk->snic->pmap->next = idx;
+			csk->snic->pmap->port_csk[idx] = csk;
+			spin_unlock_bh(&csk->snic->pmap->lock);
+
+			cxgb4i_conn_debug("reserved port %u\n",
+						cxgb4i_sport_base + idx);
+
+			return 0;
+		}
+	} while (idx != start);
+	spin_unlock_bh(&csk->snic->pmap->lock);
+
+error_out:
+	return -EADDRNOTAVAIL;
+}
+
+static void cxgb4i_sock_put_port(struct cxgb4i_sock *csk)
+{
+	if (csk->saddr.sin_port) {
+		int idx = ntohs(csk->saddr.sin_port) - cxgb4i_sport_base;
+
+		csk->saddr.sin_port = 0;
+		if (idx < 0 || idx >= cxgb4i_sport_base)
+			return;
+
+		spin_lock_bh(&csk->snic->pmap->lock);
+		csk->snic->pmap->port_csk[idx] = NULL;
+		spin_unlock_bh(&csk->snic->pmap->lock);
+
+		cxgb4i_conn_debug("released port %u\n",
+				cxgb4i_sport_base + idx);
+	}
+}
+
+static inline void cxgb4i_sock_set_flag(struct cxgb4i_sock *csk,
+					enum cxgb4i_sock_flags flag)
+{
+	__set_bit(flag, &csk->flags);
+	cxgb4i_conn_debug("csk 0x%p, set %d, state %u, flags 0x%lu\n",
+			csk, flag, csk->state, csk->flags);
+}
+
+static inline void cxgb4i_sock_clear_flag(struct cxgb4i_sock *csk,
+					enum cxgb4i_sock_flags flag)
+{
+	__clear_bit(flag, &csk->flags);
+	cxgb4i_conn_debug("csk 0x%p, clear %d, state %u, flags 0x%lu\n",
+			csk, flag, csk->state, csk->flags);
+}
+
+static inline int cxgb4i_sock_flag(struct cxgb4i_sock *csk,
+				enum cxgb4i_sock_flags flag)
+{
+	if (csk == NULL)
+		return 0;
+
+	return test_bit(flag, &csk->flags);
+}
+
+static void cxgb4i_sock_set_state(struct cxgb4i_sock *csk, int state)
+{
+	csk->state = state;
+}
+
+static inline void cxgb4i_sock_hold(struct cxgb4i_sock *csk)
+{
+	atomic_inc(&csk->refcnt);
+}
+
+static inline void cxgb4i_sock_put(struct cxgb4i_sock *csk)
+{
+	if (atomic_dec_and_test(&csk->refcnt)) {
+		cxgb4i_conn_debug("free csk 0x%p, state %u, flags 0x%lx\n",
+				csk, csk->state, csk->flags);
+		kfree(csk);
+	}
+}
+
+static void cxgb4i_sock_closed(struct cxgb4i_sock *csk)
+{
+	cxgb4i_conn_debug("csk 0x%p, state %u, flags 0x%lx\n",
+			csk, csk->state, csk->flags);
+
+	cxgb4i_sock_put_port(csk);
+	cxgb4i_sock_release_offload_resources(csk);
+	cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_CLOSED);
+	cxgb4i_sock_conn_closing(csk);
+}
+
+static unsigned int cxgb4i_find_best_mtu(struct cxgb4i_snic *snic,
+						unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < NMTUS - 1 && snic->lldi.mtus[i + 1] <= mtu)
+		++i;
+
+	return i;
+}
+
+static unsigned int cxgb4i_select_mss(struct cxgb4i_sock *csk,
+						unsigned int pmtu)
+{
+	unsigned int idx;
+	struct dst_entry *dst = csk->dst;
+	u16 advmss = dst_metric(dst, RTAX_ADVMSS);
+
+	if (advmss > pmtu - 40)
+		advmss = pmtu - 40;
+	if (advmss < csk->snic->lldi.mtus[0] - 40)
+		advmss = csk->snic->lldi.mtus[0] - 40;
+	idx = cxgb4i_find_best_mtu(csk->snic, advmss + 40);
+
+	return idx;
+}
+
+static inline int cxgb4i_sock_compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535 << wscale) < win)
+		wscale++;
+
+	return wscale;
+}
+
+static void cxgb4i_sock_make_act_open_req(struct cxgb4i_sock *csk,
+					   struct sk_buff *skb,
+					   unsigned int qid_atid,
+					   struct l2t_entry *e)
+{
+	struct cpl_act_open_req *req;
+	unsigned long long opt0;
+	unsigned int opt2;
+	int wscale;
+
+	cxgb4i_conn_debug("csk 0x%p, atid 0x%x\n", csk, qid_atid);
+
+	wscale = cxgb4i_sock_compute_wscale(csk->mss_idx);
+
+	opt0 = KEEP_ALIVE(1) |
+		WND_SCALE(wscale) |
+		MSS_IDX(csk->mss_idx) |
+		L2T_IDX(csk->l2t->idx) |
+		TX_CHAN(csk->tx_chan) |
+		SMAC_SEL(csk->smac_idx) |
+		RCV_BUFSIZ(cxgb4i_rcv_win >> 10);
+
+	opt2 = RX_CHANNEL(0) |
+		RSS_QUEUE_VALID |
+		RSS_QUEUE(csk->rss_qid);
+
+	skb->queue_mapping = CPL_PRIORITY_SETUP;
+	req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, 0);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					qid_atid));
+	req->local_port = csk->saddr.sin_port;
+	req->peer_port = csk->daddr.sin_port;
+	req->local_ip = csk->saddr.sin_addr.s_addr;
+	req->peer_ip = csk->daddr.sin_addr.s_addr;
+	req->opt0 = cpu_to_be64(opt0);
+	req->params = 0;
+	req->opt2 = cpu_to_be32(opt2);
+}
+
+static void cxgb4i_fail_act_open(struct cxgb4i_sock *csk, int errno)
+{
+	cxgb4i_conn_debug("csk 0%p, state %u, flag 0x%lx\n", csk,
+			csk->state, csk->flags);
+	csk->err = errno;
+	cxgb4i_sock_closed(csk);
+}
+
+static void cxgb4i_act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk = (struct cxgb4i_sock *)skb->sk;
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+	if (csk->state == CXGB4I_CSK_ST_CONNECTING)
+		cxgb4i_fail_act_open(csk, -EHOSTUNREACH);
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+	__kfree_skb(skb);
+}
+
+static void cxgb4i_sock_skb_entail(struct cxgb4i_sock *csk,
+					struct sk_buff *skb,
+					int flags)
+{
+	cxgb4i_skb_tcp_seq(skb) = csk->write_seq;
+	cxgb4i_skb_flags(skb) = flags;
+	__skb_queue_tail(&csk->write_queue, skb);
+}
+
+static void cxgb4i_sock_send_close_req(struct cxgb4i_sock *csk)
+{
+	struct sk_buff *skb = csk->cpl_close;
+	struct cpl_close_con_req *req = (struct cpl_close_con_req *)skb->head;
+	unsigned int tid = csk->hwtid;
+
+	csk->cpl_close = NULL;
+
+	skb->queue_mapping = CPL_PRIORITY_DATA;
+	INIT_TP_WR(req, tid);
+
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = 0;
+
+	cxgb4i_sock_skb_entail(csk, skb, CXGB4I_SKCB_FLAG_NO_APPEND);
+	if (csk->state != CXGB4I_CSK_ST_CONNECTING)
+		cxgb4i_sock_push_tx_frames(csk, 1);
+}
+
+static void cxgb4i_sock_abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+	struct cxgb4i_sock *csk = (struct cxgb4i_sock *)handle;
+
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb4_ofld_send(csk->snic->lldi.ports[csk->port_id], skb);
+}
+
+static inline void cxgb4i_sock_purge_write_queue(struct cxgb4i_sock *csk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&csk->write_queue)))
+		__kfree_skb(skb);
+}
+
+static void cxgb4i_sock_send_abort_req(struct cxgb4i_sock *csk)
+{
+	struct cpl_abort_req *req;
+	struct sk_buff *skb = csk->cpl_abort_req;
+
+	if (unlikely(csk->state == CXGB4I_CSK_ST_ABORTING) ||
+			!skb || !csk->snic)
+		return;
+
+	cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_ABORTING);
+
+	cxgb4i_conn_debug("csk 0x%p, flag ABORT_RPL + ABORT_SHUT\n", csk);
+
+	cxgb4i_sock_set_state(csk, CXGB4I_CSK_FL_ABORT_RPL_PENDING);
+
+	cxgb4i_sock_purge_write_queue(csk);
+
+	csk->cpl_abort_req = NULL;
+	req = (struct cpl_abort_req *)skb->head;
+
+	skb->queue_mapping = CPL_PRIORITY_DATA;
+	t4_set_arp_err_handler(skb, csk, cxgb4i_sock_abort_arp_failure);
+	INIT_TP_WR(req, csk->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, csk->hwtid));
+	req->rsvd0 = htonl(csk->snd_nxt);
+	req->rsvd1 = !cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_TX_DATA_SENT);
+	req->cmd = CPL_ABORT_SEND_RST;
+
+	cxgb4_l2t_send(csk->snic->lldi.ports[csk->port_id], skb, csk->l2t);
+}
+
+static void cxgb4i_sock_send_abort_rpl(struct cxgb4i_sock *csk, int rst_status)
+{
+	struct sk_buff *skb = csk->cpl_abort_rpl;
+	struct cpl_abort_rpl *rpl = (struct cpl_abort_rpl *)skb->head;
+
+	csk->cpl_abort_rpl = NULL;
+
+	skb->queue_mapping = CPL_PRIORITY_DATA;
+	INIT_TP_WR(rpl, csk->hwtid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, csk->hwtid));
+	rpl->cmd = rst_status;
+
+	cxgb4_ofld_send(csk->snic->lldi.ports[csk->port_id], skb);
+}
+
+static u32 cxgb4i_csk_send_rx_credits(struct cxgb4i_sock *csk, u32 credits)
+{
+	struct sk_buff *skb;
+	struct cpl_rx_data_ack *req;
+	int wrlen = roundup(sizeof(*req), 16);
+
+	skb = alloc_skb(wrlen, GFP_ATOMIC);
+	if (!skb)
+		return 0;
+
+	req = (struct cpl_rx_data_ack *)__skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	skb->queue_mapping = CPL_PRIORITY_ACK;
+	INIT_TP_WR(req, csk->hwtid);
+	OPCODE_TID(req) =
+		cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK, csk->hwtid));
+	req->credit_dack = cpu_to_be32(RX_CREDITS(credits) | RX_FORCE_ACK(1));
+	cxgb4_ofld_send(csk->snic->lldi.ports[csk->port_id], skb);
+	return credits;
+}
+
+
+#define SKB_WR_LIST_SIZE	(MAX_SKB_FRAGS + 2)
+
+static const unsigned int cxgb4i_ulp_extra_len[] = { 0, 4, 4, 8 };
+static inline unsigned int ulp_extra_len(const struct sk_buff *skb)
+{
+	return cxgb4i_ulp_extra_len[cxgb4i_skb_ulp_mode(skb) & 3];
+}
+
+static inline void cxgb4i_sock_reset_wr_list(struct cxgb4i_sock *csk)
+{
+	csk->wr_pending_head = csk->wr_pending_tail = NULL;
+}
+
+static inline void cxgb4i_sock_enqueue_wr(struct cxgb4i_sock *csk,
+						struct sk_buff *skb)
+{
+	cxgb4i_skb_tx_wr_next(skb) = NULL;
+
+	/*
+	 * We want to take an extra reference since both us and the driver
+	 * need to free the packet before it's really freed. We know there's
+	 * just one user currently so we use atomic_set rather than skb_get
+	 * to avoid the atomic op.
+	 */
+	atomic_set(&skb->users, 2);
+
+	if (!csk->wr_pending_head)
+		csk->wr_pending_head = skb;
+
+	else
+		cxgb4i_skb_tx_wr_next(csk->wr_pending_tail) = skb;
+
+	csk->wr_pending_tail = skb;
+}
+
+static int cxgb4i_sock_count_pending_wrs(const struct cxgb4i_sock *csk)
+{
+	int n = 0;
+	const struct sk_buff *skb = csk->wr_pending_head;
+
+	while (skb) {
+		n += skb->csum;
+		skb = cxgb4i_skb_tx_wr_next(skb);
+	}
+	return n;
+}
+
+static inline struct sk_buff *cxgb4i_sock_peek_wr(const struct cxgb4i_sock *csk)
+{
+	return csk->wr_pending_head;
+}
+
+static inline void cxgb4i_sock_free_wr_skb(struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static inline struct sk_buff *cxgb4i_sock_dequeue_wr(struct cxgb4i_sock *csk)
+{
+	struct sk_buff *skb = csk->wr_pending_head;
+
+	if (likely(skb)) {
+		csk->wr_pending_head = cxgb4i_skb_tx_wr_next(skb);
+		cxgb4i_skb_tx_wr_next(skb) = NULL;
+	}
+	return skb;
+}
+
+static void cxgb4i_sock_purge_wr_queue(struct cxgb4i_sock *csk)
+{
+	struct sk_buff *skb;
+
+	while ((skb = cxgb4i_sock_dequeue_wr(csk)) != NULL)
+		cxgb4i_sock_free_wr_skb(skb);
+}
+
+/*
+ * sgl_len - calculates the size of an SGL of the given capacity
+ * @n: the number of SGL entries
+ * Calculates the number of flits needed for a scatter/gather list that
+ * can hold the given number of entries.
+ */
+static inline unsigned int sgl_len(unsigned int n)
+{
+	n--;
+	return (3 * n) / 2 + (n & 1) + 2;
+}
+
+/*
+ * calc_tx_flits_ofld - calculate # of flits for an offload packet
+ * @skb: the packet
+ *
+ * Returns the number of flits needed for the given offload packet.
+ * These packets are already fully constructed and no additional headers
+ * will be added.
+ */
+static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb)
+{
+	unsigned int flits, cnt;
+
+	if (is_ofld_imm(skb))
+		return DIV_ROUND_UP(skb->len, 8);
+
+	flits = skb_transport_offset(skb) / 8;
+	cnt = skb_shinfo(skb)->nr_frags;
+	if (skb->tail != skb->transport_header)
+		cnt++;
+	return flits + sgl_len(cnt);
+}
+
+static inline void cxgb4i_sock_send_tx_flowc_wr(struct cxgb4i_sock *csk)
+{
+	struct sk_buff *skb;
+	struct fw_flowc_wr *flowc;
+	int flowclen, i;
+
+	flowclen = 80;
+	skb = alloc_skb(flowclen, GFP_ATOMIC);
+	flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+
+	flowc->op_to_nparams =
+		htonl(FW_WR_OP(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS(8));
+	flowc->flowid_len16 =
+		htonl(FW_WR_LEN16(DIV_ROUND_UP(72, 16)) |
+				FW_WR_FLOWID(csk->hwtid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = htonl(0);
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = htonl(csk->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = htonl(csk->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = htonl(csk->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = htonl(csk->snd_nxt);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = htonl(csk->rcv_nxt);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = htonl(cxgb4i_snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = htonl(csk->mss_idx);
+	flowc->mnemval[8].mnemonic = 0;
+	flowc->mnemval[8].val = 0;
+	for (i = 0; i < 9; i++) {
+		flowc->mnemval[i].r4[0] = 0;
+		flowc->mnemval[i].r4[1] = 0;
+		flowc->mnemval[i].r4[2] = 0;
+	}
+
+	skb->queue_mapping = CPL_PRIORITY_DATA;
+
+	cxgb4_ofld_send(csk->snic->lldi.ports[csk->port_id], skb);
+}
+
+static inline void cxgb4i_sock_make_tx_data_wr(struct cxgb4i_sock *csk,
+						struct sk_buff *skb, int dlen,
+						int len, u32 credits,
+						int req_completion)
+{
+	struct fw_ofld_tx_data_wr *req;
+	unsigned int wr_ulp_mode;
+
+	if (is_ofld_imm(skb)) {
+			req = (struct fw_ofld_tx_data_wr *)
+				__skb_push(skb, sizeof(*req));
+			req->op_to_immdlen =
+				cpu_to_be32(FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+					FW_WR_COMPL(req_completion) |
+					FW_WR_IMMDLEN(dlen));
+			req->flowid_len16 =
+				cpu_to_be32(FW_WR_FLOWID(csk->hwtid) |
+						FW_WR_LEN16(credits));
+	} else {
+		req = (struct fw_ofld_tx_data_wr *)
+			__skb_push(skb, sizeof(*req));
+		req->op_to_immdlen =
+			cpu_to_be32(FW_WR_OP(FW_OFLD_TX_DATA_WR) |
+					FW_WR_COMPL(req_completion) |
+					FW_WR_IMMDLEN(0));
+		req->flowid_len16 =
+			cpu_to_be32(FW_WR_FLOWID(csk->hwtid) |
+					FW_WR_LEN16(credits));
+	}
+
+	wr_ulp_mode =
+		FW_OFLD_TX_DATA_WR_ULPMODE(cxgb4i_skb_ulp_mode(skb) >> 4) |
+		FW_OFLD_TX_DATA_WR_ULPSUBMODE(cxgb4i_skb_ulp_mode(skb) & 3);
+
+	req->tunnel_to_proxy = cpu_to_be32(wr_ulp_mode) |
+		FW_OFLD_TX_DATA_WR_SHOVE(skb_peek(&csk->write_queue) ? 0 : 1);
+
+	req->plen = cpu_to_be32(len);
+
+	if (!cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_TX_DATA_SENT))
+		cxgb4i_sock_set_flag(csk, CXGB4I_CSK_FL_TX_DATA_SENT);
+}
+
+static void cxgb4i_sock_arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static int cxgb4i_sock_push_tx_frames(struct cxgb4i_sock *csk,
+						int req_completion)
+{
+	int total_size = 0;
+	struct sk_buff *skb;
+	struct cxgb4i_snic *snic;
+
+	if (unlikely(csk->state == CXGB4I_CSK_ST_CONNECTING ||
+				csk->state == CXGB4I_CSK_ST_CLOSE_WAIT_1 ||
+				csk->state >= CXGB4I_CSK_ST_ABORTING)) {
+		cxgb4i_tx_debug("csk 0x%p, in closing state %u.\n",
+				csk, csk->state);
+		return 0;
+	}
+
+	snic = csk->snic;
+
+	while (csk->wr_cred
+			&& (skb = skb_peek(&csk->write_queue)) != NULL) {
+		int dlen;
+		int len;
+		unsigned int credits_needed;
+
+		dlen = len = skb->len;
+		skb_reset_transport_header(skb);
+
+		if (is_ofld_imm(skb))
+			credits_needed = DIV_ROUND_UP(dlen +
+					sizeof(struct fw_ofld_tx_data_wr), 16);
+		else
+			credits_needed = DIV_ROUND_UP(8 *
+					calc_tx_flits_ofld(skb)+
+					sizeof(struct fw_ofld_tx_data_wr), 16);
+
+		if (csk->wr_cred < credits_needed) {
+			cxgb4i_tx_debug("csk 0x%p, skb len %u/%u, "
+					"wr %d < %u.\n",
+					csk, skb->len, skb->data_len,
+					credits_needed, csk->wr_cred);
+			break;
+		}
+
+		__skb_unlink(skb, &csk->write_queue);
+		skb->queue_mapping = CPL_PRIORITY_DATA;
+		skb->csum = credits_needed; /* remember this until the WR_ACK */
+		csk->wr_cred -= credits_needed;
+		csk->wr_una_cred += credits_needed;
+		cxgb4i_sock_enqueue_wr(csk, skb);
+
+		cxgb4i_tx_debug("csk 0x%p, enqueue, skb len %u/%u, "
+				"wr %d, left %u, unack %u.\n",
+				csk, skb->len, skb->data_len,
+				credits_needed, csk->wr_cred,
+				csk->wr_una_cred);
+
+
+		if (likely(cxgb4i_skb_flags(skb) &
+					CXGB4I_SKCB_FLAG_NEED_HDR)) {
+			len += ulp_extra_len(skb);
+			if (!cxgb4i_sock_flag(csk,
+						CXGB4I_CSK_FL_TX_DATA_SENT)) {
+				cxgb4i_sock_send_tx_flowc_wr(csk);
+				skb->csum += 5;
+				csk->wr_cred -= 5;
+				csk->wr_una_cred += 5;
+			}
+
+			if ((req_completion &&
+				csk->wr_una_cred == credits_needed) ||
+				(cxgb4i_skb_flags(skb) &
+				  CXGB4I_SKCB_FLAG_COMPL) ||
+				csk->wr_una_cred >= csk->wr_max_cred / 2) {
+				req_completion = 1;
+				csk->wr_una_cred = 0;
+			}
+			cxgb4i_sock_make_tx_data_wr(csk, skb, dlen, len,
+							credits_needed,
+							req_completion);
+			csk->snd_nxt += len;
+
+			if (req_completion)
+				cxgb4i_skb_flags(skb) &=
+					~CXGB4I_SKCB_FLAG_NEED_HDR;
+		}
+
+		total_size += skb->truesize;
+		t4_set_arp_err_handler(skb, csk,
+					cxgb4i_sock_arp_failure_discard);
+		cxgb4_l2t_send(snic->lldi.ports[csk->port_id], skb, csk->l2t);
+	}
+	return total_size;
+}
+
+static inline void cxgb4i_sock_free_atid(struct cxgb4i_sock *csk)
+{
+	cxgb4_free_atid(csk->snic->lldi.tids, csk->atid);
+	cxgb4i_sock_put(csk);
+}
+
+static void cxgb4i_sock_established(struct cxgb4i_sock *csk, u32 snd_isn,
+					unsigned int opt)
+{
+	cxgb4i_conn_debug("csk 0x%p, state %u.\n", csk, csk->state);
+
+	csk->write_seq = csk->snd_nxt = csk->snd_una = snd_isn;
+
+	/*
+	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+	 * pass through opt0.
+	 */
+	if (cxgb4i_rcv_win > (RCV_BUFSIZ_MASK << 10))
+		csk->rcv_wup -= cxgb4i_rcv_win - (RCV_BUFSIZ_MASK << 10);
+
+	dst_confirm(csk->dst);
+
+	smp_mb();
+
+	cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_ESTABLISHED);
+}
+
+static int cxgb4i_cpl_act_establish(struct cxgb4i_snic *snic,
+						struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	unsigned int atid = GET_TID_TID(ntohl(req->tos_atid));
+	struct tid_info *t = snic->lldi.tids;
+	u32 rcv_isn = be32_to_cpu(req->rcv_isn);
+
+	csk = lookup_atid(t, atid);
+
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	cxgb4i_conn_debug("csk 0x%p, state %u, flag 0x%lx\n",
+				csk, csk->state, csk->flags);
+	csk->hwtid = hwtid;
+	cxgb4i_sock_hold(csk);
+	cxgb4_insert_tid(snic->lldi.tids, csk, hwtid);
+	cxgb4_free_atid(snic->lldi.tids, atid);
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state != CXGB4I_CSK_ST_CONNECTING))
+		cxgb4i_log_error("TID %u expected SYN_SENT, got EST., s %u\n",
+				csk->hwtid, csk->state);
+
+	csk->copied_seq = csk->rcv_wup = csk->rcv_nxt = rcv_isn;
+	cxgb4i_sock_established(csk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	__kfree_skb(skb);
+
+	if (unlikely(cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ACTIVE_CLOSE_NEEDED)))
+		cxgb4i_sock_send_abort_req(csk);
+	else {
+		if (skb_queue_len(&csk->write_queue))
+			cxgb4i_sock_push_tx_frames(csk, 1);
+
+		cxgb4i_conn_tx_open(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+
+	return 0;
+}
+
+static int act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return -ECONNREFUSED;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		cxgb4i_log_error("ACTIVE_OPEN_RPL: 4-tuple in use\n");
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+		status != CPL_ERR_ARP_MISS;
+}
+
+static void cxgb4i_sock_act_open_retry_timer(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct cxgb4i_sock *csk = (struct cxgb4i_sock *)data;
+
+	cxgb4i_conn_debug("csk 0x%p, state %u.\n", csk, csk->state);
+
+	spin_lock_bh(&csk->lock);
+	skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_ATOMIC);
+	if (!skb)
+		cxgb4i_fail_act_open(csk, -ENOMEM);
+	else {
+		unsigned int qid_atid  = csk->rss_qid << 14;
+		qid_atid |= (unsigned int)csk->atid;
+		skb->sk = (struct sock *)csk;
+		t4_set_arp_err_handler(skb, csk,
+					cxgb4i_act_open_req_arp_failure);
+		cxgb4i_sock_make_act_open_req(csk, skb, qid_atid, csk->l2t);
+		cxgb4_l2t_send(csk->snic->lldi.ports[csk->port_id], skb,
+								csk->l2t);
+	}
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+}
+
+static int cxgb4i_cpl_act_open_rpl(struct cxgb4i_snic *snic,
+						struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid =
+		GET_TID_TID(GET_AOPEN_ATID(be32_to_cpu(rpl->atid_status)));
+	struct tid_info *t = snic->lldi.tids;
+	unsigned int status = GET_AOPEN_STATUS(be32_to_cpu(rpl->atid_status));
+
+	csk = lookup_atid(t, atid);
+
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", atid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	cxgb4i_conn_debug("rcv, status 0x%x, csk 0x%p, csk->state %u, "
+			"csk->flag 0x%lx, csk->atid %u.\n",
+			status, csk, csk->state, csk->flags, csk->hwtid);
+
+	if (status & act_open_has_tid(status))
+		cxgb4_remove_tid(snic->lldi.tids, csk->port_id, GET_TID(rpl));
+
+	if (status == CPL_ERR_CONN_EXIST &&
+			csk->retry_timer.function !=
+			cxgb4i_sock_act_open_retry_timer) {
+		csk->retry_timer.function = cxgb4i_sock_act_open_retry_timer;
+		if (!mod_timer(&csk->retry_timer, jiffies + HZ / 2))
+			cxgb4i_sock_hold(csk);
+	} else
+
+		cxgb4i_fail_act_open(csk, act_open_rpl_status_to_errno(status));
+
+	__kfree_skb(skb);
+
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+
+	return 0;
+}
+
+static int cxgb4i_cpl_peer_close(struct cxgb4i_snic *snic, struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_peer_close *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct tid_info *t = snic->lldi.tids;
+
+	csk = lookup_tid(t, hwtid);
+
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ABORT_RPL_PENDING))
+		goto out;
+
+	switch (csk->state) {
+	case CXGB4I_CSK_ST_ESTABLISHED:
+		cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_PASSIVE_CLOSE);
+		break;
+	case CXGB4I_CSK_ST_ACTIVE_CLOSE:
+		cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_CLOSE_WAIT_2);
+		break;
+	case CXGB4I_CSK_ST_CLOSE_WAIT_1:
+		cxgb4i_sock_closed(csk);
+		break;
+	case CXGB4I_CSK_ST_ABORTING:
+		break;
+	default:
+		cxgb4i_log_error("peer close, TID %u in bad state %u\n",
+				csk->hwtid, csk->state);
+	}
+
+	cxgb4i_sock_conn_closing(csk);
+
+out:
+	__kfree_skb(skb);
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+
+	return 0;
+}
+
+static int cxgb4i_cpl_close_con_rpl(struct cxgb4i_snic *snic,
+						struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+	unsigned int hwtid = GET_TID(rpl);
+	struct tid_info *t = snic->lldi.tids;
+
+	csk = lookup_tid(t, hwtid);
+
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	cxgb4i_conn_debug("csk 0x%p, state %u, flag 0x%lx.\n",
+			csk, csk->state, csk->flags);
+
+	csk->snd_una = ntohl(rpl->snd_nxt) - 1;
+
+	if (cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ABORT_RPL_PENDING))
+		goto out;
+
+	switch (csk->state) {
+	case CXGB4I_CSK_ST_ACTIVE_CLOSE:
+		cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_CLOSE_WAIT_1);
+		break;
+	case CXGB4I_CSK_ST_CLOSE_WAIT_1:
+	case CXGB4I_CSK_ST_CLOSE_WAIT_2:
+		cxgb4i_sock_closed(csk);
+		break;
+	case CXGB4I_CSK_ST_ABORTING:
+		break;
+	default:
+		cxgb4i_log_error("close_rpl, TID %u in bad state %u\n",
+				csk->hwtid, csk->state);
+	}
+out:
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+	kfree_skb(skb);
+
+	return 0;
+}
+
+static int abort_status_to_errno(struct cxgb4i_sock *csk, int abort_reason,
+								int *need_rst)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN: /* fall through */
+	case CPL_ERR_CONN_RESET:
+		return csk->state > CXGB4I_CSK_ST_ESTABLISHED ?
+			-EPIPE : -ECONNRESET;
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return -ETIMEDOUT;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static inline int is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+		status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static int cxgb4i_cpl_abort_req_rss(struct cxgb4i_snic *snic,
+						struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct tid_info *t = snic->lldi.tids;
+	int rst_status = CPL_ABORT_NO_RST;
+
+	csk = lookup_tid(t, hwtid);
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (is_neg_adv_abort(req->status)) {
+		__kfree_skb(skb);
+		return 0;
+	}
+
+	if (!cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ABORT_REQ_RCVD)) {
+		cxgb4i_sock_set_flag(csk, CXGB4I_CSK_FL_ABORT_REQ_RCVD);
+		cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_ABORTING);
+		__kfree_skb(skb);
+		return 0;
+	}
+
+	cxgb4i_sock_clear_flag(csk, CXGB4I_CSK_FL_ABORT_REQ_RCVD);
+	cxgb4i_sock_send_abort_rpl(csk, rst_status);
+
+	if (!cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ABORT_RPL_PENDING)) {
+		csk->err = abort_status_to_errno(csk, req->status,
+							&rst_status);
+		cxgb4i_sock_closed(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+
+	return 0;
+}
+
+static int cxgb4i_cpl_abort_rpl_rss(struct cxgb4i_snic *snic,
+						struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	unsigned int hwtid = GET_TID(rpl);
+	struct tid_info *t = snic->lldi.tids;
+
+	if (rpl->status == CPL_ERR_ABORT_FAILED)
+		goto out;
+
+	csk = lookup_tid(t, hwtid);
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		goto out;
+	}
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	if (cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ABORT_RPL_PENDING)) {
+		if (!cxgb4i_sock_flag(csk, CXGB4I_CSK_FL_ABORT_RPL_RCVD))
+			cxgb4i_sock_set_flag(csk,
+						CXGB4I_CSK_FL_ABORT_RPL_RCVD);
+		else {
+			cxgb4i_sock_clear_flag(csk,
+						CXGB4I_CSK_FL_ABORT_RPL_RCVD);
+			cxgb4i_sock_clear_flag(csk,
+					CXGB4I_CSK_FL_ABORT_RPL_PENDING);
+
+			if (cxgb4i_sock_flag(csk,
+						CXGB4I_CSK_FL_ABORT_REQ_RCVD))
+				cxgb4i_log_error("tid %u, ABORT_RPL_RSS\n",
+						csk->hwtid);
+
+			cxgb4i_sock_closed(csk);
+		}
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+
+out:
+	__kfree_skb(skb);
+	return 0;
+}
+
+static int cxgb4i_cpl_iscsi_hdr(struct cxgb4i_snic *snic, struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_iscsi_hdr *cpl = cplhdr(skb);
+	unsigned int hwtid = GET_TID(cpl);
+	struct tid_info *t = snic->lldi.tids;
+	struct sk_buff *lskb;
+
+	csk = lookup_tid(t, hwtid);
+
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CXGB4I_CSK_ST_PASSIVE_CLOSE)) {
+		if (csk->state != CXGB4I_CSK_ST_ABORTING)
+			goto abort_conn;
+	}
+
+	cxgb4i_skb_tcp_seq(skb) = ntohl(cpl->seq);
+	cxgb4i_skb_flags(skb) = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(*cpl));
+	__pskb_trim(skb, ntohs(cpl->len));
+
+	if (!csk->skb_ulp_lhdr) {
+		unsigned char *byte;
+		csk->skb_ulp_lhdr = skb;
+		lskb = csk->skb_ulp_lhdr;
+
+		cxgb4i_skb_flags(lskb) = CXGB4I_SKCB_FLAG_HDR_RCVD;
+
+		if (cxgb4i_skb_tcp_seq(lskb) != csk->rcv_nxt) {
+			cxgb4i_log_error("tid 0x%x, CPL_ISCSI_HDR, bad seq got "
+					"0x%x, exp 0x%x\n",
+					csk->hwtid,
+					cxgb4i_skb_tcp_seq(lskb),
+					csk->rcv_nxt);
+		}
+
+		byte = skb->data;
+		cxgb4i_skb_rx_pdulen(skb) = ntohs(cpl->pdu_len_ddp) - 40;
+		csk->rcv_nxt += cxgb4i_skb_rx_pdulen(lskb);
+	} else {
+		lskb = csk->skb_ulp_lhdr;
+		cxgb4i_skb_flags(lskb) |= CXGB4I_SKCB_FLAG_DATA_RCVD;
+		cxgb4i_skb_flags(skb) = CXGB4I_SKCB_FLAG_DATA_RCVD;
+		cxgb4i_log_debug("csk 0x%p, tid 0x%x skb 0x%p, pdu data, "
+				" header 0x%p.\n",
+				csk, csk->hwtid, skb, lskb);
+	}
+
+	__skb_queue_tail(&csk->receive_queue, skb);
+
+	spin_unlock_bh(&csk->lock);
+
+	return 0;
+
+abort_conn:
+	cxgb4i_sock_send_abort_req(csk);
+	__kfree_skb(skb);
+	spin_unlock_bh(&csk->lock);
+
+	return -EINVAL;
+}
+
+static int cxgb4i_cpl_rx_data_ddp(struct cxgb4i_snic *snic, struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct sk_buff *lskb;
+	struct cpl_rx_data_ddp *rpl = cplhdr(skb);
+	unsigned int hwtid = GET_TID(rpl);
+	struct tid_info *t = snic->lldi.tids;
+	unsigned int status;
+
+	csk = lookup_tid(t, hwtid);
+
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	spin_lock_bh(&csk->lock);
+
+	if (unlikely(csk->state >= CXGB4I_CSK_ST_PASSIVE_CLOSE)) {
+		if (csk->state != CXGB4I_CSK_ST_ABORTING)
+			goto abort_conn;
+	}
+
+	if (!csk->skb_ulp_lhdr) {
+		cxgb4i_log_error("tid 0x%x, rcv RX_DATA_DDP w/o pdu header\n",
+				csk->hwtid);
+		goto abort_conn;
+	}
+
+	lskb = csk->skb_ulp_lhdr;
+	cxgb4i_skb_flags(lskb) |= CXGB4I_SKCB_FLAG_STATUS_RCVD;
+
+	if (ntohs(rpl->len) != cxgb4i_skb_rx_pdulen(lskb)) {
+		cxgb4i_log_error("tid 0x%x, RX_DATA_DDP pdulen %u != %u.\n",
+				csk->hwtid, ntohs(rpl->len),
+				cxgb4i_skb_rx_pdulen(lskb));
+	}
+
+	cxgb4i_skb_rx_ddigest(lskb) = ntohl(rpl->ulp_crc);
+	status = ntohl(rpl->ddpvld);
+
+	if (status & (1 << RX_DDP_STATUS_HCRC_SHIFT))
+		cxgb4i_skb_ulp_mode(skb) |= ULP2_FLAG_HCRC_ERROR;
+	if (status & (1 << RX_DDP_STATUS_DCRC_SHIFT))
+		cxgb4i_skb_ulp_mode(skb) |= ULP2_FLAG_DCRC_ERROR;
+	if (status & (1 << RX_DDP_STATUS_PAD_SHIFT))
+		cxgb4i_skb_ulp_mode(skb) |= ULP2_FLAG_PAD_ERROR;
+	if ((cxgb4i_skb_flags(lskb) & ULP2_FLAG_DATA_READY))
+		cxgb4i_skb_ulp_mode(skb) |= ULP2_FLAG_DATA_DDPED;
+
+	csk->skb_ulp_lhdr = NULL;
+
+	__kfree_skb(skb);
+	cxgb4i_conn_pdu_ready(csk);
+	spin_unlock_bh(&csk->lock);
+
+	return 0;
+
+abort_conn:
+	cxgb4i_sock_send_abort_req(csk);
+	__kfree_skb(skb);
+	spin_unlock_bh(&csk->lock);
+	return -EINVAL;
+}
+
+static void check_wr_invariants(const struct cxgb4i_sock *csk)
+{
+	int pending = cxgb4i_sock_count_pending_wrs(csk);
+
+	if (unlikely(csk->wr_cred + pending != csk->wr_max_cred))
+		printk(KERN_ERR "TID %u: credit imbalance: avail %u, "
+				"pending %u, total should be %u\n",
+				csk->hwtid,
+				csk->wr_cred,
+				pending,
+				csk->wr_max_cred);
+}
+
+static int cxgb4i_cpl_fw4_ack(struct cxgb4i_snic *snic, struct sk_buff *skb)
+{
+	struct cxgb4i_sock *csk;
+	struct cpl_fw4_ack *rpl = cplhdr(skb);
+	unsigned int hwtid = GET_TID(rpl);
+	struct tid_info *t = snic->lldi.tids;
+	unsigned char credits;
+	unsigned int snd_una;
+
+	csk = lookup_tid(t, hwtid);
+	if (unlikely(!csk)) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		kfree_skb(skb);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	credits = rpl->credits;
+	snd_una = be32_to_cpu(rpl->snd_una);
+
+	cxgb4i_tx_debug("%u WR credits, avail %u, unack %u, TID %u, state %u\n",
+				credits, csk->wr_cred, csk->wr_una_cred,
+						csk->hwtid, csk->state);
+
+	csk->wr_cred += credits;
+
+	if (csk->wr_una_cred > csk->wr_max_cred - csk->wr_cred)
+		csk->wr_una_cred = csk->wr_max_cred - csk->wr_cred;
+
+	while (credits) {
+		struct sk_buff *p = cxgb4i_sock_peek_wr(csk);
+
+		if (unlikely(!p)) {
+			cxgb4i_log_error("%u WR_ACK credits for TID %u with "
+					"nothing pending, state %u\n",
+					credits, csk->hwtid, csk->state);
+			break;
+		}
+
+		if (unlikely(credits < p->csum)) {
+			p->csum -= credits;
+		} else {
+			cxgb4i_sock_dequeue_wr(csk);
+			credits -= p->csum;
+			cxgb4i_sock_free_wr_skb(p);
+		}
+	}
+
+	check_wr_invariants(csk);
+
+	if (rpl->seq_vld) {
+		if (unlikely(before(snd_una, csk->snd_una))) {
+			cxgb4i_log_error("TID %u, unexpected sequence # %u "
+					"in WR_ACK snd_una %u\n",
+					csk->hwtid, snd_una, csk->snd_una);
+			goto out_free;
+		}
+	}
+
+	if (csk->snd_una != snd_una) {
+		csk->snd_una = snd_una;
+		dst_confirm(csk->dst);
+	}
+
+	if (skb_queue_len(&csk->write_queue)) {
+		if (cxgb4i_sock_push_tx_frames(csk, 0))
+			cxgb4i_conn_tx_open(csk);
+	} else
+		cxgb4i_conn_tx_open(csk);
+
+	goto out;
+
+out_free:
+
+	__kfree_skb(skb);
+
+out:
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+
+	return 0;
+}
+
+static int cxgb4i_cpl_set_tcb_rpl(struct cxgb4i_snic *snic, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+	unsigned int hwtid = GET_TID(rpl);
+	struct tid_info *t = snic->lldi.tids;
+	struct cxgb4i_sock *csk;
+
+	csk = lookup_tid(t, hwtid);
+
+	if (!csk) {
+		cxgb4i_log_error("can't find connection for tid %u\n", hwtid);
+		__kfree_skb(skb);
+		return CPL_RET_UNKNOWN_TID;
+	}
+
+	spin_lock_bh(&csk->lock);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		cxgb4i_log_error("Unexpected SET_TCB_RPL status %u "
+				 "for tid %u\n", rpl->status, GET_TID(rpl));
+	}
+
+	__kfree_skb(skb);
+	spin_unlock_bh(&csk->lock);
+
+	return 0;
+}
+
+static void cxgb4i_sock_free_cpl_skbs(struct cxgb4i_sock *csk)
+{
+	if (csk->cpl_close)
+		kfree_skb(csk->cpl_close);
+	if (csk->cpl_abort_req)
+		kfree_skb(csk->cpl_abort_req);
+	if (csk->cpl_abort_rpl)
+		kfree_skb(csk->cpl_abort_rpl);
+}
+
+static int cxgb4i_alloc_cpl_skbs(struct cxgb4i_sock *csk)
+{
+	csk->cpl_close = alloc_skb(sizeof(struct cpl_close_con_req),
+					GFP_KERNEL);
+	if (!csk->cpl_close)
+		return -ENOMEM;
+	skb_put(csk->cpl_close, sizeof(struct cpl_close_con_req));
+
+	csk->cpl_abort_req = alloc_skb(sizeof(struct cpl_abort_req),
+					GFP_KERNEL);
+	if (!csk->cpl_abort_req)
+		goto free_cpl_skbs;
+	skb_put(csk->cpl_abort_req, sizeof(struct cpl_abort_req));
+
+	csk->cpl_abort_rpl = alloc_skb(sizeof(struct cpl_abort_rpl),
+					GFP_KERNEL);
+	if (!csk->cpl_abort_rpl)
+		goto free_cpl_skbs;
+	skb_put(csk->cpl_abort_rpl, sizeof(struct cpl_abort_rpl));
+
+	return 0;
+
+free_cpl_skbs:
+	cxgb4i_sock_free_cpl_skbs(csk);
+	return -ENOMEM;
+}
+
+static void cxgb4i_sock_release_offload_resources(struct cxgb4i_sock *csk)
+{
+
+	cxgb4i_sock_free_cpl_skbs(csk);
+
+	if (csk->wr_cred != csk->wr_max_cred) {
+		cxgb4i_sock_purge_wr_queue(csk);
+		cxgb4i_sock_reset_wr_list(csk);
+	}
+
+	if (csk->l2t) {
+		cxgb4_l2t_release(csk->l2t);
+		csk->l2t = NULL;
+	}
+
+	if (csk->state == CXGB4I_CSK_ST_CONNECTING)
+		cxgb4i_sock_free_atid(csk);
+	else {
+		cxgb4_remove_tid(csk->snic->lldi.tids, 0, csk->hwtid);
+		cxgb4i_sock_put(csk);
+	}
+
+	csk->dst = NULL;
+	csk->snic = NULL;
+}
+
+struct cxgb4i_sock *cxgb4i_sock_create(struct cxgb4i_snic *snic)
+{
+	struct cxgb4i_sock *csk = NULL;
+
+	csk = kzalloc(sizeof(*csk), GFP_KERNEL);
+	if (!csk)
+		return NULL;
+
+	if (cxgb4i_alloc_cpl_skbs(csk) < 0)
+		goto free_csk;
+
+	cxgb4i_conn_debug("alloc csk: 0x%p\n", csk);
+
+	csk->flags = 0;
+	spin_lock_init(&csk->lock);
+	atomic_set(&csk->refcnt, 1);
+	skb_queue_head_init(&csk->receive_queue);
+	skb_queue_head_init(&csk->write_queue);
+	setup_timer(&csk->retry_timer, NULL, (unsigned long)csk);
+	rwlock_init(&csk->callback_lock);
+	csk->snic = snic;
+
+	return csk;
+
+free_csk:
+	cxgb4i_api_debug("csk alloc failed %p, baling out\n", csk);
+	kfree(csk);
+	return NULL;
+}
+
+static void cxgb4i_sock_active_close(struct cxgb4i_sock *csk)
+{
+	int data_lost;
+	int close_req = 0;
+
+	cxgb4i_conn_debug("csk 0x%p, state %u, flags %lu\n",
+			csk, csk->state, csk->flags);
+
+	dst_confirm(csk->dst);
+
+	cxgb4i_sock_hold(csk);
+	spin_lock_bh(&csk->lock);
+
+	data_lost = skb_queue_len(&csk->receive_queue);
+	__skb_queue_purge(&csk->receive_queue);
+
+	switch (csk->state) {
+	case CXGB4I_CSK_ST_CLOSED:
+	case CXGB4I_CSK_ST_ACTIVE_CLOSE:
+	case CXGB4I_CSK_ST_CLOSE_WAIT_1:
+	case CXGB4I_CSK_ST_CLOSE_WAIT_2:
+	case CXGB4I_CSK_ST_ABORTING:
+		break;
+
+	case CXGB4I_CSK_ST_CONNECTING:
+		cxgb4i_sock_set_flag(csk, CXGB4I_CSK_FL_ACTIVE_CLOSE_NEEDED);
+		break;
+	case CXGB4I_CSK_ST_ESTABLISHED:
+		close_req = 1;
+		cxgb4i_sock_set_flag(csk, CXGB4I_CSK_ST_CLOSE_WAIT_2);
+		break;
+	}
+
+	if (close_req) {
+		if (data_lost)
+			cxgb4i_sock_send_abort_req(csk);
+		else
+			cxgb4i_sock_send_close_req(csk);
+	}
+
+	spin_unlock_bh(&csk->lock);
+	cxgb4i_sock_put(csk);
+}
+
+void cxgb4i_sock_release(struct cxgb4i_sock *csk)
+{
+	cxgb4i_conn_debug("csk 0x%p, state %u, flags %lu\n",
+			csk, csk->state, csk->flags);
+
+	if (unlikely(csk->state == CXGB4I_CSK_ST_CONNECTING))
+		cxgb4i_sock_set_state(csk,
+				CXGB4I_CSK_FL_ACTIVE_CLOSE_NEEDED);
+	else if (likely(csk->state != CXGB4I_CSK_ST_CLOSED))
+		cxgb4i_sock_active_close(csk);
+
+	cxgb4i_sock_put(csk);
+}
+
+static int is_cxgb4_dev(struct net_device *dev, struct cxgb4i_snic *snic)
+{
+	struct net_device *ndev = dev;
+	int i;
+
+	if (dev->priv_flags & IFF_802_1Q_VLAN)
+		ndev = vlan_dev_real_dev(dev);
+
+	for (i = 0; i < snic->lldi.nports; i++) {
+		if (ndev == snic->lldi.ports[i])
+			return 1;
+	}
+
+	return 0;
+}
+
+static struct net_device *cxgb4i_find_egress_dev(struct net_device *root_dev,
+						struct cxgb4i_snic *snic)
+{
+	while (root_dev) {
+		if (root_dev->priv_flags & IFF_802_1Q_VLAN)
+			root_dev = vlan_dev_real_dev(root_dev);
+		else if (is_cxgb4_dev(root_dev, snic))
+			return root_dev;
+		else
+			return NULL;
+	}
+
+	return NULL;
+}
+
+static struct rtable *find_route(struct net_device *dev,
+				__be32 saddr, __be32 daddr,
+				__be16 sport, __be16 dport,
+				u8 tos)
+{
+	struct rtable *rt;
+	struct flowi fl = {
+		.oif = dev ? dev->ifindex : 0,
+		.nl_u = {
+			.ip4_u = {
+				.daddr = daddr,
+				.saddr = saddr,
+				.tos = tos }
+			},
+		.proto = IPPROTO_TCP,
+		.uli_u = {
+			.ports = {
+				.sport = sport,
+				.dport = dport }
+			}
+	};
+
+	if (ip_route_output_flow(dev ? dev_net(dev) : &init_net,
+					&rt, &fl, NULL, 0))
+		return NULL;
+
+	return rt;
+}
+
+static int cxgb4i_init_act_open(struct cxgb4i_sock *csk,
+					struct net_device *dev)
+{
+	struct dst_entry *dst = csk->dst;
+	struct sk_buff *skb;
+	struct port_info *pi = netdev_priv(dev);
+
+	cxgb4i_conn_debug("csk 0x%p, state %u, flags 0x%lx\n",
+			csk, csk->state, csk->flags);
+
+	csk->atid = cxgb4_alloc_atid(csk->snic->lldi.tids, csk);
+	if (csk->atid == -1) {
+		cxgb4i_log_error("cannot alloc atid\n");
+		goto out_err;
+	}
+
+	csk->l2t = cxgb4_l2t_get(csk->snic->lldi.l2t,
+				csk->dst->neighbour,
+				dev, 0);
+	if (!csk->l2t) {
+		cxgb4i_log_error("cannot alloc l2t\n");
+		goto free_atid;
+	}
+
+	skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_KERNEL);
+	if (!skb)
+		goto free_l2t;
+
+	skb->sk = (struct sock *)csk;
+	t4_set_arp_err_handler(skb, csk, cxgb4i_act_open_req_arp_failure);
+
+	cxgb4i_sock_hold(csk);
+
+	csk->wr_max_cred = csk->wr_cred = csk->snic->lldi.wr_cred;
+	csk->port_id = pi->port_id;
+	csk->rss_qid = csk->snic->lldi.rxq_ids[csk->port_id];
+	csk->tx_chan = pi->tx_chan;
+	csk->smac_idx = csk->tx_chan << 1;
+	csk->wr_una_cred = 0;
+	csk->mss_idx = cxgb4i_select_mss(csk, dst_mtu(dst));
+	csk->err = 0;
+
+	cxgb4i_sock_reset_wr_list(csk);
+
+	cxgb4i_sock_make_act_open_req(csk, skb,
+					((csk->rss_qid << 14) |
+					 (csk->atid)), csk->l2t);
+	cxgb4_l2t_send(csk->snic->lldi.ports[csk->port_id], skb, csk->l2t);
+	return 0;
+
+free_l2t:
+	cxgb4_l2t_release(csk->l2t);
+
+free_atid:
+	cxgb4_free_atid(csk->snic->lldi.tids, csk->atid);
+
+out_err:
+
+	return -EINVAL;;
+}
+
+static struct net_device *cxgb4i_find_dev(struct net_device *dev,
+							__be32 ipaddr)
+{
+	struct flowi fl;
+	struct rtable *rt;
+	int err;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.nl_u.ip4_u.daddr = ipaddr;
+
+	err = ip_route_output_key(dev ? dev_net(dev) : &init_net, &rt, &fl);
+	if (!err)
+		return (&rt->u.dst)->dev;
+
+	return NULL;
+}
+
+int cxgb4i_sock_connect(struct net_device *dev, struct cxgb4i_sock *csk,
+						struct sockaddr_in *sin)
+{
+	struct rtable *rt;
+	__be32 sipv4 = 0;
+	struct net_device *dstdev;
+	struct cxgb4i_hba *chba = NULL;
+	int err;
+
+	cxgb4i_conn_debug("csk 0x%p, dev 0x%p\n", csk, dev);
+
+	if (sin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	csk->daddr.sin_port = sin->sin_port;
+	csk->daddr.sin_addr.s_addr = sin->sin_addr.s_addr;
+
+	dstdev = cxgb4i_find_dev(dev, sin->sin_addr.s_addr);
+	if (!dstdev || !is_cxgb4_dev(dstdev, csk->snic))
+		return -ENETUNREACH;
+
+	if (dstdev->priv_flags & IFF_802_1Q_VLAN)
+		dev = dstdev;
+
+	rt = find_route(dev, csk->saddr.sin_addr.s_addr,
+			csk->daddr.sin_addr.s_addr,
+			csk->saddr.sin_port,
+			csk->daddr.sin_port,
+			0);
+	if (rt == NULL) {
+		cxgb4i_conn_debug("no route to %pI4, port %u, dev %s, "
+					"snic 0x%p\n",
+					&csk->daddr.sin_addr.s_addr,
+					ntohs(csk->daddr.sin_port),
+					dev ? dev->name : "any",
+					csk->snic);
+		return -ENETUNREACH;
+	}
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		cxgb4i_conn_debug("multi-cast route to %pI4, port %u, "
+					"dev %s, snic 0x%p\n",
+					&csk->daddr.sin_addr.s_addr,
+					ntohs(csk->daddr.sin_port),
+					dev ? dev->name : "any",
+					csk->snic);
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (!csk->saddr.sin_addr.s_addr)
+		csk->saddr.sin_addr.s_addr = rt->rt_src;
+
+	csk->dst = &rt->u.dst;
+
+	dev = cxgb4i_find_egress_dev(csk->dst->dev, csk->snic);
+	if (dev == NULL) {
+		cxgb4i_conn_debug("csk: 0x%p, egress dev NULL\n", csk);
+		return -ENETUNREACH;
+	}
+
+	err = cxgb4i_sock_get_port(csk);
+	if (err)
+		return err;
+
+	cxgb4i_conn_debug("csk: 0x%p get port: %u\n",
+			csk, ntohs(csk->saddr.sin_port));
+
+	chba = cxgb4i_hba_find_by_netdev(csk->dst->dev);
+
+	sipv4 = cxgb4i_get_iscsi_ipv4(chba);
+	if (!sipv4) {
+		cxgb4i_conn_debug("csk: 0x%p, iscsi is not configured\n", csk);
+		sipv4 = csk->saddr.sin_addr.s_addr;
+		cxgb4i_set_iscsi_ipv4(chba, sipv4);
+	} else
+		csk->saddr.sin_addr.s_addr = sipv4;
+
+	cxgb4i_conn_debug("csk: 0x%p, %pI4:[%u], %pI4:[%u] SYN_SENT\n",
+				csk,
+				&csk->saddr.sin_addr.s_addr,
+				ntohs(csk->saddr.sin_port),
+				&csk->daddr.sin_addr.s_addr,
+				ntohs(csk->daddr.sin_port));
+
+	cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_CONNECTING);
+
+	if (!cxgb4i_init_act_open(csk, dev))
+		return 0;
+
+	err = -ENOTSUPP;
+
+	cxgb4i_conn_debug("csk 0x%p -> closed\n", csk);
+	cxgb4i_sock_set_state(csk, CXGB4I_CSK_ST_CLOSED);
+	ip_rt_put(rt);
+	cxgb4i_sock_put_port(csk);
+
+	return err;
+}
+
+void cxgb4i_sock_rx_credits(struct cxgb4i_sock *csk, int copied)
+{
+	int must_send;
+	u32 credits;
+
+	if (csk->state != CXGB4I_CSK_ST_ESTABLISHED)
+		return;
+
+	credits = csk->copied_seq - csk->rcv_wup;
+	if (unlikely(!credits))
+		return;
+
+	if (unlikely(cxgb4i_rx_credit_thres == 0))
+		return;
+
+	must_send = credits + 16384 >= cxgb4i_rcv_win;
+
+	if (must_send || credits >= cxgb4i_rx_credit_thres)
+		csk->rcv_wup += cxgb4i_csk_send_rx_credits(csk, credits);
+}
+
+int cxgb4i_sock_send_pdus(struct cxgb4i_sock *csk, struct sk_buff *skb)
+{
+	struct sk_buff *next;
+	int err, copied = 0;
+
+	spin_lock_bh(&csk->lock);
+
+	if (csk->state != CXGB4I_CSK_ST_ESTABLISHED) {
+		cxgb4i_tx_debug("csk 0x%p, not in est. state %u.\n",
+			      csk, csk->state);
+		err = -EAGAIN;
+		goto out_err;
+	}
+
+	if (csk->err) {
+		cxgb4i_tx_debug("csk 0x%p, err %d.\n", csk, csk->err);
+		err = -EPIPE;
+		goto out_err;
+	}
+
+	if (csk->write_seq - csk->snd_una >= cxgb4i_snd_win) {
+		cxgb4i_tx_debug("csk 0x%p, snd %u - %u > %u.\n",
+				csk, csk->write_seq, csk->snd_una,
+				cxgb4i_snd_win);
+		err = -ENOBUFS;
+		goto out_err;
+	}
+
+	while (skb) {
+		int frags = skb_shinfo(skb)->nr_frags +
+				(skb->len != skb->data_len);
+
+		if (unlikely(skb_headroom(skb) < TX_HEADER_LEN)) {
+			cxgb4i_tx_debug("csk 0x%p, skb head.\n", csk);
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		if (frags >= SKB_WR_LIST_SIZE) {
+			cxgb4i_log_error("csk 0x%p, tx frags %d, len %u,%u.\n",
+					 csk, skb_shinfo(skb)->nr_frags,
+					 skb->len, skb->data_len);
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		next = skb->next;
+		skb->next = NULL;
+		cxgb4i_sock_skb_entail(csk, skb,
+				CXGB4I_SKCB_FLAG_NO_APPEND |
+				CXGB4I_SKCB_FLAG_NEED_HDR);
+		copied += skb->len;
+		csk->write_seq += skb->len + ulp_extra_len(skb);
+		skb = next;
+	}
+done:
+	if (likely(skb_queue_len(&csk->write_queue)))
+		cxgb4i_sock_push_tx_frames(csk, 1);
+	spin_unlock_bh(&csk->lock);
+	return copied;
+
+out_err:
+	if (copied == 0 && err == -EPIPE)
+		copied = csk->err ? csk->err : -EPIPE;
+	else
+		copied = err;
+	goto done;
+}
+
+static void cxgb4i_sock_conn_closing(struct cxgb4i_sock *csk)
+{
+	struct iscsi_conn *conn = csk->user_data;
+
+	read_lock(&csk->callback_lock);
+	if (conn && csk->state != CXGB4I_CSK_ST_ESTABLISHED)
+		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+	read_unlock(&csk->callback_lock);
+}
+
+int cxgb4i_ofld_init(struct cxgb4i_snic *snic)
+{
+	struct cxgb4i_ports_map *ports;
+	int mapsize = (cxgb4i_max_connect * sizeof(struct cxgb4i_sock));
+
+	ports = cxgb4i_alloc_big_mem(sizeof(*ports) + mapsize, GFP_KERNEL);
+	if (!ports)
+		return -ENOMEM;
+
+	spin_lock_init(&ports->lock);
+	snic->pmap = ports;
+
+	snic->funcs[CPL_ACT_ESTABLISH] = cxgb4i_cpl_act_establish;
+	snic->funcs[CPL_ACT_OPEN_RPL] = cxgb4i_cpl_act_open_rpl;
+	snic->funcs[CPL_PEER_CLOSE] = cxgb4i_cpl_peer_close;
+	snic->funcs[CPL_ABORT_REQ_RSS] = cxgb4i_cpl_abort_req_rss;
+	snic->funcs[CPL_ABORT_RPL_RSS] = cxgb4i_cpl_abort_rpl_rss;
+	snic->funcs[CPL_CLOSE_CON_RPL] = cxgb4i_cpl_close_con_rpl;
+	snic->funcs[CPL_FW4_ACK] = cxgb4i_cpl_fw4_ack;
+	snic->funcs[CPL_ISCSI_HDR] = cxgb4i_cpl_iscsi_hdr;
+	snic->funcs[CPL_SET_TCB_RPL] = cxgb4i_cpl_set_tcb_rpl;
+	snic->funcs[CPL_RX_DATA_DDP] = cxgb4i_cpl_rx_data_ddp;
+
+	return 0;
+}
+
+void cxgb4i_ofld_cleanup(struct cxgb4i_snic *snic)
+{
+	cxgb4i_free_big_mem(snic->pmap);
+}
+
diff --git a/drivers/scsi/cxgb4i/cxgb4i_offload.h b/drivers/scsi/cxgb4i/cxgb4i_offload.h
new file mode 100644
index 0000000..40ca066
--- /dev/null
+++ b/drivers/scsi/cxgb4i/cxgb4i_offload.h
@@ -0,0 +1,171 @@
+/*
+ * cxgb4i_offload.h: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ * Written by: Rakesh Ranjan (rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ */
+
+#ifndef	__CXGB4I_OFFLOAD_H__
+#define	__CXGB4I_OFFLOAD_H__
+
+#include <linux/skbuff.h>
+#include <linux/in.h>
+
+#define	CXGB4I_MAX_CONN	0xFFF
+
+enum {
+	CPL_RET_BUF_DONE = 1,
+	CPL_RET_BAD_MSG = 2,
+	CPL_RET_UNKNOWN_TID = 4
+};
+
+struct cxgb4i_sock {
+	struct net_device *egdev;
+	struct cxgb4i_snic *snic;
+
+	unsigned long flags;
+	unsigned int qset;
+	unsigned int rss_qid;
+
+	unsigned int hwtid;
+	unsigned int atid;
+
+	unsigned int tx_chan;
+	unsigned int rx_chan;
+	unsigned int mss_idx;
+	unsigned int smac_idx;
+	unsigned char port_id;
+
+	struct l2t_entry *l2t;
+
+	int wr_max_cred;
+	int wr_cred;
+	int wr_una_cred;
+
+	struct sk_buff *wr_pending_head;
+	struct sk_buff *wr_pending_tail;
+	struct sk_buff *cpl_close;
+	struct sk_buff *cpl_abort_req;
+	struct sk_buff *cpl_abort_rpl;
+	struct sk_buff *skb_ulp_lhdr;
+	spinlock_t lock;
+	atomic_t refcnt;
+	volatile unsigned int state;
+	struct sockaddr_in saddr;
+	struct sockaddr_in daddr;
+	struct dst_entry *dst;
+	struct sk_buff_head receive_queue;
+	struct sk_buff_head write_queue;
+	struct timer_list retry_timer;
+	int err;
+	rwlock_t callback_lock;
+	void *user_data;
+
+	u32 rcv_nxt;
+	u32 copied_seq;
+	u32 rcv_wup;
+	u32 snd_nxt;
+	u32 snd_una;
+	u32 write_seq;
+};
+
+enum cxgb4i_sock_states{
+	CXGB4I_CSK_ST_CONNECTING = 1,
+	CXGB4I_CSK_ST_ESTABLISHED,
+	CXGB4I_CSK_ST_ACTIVE_CLOSE,
+	CXGB4I_CSK_ST_PASSIVE_CLOSE,
+	CXGB4I_CSK_ST_CLOSE_WAIT_1,
+	CXGB4I_CSK_ST_CLOSE_WAIT_2,
+	CXGB4I_CSK_ST_ABORTING,
+	CXGB4I_CSK_ST_CLOSED,
+};
+
+static inline unsigned int cxgb4i_sock_is_closing(
+			const struct cxgb4i_sock *csk)
+{
+	return csk->state >= CXGB4I_CSK_ST_ACTIVE_CLOSE;
+}
+
+static inline unsigned int cxgb4i_sock_is_established(
+				const struct cxgb4i_sock *csk)
+{
+	return csk->state == CXGB4I_CSK_ST_ESTABLISHED;
+}
+
+enum cxgb4i_sock_flags {
+	CXGB4I_CSK_FL_ABORT_RPL_RCVD,	/*received one ABORT_RPL_RSS message */
+	CXGB4I_CSK_FL_ABORT_REQ_RCVD,	/*received one ABORT_REQ_RSS message */
+	CXGB4I_CSK_FL_ABORT_RPL_PENDING,	/* expecting an abort reply */
+	CXGB4I_CSK_FL_TX_DATA_SENT,	/* already sent a TX_DATA WR */
+	CXGB4I_CSK_FL_ACTIVE_CLOSE_NEEDED,	/* need to be closed */
+	CXGB4I_CSK_FL_OFFLOAD_DOWN		/* offload function off */
+};
+
+struct cxgb4i_sock *cxgb4i_sock_create(struct cxgb4i_snic *);
+void cxgb4i_sock_release(struct cxgb4i_sock *);
+int cxgb4i_sock_connect(struct net_device *, struct cxgb4i_sock *,
+			struct sockaddr_in *);
+void cxgb4i_sock_rx_credits(struct cxgb4i_sock *, int);
+int cxgb4i_sock_send_pdus(struct cxgb4i_sock *, struct sk_buff *);
+
+struct cxgb4i_skb_rx_cb {
+	__u32 ddigest;
+	__u32 pdulen;
+};
+
+struct cxgb4i_skb_tx_cb {
+	struct l2t_skb_cb l2t;
+	struct sk_buff *wr_next;
+};
+
+struct cxgb4i_skb_cb {
+	__u16 flags;
+	__u16 ulp_mode;
+	__u32 seq;
+
+	union {
+		struct cxgb4i_skb_rx_cb rx;
+		struct cxgb4i_skb_tx_cb tx;
+	};
+};
+
+#define CXGB4I_SKB_CB(skb)	((struct cxgb4i_skb_cb *)&((skb)->cb[0]))
+#define cxgb4i_skb_flags(skb)	(CXGB4I_SKB_CB(skb)->flags)
+#define cxgb4i_skb_ulp_mode(skb)	(CXGB4I_SKB_CB(skb)->ulp_mode)
+#define cxgb4i_skb_tcp_seq(skb)		(CXGB4I_SKB_CB(skb)->seq)
+#define cxgb4i_skb_rx_ddigest(skb)	(CXGB4I_SKB_CB(skb)->rx.ddigest)
+#define cxgb4i_skb_rx_pdulen(skb)	(CXGB4I_SKB_CB(skb)->rx.pdulen)
+#define cxgb4i_skb_tx_wr_next(skb)	(CXGB4I_SKB_CB(skb)->tx.wr_next)
+
+enum cxgb4i_skcb_flags {
+	CXGB4I_SKCB_FLAG_NEED_HDR = 1 << 0,	/* packet needs a header */
+	CXGB4I_SKCB_FLAG_NO_APPEND = 1 << 1,	/* don't grow this skb */
+	CXGB4I_SKCB_FLAG_COMPL = 1 << 2,	/* request WR completion */
+	CXGB4I_SKCB_FLAG_HDR_RCVD = 1 << 3,	/* recieved header pdu */
+	CXGB4I_SKCB_FLAG_DATA_RCVD = 1 << 4,	/*  recieved data pdu */
+	CXGB4I_SKCB_FLAG_STATUS_RCVD = 1 << 5,	/* recieved ddp status */
+};
+
+/*
+ * sge_opaque_hdr -
+ * Opaque version of structure the SGE stores at skb->head of TX_DATA packets
+ * and for which we must reserve space.
+ */
+struct sge_opaque_hdr {
+	void *dev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
+#define TX_HEADER_LEN \
+	(sizeof(struct fw_ofld_tx_data_wr) + sizeof(struct sge_opaque_hdr))
+#define SKB_TX_HEADROOM	SKB_MAX_HEAD(TX_HEADER_LEN)
+
+#endif	/* __CXGB4I_OFFLOAD_H__ */
+
diff --git a/drivers/scsi/cxgb4i/cxgb4i_snic.c b/drivers/scsi/cxgb4i/cxgb4i_snic.c
new file mode 100644
index 0000000..36be666
--- /dev/null
+++ b/drivers/scsi/cxgb4i/cxgb4i_snic.c
@@ -0,0 +1,253 @@
+/*
+ * cxgb4i_snic.c: Chelsio T4 iSCSI driver.
+ *
+ * Copyright (c) 2010 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ * Written by: Rakesh Ranjan (rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org)
+ */
+
+#include <net/route.h>
+
+#include "cxgb4i.h"
+
+#define	DRV_MODULE_NAME		"cxgb4i"
+#define	DRV_MODULE_VERSION	"0.90"
+#define	DRV_MODULE_RELDATE	"04/08/2010"
+
+static char version[] =
+	"Chelsio T4 iSCSI driver " DRV_MODULE_NAME
+	" v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+
+MODULE_AUTHOR("Rakesh Ranjan <rranjan-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>");
+MODULE_DESCRIPTION("Chelsio T4 iSCSI driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+
+static LIST_HEAD(snic_list);
+static DEFINE_MUTEX(snic_rwlock);
+
+static cxgb4i_cplhandler_func cxgb4i_cplhandlers[NUM_CPL_CMDS];
+static void *cxgb4i_uld_add(const struct cxgb4_lld_info *linfo);
+static int cxgb4i_uld_rx_handler(void *handle, const __be64 *rsp,
+				const struct pkt_gl *pgl);
+static int cxgb4i_uld_state_change(void *handle, enum cxgb4_state state);
+
+static struct cxgb4_uld_info cxgb4i_uld_info = {
+	.name = "cxgb4i",
+	.add = cxgb4i_uld_add,
+	.rx_handler = cxgb4i_uld_rx_handler,
+	.state_change = cxgb4i_uld_state_change,
+};
+
+struct cxgb4i_hba *cxgb4i_hba_find_by_netdev(struct net_device *dev)
+{
+	int i;
+	struct cxgb4i_snic *snic = NULL;;
+
+	if (dev->priv_flags & IFF_802_1Q_VLAN)
+		dev = vlan_dev_real_dev(dev);
+
+	mutex_lock(&snic_rwlock);
+	list_for_each_entry(snic, &snic_list, list_head) {
+		for (i = 0; i < snic->hba_cnt; i++) {
+			if (snic->hba[i]->ndev == dev) {
+				mutex_unlock(&snic_rwlock);
+				return snic->hba[i];
+			}
+		}
+	}
+	mutex_unlock(&snic_rwlock);
+	return NULL;
+}
+
+struct cxgb4i_snic *cxgb4i_find_snic(struct net_device *dev, __be32 ipaddr)
+{
+	struct flowi fl;
+	struct rtable *rt;
+	struct net_device *sdev = NULL;
+	struct cxgb4i_snic *snic = NULL, *tmp;
+	int err, i;
+
+	memset(&fl, 0, sizeof(fl));
+	fl.nl_u.ip4_u.daddr = ipaddr;
+
+	err = ip_route_output_key(dev ? dev_net(dev) : &init_net, &rt, &fl);
+	if (err)
+		goto out;
+
+	sdev = (&rt->u.dst)->dev;
+	mutex_lock(&snic_rwlock);
+	list_for_each_entry_safe(snic, tmp, &snic_list, list_head) {
+		if (snic) {
+			for (i = 0; i < snic->lldi.nports; i++) {
+				if (sdev == snic->lldi.ports[i]) {
+					mutex_unlock(&snic_rwlock);
+					return snic;
+				}
+			}
+		}
+	}
+	mutex_unlock(&snic_rwlock);
+
+out:
+	snic = NULL;
+	return snic;
+}
+
+
+struct cxgb4i_snic *cxgb4i_snic_init(const struct cxgb4_lld_info *linfo)
+{
+	struct cxgb4i_snic *snic;
+	int i;
+
+	snic = kzalloc(sizeof(*snic), GFP_KERNEL);
+	if (snic) {
+
+		spin_lock_init(&snic->lock);
+		snic->lldi = *linfo;
+		snic->hba_cnt = snic->lldi.nports;
+		snic->funcs = cxgb4i_cplhandlers;
+
+		cxgb4i_iscsi_init();
+		cxgb4i_pdu_init();
+		cxgb4i_ddp_init(snic);
+		cxgb4i_ofld_init(snic);
+
+		for (i = 0; i < snic->hba_cnt; i++) {
+			snic->hba[i] = cxgb4i_hba_add(snic,
+						snic->lldi.ports[i]);
+			if (!snic->hba[i]) {
+				kfree(snic);
+				snic = ERR_PTR(-ENOMEM);
+				goto out;
+			}
+		}
+
+		mutex_lock(&snic_rwlock);
+		list_add_tail(&snic->list_head, &snic_list);
+		mutex_unlock(&snic_rwlock);
+	} else
+out :
+	snic = ERR_PTR(-ENOMEM);
+
+	return snic;
+}
+
+void cxgb4i_snic_cleanup(void)
+{
+	struct cxgb4i_snic *snic, *tmp;
+	int i;
+
+	mutex_lock(&snic_rwlock);
+	list_for_each_entry_safe(snic, tmp, &snic_list, list_head) {
+		list_del(&snic->list_head);
+
+		for (i = 0; i < snic->hba_cnt; i++) {
+			if (snic->hba[i]) {
+				cxgb4i_hba_remove(snic->hba[i]);
+				snic->hba[i] = NULL;
+			}
+		}
+		cxgb4i_ofld_cleanup(snic);
+		cxgb4i_ddp_cleanup(snic);
+		cxgb4i_log_info("snic 0x%p, %u scsi hosts removed.\n",
+				snic, snic->hba_cnt);
+
+		kfree(snic);
+	}
+	mutex_unlock(&snic_rwlock);
+
+	cxgb4i_pdu_cleanup();
+	cxgb4i_iscsi_cleanup();
+}
+
+static void *cxgb4i_uld_add(const struct cxgb4_lld_info *linfo)
+{
+	struct cxgb4i_snic *snic;
+
+	cxgb4i_log_info("%s", version);
+
+	snic = cxgb4i_snic_init(linfo);
+	if (!snic)
+		goto out;
+out:
+	return snic;
+}
+
+static int cxgb4i_uld_rx_handler(void *handle, const __be64 *rsp,
+				const struct pkt_gl *pgl)
+{
+	struct cxgb4i_snic *snic = handle;
+	struct sk_buff *skb;
+	const struct cpl_act_establish *rpl;
+	unsigned int opcode;
+
+	if (pgl == NULL) {
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_skb(256, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		__skb_put(skb, len);
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+
+	} else if (pgl == CXGB4_MSG_AN) {
+
+		return 0;
+
+	} else {
+
+		skb = cxgb4_pktgl_to_skb(pgl, 256, 256);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	rpl = cplhdr(skb);
+	opcode = rpl->ot.opcode;
+
+	cxgb4i_api_debug("snic %p, opcode 0x%x, skb %p\n",
+			 snic, opcode, skb);
+
+	BUG_ON(!snic->funcs[opcode]);
+
+	if (snic->funcs[opcode]) {
+		snic->funcs[opcode](snic, skb);
+	} else
+		cxgb4i_log_error("No handler for opcode 0x%x\n",
+				opcode);
+
+	return 0;
+
+nomem:
+	cxgb4i_api_debug("OOM bailing out\n");
+	return 1;
+}
+
+static int cxgb4i_uld_state_change(void *handle, enum cxgb4_state state)
+{
+	return 0;
+}
+
+static int __init cxgb4i_init_module(void)
+{
+	cxgb4_register_uld(CXGB4_ULD_ISCSI, &cxgb4i_uld_info);
+
+	return 0;
+}
+
+static void __exit cxgb4i_exit_module(void)
+{
+
+	cxgb4_unregister_uld(CXGB4_ULD_ISCSI);
+	cxgb4i_snic_cleanup();
+}
+
+module_init(cxgb4i_init_module);
+module_exit(cxgb4i_exit_module);
+
-- 
1.6.6.1

-- 
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To unsubscribe from this group, send email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
For more options, visit this group at http://groups.google.com/group/open-iscsi?hl=en.

^ permalink raw reply related

* [PATCH 1/3] cxgb4i: add build support
From: Rakesh Ranjan @ 2010-04-08 12:14 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	open-iscsi-/JYPxA39Uh5TLH3MbocFFw
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, kxie-ut6Up61K2wZBDgjK7y7TUQ,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	James.Bottomley-JuX6DAaQMKPCXq6kfMZ53/egYHeGw8Jk,
	michaelc-hcNo3dDEHLuVc3sceRu5cw, Rakesh Ranjan
In-Reply-To: <1270728855-20951-1-git-send-email-rakesh-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

From: Rakesh Ranjan <rakesh-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>


Signed-off-by: Rakesh Ranjan <rakesh-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
 drivers/scsi/Kconfig        |    1 +
 drivers/scsi/Makefile       |    1 +
 drivers/scsi/cxgb4i/Kbuild  |    4 ++++
 drivers/scsi/cxgb4i/Kconfig |    7 +++++++
 4 files changed, 13 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/cxgb4i/Kbuild
 create mode 100644 drivers/scsi/cxgb4i/Kconfig

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 75f2336..fc3810a 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -371,6 +371,7 @@ config ISCSI_TCP
 	 http://open-iscsi.org
 
 source "drivers/scsi/cxgb3i/Kconfig"
+source "drivers/scsi/cxgb4i/Kconfig"
 source "drivers/scsi/bnx2i/Kconfig"
 source "drivers/scsi/be2iscsi/Kconfig"
 
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index 92a8c50..d8a90f5 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -133,6 +133,7 @@ obj-$(CONFIG_SCSI_STEX)		+= stex.o
 obj-$(CONFIG_SCSI_MVSAS)	+= mvsas/
 obj-$(CONFIG_PS3_ROM)		+= ps3rom.o
 obj-$(CONFIG_SCSI_CXGB3_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgb3i/
+obj-$(CONFIG_SCSI_CXGB4_ISCSI)	+= libiscsi.o libiscsi_tcp.o cxgb4i/
 obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
 obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
 obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
diff --git a/drivers/scsi/cxgb4i/Kbuild b/drivers/scsi/cxgb4i/Kbuild
new file mode 100644
index 0000000..4c5c40e
--- /dev/null
+++ b/drivers/scsi/cxgb4i/Kbuild
@@ -0,0 +1,4 @@
+EXTRA_CFLAGS += -I$(srctree)/drivers/net/cxgb4
+
+cxgb4i-y := cxgb4i_snic.o cxgb4i_iscsi.o cxgb4i_pdu.o cxgb4i_offload.o cxgb4i_ddp.o
+obj-$(CONFIG_SCSI_CXGB4_ISCSI) += cxgb4i.o
diff --git a/drivers/scsi/cxgb4i/Kconfig b/drivers/scsi/cxgb4i/Kconfig
new file mode 100644
index 0000000..3f33dc2
--- /dev/null
+++ b/drivers/scsi/cxgb4i/Kconfig
@@ -0,0 +1,7 @@
+config SCSI_CXGB4_ISCSI
+	tristate "Chelsio T4 iSCSI support"
+	depends on CHELSIO_T4_DEPENDS
+	select CHELSIO_T4
+	select SCSI_ISCSI_ATTRS
+	---help---
+	This driver supports iSCSI offload for the Chelsio T4 series devices.
-- 
1.6.6.1

-- 
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To unsubscribe from this group, send email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
For more options, visit this group at http://groups.google.com/group/open-iscsi?hl=en.

^ permalink raw reply related

* cxgb4i submission
From: Rakesh Ranjan @ 2010-04-08 12:14 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	open-iscsi-/JYPxA39Uh5TLH3MbocFFw
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, kxie-ut6Up61K2wZBDgjK7y7TUQ,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	James.Bottomley-JuX6DAaQMKPCXq6kfMZ53/egYHeGw8Jk,
	michaelc-hcNo3dDEHLuVc3sceRu5cw

The following 3 patche add a new iscsi LLD driver cxgb4i to enable iscsi offload
support on Chelsio's new 1G and 10G cards. This driver depends upon cxgb4 driver,
posted recently on netdev and now part of net-next repo. Please share you commnets
after review. I will be sending updated patches after getting feedback/commnets.

-- 
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To unsubscribe from this group, send email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
For more options, visit this group at http://groups.google.com/group/open-iscsi?hl=en.

^ permalink raw reply

* Re: dhcp client packet sniffing...
From: David Miller @ 2010-04-08 12:11 UTC (permalink / raw)
  To: herbert; +Cc: netdev
In-Reply-To: <20100408114738.GA23329@gondor.apana.org.au>

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 8 Apr 2010 19:47:38 +0800

> Another approach is to use skb_share.  At some point all protocols
> were designed to handle shared skbs.
> 
> If we could tap into that then it would be an obvious way to
> eliminate the clone.  To do this we'd need to audit all the
> protocols to ensure that they can still handle shared packets
> safely.

I don't even want to call down into the AF_PACKET code for
the case where the filter doesn't pass.

We have the socket pointer etc. already in dev_queue_xmit_nit(), so we
can easily do it.

^ permalink raw reply

* [-next April 8] eHEA driver failure on powerpc
From: Sachin Sant @ 2010-04-08 11:55 UTC (permalink / raw)
  To: linux-next; +Cc: netdev, HERING2
In-Reply-To: <20100408155113.ef8df30c.sfr@canb.auug.org.au>

With today's next release, eHEA network interface on couple
of power6 boxes fails to initialize.

# modprobe ehea
IBM eHEA ethernet device driver (Release EHEA_0102)
  alloc irq_desc for 256 on node 0
  alloc kstat_irqs on node 0
irq: irq 590080 on host null mapped to virtual irq 256
ehea: Error in ehea_plpar_hcall_norets: opcode=26c ret=fffffffffffffffc arg1=8000000003000000 arg2=0 arg3=7000000000050400 arg4=fc9b0000 arg5=200 arg6=0 arg7=0
ehea: Error in ehea_reg_mr_section: register_rpage_mr failed
ehea: Error in ehea_reg_kernel_mr: registering mr failed
ehea: Error in ehea_setup_ports: creating MR failed
ehea 23c00200.lhea: setup_ports failed
ehea: probe of 23c00200.lhea failed with error -5
# lsmod
Module                  Size  Used by
ehea                   82002  0 
ipv6                  386130  24 
fuse                   89953  1 
loop                   20967  0 
dm_mod                105104  0 
sg                     40193  0 
sd_mod                 46334  3 
crc_t10dif              1619  1 sd_mod
ibmvscsic              36002  2 
scsi_transport_srp      8024  1 ibmvscsic
scsi_tgt               17581  1 scsi_transport_srp
scsi_mod              218761  5 sg,sd_mod,ibmvscsic,scsi_transport_srp,scsi_tgt
# ifconfig eth0
eth0: error fetching interface information: Device not found

Yesterday's next worked fine.

Thanks
-Sachin


-- 

---------------------------------
Sachin Sant
IBM Linux Technology Center
India Systems and Technology Labs
Bangalore, India
---------------------------------

^ permalink raw reply

* Re: [PATCH] tcp: Set CHECKSUM_UNNECESSARY in tcp_init_nondata_skb
From: Herbert Xu @ 2010-04-08 11:49 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100408.012617.98660541.davem@davemloft.net>

On Thu, Apr 08, 2010 at 01:26:17AM -0700, David Miller wrote:
> 
> Back in commit 04a0551c87363f100b04d28d7a15a632b70e18e7
> ("loopback: Drop obsolete ip_summed setting") we stopped
> setting CHECKSUM_UNNECESSARY in the loopback xmit.
> 
> This is because such a setting was a lie since it implies that the
> checksum field of the packet is properly filled in.
> 
> Instead what happens normally is that CHECKSUM_PARTIAL is set and
> skb->csum is calculated as needed.
> 
> But this was only happening for TCP data packets (via the
> skb->ip_summed assignment done in tcp_sendmsg()).  It doesn't
> happen for non-data packets like ACKs etc.

I don't understand.  If said packet was going across the network,
how could it possibly have worked?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: dhcp client packet sniffing...
From: Herbert Xu @ 2010-04-08 11:47 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20100408.035049.177640912.davem@davemloft.net>

On Thu, Apr 08, 2010 at 03:50:49AM -0700, David Miller wrote:
> 
> This is an old topic, but looking at traces tonight I was reminded
> about it.
> 
> dhcp clients sniff every packet in the system, the reason it does this
> and the things we can do to make it not have to do so have been
> discussed before.  Actually, I don't remember where we got with
> that and if we were able to make it such that the dhcp client
> doesn't have to do this any more.  Herbert?

The main problem is that it needs to be able to receive packets
destined to an address which is not yet a local address.  IOW,
it needs to be able to bypass the routing table and receive
non-local traffic.

> This means every packet in the machine gets sniffed.
> 
> The DHCP client at least installs a socket filter that only accepts
> the packets that the DHCP client is actually interested in.
> 
> The problem is that we clone the SKB and do some other operations
> before running the socket filter.
> 
> I was thinking, what if we simply move the sk_filter() call up to
> dev_queue_xmit_nit()?  And if sk_filter() rejects we don't even need
> to clone the packet.

Since you're talking about dev_queue_xmit_nit, I presume you're
mainly concerned about the TX direction?

FWIW the DHCP client does not need to see any packets in the TX
direction.  So if we could provide a way to only sniff RX traffic
then that would remove the need to clone for TX.  However, this
may require user-space modifications.

Another approach is to use skb_share.  At some point all protocols
were designed to handle shared skbs.

If we could tap into that then it would be an obvious way to
eliminate the clone.  To do this we'd need to audit all the
protocols to ensure that they can still handle shared packets
safely.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: mmotm 2010-04-05-16-09 uploaded
From: Patrick McHardy @ 2010-04-08 11:41 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: Andrew Morton, Peter Zijlstra, Ingo Molnar, David S. Miller,
	linux-kernel, netfilter-devel, netdev
In-Reply-To: <13074.1270663309@localhost>

[-- Attachment #1: Type: text/plain, Size: 1276 bytes --]

Valdis.Kletnieks@vt.edu wrote:
> On Mon, 05 Apr 2010 16:09:45 PDT, akpm@linux-foundation.org said:
>> The mm-of-the-moment snapshot 2010-04-05-16-09 has been uploaded to
>>
>>    http://userweb.kernel.org/~akpm/mmotm/
> 
> Seen in dmesg, 2.6.34-rc2-mmotm0323 didn't do this. Tossing it at all the
> likely suspects, hopefully somebody will recognize it and save me the
> bisecting. ;)
> 
> [   11.488535] ctnetlink v0.93: registering with nfnetlink.
> [   11.488579] 
> [   11.488579] ===================================================
> [   11.489529] [ INFO: suspicious rcu_dereference_check() usage. ]
> [   11.489988] ---------------------------------------------------
> [   11.490494] net/netfilter/nf_conntrack_ecache.c:88 invoked rcu_dereference_check() without protection!
> [   11.491024] 
> [   11.491024] other info that might help us debug this:
> [   11.491025] 
> [   11.492834] 
> [   11.492835] rcu_scheduler_active = 1, debug_locks = 0
> [   11.494124] 1 lock held by swapper/1:
> [   11.494776]  #0:  (nf_ct_ecache_mutex){+.+...}, at: [<ffffffff8148c606>] nf_conntrack_register_notifier+0x1a/0x76
> [   11.495505] 

There are some unnecessary rcu_dereference() calls in the conntrack
notifier registration and unregistration functions.

Does this fix it?


[-- Attachment #2: x --]
[-- Type: text/plain, Size: 1834 bytes --]

diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index d5a9bcd..849614a 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -81,11 +81,9 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
 int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
 {
 	int ret = 0;
-	struct nf_ct_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_conntrack_event_cb);
-	if (notify != NULL) {
+	if (nf_conntrack_event_cb != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -101,11 +99,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
 
 void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *new)
 {
-	struct nf_ct_event_notifier *notify;
-
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_conntrack_event_cb);
-	BUG_ON(notify != new);
+	BUG_ON(nf_conntrack_event_cb != new);
 	rcu_assign_pointer(nf_conntrack_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }
@@ -114,11 +109,9 @@ EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
 int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *new)
 {
 	int ret = 0;
-	struct nf_exp_event_notifier *notify;
 
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_expect_event_cb);
-	if (notify != NULL) {
+	if (nf_expect_event_cb != NULL) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -134,11 +127,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier);
 
 void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *new)
 {
-	struct nf_exp_event_notifier *notify;
-
 	mutex_lock(&nf_ct_ecache_mutex);
-	notify = rcu_dereference(nf_expect_event_cb);
-	BUG_ON(notify != new);
+	BUG_ON(nf_expect_event_cb != new);
 	rcu_assign_pointer(nf_expect_event_cb, NULL);
 	mutex_unlock(&nf_ct_ecache_mutex);
 }

^ permalink raw reply related

* Re: [PATCH] IPVS: replace sprintf to snprintf to avoid stack buffer overflow
From: Patrick McHardy @ 2010-04-08 11:37 UTC (permalink / raw)
  To: Simon Horman
  Cc: wzt.wzt, linux-kernel, Wensong Zhang, Julian Anastasov, netdev,
	lvs-devel
In-Reply-To: <20100407223445.GA15810@verge.net.au>

Simon Horman wrote:
> On Wed, Apr 07, 2010 at 06:09:54PM +0200, Patrick McHardy wrote:
>> Simon Horman wrote:
>>> On Tue, Apr 06, 2010 at 10:50:20AM +0800, wzt.wzt@gmail.com wrote:
>>>> IPVS not check the length of pp->name, use sprintf will cause stack buffer overflow.
>>>> struct ip_vs_protocol{} declare name as char *, if register a protocol as:
>>>> struct ip_vs_protocol ip_vs_test = {
>>>>         .name =			"aaaaaaaa....128...aaa",
>>>> 	.debug_packet =         ip_vs_tcpudp_debug_packet,
>>>> };
>>>>
>>>> when called ip_vs_tcpudp_debug_packet(), sprintf(buf, "%s TRUNCATED", pp->name); 
>>>> will cause stack buffer overflow.
>>>>
>>>> Signed-off-by: Zhitong Wang <zhitong.wangzt@alibaba-inc.com>
>>> I think that the simple answer is, don't do that.
>> Indeed.
>>
>>> But your patch seems entirely reasonable to me.
>>>
>>> Acked-by: Simon Horman <horms@verge.net.au>
>>>
>>> Patrick, please consider merging this.
>> I think this fix is a bit silly, we can simply print the name in
>> the pr_debug() statement and avoid both the potential overflow
>> and truncation.
>>
>> How does this look?
> 
> Looks good to me:
> 
> Acked-by: Simon Horman <horms@verge.net.au>

Thanks, I've applied the patch.

^ permalink raw reply

* Re: Crashes in xfrm_lookup
From: Mark Brown @ 2010-04-08 11:31 UTC (permalink / raw)
  To: Timo Teräs; +Cc: netdev
In-Reply-To: <4BBDBCF9.5060906@iki.fi>

On Thu, Apr 08, 2010 at 02:24:41PM +0300, Timo Teräs wrote:

> Probably the same as http://marc.info/?t=127071006600005&r=1&w=2

> Happens because CONFIG_XFRM_SUB_POLICY is not enabled, and one of
> the helper functions I used did unexpected things in that case.

> Try the following:

Yes, that seems to make things much happier - thanks!  I do have one
patch I came up with while I was looking at the code (though I'm not
sure it's correct), I'll post that once I've had lunch.

^ permalink raw reply

* Re: Crashes in xfrm_lookup
From: Timo Teräs @ 2010-04-08 11:24 UTC (permalink / raw)
  To: Mark Brown; +Cc: netdev
In-Reply-To: <20100408111441.GA14241@sirena.org.uk>

Mark Brown wrote:
> With -next as of today I'm experiencing crashes in the xfrm_lookup code
> when attempting to boot from an NFS root on a SMDK6410 (an ARM based
> development board).  I'm currently investigating what's gone wrong, but
> thought it was better to report early in case it's obvious to someone
> familiar with the code or there's a fix already.

Probably the same as http://marc.info/?t=127071006600005&r=1&w=2

Happens because CONFIG_XFRM_SUB_POLICY is not enabled, and one of
the helper functions I used did unexpected things in that case.

Try the following:

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 625dd61..cccb049 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -735,19 +735,12 @@ static inline void xfrm_pol_put(struct xfrm_policy *policy)
 		xfrm_policy_destroy(policy);
 }
 
-#ifdef CONFIG_XFRM_SUB_POLICY
 static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
 {
 	int i;
 	for (i = npols - 1; i >= 0; --i)
 		xfrm_pol_put(pols[i]);
 }
-#else
-static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
-{
-	xfrm_pol_put(pols[0]);
-}
-#endif
 
 extern void __xfrm_state_destroy(struct xfrm_state *);
 


^ permalink raw reply related

* Re: [PATCH v3 0/4] xfrm: add x86 CONFIG_COMPAT support
From: Patrick McHardy @ 2010-04-08 11:24 UTC (permalink / raw)
  To: David Miller; +Cc: fw, netdev, johannes
In-Reply-To: <20100408.025412.247148013.davem@davemloft.net>

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Thu, 08 Apr 2010 11:44:43 +0200
> 
>> Either the kernel or the userspace programs have to be updated
>> either way.
> 
> The only case in which we only have to change one side is if we add
> the full set of compat support to the kernel.
> 
> If we take any other option (new XFRM numbers and new datastructures,
> or only convert sendmsg() to do compat translations), it requires both
> the kernel and userspace to change.

You're right of course.

> And the currently existing 32-bit binaries don't work on 64-bit
> kernels because of something that cannot be classified any other way
> than as being a kernel bug.

Agreed, but since its not a new bug, I think we have some flexibility
in how to fix it. In the wireless case we had no real choice but to
add the COMPAT_NETLINK thing, but its a bit sad to add more of this
crap to netlink.

Anyways, I guess there's no reason why we couldn't do both and
transistion to a fixed API in the long term.

^ permalink raw reply

* Crashes in xfrm_lookup
From: Mark Brown @ 2010-04-08 11:14 UTC (permalink / raw)
  To: netdev; +Cc: Timo =?unknown-8bit?B?VGVyw6Rz?=

With -next as of today I'm experiencing crashes in the xfrm_lookup code
when attempting to boot from an NFS root on a SMDK6410 (an ARM based
development board).  I'm currently investigating what's gone wrong, but
thought it was better to report early in case it's obvious to someone
familiar with the code or there's a fix already.

Sample backtrace below:

[    6.540000] Unable to handle kernel NULL pointer dereference at virtual addrc
[    6.540000] pgd = c0004000                                                   
[    6.550000] [0000003c] *pgd=00000000                                         
[    6.550000] Internal error: Oops: 5 [#1]                                     
[    6.550000] last sysfs file:                                                 
[    6.550000] Modules linked in:                                               
[    6.550000] CPU: 0    Not tainted (2.6.34-rc3-next-20100408-00011-g3d8ee7a-)
[    6.550000] PC is at __xfrm_lookup+0x308/0x3a8                               
[    6.550000] LR is at ip_route_output_flow+0x7c/0x210                         

...

[    6.550000] [<c02c1394>] (__xfrm_lookup+0x308/0x3a8) from [<c02889a4>] (ip_r)
[    6.550000] [<c02889a4>] (ip_route_output_flow+0x7c/0x210) from [<c02ade9c>])
[    6.550000] [<c02ade9c>] (udp_sendmsg+0x320/0x5ec) from [<c02b3b20>] (inet_s)
[    6.550000] [<c02b3b20>] (inet_sendmsg+0x58/0x64) from [<c02612d8>] (sock_se)
[    6.550000] [<c02612d8>] (sock_sendmsg+0x88/0xa8) from [<c0261338>] (kernel_)
[    6.550000] [<c0261338>] (kernel_sendmsg+0x40/0x7c) from [<c02d0dd0>] (xs_se)
[    6.550000] [<c02d0dd0>] (xs_send_kvec+0x94/0xa4) from [<c02d0e70>] (xs_send)
[    6.550000] [<c02d0e70>] (xs_sendpages+0x90/0x204) from [<c02d1268>] (xs_udp)
[    6.550000] [<c02d1268>] (xs_udp_send_request+0x3c/0x120) from [<c02cf254>] )
[    6.550000] [<c02cf254>] (xprt_transmit+0x158/0x25c) from [<c02cc9c0>] (call)
[    6.550000] [<c02cc9c0>] (call_transmit+0x204/0x284) from [<c02d380c>] (__rp)
[    6.550000] [<c02d380c>] (__rpc_execute+0x94/0x2c4) from [<c02cd35c>] (rpc_r)
[    6.550000] [<c02cd35c>] (rpc_run_task+0x60/0x6c) from [<c02cd4a0>] (rpc_cal)
[    6.550000] [<c02cd4a0>] (rpc_call_sync+0x58/0x84) from [<c02cd51c>] (rpc_pi)
[    6.550000] [<c02cd51c>] (rpc_ping+0x50/0x70) from [<c02cded0>] (rpc_create+)
[    6.550000] [<c02cded0>] (rpc_create+0x3e4/0x498) from [<c013c194>] (nfs_mou)
[    6.550000] [<c013c194>] (nfs_mount+0xd8/0x1c4) from [<c0016d14>] (nfs_root_)
[    6.550000] [<c0016d14>] (nfs_root_data+0x2cc/0x398) from [<c0008fb8>] (moun)
[    6.550000] [<c0008fb8>] (mount_root+0x1c/0x104) from [<c0009204>] (prepare_)
[    6.550000] [<c0009204>] (prepare_namespace+0x164/0x1c8) from [<c0008478>] ()
[    6.550000] [<c0008478>] (kernel_init+0x108/0x148) from [<c002df5c>] (kernel)


^ permalink raw reply

* HTB - What's the minimal value for 'rate' parameter?
From: Antonio Almeida @ 2010-04-08 11:07 UTC (permalink / raw)
  To: netdev, jarkao2, kaber, davem, devik

Hi!
I've been using HTB for a while, and we've already sent some e-mails
each other when resolving HTB accuracy issue.
When using HTB, I realised that for some configurations the rate limit
doesn't work.
I suspect that the problem is the minimum value of rate parameter,
which I cant figure out what is.

I simple configuration that turns out to be wrong is as fallows: The
root (1:1) gets the link bandwidth configuration; the second (1:2) is
set to 4096Kbit; then I have two branches (1:10 and 1:11) with rate
1024Kbit and ceil 4096Kbit; and finally a leaf class in each branch
(1:111 below 1:11, and 1:101 below 1:10) with rate 8bit and ceil
4096Kbit, and the same priority.
I don't want to have sustained rate, and since I must configure 'rate'
parameter I decide to set it to 8bits - which is the minimal accepted
value. My cue goes for 'rate' parameter. If I set 'rate' parameter to
1Kbit for instance, the problem disappears and the shaping is done
perfectly.

So, I'm looking for help to find out if the problem is actually in
this parameter configuration or if it's just coincidence and I'll get
the same problem ahead :(
What's the minimal value for 'rate' parameter using HTB qdisc?

Here's the tc command output, using leaves rate set to 8bit:

# tc -s class list dev eth1
class htb 1:101 parent 1:10 leaf 101: prio 3 rate 8bit ceil 4096Kbit
burst 225b cburst 3655b
 Sent 42305702 bytes 27943 pkt (dropped 23031, overlimits 0 requeues 0)
 rate 4036Kbit 333pps backlog 0b 126p requeues 0
 lended: 27817 borrowed: 0 giants: 0
 tokens: 1250000000 ctokens: -39387

class htb 1:11 parent 1:2 rate 1024Kbit ceil 4096Kbit burst 2113b cburst 3655b
 Sent 42170956 bytes 27854 pkt (dropped 0, overlimits 0 requeues 0)
 rate 4035Kbit 333pps backlog 0b 0p requeues 0
 lended: 0 borrowed: 0 giants: 0
 tokens: -937499999 ctokens: -42881

class htb 1:10 parent 1:2 rate 1024Kbit ceil 4096Kbit burst 2113b cburst 3655b
 Sent 42114938 bytes 27817 pkt (dropped 0, overlimits 0 requeues 0)
 rate 4035Kbit 333pps backlog 0b 0p requeues 0
 lended: 0 borrowed: 0 giants: 0
 tokens: -937499999 ctokens: -39387

class htb 1:1 root rate 1000Mbit ceil 1000Mbit burst 503375b cburst 503375b
 Sent 84285894 bytes 55671 pkt (dropped 0, overlimits 0 requeues 0)
 rate 8071Kbit 666pps backlog 0b 0p requeues 0
 lended: 0 borrowed: 0 giants: 0
 tokens: 62750 ctokens: 62750

class htb 1:111 parent 1:11 leaf 111: prio 3 rate 8bit ceil 4096Kbit
burst 225b cburst 3655b
 Sent 42363234 bytes 27981 pkt (dropped 23064, overlimits 0 requeues 0)
 rate 4035Kbit 333pps backlog 0b 127p requeues 0
 lended: 27854 borrowed: 0 giants: 0
 tokens: 1250000000 ctokens: -42881

class htb 1:2 parent 1:1 rate 4096Kbit ceil 4096Kbit burst 3655b cburst 3655b
 Sent 84285894 bytes 55671 pkt (dropped 0, overlimits 0 requeues 0)
 rate 8071Kbit 666pps backlog 0b 0p requeues 0
 lended: 0 borrowed: 0 giants: 0
 tokens: -937499999 ctokens: -937499999

class sfq 111:16 parent 111:
 (dropped 0, overlimits 0 requeues 0)
 backlog 0b 127p requeues 0
 allot 1514

class sfq 101:252 parent 101:
 (dropped 0, overlimits 0 requeues 0)
 backlog 0b 126p requeues 0
 allot 1514

Regards
  Antonio Almeida

^ permalink raw reply

* Re: dhcp client packet sniffing...
From: David Miller @ 2010-04-08 11:01 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, herbert
In-Reply-To: <20100408105951.GA16868@hmsreliant.think-freely.org>

From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 8 Apr 2010 06:59:51 -0400

> Dave, sorry to be late to the discussion, but can you refesh us on
> why dhcp has to sniff every packet?

It wants to see DHCP packets from the server.

And it can't use ipv4 RAW sockets due to some limitations wrt. getting
at the link layer headers when using that interfaces.

^ permalink raw reply

* Re: dhcp client packet sniffing...
From: Neil Horman @ 2010-04-08 10:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, herbert
In-Reply-To: <20100408.035049.177640912.davem@davemloft.net>

On Thu, Apr 08, 2010 at 03:50:49AM -0700, David Miller wrote:
> 
> This is an old topic, but looking at traces tonight I was reminded
> about it.
> 
> dhcp clients sniff every packet in the system, the reason it does this
> and the things we can do to make it not have to do so have been
> discussed before.  Actually, I don't remember where we got with
> that and if we were able to make it such that the dhcp client
> doesn't have to do this any more.  Herbert?
> 
Dave, sorry to be late to the discussion, but can you refesh us on why dhcp has
to sniff every packet?
Regards
Neil


^ permalink raw reply

* dhcp client packet sniffing...
From: David Miller @ 2010-04-08 10:50 UTC (permalink / raw)
  To: netdev; +Cc: herbert

This is an old topic, but looking at traces tonight I was reminded
about it.

dhcp clients sniff every packet in the system, the reason it does this
and the things we can do to make it not have to do so have been
discussed before.  Actually, I don't remember where we got with
that and if we were able to make it such that the dhcp client
doesn't have to do this any more.  Herbert?

But, in any event, the fact of the matter is that currently it still
does on many machines.

This means every packet in the machine gets sniffed.

The DHCP client at least installs a socket filter that only accepts
the packets that the DHCP client is actually interested in.

The problem is that we clone the SKB and do some other operations
before running the socket filter.

I was thinking, what if we simply move the sk_filter() call up to
dev_queue_xmit_nit()?  And if sk_filter() rejects we don't even need
to clone the packet.

^ permalink raw reply

* FEC driver: rcv is not +last
From: Matthias Kaehlcke @ 2010-04-08 10:40 UTC (permalink / raw)
  To: netdev; +Cc: Sascha Hauer

hi,

i have problems with the FEC on a i.MX25 3-Stack board. the kernel is
v2.6.34-rc2 plus the following patch:
http://patchwork.ozlabs.org/patch/41235/

the following traces are generated at boot time:

FEC Ethernet Driver
fec: PHY @ 0x1, ID 0x20005ce1 -- unknown PHY!
...
eth0: config: auto-negotiation on, 100FDX, 100HDX, 10FDX, 10HDX.
...
FEC ENET: rcv is not +last
FEC ENET: rcv is not +last
FEC ENET: rcv is not +last
FEC ENET: rcv is not +last
...

the PHY of the board is a DP83840, which is not supported by the
driver. could this be the problem? i tried to make the kernel think
the DP83840 is a DP83848, which is supported, but the behaviour is the
same except the 'unknown PHY' warning.

any idea what could be wrong?

-- 
Matthias Kaehlcke
Embedded Linux Developer
Barcelona

              You can't separate peace from freedom because no
               one can be at peace unless he has his freedom
                              (Malcolm X)
                                                                 .''`.
    using free software / Debian GNU/Linux | http://debian.org  : :'  :
                                                                `. `'`
gpg --keyserver pgp.mit.edu --recv-keys 47D8E5D4                  `-

^ permalink raw reply

* Re: IP/UDP encapsulation
From: ZioPRoTo (Saverio Proto) @ 2010-04-08 10:39 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mark Smith, Gustavo F. Padovan, netdev, marco bonola,
	Behling Mario, L. Aaron Kaplan
In-Reply-To: <1270722573.2215.47.camel@edumazet-laptop>

>> I'm a bit confused. How can tunnelling IP in UDP in IP be faster than IP in IP?
>>
>
> Maybe the 'gateway' doesnt handle IPIP at all ;)

Yes that's the point. IP in UDP is more supported. It has much an
easier way when your traffic travels over the Internet, and maybe you
have to pass some NAT. Some NAT will not handle at all IP in IP
packets.

IP in IP has of course less overhead but you can experience problems
and many network setups. This is why at Freifunk we are thinking of
developing this kernel module.

Regards

Saverio Proto

^ permalink raw reply

* Re: IP/UDP encapsulation
From: Eric Dumazet @ 2010-04-08 10:29 UTC (permalink / raw)
  To: Mark Smith
  Cc: Gustavo F. Padovan, netdev, marco bonola,
	ZioPRoTo (Saverio Proto), Behling Mario, L. Aaron Kaplan
In-Reply-To: <20100408191831.08cd8d7b@opy.nosense.org>

Le jeudi 08 avril 2010 à 19:18 +0930, Mark Smith a écrit :
> On Thu, 8 Apr 2010 04:42:47 -0300
> "Gustavo F. Padovan" <gustavo@padovan.org> wrote:
> 
> > Hi,
> > 
> > I'm looking for some advice on that work. The Freifunk organization is
> > planning work on the IP/UDP encapsulation kernel module as a GSoC
> > project. The idea is to create a IP-in-UDP tunnel like we do for
> > IP-in-IP or IP-in-GRE tunnels. The only way to do that today is to use
> > some VPN software.
> > 
> > The module will export its virtual interface through sockets and will
> > have support for the standard syscalls like the others encapsulation
> > modules.
> > 
> > It will improve the performance of mesh networks that will we be able
> > to use IP-in-UDP rather than IP-in-IP.
> 
> I'm a bit confused. How can tunnelling IP in UDP in IP be faster than IP in IP?
> 

Maybe the 'gateway' doesnt handle IPIP at all ;)

Until 2.6.32, IPIP tunnels were not so scalable then UDP (RCU enabled)
ipip_rcv() was hitting a global rwlock

git describe 8f95dd63a2ab6fe7243c4f0bd2c3266e3a5525ab
v2.6.32-rc3-468-g8f95dd6




^ permalink raw reply

* Re: [PATCH v3 0/4] xfrm: add x86 CONFIG_COMPAT support
From: David Miller @ 2010-04-08  9:54 UTC (permalink / raw)
  To: kaber; +Cc: fw, netdev, johannes
In-Reply-To: <4BBDA58B.8090301@trash.net>

From: Patrick McHardy <kaber@trash.net>
Date: Thu, 08 Apr 2010 11:44:43 +0200

> Either the kernel or the userspace programs have to be updated
> either way.

The only case in which we only have to change one side is if we add
the full set of compat support to the kernel.

If we take any other option (new XFRM numbers and new datastructures,
or only convert sendmsg() to do compat translations), it requires both
the kernel and userspace to change.

And the currently existing 32-bit binaries don't work on 64-bit
kernels because of something that cannot be classified any other way
than as being a kernel bug.

^ permalink raw reply

* Re: IP/UDP encapsulation
From: Mark Smith @ 2010-04-08  9:48 UTC (permalink / raw)
  To: Gustavo F. Padovan
  Cc: netdev, marco bonola, ZioPRoTo (Saverio Proto), Behling Mario,
	L. Aaron Kaplan
In-Reply-To: <20100408074247.GA19798@vigoh>

On Thu, 8 Apr 2010 04:42:47 -0300
"Gustavo F. Padovan" <gustavo@padovan.org> wrote:

> Hi,
> 
> I'm looking for some advice on that work. The Freifunk organization is
> planning work on the IP/UDP encapsulation kernel module as a GSoC
> project. The idea is to create a IP-in-UDP tunnel like we do for
> IP-in-IP or IP-in-GRE tunnels. The only way to do that today is to use
> some VPN software.
> 
> The module will export its virtual interface through sockets and will
> have support for the standard syscalls like the others encapsulation
> modules.
> 
> It will improve the performance of mesh networks that will we be able
> to use IP-in-UDP rather than IP-in-IP.

I'm a bit confused. How can tunnelling IP in UDP in IP be faster than IP in IP?


> So, instead of push all data to
> local gateway into the mesh all the data can be tunneled to a faster
> server and from there to the Internet. With all the data exiting with
> the same IP address (the fast server IP). That will improve bandwidth,
> especially for upload.
> 
> Is such module acceptable for merge into the Linux Kernel?
> 
> Any comments or suggestions to the module architecture and
> implementation? If you want more information about the module I can
> provide that.
> 
> Regards,
> 
> -- 
> Gustavo F. Padovan
> http://padovan.org
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v3 0/4] xfrm: add x86 CONFIG_COMPAT support
From: Patrick McHardy @ 2010-04-08  9:44 UTC (permalink / raw)
  To: David Miller; +Cc: fw, netdev, johannes
In-Reply-To: <20100407.164842.54065324.davem@davemloft.net>

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Wed, 07 Apr 2010 15:45:51 +0200
> 
>> Florian Westphal wrote:
>>> David Miller <davem@davemloft.net> wrote:
>>>> From: Florian Westphal <fw@strlen.de>
>>>> Date: Tue,  6 Apr 2010 00:27:07 +0200
>>> [..]
>>>
>>>>> I sent a patch that solved this by adding a sys_compat_write syscall
>>>>> and a ->compat_aio_write() to struct file_operations to the
>>>>> vfs mailing list, but that patch was ignored by the vfs people,
>>>>> and the x86 folks did not exactly like the idea either.
>>>>>
>>>>> So this leaves three alternatives:
>>>>> 1 - drop the whole idea and keep the current status.
>>>>> 2 - Add new structure definitions (with new numbering) that would work
>>>>>     everywhere, keep the old ones for backwards compatibility (This
>>>>>     was suggested by Arnd Bergmann).
>> Given that there is only a quite small number of users of this
>> interface, that would in my opinion be the best way.
> 
> Can you explain that line of reasoning?
> 
> It's not that there are only "3 or 4 tools" using these interfaces,
> it's the fact that 32-bit binaries of those tools are on millions and
> millions of systems out there.

Yeah, but they're currently not working if used on 64 bit hosts,
so I don't think its unreasonable to consider just the number of
tools that need to be fixed. Either the kernel or the userspace
programs have to be updated either way.

^ permalink raw reply

* Re: [PATCH] tcp: Set CHECKSUM_UNNECESSARY in tcp_init_nondata_skb
From: Joe Perches @ 2010-04-08  9:20 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, herbert
In-Reply-To: <20100408.012617.98660541.davem@davemloft.net>

On Thu, 2010-04-08 at 01:26 -0700, David Miller wrote:
> Fix this by setting skb->ip_summed in the common non-data packet
> constructor.  It already is setting skb->csum to zero.
> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index f181b78..00afbb0 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -349,6 +349,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
>   */
>  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
>  {
> +	skb->ip_summed = CHECKSUM_PARTIAL;
>  	skb->csum = 0;
>  
>  	TCP_SKB_CB(skb)->flags = flags;

There might be trivial value in using the
struct layout order for the sets avoiding
crossing cachelines.

from:

static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
	skb->ip_summed = CHECKSUM_PARTIAL;
	skb->csum = 0;

	TCP_SKB_CB(skb)->flags = flags;
	TCP_SKB_CB(skb)->sacked = 0;

	skb_shinfo(skb)->gso_segs = 1;
	skb_shinfo(skb)->gso_size = 0;
	skb_shinfo(skb)->gso_type = 0;

	TCP_SKB_CB(skb)->seq = seq;
	if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
		seq++;
	TCP_SKB_CB(skb)->end_seq = seq;
}

to:

static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
	skb->ip_summed = CHECKSUM_PARTIAL; 
	skb->csum = 0;

	TCP_SKB_CB(skb)->seq = seq;
	if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN))
		seq++;
	TCP_SKB_CB(skb)->end_seq = seq;
	TCP_SKB_CB(skb)->sacked = 0;
	TCP_SKB_CB(skb)->flags = flags;

	skb_shinfo(skb)->gso_size = 0;
	skb_shinfo(skb)->gso_segs = 1;
	skb_shinfo(skb)->gso_type = 0;
}



^ permalink raw reply

* Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.
From: xiaohui.xin @ 2010-04-08  9:07 UTC (permalink / raw)
  To: mst; +Cc: netdev, kvm, linux-kernel, jdike, Xin Xiaohui
In-Reply-To: <20100407081800.GC9550@redhat.com>

From: Xin Xiaohui <xiaohui.xin@intel.com>

---
Michael,
This is a small patch for the write logging issue with async queue.
I have made a __vhost_get_vq_desc() func which may compute the log
info with any valid buffer index. The __vhost_get_vq_desc() is 
coming from the code in vq_get_vq_desc().
And I use it to recompute the log info when logging is enabled.

Thanks
Xiaohui

 drivers/vhost/net.c   |   27 ++++++++---
 drivers/vhost/vhost.c |  115 ++++++++++++++++++++++++++++---------------------
 drivers/vhost/vhost.h |    5 ++
 3 files changed, 90 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2aafd90..00a45ef 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -115,7 +115,8 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
 	struct kiocb *iocb = NULL;
 	struct vhost_log *vq_log = NULL;
 	int rx_total_len = 0;
-	int log, size;
+	unsigned int head, log, in, out;
+	int size;
 
 	if (vq->link_state != VHOST_VQ_LINK_ASYNC)
 		return;
@@ -130,14 +131,25 @@ static void handle_async_rx_events_notify(struct vhost_net *net,
 				iocb->ki_pos, iocb->ki_nbytes);
 		log = (int)iocb->ki_user_data;
 		size = iocb->ki_nbytes;
+		head = iocb->ki_pos;
 		rx_total_len += iocb->ki_nbytes;
 
 		if (iocb->ki_dtor)
 			iocb->ki_dtor(iocb);
 		kmem_cache_free(net->cache, iocb);
 
-		if (unlikely(vq_log))
+		/* when log is enabled, recomputing the log info is needed,
+		 * since these buffers are in async queue, and may not get
+		 * the log info before.
+		 */
+		if (unlikely(vq_log)) {
+			if (!log)
+				__vhost_get_vq_desc(&net->dev, vq, vq->iov,
+						    ARRAY_SIZE(vq->iov),
+						    &out, &in, vq_log,
+						    &log, head);
 			vhost_log_write(vq, vq_log, log, size);
+		}
 		if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
@@ -313,14 +325,13 @@ static void handle_rx(struct vhost_net *net)
 	vhost_disable_notify(vq);
 	hdr_size = vq->hdr_size;
 
-	/* In async cases, for write logging, the simple way is to get
-	 * the log info always, and really logging is decided later.
-	 * Thus, when logging enabled, we can get log, and when logging
-	 * disabled, we can get log disabled accordingly.
+	/* In async cases, when write log is enabled, in case the submitted
+	 * buffers did not get log info before the log enabling, so we'd
+	 * better recompute the log info when needed. We do this in
+	 * handle_async_rx_events_notify().
 	 */
 
-	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
-		(vq->link_state == VHOST_VQ_LINK_ASYNC) ?
+	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
 	handle_async_rx_events_notify(net, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			   struct iovec iov[], unsigned int iov_size,
 			   unsigned int *out_num, unsigned int *in_num,
-			   struct vhost_log *log, unsigned int *log_num)
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head)
 {
 	struct vring_desc desc;
-	unsigned int i, head, found = 0;
-	u16 last_avail_idx;
+	unsigned int i = head, found = 0;
 	int ret;
 
-	/* Check it isn't doing very strange things with descriptor numbers. */
-	last_avail_idx = vq->last_avail_idx;
-	if (get_user(vq->avail_idx, &vq->avail->idx)) {
-		vq_err(vq, "Failed to access avail idx at %p\n",
-		       &vq->avail->idx);
-		return vq->num;
-	}
-
-	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
-		vq_err(vq, "Guest moved used index from %u to %u",
-		       last_avail_idx, vq->avail_idx);
-		return vq->num;
-	}
-
-	/* If there's nothing new since last we looked, return invalid. */
-	if (vq->avail_idx == last_avail_idx)
-		return vq->num;
-
-	/* Only get avail ring entries after they have been exposed by guest. */
-	rmb();
-
-	/* Grab the next descriptor number they're advertising, and increment
-	 * the index we've seen. */
-	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
-		vq_err(vq, "Failed to read head: idx %d address %p\n",
-		       last_avail_idx,
-		       &vq->avail->ring[last_avail_idx % vq->num]);
-		return vq->num;
-	}
-
-	/* If their number is silly, that's an error. */
-	if (head >= vq->num) {
-		vq_err(vq, "Guest says index %u > %u is available",
-		       head, vq->num);
-		return vq->num;
-	}
-
 	/* When we start there are none of either input nor output. */
 	*out_num = *in_num = 0;
 	if (unlikely(log))
 		*log_num = 0;
 
-	i = head;
 	do {
 		unsigned iov_count = *in_num + *out_num;
 		if (i >= vq->num) {
@@ -833,8 +788,70 @@ unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 			*out_num += ret;
 		}
 	} while ((i = next_desc(&desc)) != -1);
+	return head;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which
+ * is never a valid descriptor number) if none was found. */
+unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+			   struct iovec iov[], unsigned int iov_size,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num)
+{
+	struct vring_desc desc;
+	unsigned int i, head, found = 0;
+	u16 last_avail_idx;
+	unsigned int ret;
+
+	/* Check it isn't doing very strange things with descriptor numbers. */
+	last_avail_idx = vq->last_avail_idx;
+	if (get_user(vq->avail_idx, &vq->avail->idx)) {
+		vq_err(vq, "Failed to access avail idx at %p\n",
+		       &vq->avail->idx);
+		return vq->num;
+	}
+
+	if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) {
+		vq_err(vq, "Guest moved used index from %u to %u",
+		       last_avail_idx, vq->avail_idx);
+		return vq->num;
+	}
+
+	/* If there's nothing new since last we looked, return invalid. */
+	if (vq->avail_idx == last_avail_idx)
+		return vq->num;
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	rmb();
+
+	/* Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen. */
+	if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) {
+		vq_err(vq, "Failed to read head: idx %d address %p\n",
+		       last_avail_idx,
+		       &vq->avail->ring[last_avail_idx % vq->num]);
+		return vq->num;
+	}
+
+	/* If their number is silly, that's an error. */
+	if (head >= vq->num) {
+		vq_err(vq, "Guest says index %u > %u is available",
+		       head, vq->num);
+		return vq->num;
+	}
+
+	ret = __vhost_get_vq_desc(dev, vq, iov, iov_size,
+				  out_num, in_num,
+				  log, log_num, head);
 
 	/* On success, increment avail index. */
+	if (ret == vq->num)
+		return ret;
 	vq->last_avail_idx++;
 	return head;
 }
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index cffe39a..a74a6d4 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -132,6 +132,11 @@ unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,
 			   struct vhost_log *log, unsigned int *log_num);
+unsigned __vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *,
+			   struct iovec iov[], unsigned int iov_count,
+			   unsigned int *out_num, unsigned int *in_num,
+			   struct vhost_log *log, unsigned int *log_num,
+			   unsigned int head);
 void vhost_discard_vq_desc(struct vhost_virtqueue *);
 
 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-- 
1.5.4.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox